1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support six policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * weighted interleave
23 * Allocate memory interleaved over a set of nodes based on
24 * a set of weights (per-node), with normal fallback if it
25 * fails. Otherwise operates the same as interleave.
26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27 * on node 0 for every 1 page allocated on node 1.
28 *
29 * bind Only allocate memory on a specific set of nodes,
30 * no fallback.
31 * FIXME: memory is allocated starting with the first node
32 * to the last. It would be better if bind would truly restrict
33 * the allocation to memory nodes instead
34 *
35 * preferred Try a specific node first before normal fallback.
36 * As a special case NUMA_NO_NODE here means do the allocation
37 * on the local CPU. This is normally identical to default,
38 * but useful to set in a VMA when you have a non default
39 * process policy.
40 *
41 * preferred many Try a set of nodes first before normal fallback. This is
42 * similar to preferred without the special case.
43 *
44 * default Allocate on the local node first, or when on a VMA
45 * use the process policy. This is what Linux always did
46 * in a NUMA aware kernel and still does by, ahem, default.
47 *
48 * The process policy is applied for most non interrupt memory allocations
49 * in that process' context. Interrupts ignore the policies and always
50 * try to allocate on the local CPU. The VMA policy is only applied for memory
51 * allocations for a VMA in the VM.
52 *
53 * Currently there are a few corner cases in swapping where the policy
54 * is not applied, but the majority should be handled. When process policy
55 * is used it is not remembered over swap outs/swap ins.
56 *
57 * Only the highest zone in the zone hierarchy gets policied. Allocations
58 * requesting a lower zone just use default policy. This implies that
59 * on systems with highmem kernel lowmem allocation don't get policied.
60 * Same with GFP_DMA allocations.
61 *
62 * For shmem/tmpfs shared memory the policy is shared between
63 * all users and remembered even when nobody has memory mapped.
64 */
65
66 /* Notebook:
67 fix mmap readahead to honour policy and enable policy for any page cache
68 object
69 statistics for bigpages
70 global policy for page cache? currently it uses process policy. Requires
71 first item above.
72 handle mremap for shared memory (currently ignored for the policy)
73 grows down?
74 make bind policy root only? It can trigger oom much faster and the
75 kernel is not always grateful with that.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/sysctl.h>
89 #include <linux/sched/task.h>
90 #include <linux/nodemask.h>
91 #include <linux/cpuset.h>
92 #include <linux/slab.h>
93 #include <linux/string.h>
94 #include <linux/export.h>
95 #include <linux/nsproxy.h>
96 #include <linux/interrupt.h>
97 #include <linux/init.h>
98 #include <linux/compat.h>
99 #include <linux/ptrace.h>
100 #include <linux/swap.h>
101 #include <linux/seq_file.h>
102 #include <linux/proc_fs.h>
103 #include <linux/memory-tiers.h>
104 #include <linux/migrate.h>
105 #include <linux/ksm.h>
106 #include <linux/rmap.h>
107 #include <linux/security.h>
108 #include <linux/syscalls.h>
109 #include <linux/ctype.h>
110 #include <linux/mm_inline.h>
111 #include <linux/mmu_notifier.h>
112 #include <linux/printk.h>
113 #include <linux/leafops.h>
114 #include <linux/gcd.h>
115
116 #include <asm/tlbflush.h>
117 #include <asm/tlb.h>
118 #include <linux/uaccess.h>
119 #include <linux/memory.h>
120
121 #include "internal.h"
122
123 /* Internal flags */
124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
125 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
126 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
127
128 static struct kmem_cache *policy_cache;
129 static struct kmem_cache *sn_cache;
130
131 /* Highest zone. An specific allocation for a zone below that is not
132 policied. */
133 enum zone_type policy_zone = 0;
134
135 /*
136 * run-time system-wide default policy => local allocation
137 */
138 static struct mempolicy default_policy = {
139 .refcnt = ATOMIC_INIT(1), /* never free it */
140 .mode = MPOL_LOCAL,
141 };
142
143 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144
145 /*
146 * weightiness balances the tradeoff between small weights (cycles through nodes
147 * faster, more fair/even distribution) and large weights (smaller errors
148 * between actual bandwidth ratios and weight ratios). 32 is a number that has
149 * been found to perform at a reasonable compromise between the two goals.
150 */
151 static const int weightiness = 32;
152
153 /*
154 * A null weighted_interleave_state is interpreted as having .mode="auto",
155 * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156 */
157 struct weighted_interleave_state {
158 bool mode_auto;
159 u8 iw_table[];
160 };
161 static struct weighted_interleave_state __rcu *wi_state;
162 static unsigned int *node_bw_table;
163
164 /*
165 * wi_state_lock protects both wi_state and node_bw_table.
166 * node_bw_table is only used by writers to update wi_state.
167 */
168 static DEFINE_MUTEX(wi_state_lock);
169
get_il_weight(int node)170 static u8 get_il_weight(int node)
171 {
172 struct weighted_interleave_state *state;
173 u8 weight = 1;
174
175 rcu_read_lock();
176 state = rcu_dereference(wi_state);
177 if (state)
178 weight = state->iw_table[node];
179 rcu_read_unlock();
180 return weight;
181 }
182
183 /*
184 * Convert bandwidth values into weighted interleave weights.
185 * Call with wi_state_lock.
186 */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188 {
189 u64 sum_bw = 0;
190 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191 int nid;
192
193 for_each_node_state(nid, N_MEMORY)
194 sum_bw += bw[nid];
195
196 /* Scale bandwidths to whole numbers in the range [1, weightiness] */
197 for_each_node_state(nid, N_MEMORY) {
198 /*
199 * Try not to perform 64-bit division.
200 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201 * If sum_bw > scaling_factor, then round the weight up to 1.
202 */
203 scaling_factor = weightiness * bw[nid];
204 if (bw[nid] && sum_bw < scaling_factor) {
205 cast_sum_bw = (unsigned int)sum_bw;
206 new_iw[nid] = scaling_factor / cast_sum_bw;
207 } else {
208 new_iw[nid] = 1;
209 }
210 if (!iw_gcd)
211 iw_gcd = new_iw[nid];
212 iw_gcd = gcd(iw_gcd, new_iw[nid]);
213 }
214
215 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216 for_each_node_state(nid, N_MEMORY)
217 new_iw[nid] /= iw_gcd;
218 }
219
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221 {
222 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223 unsigned int *old_bw, *new_bw;
224 unsigned int bw_val;
225 int i;
226
227 bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229 if (!new_bw)
230 return -ENOMEM;
231
232 new_wi_state = kmalloc_flex(*new_wi_state, iw_table, nr_node_ids);
233 if (!new_wi_state) {
234 kfree(new_bw);
235 return -ENOMEM;
236 }
237 new_wi_state->mode_auto = true;
238 for (i = 0; i < nr_node_ids; i++)
239 new_wi_state->iw_table[i] = 1;
240
241 /*
242 * Update bandwidth info, even in manual mode. That way, when switching
243 * to auto mode in the future, iw_table can be overwritten using
244 * accurate bw data.
245 */
246 mutex_lock(&wi_state_lock);
247
248 old_bw = node_bw_table;
249 if (old_bw)
250 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
251 new_bw[node] = bw_val;
252 node_bw_table = new_bw;
253
254 old_wi_state = rcu_dereference_protected(wi_state,
255 lockdep_is_held(&wi_state_lock));
256 if (old_wi_state && !old_wi_state->mode_auto) {
257 /* Manual mode; skip reducing weights and updating wi_state */
258 mutex_unlock(&wi_state_lock);
259 kfree(new_wi_state);
260 goto out;
261 }
262
263 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
264 reduce_interleave_weights(new_bw, new_wi_state->iw_table);
265 rcu_assign_pointer(wi_state, new_wi_state);
266
267 mutex_unlock(&wi_state_lock);
268 if (old_wi_state) {
269 synchronize_rcu();
270 kfree(old_wi_state);
271 }
272 out:
273 kfree(old_bw);
274 return 0;
275 }
276
277 /**
278 * numa_nearest_node - Find nearest node by state
279 * @node: Node id to start the search
280 * @state: State to filter the search
281 *
282 * Lookup the closest node by distance if @nid is not in state.
283 *
284 * Return: this @node if it is in state, otherwise the closest node by distance
285 */
numa_nearest_node(int node,unsigned int state)286 int numa_nearest_node(int node, unsigned int state)
287 {
288 int min_dist = INT_MAX, dist, n, min_node;
289
290 if (state >= NR_NODE_STATES)
291 return -EINVAL;
292
293 if (node == NUMA_NO_NODE || node_state(node, state))
294 return node;
295
296 min_node = node;
297 for_each_node_state(n, state) {
298 dist = node_distance(node, n);
299 if (dist < min_dist) {
300 min_dist = dist;
301 min_node = n;
302 }
303 }
304
305 return min_node;
306 }
307 EXPORT_SYMBOL_GPL(numa_nearest_node);
308
309 /**
310 * nearest_node_nodemask - Find the node in @mask at the nearest distance
311 * from @node.
312 *
313 * @node: a valid node ID to start the search from.
314 * @mask: a pointer to a nodemask representing the allowed nodes.
315 *
316 * This function iterates over all nodes in @mask and calculates the
317 * distance from the starting @node, then it returns the node ID that is
318 * the closest to @node, or MAX_NUMNODES if no node is found.
319 *
320 * Note that @node must be a valid node ID usable with node_distance(),
321 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
322 * or unexpected behavior.
323 */
nearest_node_nodemask(int node,nodemask_t * mask)324 int nearest_node_nodemask(int node, nodemask_t *mask)
325 {
326 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
327
328 for_each_node_mask(n, *mask) {
329 dist = node_distance(node, n);
330 if (dist < min_dist) {
331 min_dist = dist;
332 min_node = n;
333 }
334 }
335
336 return min_node;
337 }
338 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
339
get_task_policy(struct task_struct * p)340 struct mempolicy *get_task_policy(struct task_struct *p)
341 {
342 struct mempolicy *pol = p->mempolicy;
343 int node;
344
345 if (pol)
346 return pol;
347
348 node = numa_node_id();
349 if (node != NUMA_NO_NODE) {
350 pol = &preferred_node_policy[node];
351 /* preferred_node_policy is not initialised early in boot */
352 if (pol->mode)
353 return pol;
354 }
355
356 return &default_policy;
357 }
358 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
359
360 static const struct mempolicy_operations {
361 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
362 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
363 } mpol_ops[MPOL_MAX];
364
mpol_store_user_nodemask(const struct mempolicy * pol)365 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
366 {
367 return pol->flags & MPOL_USER_NODEMASK_FLAGS;
368 }
369
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)370 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
371 const nodemask_t *rel)
372 {
373 nodemask_t tmp;
374 nodes_fold(tmp, *orig, nodes_weight(*rel));
375 nodes_onto(*ret, tmp, *rel);
376 }
377
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)378 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
379 {
380 if (nodes_empty(*nodes))
381 return -EINVAL;
382 pol->nodes = *nodes;
383 return 0;
384 }
385
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)386 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 if (nodes_empty(*nodes))
389 return -EINVAL;
390
391 nodes_clear(pol->nodes);
392 node_set(first_node(*nodes), pol->nodes);
393 return 0;
394 }
395
396 /*
397 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
398 * any, for the new policy. mpol_new() has already validated the nodes
399 * parameter with respect to the policy mode and flags.
400 *
401 * Must be called holding task's alloc_lock to protect task's mems_allowed
402 * and mempolicy. May also be called holding the mmap_lock for write.
403 */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)404 static int mpol_set_nodemask(struct mempolicy *pol,
405 const nodemask_t *nodes, struct nodemask_scratch *nsc)
406 {
407 int ret;
408
409 /*
410 * Default (pol==NULL) resp. local memory policies are not a
411 * subject of any remapping. They also do not need any special
412 * constructor.
413 */
414 if (!pol || pol->mode == MPOL_LOCAL)
415 return 0;
416
417 /* Check N_MEMORY */
418 nodes_and(nsc->mask1,
419 cpuset_current_mems_allowed, node_states[N_MEMORY]);
420
421 VM_BUG_ON(!nodes);
422
423 if (pol->flags & MPOL_F_RELATIVE_NODES)
424 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
425 else
426 nodes_and(nsc->mask2, *nodes, nsc->mask1);
427
428 if (mpol_store_user_nodemask(pol))
429 pol->w.user_nodemask = *nodes;
430 else
431 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
432
433 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
434 return ret;
435 }
436
437 /*
438 * This function just creates a new policy, does some check and simple
439 * initialization. You must invoke mpol_set_nodemask() to set nodes.
440 */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)441 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
442 nodemask_t *nodes)
443 {
444 struct mempolicy *policy;
445
446 if (mode == MPOL_DEFAULT) {
447 if (nodes && !nodes_empty(*nodes))
448 return ERR_PTR(-EINVAL);
449 return NULL;
450 }
451 VM_BUG_ON(!nodes);
452
453 /*
454 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
455 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
456 * All other modes require a valid pointer to a non-empty nodemask.
457 */
458 if (mode == MPOL_PREFERRED) {
459 if (nodes_empty(*nodes)) {
460 if (((flags & MPOL_F_STATIC_NODES) ||
461 (flags & MPOL_F_RELATIVE_NODES)))
462 return ERR_PTR(-EINVAL);
463
464 mode = MPOL_LOCAL;
465 }
466 } else if (mode == MPOL_LOCAL) {
467 if (!nodes_empty(*nodes) ||
468 (flags & MPOL_F_STATIC_NODES) ||
469 (flags & MPOL_F_RELATIVE_NODES))
470 return ERR_PTR(-EINVAL);
471 } else if (nodes_empty(*nodes))
472 return ERR_PTR(-EINVAL);
473
474 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
475 if (!policy)
476 return ERR_PTR(-ENOMEM);
477 atomic_set(&policy->refcnt, 1);
478 policy->mode = mode;
479 policy->flags = flags;
480 policy->home_node = NUMA_NO_NODE;
481
482 return policy;
483 }
484
485 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)486 void __mpol_put(struct mempolicy *pol)
487 {
488 if (!atomic_dec_and_test(&pol->refcnt))
489 return;
490 kmem_cache_free(policy_cache, pol);
491 }
492 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
493
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)494 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
495 {
496 }
497
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)498 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
499 {
500 nodemask_t tmp;
501
502 if (pol->flags & MPOL_F_STATIC_NODES)
503 nodes_and(tmp, pol->w.user_nodemask, *nodes);
504 else if (pol->flags & MPOL_F_RELATIVE_NODES)
505 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
506 else {
507 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
508 *nodes);
509 pol->w.cpuset_mems_allowed = *nodes;
510 }
511
512 if (nodes_empty(tmp))
513 tmp = *nodes;
514
515 pol->nodes = tmp;
516 }
517
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)518 static void mpol_rebind_preferred(struct mempolicy *pol,
519 const nodemask_t *nodes)
520 {
521 pol->w.cpuset_mems_allowed = *nodes;
522 }
523
524 /*
525 * mpol_rebind_policy - Migrate a policy to a different set of nodes
526 *
527 * Per-vma policies are protected by mmap_lock. Allocations using per-task
528 * policies are protected by task->mems_allowed_seq to prevent a premature
529 * OOM/allocation failure due to parallel nodemask modification.
530 */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)531 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
532 {
533 if (!pol || pol->mode == MPOL_LOCAL)
534 return;
535 if (!mpol_store_user_nodemask(pol) &&
536 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
537 return;
538
539 mpol_ops[pol->mode].rebind(pol, newmask);
540 }
541
542 /*
543 * Wrapper for mpol_rebind_policy() that just requires task
544 * pointer, and updates task mempolicy.
545 *
546 * Called with task's alloc_lock held.
547 */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)548 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
549 {
550 mpol_rebind_policy(tsk->mempolicy, new);
551 }
552
553 /*
554 * Rebind each vma in mm to new nodemask.
555 *
556 * Call holding a reference to mm. Takes mm->mmap_lock during call.
557 */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)558 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
559 {
560 struct vm_area_struct *vma;
561 VMA_ITERATOR(vmi, mm, 0);
562
563 mmap_write_lock(mm);
564 for_each_vma(vmi, vma) {
565 vma_start_write(vma);
566 mpol_rebind_policy(vma->vm_policy, new);
567 }
568 mmap_write_unlock(mm);
569 }
570
571 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
572 [MPOL_DEFAULT] = {
573 .rebind = mpol_rebind_default,
574 },
575 [MPOL_INTERLEAVE] = {
576 .create = mpol_new_nodemask,
577 .rebind = mpol_rebind_nodemask,
578 },
579 [MPOL_PREFERRED] = {
580 .create = mpol_new_preferred,
581 .rebind = mpol_rebind_preferred,
582 },
583 [MPOL_BIND] = {
584 .create = mpol_new_nodemask,
585 .rebind = mpol_rebind_nodemask,
586 },
587 [MPOL_LOCAL] = {
588 .rebind = mpol_rebind_default,
589 },
590 [MPOL_PREFERRED_MANY] = {
591 .create = mpol_new_nodemask,
592 .rebind = mpol_rebind_preferred,
593 },
594 [MPOL_WEIGHTED_INTERLEAVE] = {
595 .create = mpol_new_nodemask,
596 .rebind = mpol_rebind_nodemask,
597 },
598 };
599
600 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
601 unsigned long flags);
602 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
603 pgoff_t ilx, int *nid);
604
strictly_unmovable(unsigned long flags)605 static bool strictly_unmovable(unsigned long flags)
606 {
607 /*
608 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
609 * if any misplaced page is found.
610 */
611 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
612 MPOL_MF_STRICT;
613 }
614
615 struct migration_mpol { /* for alloc_migration_target_by_mpol() */
616 struct mempolicy *pol;
617 pgoff_t ilx;
618 };
619
620 struct queue_pages {
621 struct list_head *pagelist;
622 unsigned long flags;
623 nodemask_t *nmask;
624 unsigned long start;
625 unsigned long end;
626 struct vm_area_struct *first;
627 struct folio *large; /* note last large folio encountered */
628 long nr_failed; /* could not be isolated at this time */
629 };
630
631 /*
632 * Check if the folio's nid is in qp->nmask.
633 *
634 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
635 * in the invert of qp->nmask.
636 */
queue_folio_required(struct folio * folio,struct queue_pages * qp)637 static inline bool queue_folio_required(struct folio *folio,
638 struct queue_pages *qp)
639 {
640 int nid = folio_nid(folio);
641 unsigned long flags = qp->flags;
642
643 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
644 }
645
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)646 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
647 {
648 struct folio *folio;
649 struct queue_pages *qp = walk->private;
650
651 if (unlikely(pmd_is_migration_entry(*pmd))) {
652 qp->nr_failed++;
653 return;
654 }
655 folio = pmd_folio(*pmd);
656 if (is_huge_zero_folio(folio)) {
657 walk->action = ACTION_CONTINUE;
658 return;
659 }
660 if (!queue_folio_required(folio, qp))
661 return;
662 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
663 !vma_migratable(walk->vma) ||
664 !migrate_folio_add(folio, qp->pagelist, qp->flags))
665 qp->nr_failed++;
666 }
667
668 /*
669 * Scan through folios, checking if they satisfy the required conditions,
670 * moving them from LRU to local pagelist for migration if they do (or not).
671 *
672 * queue_folios_pte_range() has two possible return values:
673 * 0 - continue walking to scan for more, even if an existing folio on the
674 * wrong node could not be isolated and queued for migration.
675 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
676 * and an existing folio was on a node that does not follow the policy.
677 */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)678 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
679 unsigned long end, struct mm_walk *walk)
680 {
681 struct vm_area_struct *vma = walk->vma;
682 struct folio *folio;
683 struct queue_pages *qp = walk->private;
684 unsigned long flags = qp->flags;
685 pte_t *pte, *mapped_pte;
686 pte_t ptent;
687 spinlock_t *ptl;
688 int max_nr, nr;
689
690 ptl = pmd_trans_huge_lock(pmd, vma);
691 if (ptl) {
692 queue_folios_pmd(pmd, walk);
693 spin_unlock(ptl);
694 goto out;
695 }
696
697 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
698 if (!pte) {
699 walk->action = ACTION_AGAIN;
700 return 0;
701 }
702 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
703 max_nr = (end - addr) >> PAGE_SHIFT;
704 nr = 1;
705 ptent = ptep_get(pte);
706 if (pte_none(ptent))
707 continue;
708 if (!pte_present(ptent)) {
709 const softleaf_t entry = softleaf_from_pte(ptent);
710
711 if (softleaf_is_migration(entry))
712 qp->nr_failed++;
713 continue;
714 }
715 folio = vm_normal_folio(vma, addr, ptent);
716 if (!folio || folio_is_zone_device(folio))
717 continue;
718 if (folio_test_large(folio) && max_nr != 1)
719 nr = folio_pte_batch(folio, pte, ptent, max_nr);
720 /*
721 * vm_normal_folio() filters out zero pages, but there might
722 * still be reserved folios to skip, perhaps in a VDSO.
723 */
724 if (folio_test_reserved(folio))
725 continue;
726 if (!queue_folio_required(folio, qp))
727 continue;
728 if (folio_test_large(folio)) {
729 /*
730 * A large folio can only be isolated from LRU once,
731 * but may be mapped by many PTEs (and Copy-On-Write may
732 * intersperse PTEs of other, order 0, folios). This is
733 * a common case, so don't mistake it for failure (but
734 * there can be other cases of multi-mapped pages which
735 * this quick check does not help to filter out - and a
736 * search of the pagelist might grow to be prohibitive).
737 *
738 * migrate_pages(&pagelist) returns nr_failed folios, so
739 * check "large" now so that queue_pages_range() returns
740 * a comparable nr_failed folios. This does imply that
741 * if folio could not be isolated for some racy reason
742 * at its first PTE, later PTEs will not give it another
743 * chance of isolation; but keeps the accounting simple.
744 */
745 if (folio == qp->large)
746 continue;
747 qp->large = folio;
748 }
749 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
750 !vma_migratable(vma) ||
751 !migrate_folio_add(folio, qp->pagelist, flags)) {
752 qp->nr_failed += nr;
753 if (strictly_unmovable(flags))
754 break;
755 }
756 }
757 pte_unmap_unlock(mapped_pte, ptl);
758 cond_resched();
759 out:
760 if (qp->nr_failed && strictly_unmovable(flags))
761 return -EIO;
762 return 0;
763 }
764
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)765 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
766 unsigned long addr, unsigned long end,
767 struct mm_walk *walk)
768 {
769 #ifdef CONFIG_HUGETLB_PAGE
770 struct queue_pages *qp = walk->private;
771 unsigned long flags = qp->flags;
772 struct folio *folio;
773 spinlock_t *ptl;
774 pte_t ptep;
775
776 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
777 ptep = huge_ptep_get(walk->mm, addr, pte);
778 if (!pte_present(ptep)) {
779 if (!huge_pte_none(ptep)) {
780 const softleaf_t entry = softleaf_from_pte(ptep);
781
782 if (unlikely(softleaf_is_migration(entry)))
783 qp->nr_failed++;
784 }
785
786 goto unlock;
787 }
788 folio = pfn_folio(pte_pfn(ptep));
789 if (!queue_folio_required(folio, qp))
790 goto unlock;
791 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
792 !vma_migratable(walk->vma)) {
793 qp->nr_failed++;
794 goto unlock;
795 }
796 /*
797 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
798 * Choosing not to migrate a shared folio is not counted as a failure.
799 *
800 * See folio_maybe_mapped_shared() on possible imprecision when we
801 * cannot easily detect if a folio is shared.
802 */
803 if ((flags & MPOL_MF_MOVE_ALL) ||
804 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
805 if (!folio_isolate_hugetlb(folio, qp->pagelist))
806 qp->nr_failed++;
807 unlock:
808 spin_unlock(ptl);
809 if (qp->nr_failed && strictly_unmovable(flags))
810 return -EIO;
811 #endif
812 return 0;
813 }
814
815 #ifdef CONFIG_NUMA_BALANCING
816 /**
817 * folio_can_map_prot_numa() - check whether the folio can map prot numa
818 * @folio: The folio whose mapping considered for being made NUMA hintable
819 * @vma: The VMA that the folio belongs to.
820 * @is_private_single_threaded: Is this a single-threaded private VMA or not
821 *
822 * This function checks to see if the folio actually indicates that
823 * we need to make the mapping one which causes a NUMA hinting fault,
824 * as there are cases where it's simply unnecessary, and the folio's
825 * access time is adjusted for memory tiering if prot numa needed.
826 *
827 * Return: True if the mapping of the folio needs to be changed, false otherwise.
828 */
folio_can_map_prot_numa(struct folio * folio,struct vm_area_struct * vma,bool is_private_single_threaded)829 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
830 bool is_private_single_threaded)
831 {
832 int nid;
833
834 if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
835 return false;
836
837 /* Also skip shared copy-on-write folios */
838 if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
839 return false;
840
841 /* Folios are pinned and can't be migrated */
842 if (folio_maybe_dma_pinned(folio))
843 return false;
844
845 /*
846 * While migration can move some dirty folios,
847 * it cannot move them all from MIGRATE_ASYNC
848 * context.
849 */
850 if (folio_is_file_lru(folio) && folio_test_dirty(folio))
851 return false;
852
853 /*
854 * Don't mess with PTEs if folio is already on the node
855 * a single-threaded process is running on.
856 */
857 nid = folio_nid(folio);
858 if (is_private_single_threaded && (nid == numa_node_id()))
859 return false;
860
861 /*
862 * Skip scanning top tier node if normal numa
863 * balancing is disabled
864 */
865 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
866 node_is_toptier(nid))
867 return false;
868
869 if (folio_use_access_time(folio))
870 folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
871
872 return true;
873 }
874
875 /*
876 * This is used to mark a range of virtual addresses to be inaccessible.
877 * These are later cleared by a NUMA hinting fault. Depending on these
878 * faults, pages may be migrated for better NUMA placement.
879 *
880 * This is assuming that NUMA faults are handled using PROT_NONE. If
881 * an architecture makes a different choice, it will need further
882 * changes to the core.
883 */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)884 unsigned long change_prot_numa(struct vm_area_struct *vma,
885 unsigned long addr, unsigned long end)
886 {
887 struct mmu_gather tlb;
888 long nr_updated;
889
890 tlb_gather_mmu(&tlb, vma->vm_mm);
891
892 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
893 if (nr_updated > 0) {
894 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
895 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
896 }
897
898 tlb_finish_mmu(&tlb);
899
900 return nr_updated;
901 }
902 #endif /* CONFIG_NUMA_BALANCING */
903
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)904 static int queue_pages_test_walk(unsigned long start, unsigned long end,
905 struct mm_walk *walk)
906 {
907 struct vm_area_struct *next, *vma = walk->vma;
908 struct queue_pages *qp = walk->private;
909 unsigned long flags = qp->flags;
910
911 /* range check first */
912 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
913
914 if (!qp->first) {
915 qp->first = vma;
916 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
917 (qp->start < vma->vm_start))
918 /* hole at head side of range */
919 return -EFAULT;
920 }
921 next = find_vma(vma->vm_mm, vma->vm_end);
922 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
923 ((vma->vm_end < qp->end) &&
924 (!next || vma->vm_end < next->vm_start)))
925 /* hole at middle or tail of range */
926 return -EFAULT;
927
928 /*
929 * Need check MPOL_MF_STRICT to return -EIO if possible
930 * regardless of vma_migratable
931 */
932 if (!vma_migratable(vma) &&
933 !(flags & MPOL_MF_STRICT))
934 return 1;
935
936 /*
937 * Check page nodes, and queue pages to move, in the current vma.
938 * But if no moving, and no strict checking, the scan can be skipped.
939 */
940 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
941 return 0;
942 return 1;
943 }
944
945 static const struct mm_walk_ops queue_pages_walk_ops = {
946 .hugetlb_entry = queue_folios_hugetlb,
947 .pmd_entry = queue_folios_pte_range,
948 .test_walk = queue_pages_test_walk,
949 .walk_lock = PGWALK_RDLOCK,
950 };
951
952 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
953 .hugetlb_entry = queue_folios_hugetlb,
954 .pmd_entry = queue_folios_pte_range,
955 .test_walk = queue_pages_test_walk,
956 .walk_lock = PGWALK_WRLOCK,
957 };
958
959 /*
960 * Walk through page tables and collect pages to be migrated.
961 *
962 * If pages found in a given range are not on the required set of @nodes,
963 * and migration is allowed, they are isolated and queued to @pagelist.
964 *
965 * queue_pages_range() may return:
966 * 0 - all pages already on the right node, or successfully queued for moving
967 * (or neither strict checking nor moving requested: only range checking).
968 * >0 - this number of misplaced folios could not be queued for moving
969 * (a hugetlbfs page or a transparent huge page being counted as 1).
970 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
971 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
972 */
973 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)974 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
975 nodemask_t *nodes, unsigned long flags,
976 struct list_head *pagelist)
977 {
978 int err;
979 struct queue_pages qp = {
980 .pagelist = pagelist,
981 .flags = flags,
982 .nmask = nodes,
983 .start = start,
984 .end = end,
985 .first = NULL,
986 };
987 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
988 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
989
990 err = walk_page_range(mm, start, end, ops, &qp);
991
992 if (!qp.first)
993 /* whole range in hole */
994 err = -EFAULT;
995
996 return err ? : qp.nr_failed;
997 }
998
999 /*
1000 * Apply policy to a single VMA
1001 * This must be called with the mmap_lock held for writing.
1002 */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)1003 static int vma_replace_policy(struct vm_area_struct *vma,
1004 struct mempolicy *pol)
1005 {
1006 int err;
1007 struct mempolicy *old;
1008 struct mempolicy *new;
1009
1010 vma_assert_write_locked(vma);
1011
1012 new = mpol_dup(pol);
1013 if (IS_ERR(new))
1014 return PTR_ERR(new);
1015
1016 if (vma->vm_ops && vma->vm_ops->set_policy) {
1017 err = vma->vm_ops->set_policy(vma, new);
1018 if (err)
1019 goto err_out;
1020 }
1021
1022 old = vma->vm_policy;
1023 vma->vm_policy = new; /* protected by mmap_lock */
1024 mpol_put(old);
1025
1026 return 0;
1027 err_out:
1028 mpol_put(new);
1029 return err;
1030 }
1031
1032 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)1033 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1034 struct vm_area_struct **prev, unsigned long start,
1035 unsigned long end, struct mempolicy *new_pol)
1036 {
1037 unsigned long vmstart, vmend;
1038
1039 vmend = min(end, vma->vm_end);
1040 if (start > vma->vm_start) {
1041 *prev = vma;
1042 vmstart = start;
1043 } else {
1044 vmstart = vma->vm_start;
1045 }
1046
1047 if (mpol_equal(vma->vm_policy, new_pol)) {
1048 *prev = vma;
1049 return 0;
1050 }
1051
1052 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
1053 if (IS_ERR(vma))
1054 return PTR_ERR(vma);
1055
1056 *prev = vma;
1057 return vma_replace_policy(vma, new_pol);
1058 }
1059
1060 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)1061 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1062 nodemask_t *nodes)
1063 {
1064 struct mempolicy *new, *old;
1065 NODEMASK_SCRATCH(scratch);
1066 int ret;
1067
1068 if (!scratch)
1069 return -ENOMEM;
1070
1071 new = mpol_new(mode, flags, nodes);
1072 if (IS_ERR(new)) {
1073 ret = PTR_ERR(new);
1074 goto out;
1075 }
1076
1077 task_lock(current);
1078 ret = mpol_set_nodemask(new, nodes, scratch);
1079 if (ret) {
1080 task_unlock(current);
1081 mpol_put(new);
1082 goto out;
1083 }
1084
1085 old = current->mempolicy;
1086 current->mempolicy = new;
1087 if (new && (new->mode == MPOL_INTERLEAVE ||
1088 new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1089 current->il_prev = MAX_NUMNODES-1;
1090 current->il_weight = 0;
1091 }
1092 task_unlock(current);
1093 mpol_put(old);
1094 ret = 0;
1095 out:
1096 NODEMASK_SCRATCH_FREE(scratch);
1097 return ret;
1098 }
1099
1100 /*
1101 * Return nodemask for policy for get_mempolicy() query
1102 *
1103 * Called with task's alloc_lock held
1104 */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1105 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1106 {
1107 nodes_clear(*nodes);
1108 if (pol == &default_policy)
1109 return;
1110
1111 switch (pol->mode) {
1112 case MPOL_BIND:
1113 case MPOL_INTERLEAVE:
1114 case MPOL_PREFERRED:
1115 case MPOL_PREFERRED_MANY:
1116 case MPOL_WEIGHTED_INTERLEAVE:
1117 *nodes = pol->nodes;
1118 break;
1119 case MPOL_LOCAL:
1120 /* return empty node mask for local allocation */
1121 break;
1122 default:
1123 BUG();
1124 }
1125 }
1126
lookup_node(struct mm_struct * mm,unsigned long addr)1127 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1128 {
1129 struct page *p = NULL;
1130 int ret;
1131
1132 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1133 if (ret > 0) {
1134 ret = page_to_nid(p);
1135 put_page(p);
1136 }
1137 return ret;
1138 }
1139
1140 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1141 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1142 unsigned long addr, unsigned long flags)
1143 {
1144 int err;
1145 struct mm_struct *mm = current->mm;
1146 struct vm_area_struct *vma = NULL;
1147 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1148
1149 if (flags &
1150 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1151 return -EINVAL;
1152
1153 if (flags & MPOL_F_MEMS_ALLOWED) {
1154 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1155 return -EINVAL;
1156 *policy = 0; /* just so it's initialized */
1157 task_lock(current);
1158 *nmask = cpuset_current_mems_allowed;
1159 task_unlock(current);
1160 return 0;
1161 }
1162
1163 if (flags & MPOL_F_ADDR) {
1164 pgoff_t ilx; /* ignored here */
1165 /*
1166 * Do NOT fall back to task policy if the
1167 * vma/shared policy at addr is NULL. We
1168 * want to return MPOL_DEFAULT in this case.
1169 */
1170 mmap_read_lock(mm);
1171 vma = vma_lookup(mm, addr);
1172 if (!vma) {
1173 mmap_read_unlock(mm);
1174 return -EFAULT;
1175 }
1176 pol = __get_vma_policy(vma, addr, &ilx);
1177 } else if (addr)
1178 return -EINVAL;
1179
1180 if (!pol)
1181 pol = &default_policy; /* indicates default behavior */
1182
1183 if (flags & MPOL_F_NODE) {
1184 if (flags & MPOL_F_ADDR) {
1185 /*
1186 * Take a refcount on the mpol, because we are about to
1187 * drop the mmap_lock, after which only "pol" remains
1188 * valid, "vma" is stale.
1189 */
1190 pol_refcount = pol;
1191 vma = NULL;
1192 mpol_get(pol);
1193 mmap_read_unlock(mm);
1194 err = lookup_node(mm, addr);
1195 if (err < 0)
1196 goto out;
1197 *policy = err;
1198 } else if (pol == current->mempolicy &&
1199 pol->mode == MPOL_INTERLEAVE) {
1200 *policy = next_node_in(current->il_prev, pol->nodes);
1201 } else if (pol == current->mempolicy &&
1202 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1203 if (current->il_weight)
1204 *policy = current->il_prev;
1205 else
1206 *policy = next_node_in(current->il_prev,
1207 pol->nodes);
1208 } else {
1209 err = -EINVAL;
1210 goto out;
1211 }
1212 } else {
1213 *policy = pol == &default_policy ? MPOL_DEFAULT :
1214 pol->mode;
1215 /*
1216 * Internal mempolicy flags must be masked off before exposing
1217 * the policy to userspace.
1218 */
1219 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1220 }
1221
1222 err = 0;
1223 if (nmask) {
1224 if (mpol_store_user_nodemask(pol)) {
1225 *nmask = pol->w.user_nodemask;
1226 } else {
1227 task_lock(current);
1228 get_policy_nodemask(pol, nmask);
1229 task_unlock(current);
1230 }
1231 }
1232
1233 out:
1234 mpol_cond_put(pol);
1235 if (vma)
1236 mmap_read_unlock(mm);
1237 if (pol_refcount)
1238 mpol_put(pol_refcount);
1239 return err;
1240 }
1241
1242 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1243 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1244 unsigned long flags)
1245 {
1246 /*
1247 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1248 * Choosing not to migrate a shared folio is not counted as a failure.
1249 *
1250 * See folio_maybe_mapped_shared() on possible imprecision when we
1251 * cannot easily detect if a folio is shared.
1252 */
1253 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1254 if (folio_isolate_lru(folio)) {
1255 list_add_tail(&folio->lru, foliolist);
1256 node_stat_mod_folio(folio,
1257 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1258 folio_nr_pages(folio));
1259 } else {
1260 /*
1261 * Non-movable folio may reach here. And, there may be
1262 * temporary off LRU folios or non-LRU movable folios.
1263 * Treat them as unmovable folios since they can't be
1264 * isolated, so they can't be moved at the moment.
1265 */
1266 return false;
1267 }
1268 }
1269 return true;
1270 }
1271
1272 /*
1273 * Migrate pages from one node to a target node.
1274 * Returns error or the number of pages not migrated.
1275 */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1276 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1277 int flags)
1278 {
1279 nodemask_t nmask;
1280 struct vm_area_struct *vma;
1281 LIST_HEAD(pagelist);
1282 long nr_failed;
1283 long err = 0;
1284 struct migration_target_control mtc = {
1285 .nid = dest,
1286 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1287 .reason = MR_SYSCALL,
1288 };
1289
1290 nodes_clear(nmask);
1291 node_set(source, nmask);
1292
1293 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1294
1295 mmap_read_lock(mm);
1296 vma = find_vma(mm, 0);
1297 if (unlikely(!vma)) {
1298 mmap_read_unlock(mm);
1299 return 0;
1300 }
1301
1302 /*
1303 * This does not migrate the range, but isolates all pages that
1304 * need migration. Between passing in the full user address
1305 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1306 * but passes back the count of pages which could not be isolated.
1307 */
1308 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1309 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1310 mmap_read_unlock(mm);
1311
1312 if (!list_empty(&pagelist)) {
1313 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1314 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1315 if (err)
1316 putback_movable_pages(&pagelist);
1317 }
1318
1319 if (err >= 0)
1320 err += nr_failed;
1321 return err;
1322 }
1323
1324 /*
1325 * Move pages between the two nodesets so as to preserve the physical
1326 * layout as much as possible.
1327 *
1328 * Returns the number of page that could not be moved.
1329 */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1330 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1331 const nodemask_t *to, int flags)
1332 {
1333 long nr_failed = 0;
1334 long err = 0;
1335 nodemask_t tmp;
1336
1337 lru_cache_disable();
1338
1339 /*
1340 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1341 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1342 * bit in 'tmp', and return that <source, dest> pair for migration.
1343 * The pair of nodemasks 'to' and 'from' define the map.
1344 *
1345 * If no pair of bits is found that way, fallback to picking some
1346 * pair of 'source' and 'dest' bits that are not the same. If the
1347 * 'source' and 'dest' bits are the same, this represents a node
1348 * that will be migrating to itself, so no pages need move.
1349 *
1350 * If no bits are left in 'tmp', or if all remaining bits left
1351 * in 'tmp' correspond to the same bit in 'to', return false
1352 * (nothing left to migrate).
1353 *
1354 * This lets us pick a pair of nodes to migrate between, such that
1355 * if possible the dest node is not already occupied by some other
1356 * source node, minimizing the risk of overloading the memory on a
1357 * node that would happen if we migrated incoming memory to a node
1358 * before migrating outgoing memory source that same node.
1359 *
1360 * A single scan of tmp is sufficient. As we go, we remember the
1361 * most recent <s, d> pair that moved (s != d). If we find a pair
1362 * that not only moved, but what's better, moved to an empty slot
1363 * (d is not set in tmp), then we break out then, with that pair.
1364 * Otherwise when we finish scanning from_tmp, we at least have the
1365 * most recent <s, d> pair that moved. If we get all the way through
1366 * the scan of tmp without finding any node that moved, much less
1367 * moved to an empty node, then there is nothing left worth migrating.
1368 */
1369
1370 tmp = *from;
1371 while (!nodes_empty(tmp)) {
1372 int s, d;
1373 int source = NUMA_NO_NODE;
1374 int dest = 0;
1375
1376 for_each_node_mask(s, tmp) {
1377
1378 /*
1379 * do_migrate_pages() tries to maintain the relative
1380 * node relationship of the pages established between
1381 * threads and memory areas.
1382 *
1383 * However if the number of source nodes is not equal to
1384 * the number of destination nodes we can not preserve
1385 * this node relative relationship. In that case, skip
1386 * copying memory from a node that is in the destination
1387 * mask.
1388 *
1389 * Example: [2,3,4] -> [3,4,5] moves everything.
1390 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1391 */
1392
1393 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1394 (node_isset(s, *to)))
1395 continue;
1396
1397 d = node_remap(s, *from, *to);
1398 if (s == d)
1399 continue;
1400
1401 source = s; /* Node moved. Memorize */
1402 dest = d;
1403
1404 /* dest not in remaining from nodes? */
1405 if (!node_isset(dest, tmp))
1406 break;
1407 }
1408 if (source == NUMA_NO_NODE)
1409 break;
1410
1411 node_clear(source, tmp);
1412 err = migrate_to_node(mm, source, dest, flags);
1413 if (err > 0)
1414 nr_failed += err;
1415 if (err < 0)
1416 break;
1417 }
1418
1419 lru_cache_enable();
1420 if (err < 0)
1421 return err;
1422 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1423 }
1424
1425 /*
1426 * Allocate a new folio for page migration, according to NUMA mempolicy.
1427 */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1428 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1429 unsigned long private)
1430 {
1431 struct migration_mpol *mmpol = (struct migration_mpol *)private;
1432 struct mempolicy *pol = mmpol->pol;
1433 pgoff_t ilx = mmpol->ilx;
1434 unsigned int order;
1435 int nid = numa_node_id();
1436 gfp_t gfp;
1437
1438 order = folio_order(src);
1439 ilx += src->index >> order;
1440
1441 if (folio_test_hugetlb(src)) {
1442 nodemask_t *nodemask;
1443 struct hstate *h;
1444
1445 h = folio_hstate(src);
1446 gfp = htlb_alloc_mask(h);
1447 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1448 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1449 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1450 }
1451
1452 if (folio_test_large(src))
1453 gfp = GFP_TRANSHUGE;
1454 else
1455 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1456
1457 return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1458 }
1459 #else
1460
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1461 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1462 unsigned long flags)
1463 {
1464 return false;
1465 }
1466
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1467 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1468 const nodemask_t *to, int flags)
1469 {
1470 return -ENOSYS;
1471 }
1472
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1473 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1474 unsigned long private)
1475 {
1476 return NULL;
1477 }
1478 #endif
1479
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1480 static long do_mbind(unsigned long start, unsigned long len,
1481 unsigned short mode, unsigned short mode_flags,
1482 nodemask_t *nmask, unsigned long flags)
1483 {
1484 struct mm_struct *mm = current->mm;
1485 struct vm_area_struct *vma, *prev;
1486 struct vma_iterator vmi;
1487 struct migration_mpol mmpol;
1488 struct mempolicy *new;
1489 unsigned long end;
1490 long err;
1491 long nr_failed;
1492 LIST_HEAD(pagelist);
1493
1494 if (flags & ~(unsigned long)MPOL_MF_VALID)
1495 return -EINVAL;
1496 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1497 return -EPERM;
1498
1499 if (start & ~PAGE_MASK)
1500 return -EINVAL;
1501
1502 if (mode == MPOL_DEFAULT)
1503 flags &= ~MPOL_MF_STRICT;
1504
1505 len = PAGE_ALIGN(len);
1506 end = start + len;
1507
1508 if (end < start)
1509 return -EINVAL;
1510 if (end == start)
1511 return 0;
1512
1513 new = mpol_new(mode, mode_flags, nmask);
1514 if (IS_ERR(new))
1515 return PTR_ERR(new);
1516
1517 /*
1518 * If we are using the default policy then operation
1519 * on discontinuous address spaces is okay after all
1520 */
1521 if (!new)
1522 flags |= MPOL_MF_DISCONTIG_OK;
1523
1524 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1525 lru_cache_disable();
1526 {
1527 NODEMASK_SCRATCH(scratch);
1528 if (scratch) {
1529 mmap_write_lock(mm);
1530 err = mpol_set_nodemask(new, nmask, scratch);
1531 if (err)
1532 mmap_write_unlock(mm);
1533 } else
1534 err = -ENOMEM;
1535 NODEMASK_SCRATCH_FREE(scratch);
1536 }
1537 if (err)
1538 goto mpol_out;
1539
1540 /*
1541 * Lock the VMAs before scanning for pages to migrate,
1542 * to ensure we don't miss a concurrently inserted page.
1543 */
1544 nr_failed = queue_pages_range(mm, start, end, nmask,
1545 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1546
1547 if (nr_failed < 0) {
1548 err = nr_failed;
1549 nr_failed = 0;
1550 } else {
1551 vma_iter_init(&vmi, mm, start);
1552 prev = vma_prev(&vmi);
1553 for_each_vma_range(vmi, vma, end) {
1554 err = mbind_range(&vmi, vma, &prev, start, end, new);
1555 if (err)
1556 break;
1557 }
1558 }
1559
1560 if (!err && !list_empty(&pagelist)) {
1561 /* Convert MPOL_DEFAULT's NULL to task or default policy */
1562 if (!new) {
1563 new = get_task_policy(current);
1564 mpol_get(new);
1565 }
1566 mmpol.pol = new;
1567 mmpol.ilx = 0;
1568
1569 /*
1570 * In the interleaved case, attempt to allocate on exactly the
1571 * targeted nodes, for the first VMA to be migrated; for later
1572 * VMAs, the nodes will still be interleaved from the targeted
1573 * nodemask, but one by one may be selected differently.
1574 */
1575 if (new->mode == MPOL_INTERLEAVE ||
1576 new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1577 struct folio *folio;
1578 unsigned int order;
1579 unsigned long addr = -EFAULT;
1580
1581 list_for_each_entry(folio, &pagelist, lru) {
1582 if (!folio_test_ksm(folio))
1583 break;
1584 }
1585 if (!list_entry_is_head(folio, &pagelist, lru)) {
1586 vma_iter_init(&vmi, mm, start);
1587 for_each_vma_range(vmi, vma, end) {
1588 addr = page_address_in_vma(folio,
1589 folio_page(folio, 0), vma);
1590 if (addr != -EFAULT)
1591 break;
1592 }
1593 }
1594 if (addr != -EFAULT) {
1595 order = folio_order(folio);
1596 /* We already know the pol, but not the ilx */
1597 mpol_cond_put(get_vma_policy(vma, addr, order,
1598 &mmpol.ilx));
1599 /* Set base from which to increment by index */
1600 mmpol.ilx -= folio->index >> order;
1601 }
1602 }
1603 }
1604
1605 mmap_write_unlock(mm);
1606
1607 if (!err && !list_empty(&pagelist)) {
1608 nr_failed |= migrate_pages(&pagelist,
1609 alloc_migration_target_by_mpol, NULL,
1610 (unsigned long)&mmpol, MIGRATE_SYNC,
1611 MR_MEMPOLICY_MBIND, NULL);
1612 }
1613
1614 if (nr_failed && (flags & MPOL_MF_STRICT))
1615 err = -EIO;
1616 if (!list_empty(&pagelist))
1617 putback_movable_pages(&pagelist);
1618 mpol_out:
1619 mpol_put(new);
1620 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1621 lru_cache_enable();
1622 return err;
1623 }
1624
1625 /*
1626 * User space interface with variable sized bitmaps for nodelists.
1627 */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1628 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1629 unsigned long maxnode)
1630 {
1631 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1632 int ret;
1633
1634 if (in_compat_syscall())
1635 ret = compat_get_bitmap(mask,
1636 (const compat_ulong_t __user *)nmask,
1637 maxnode);
1638 else
1639 ret = copy_from_user(mask, nmask,
1640 nlongs * sizeof(unsigned long));
1641
1642 if (ret)
1643 return -EFAULT;
1644
1645 if (maxnode % BITS_PER_LONG)
1646 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1647
1648 return 0;
1649 }
1650
1651 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1652 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1653 unsigned long maxnode)
1654 {
1655 --maxnode;
1656 nodes_clear(*nodes);
1657 if (maxnode == 0 || !nmask)
1658 return 0;
1659 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1660 return -EINVAL;
1661
1662 /*
1663 * When the user specified more nodes than supported just check
1664 * if the non supported part is all zero, one word at a time,
1665 * starting at the end.
1666 */
1667 while (maxnode > MAX_NUMNODES) {
1668 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1669 unsigned long t;
1670
1671 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1672 return -EFAULT;
1673
1674 if (maxnode - bits >= MAX_NUMNODES) {
1675 maxnode -= bits;
1676 } else {
1677 maxnode = MAX_NUMNODES;
1678 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1679 }
1680 if (t)
1681 return -EINVAL;
1682 }
1683
1684 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1685 }
1686
1687 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1688 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1689 nodemask_t *nodes)
1690 {
1691 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1692 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1693 bool compat = in_compat_syscall();
1694
1695 if (compat)
1696 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1697
1698 if (copy > nbytes) {
1699 if (copy > PAGE_SIZE)
1700 return -EINVAL;
1701 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1702 return -EFAULT;
1703 copy = nbytes;
1704 maxnode = nr_node_ids;
1705 }
1706
1707 if (compat)
1708 return compat_put_bitmap((compat_ulong_t __user *)mask,
1709 nodes_addr(*nodes), maxnode);
1710
1711 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1712 }
1713
1714 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1715 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1716 {
1717 *flags = *mode & MPOL_MODE_FLAGS;
1718 *mode &= ~MPOL_MODE_FLAGS;
1719
1720 if ((unsigned int)(*mode) >= MPOL_MAX)
1721 return -EINVAL;
1722 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1723 return -EINVAL;
1724 if (*flags & MPOL_F_NUMA_BALANCING) {
1725 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1726 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1727 else
1728 return -EINVAL;
1729 }
1730 return 0;
1731 }
1732
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1733 static long kernel_mbind(unsigned long start, unsigned long len,
1734 unsigned long mode, const unsigned long __user *nmask,
1735 unsigned long maxnode, unsigned int flags)
1736 {
1737 unsigned short mode_flags;
1738 nodemask_t nodes;
1739 int lmode = mode;
1740 int err;
1741
1742 start = untagged_addr(start);
1743 err = sanitize_mpol_flags(&lmode, &mode_flags);
1744 if (err)
1745 return err;
1746
1747 err = get_nodes(&nodes, nmask, maxnode);
1748 if (err)
1749 return err;
1750
1751 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1752 }
1753
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1754 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1755 unsigned long, home_node, unsigned long, flags)
1756 {
1757 struct mm_struct *mm = current->mm;
1758 struct vm_area_struct *vma, *prev;
1759 struct mempolicy *new, *old;
1760 unsigned long end;
1761 int err = -ENOENT;
1762 VMA_ITERATOR(vmi, mm, start);
1763
1764 start = untagged_addr(start);
1765 if (start & ~PAGE_MASK)
1766 return -EINVAL;
1767 /*
1768 * flags is used for future extension if any.
1769 */
1770 if (flags != 0)
1771 return -EINVAL;
1772
1773 /*
1774 * Check home_node is online to avoid accessing uninitialized
1775 * NODE_DATA.
1776 */
1777 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1778 return -EINVAL;
1779
1780 len = PAGE_ALIGN(len);
1781 end = start + len;
1782
1783 if (end < start)
1784 return -EINVAL;
1785 if (end == start)
1786 return 0;
1787 mmap_write_lock(mm);
1788 prev = vma_prev(&vmi);
1789 for_each_vma_range(vmi, vma, end) {
1790 /*
1791 * If any vma in the range got policy other than MPOL_BIND
1792 * or MPOL_PREFERRED_MANY we return error. We don't reset
1793 * the home node for vmas we already updated before.
1794 */
1795 old = vma_policy(vma);
1796 if (!old) {
1797 prev = vma;
1798 continue;
1799 }
1800 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1801 err = -EOPNOTSUPP;
1802 break;
1803 }
1804 new = mpol_dup(old);
1805 if (IS_ERR(new)) {
1806 err = PTR_ERR(new);
1807 break;
1808 }
1809
1810 vma_start_write(vma);
1811 new->home_node = home_node;
1812 err = mbind_range(&vmi, vma, &prev, start, end, new);
1813 mpol_put(new);
1814 if (err)
1815 break;
1816 }
1817 mmap_write_unlock(mm);
1818 return err;
1819 }
1820
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1821 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1822 unsigned long, mode, const unsigned long __user *, nmask,
1823 unsigned long, maxnode, unsigned int, flags)
1824 {
1825 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1826 }
1827
1828 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1829 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1830 unsigned long maxnode)
1831 {
1832 unsigned short mode_flags;
1833 nodemask_t nodes;
1834 int lmode = mode;
1835 int err;
1836
1837 err = sanitize_mpol_flags(&lmode, &mode_flags);
1838 if (err)
1839 return err;
1840
1841 err = get_nodes(&nodes, nmask, maxnode);
1842 if (err)
1843 return err;
1844
1845 return do_set_mempolicy(lmode, mode_flags, &nodes);
1846 }
1847
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1848 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1849 unsigned long, maxnode)
1850 {
1851 return kernel_set_mempolicy(mode, nmask, maxnode);
1852 }
1853
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1854 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1855 const unsigned long __user *old_nodes,
1856 const unsigned long __user *new_nodes)
1857 {
1858 struct mm_struct *mm = NULL;
1859 struct task_struct *task;
1860 nodemask_t task_nodes;
1861 int err;
1862 nodemask_t *old;
1863 nodemask_t *new;
1864 NODEMASK_SCRATCH(scratch);
1865
1866 if (!scratch)
1867 return -ENOMEM;
1868
1869 old = &scratch->mask1;
1870 new = &scratch->mask2;
1871
1872 err = get_nodes(old, old_nodes, maxnode);
1873 if (err)
1874 goto out;
1875
1876 err = get_nodes(new, new_nodes, maxnode);
1877 if (err)
1878 goto out;
1879
1880 /* Find the mm_struct */
1881 rcu_read_lock();
1882 task = pid ? find_task_by_vpid(pid) : current;
1883 if (!task) {
1884 rcu_read_unlock();
1885 err = -ESRCH;
1886 goto out;
1887 }
1888 get_task_struct(task);
1889
1890 err = -EINVAL;
1891
1892 /*
1893 * Check if this process has the right to modify the specified process.
1894 * Use the regular "ptrace_may_access()" checks.
1895 */
1896 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1897 rcu_read_unlock();
1898 err = -EPERM;
1899 goto out_put;
1900 }
1901 rcu_read_unlock();
1902
1903 task_nodes = cpuset_mems_allowed(task);
1904 /* Is the user allowed to access the target nodes? */
1905 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1906 err = -EPERM;
1907 goto out_put;
1908 }
1909
1910 task_nodes = cpuset_mems_allowed(current);
1911 if (!nodes_and(*new, *new, task_nodes))
1912 goto out_put;
1913
1914 err = security_task_movememory(task);
1915 if (err)
1916 goto out_put;
1917
1918 mm = get_task_mm(task);
1919 put_task_struct(task);
1920
1921 if (!mm) {
1922 err = -EINVAL;
1923 goto out;
1924 }
1925
1926 err = do_migrate_pages(mm, old, new,
1927 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1928
1929 mmput(mm);
1930 out:
1931 NODEMASK_SCRATCH_FREE(scratch);
1932
1933 return err;
1934
1935 out_put:
1936 put_task_struct(task);
1937 goto out;
1938 }
1939
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1940 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1941 const unsigned long __user *, old_nodes,
1942 const unsigned long __user *, new_nodes)
1943 {
1944 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1945 }
1946
1947 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1948 static int kernel_get_mempolicy(int __user *policy,
1949 unsigned long __user *nmask,
1950 unsigned long maxnode,
1951 unsigned long addr,
1952 unsigned long flags)
1953 {
1954 int err;
1955 int pval;
1956 nodemask_t nodes;
1957
1958 if (nmask != NULL && maxnode < nr_node_ids)
1959 return -EINVAL;
1960
1961 addr = untagged_addr(addr);
1962
1963 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1964
1965 if (err)
1966 return err;
1967
1968 if (policy && put_user(pval, policy))
1969 return -EFAULT;
1970
1971 if (nmask)
1972 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1973
1974 return err;
1975 }
1976
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1977 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1978 unsigned long __user *, nmask, unsigned long, maxnode,
1979 unsigned long, addr, unsigned long, flags)
1980 {
1981 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1982 }
1983
vma_migratable(struct vm_area_struct * vma)1984 bool vma_migratable(struct vm_area_struct *vma)
1985 {
1986 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1987 return false;
1988
1989 /*
1990 * DAX device mappings require predictable access latency, so avoid
1991 * incurring periodic faults.
1992 */
1993 if (vma_is_dax(vma))
1994 return false;
1995
1996 if (is_vm_hugetlb_page(vma) &&
1997 !hugepage_migration_supported(hstate_vma(vma)))
1998 return false;
1999
2000 /*
2001 * Migration allocates pages in the highest zone. If we cannot
2002 * do so then migration (at least from node to node) is not
2003 * possible.
2004 */
2005 if (vma->vm_file &&
2006 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
2007 < policy_zone)
2008 return false;
2009 return true;
2010 }
2011
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2012 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2013 unsigned long addr, pgoff_t *ilx)
2014 {
2015 *ilx = 0;
2016 return (vma->vm_ops && vma->vm_ops->get_policy) ?
2017 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2018 }
2019
2020 /*
2021 * get_vma_policy(@vma, @addr, @order, @ilx)
2022 * @vma: virtual memory area whose policy is sought
2023 * @addr: address in @vma for shared policy lookup
2024 * @order: 0, or appropriate huge_page_order for interleaving
2025 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2026 * MPOL_WEIGHTED_INTERLEAVE
2027 *
2028 * Returns effective policy for a VMA at specified address.
2029 * Falls back to current->mempolicy or system default policy, as necessary.
2030 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2031 * count--added by the get_policy() vm_op, as appropriate--to protect against
2032 * freeing by another task. It is the caller's responsibility to free the
2033 * extra reference for shared policies.
2034 */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)2035 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2036 unsigned long addr, int order, pgoff_t *ilx)
2037 {
2038 struct mempolicy *pol;
2039
2040 pol = __get_vma_policy(vma, addr, ilx);
2041 if (!pol)
2042 pol = get_task_policy(current);
2043 if (pol->mode == MPOL_INTERLEAVE ||
2044 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2045 *ilx += vma->vm_pgoff >> order;
2046 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2047 }
2048 return pol;
2049 }
2050
vma_policy_mof(struct vm_area_struct * vma)2051 bool vma_policy_mof(struct vm_area_struct *vma)
2052 {
2053 struct mempolicy *pol;
2054
2055 if (vma->vm_ops && vma->vm_ops->get_policy) {
2056 bool ret = false;
2057 pgoff_t ilx; /* ignored here */
2058
2059 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2060 if (pol && (pol->flags & MPOL_F_MOF))
2061 ret = true;
2062 mpol_cond_put(pol);
2063
2064 return ret;
2065 }
2066
2067 pol = vma->vm_policy;
2068 if (!pol)
2069 pol = get_task_policy(current);
2070
2071 return pol->flags & MPOL_F_MOF;
2072 }
2073
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2074 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2075 {
2076 enum zone_type dynamic_policy_zone = policy_zone;
2077
2078 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2079
2080 /*
2081 * if policy->nodes has movable memory only,
2082 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2083 *
2084 * policy->nodes is intersect with node_states[N_MEMORY].
2085 * so if the following test fails, it implies
2086 * policy->nodes has movable memory only.
2087 */
2088 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2089 dynamic_policy_zone = ZONE_MOVABLE;
2090
2091 return zone >= dynamic_policy_zone;
2092 }
2093
weighted_interleave_nodes(struct mempolicy * policy)2094 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2095 {
2096 unsigned int node;
2097 unsigned int cpuset_mems_cookie;
2098
2099 retry:
2100 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2101 cpuset_mems_cookie = read_mems_allowed_begin();
2102 node = current->il_prev;
2103 if (!current->il_weight || !node_isset(node, policy->nodes)) {
2104 node = next_node_in(node, policy->nodes);
2105 if (read_mems_allowed_retry(cpuset_mems_cookie))
2106 goto retry;
2107 if (node == MAX_NUMNODES)
2108 return node;
2109 current->il_prev = node;
2110 current->il_weight = get_il_weight(node);
2111 }
2112 current->il_weight--;
2113 return node;
2114 }
2115
2116 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2117 static unsigned int interleave_nodes(struct mempolicy *policy)
2118 {
2119 unsigned int nid;
2120 unsigned int cpuset_mems_cookie;
2121
2122 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2123 do {
2124 cpuset_mems_cookie = read_mems_allowed_begin();
2125 nid = next_node_in(current->il_prev, policy->nodes);
2126 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2127
2128 if (nid < MAX_NUMNODES)
2129 current->il_prev = nid;
2130 return nid;
2131 }
2132
2133 /*
2134 * Depending on the memory policy provide a node from which to allocate the
2135 * next slab entry.
2136 */
mempolicy_slab_node(void)2137 unsigned int mempolicy_slab_node(void)
2138 {
2139 struct mempolicy *policy;
2140 int node = numa_mem_id();
2141
2142 if (!in_task())
2143 return node;
2144
2145 policy = current->mempolicy;
2146 if (!policy)
2147 return node;
2148
2149 switch (policy->mode) {
2150 case MPOL_PREFERRED:
2151 return first_node(policy->nodes);
2152
2153 case MPOL_INTERLEAVE:
2154 return interleave_nodes(policy);
2155
2156 case MPOL_WEIGHTED_INTERLEAVE:
2157 return weighted_interleave_nodes(policy);
2158
2159 case MPOL_BIND:
2160 case MPOL_PREFERRED_MANY:
2161 {
2162 struct zoneref *z;
2163
2164 /*
2165 * Follow bind policy behavior and start allocation at the
2166 * first node.
2167 */
2168 struct zonelist *zonelist;
2169 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2170 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2171 z = first_zones_zonelist(zonelist, highest_zoneidx,
2172 &policy->nodes);
2173 return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2174 }
2175 case MPOL_LOCAL:
2176 return node;
2177
2178 default:
2179 BUG();
2180 }
2181 }
2182
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2183 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2184 nodemask_t *mask)
2185 {
2186 /*
2187 * barrier stabilizes the nodemask locally so that it can be iterated
2188 * over safely without concern for changes. Allocators validate node
2189 * selection does not violate mems_allowed, so this is safe.
2190 */
2191 barrier();
2192 memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2193 barrier();
2194 return nodes_weight(*mask);
2195 }
2196
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2197 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2198 {
2199 struct weighted_interleave_state *state;
2200 nodemask_t nodemask;
2201 unsigned int target, nr_nodes;
2202 u8 *table = NULL;
2203 unsigned int weight_total = 0;
2204 u8 weight;
2205 int nid = 0;
2206
2207 nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2208 if (!nr_nodes)
2209 return numa_node_id();
2210
2211 rcu_read_lock();
2212
2213 state = rcu_dereference(wi_state);
2214 /* Uninitialized wi_state means we should assume all weights are 1 */
2215 if (state)
2216 table = state->iw_table;
2217
2218 /* calculate the total weight */
2219 for_each_node_mask(nid, nodemask)
2220 weight_total += table ? table[nid] : 1;
2221
2222 /* Calculate the node offset based on totals */
2223 target = ilx % weight_total;
2224 nid = first_node(nodemask);
2225 while (target) {
2226 /* detect system default usage */
2227 weight = table ? table[nid] : 1;
2228 if (target < weight)
2229 break;
2230 target -= weight;
2231 nid = next_node_in(nid, nodemask);
2232 }
2233 rcu_read_unlock();
2234 return nid;
2235 }
2236
2237 /*
2238 * Do static interleaving for interleave index @ilx. Returns the ilx'th
2239 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2240 * exceeds the number of present nodes.
2241 */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2242 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2243 {
2244 nodemask_t nodemask;
2245 unsigned int target, nnodes;
2246 int i;
2247 int nid;
2248
2249 nnodes = read_once_policy_nodemask(pol, &nodemask);
2250 if (!nnodes)
2251 return numa_node_id();
2252 target = ilx % nnodes;
2253 nid = first_node(nodemask);
2254 for (i = 0; i < target; i++)
2255 nid = next_node(nid, nodemask);
2256 return nid;
2257 }
2258
2259 /*
2260 * Return a nodemask representing a mempolicy for filtering nodes for
2261 * page allocation, together with preferred node id (or the input node id).
2262 */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2263 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2264 pgoff_t ilx, int *nid)
2265 {
2266 nodemask_t *nodemask = NULL;
2267
2268 switch (pol->mode) {
2269 case MPOL_PREFERRED:
2270 /* Override input node id */
2271 *nid = first_node(pol->nodes);
2272 break;
2273 case MPOL_PREFERRED_MANY:
2274 nodemask = &pol->nodes;
2275 if (pol->home_node != NUMA_NO_NODE)
2276 *nid = pol->home_node;
2277 break;
2278 case MPOL_BIND:
2279 /* Restrict to nodemask (but not on lower zones) */
2280 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2281 cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2282 nodemask = &pol->nodes;
2283 if (pol->home_node != NUMA_NO_NODE)
2284 *nid = pol->home_node;
2285 /*
2286 * __GFP_THISNODE shouldn't even be used with the bind policy
2287 * because we might easily break the expectation to stay on the
2288 * requested node and not break the policy.
2289 */
2290 WARN_ON_ONCE(gfp & __GFP_THISNODE);
2291 break;
2292 case MPOL_INTERLEAVE:
2293 /* Override input node id */
2294 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2295 interleave_nodes(pol) : interleave_nid(pol, ilx);
2296 break;
2297 case MPOL_WEIGHTED_INTERLEAVE:
2298 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2299 weighted_interleave_nodes(pol) :
2300 weighted_interleave_nid(pol, ilx);
2301 break;
2302 }
2303
2304 return nodemask;
2305 }
2306
2307 #ifdef CONFIG_HUGETLBFS
2308 /*
2309 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2310 * @vma: virtual memory area whose policy is sought
2311 * @addr: address in @vma for shared policy lookup and interleave policy
2312 * @gfp_flags: for requested zone
2313 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2314 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2315 *
2316 * Returns a nid suitable for a huge page allocation and a pointer
2317 * to the struct mempolicy for conditional unref after allocation.
2318 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2319 * to the mempolicy's @nodemask for filtering the zonelist.
2320 */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2321 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2322 struct mempolicy **mpol, nodemask_t **nodemask)
2323 {
2324 pgoff_t ilx;
2325 int nid;
2326
2327 nid = numa_node_id();
2328 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2329 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2330 return nid;
2331 }
2332
2333 /*
2334 * init_nodemask_of_mempolicy
2335 *
2336 * If the current task's mempolicy is "default" [NULL], return 'false'
2337 * to indicate default policy. Otherwise, extract the policy nodemask
2338 * for 'bind' or 'interleave' policy into the argument nodemask, or
2339 * initialize the argument nodemask to contain the single node for
2340 * 'preferred' or 'local' policy and return 'true' to indicate presence
2341 * of non-default mempolicy.
2342 *
2343 * We don't bother with reference counting the mempolicy [mpol_get/put]
2344 * because the current task is examining it's own mempolicy and a task's
2345 * mempolicy is only ever changed by the task itself.
2346 *
2347 * N.B., it is the caller's responsibility to free a returned nodemask.
2348 */
init_nodemask_of_mempolicy(nodemask_t * mask)2349 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2350 {
2351 struct mempolicy *mempolicy;
2352
2353 if (!(mask && current->mempolicy))
2354 return false;
2355
2356 task_lock(current);
2357 mempolicy = current->mempolicy;
2358 switch (mempolicy->mode) {
2359 case MPOL_PREFERRED:
2360 case MPOL_PREFERRED_MANY:
2361 case MPOL_BIND:
2362 case MPOL_INTERLEAVE:
2363 case MPOL_WEIGHTED_INTERLEAVE:
2364 *mask = mempolicy->nodes;
2365 break;
2366
2367 case MPOL_LOCAL:
2368 init_nodemask_of_node(mask, numa_node_id());
2369 break;
2370
2371 default:
2372 BUG();
2373 }
2374 task_unlock(current);
2375
2376 return true;
2377 }
2378 #endif
2379
2380 /*
2381 * mempolicy_in_oom_domain
2382 *
2383 * If tsk's mempolicy is "bind", check for intersection between mask and
2384 * the policy nodemask. Otherwise, return true for all other policies
2385 * including "interleave", as a tsk with "interleave" policy may have
2386 * memory allocated from all nodes in system.
2387 *
2388 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2389 */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2390 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2391 const nodemask_t *mask)
2392 {
2393 struct mempolicy *mempolicy;
2394 bool ret = true;
2395
2396 if (!mask)
2397 return ret;
2398
2399 task_lock(tsk);
2400 mempolicy = tsk->mempolicy;
2401 if (mempolicy && mempolicy->mode == MPOL_BIND)
2402 ret = nodes_intersects(mempolicy->nodes, *mask);
2403 task_unlock(tsk);
2404
2405 return ret;
2406 }
2407
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2408 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2409 int nid, nodemask_t *nodemask)
2410 {
2411 struct page *page;
2412 gfp_t preferred_gfp;
2413
2414 /*
2415 * This is a two pass approach. The first pass will only try the
2416 * preferred nodes but skip the direct reclaim and allow the
2417 * allocation to fail, while the second pass will try all the
2418 * nodes in system.
2419 */
2420 preferred_gfp = gfp | __GFP_NOWARN;
2421 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2422 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2423 if (!page)
2424 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2425
2426 return page;
2427 }
2428
2429 /**
2430 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2431 * @gfp: GFP flags.
2432 * @order: Order of the page allocation.
2433 * @pol: Pointer to the NUMA mempolicy.
2434 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2435 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2436 *
2437 * Return: The page on success or NULL if allocation fails.
2438 */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2439 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2440 struct mempolicy *pol, pgoff_t ilx, int nid)
2441 {
2442 nodemask_t *nodemask;
2443 struct page *page;
2444
2445 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2446
2447 if (pol->mode == MPOL_PREFERRED_MANY)
2448 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2449
2450 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2451 /* filter "hugepage" allocation, unless from alloc_pages() */
2452 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2453 /*
2454 * For hugepage allocation and non-interleave policy which
2455 * allows the current node (or other explicitly preferred
2456 * node) we only try to allocate from the current/preferred
2457 * node and don't fall back to other nodes, as the cost of
2458 * remote accesses would likely offset THP benefits.
2459 *
2460 * If the policy is interleave or does not allow the current
2461 * node in its nodemask, we allocate the standard way.
2462 */
2463 if (pol->mode != MPOL_INTERLEAVE &&
2464 pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2465 (!nodemask || node_isset(nid, *nodemask))) {
2466 /*
2467 * First, try to allocate THP only on local node, but
2468 * don't reclaim unnecessarily, just compact.
2469 */
2470 page = __alloc_frozen_pages_noprof(
2471 gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2472 nid, NULL);
2473 if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2474 return page;
2475 /*
2476 * If hugepage allocations are configured to always
2477 * synchronous compact or the vma has been madvised
2478 * to prefer hugepage backing, retry allowing remote
2479 * memory with both reclaim and compact as well.
2480 */
2481 }
2482 }
2483
2484 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2485
2486 if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2487 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2488 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2489 if (static_branch_likely(&vm_numa_stat_key) &&
2490 page_to_nid(page) == nid) {
2491 preempt_disable();
2492 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2493 preempt_enable();
2494 }
2495 }
2496
2497 return page;
2498 }
2499
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2500 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2501 struct mempolicy *pol, pgoff_t ilx, int nid)
2502 {
2503 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2504 ilx, nid);
2505 if (!page)
2506 return NULL;
2507
2508 set_page_refcounted(page);
2509 return page_rmappable_folio(page);
2510 }
2511
2512 /**
2513 * vma_alloc_folio - Allocate a folio for a VMA.
2514 * @gfp: GFP flags.
2515 * @order: Order of the folio.
2516 * @vma: Pointer to VMA.
2517 * @addr: Virtual address of the allocation. Must be inside @vma.
2518 *
2519 * Allocate a folio for a specific address in @vma, using the appropriate
2520 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2521 * VMA to prevent it from going away. Should be used for all allocations
2522 * for folios that will be mapped into user space, excepting hugetlbfs, and
2523 * excepting where direct use of folio_alloc_mpol() is more appropriate.
2524 *
2525 * Return: The folio on success or NULL if allocation fails.
2526 */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2527 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2528 unsigned long addr)
2529 {
2530 struct mempolicy *pol;
2531 pgoff_t ilx;
2532 struct folio *folio;
2533
2534 if (vma->vm_flags & VM_DROPPABLE)
2535 gfp |= __GFP_NOWARN;
2536
2537 pol = get_vma_policy(vma, addr, order, &ilx);
2538 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2539 mpol_cond_put(pol);
2540 return folio;
2541 }
2542 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2543
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2544 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2545 {
2546 struct mempolicy *pol = &default_policy;
2547
2548 /*
2549 * No reference counting needed for current->mempolicy
2550 * nor system default_policy
2551 */
2552 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2553 pol = get_task_policy(current);
2554
2555 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2556 numa_node_id());
2557 }
2558
2559 /**
2560 * alloc_pages - Allocate pages.
2561 * @gfp: GFP flags.
2562 * @order: Power of two of number of pages to allocate.
2563 *
2564 * Allocate 1 << @order contiguous pages. The physical address of the
2565 * first page is naturally aligned (eg an order-3 allocation will be aligned
2566 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2567 * process is honoured when in process context.
2568 *
2569 * Context: Can be called from any context, providing the appropriate GFP
2570 * flags are used.
2571 * Return: The page on success or NULL if allocation fails.
2572 */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2573 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2574 {
2575 struct page *page = alloc_frozen_pages_noprof(gfp, order);
2576
2577 if (page)
2578 set_page_refcounted(page);
2579 return page;
2580 }
2581 EXPORT_SYMBOL(alloc_pages_noprof);
2582
folio_alloc_noprof(gfp_t gfp,unsigned int order)2583 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2584 {
2585 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2586 }
2587 EXPORT_SYMBOL(folio_alloc_noprof);
2588
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2589 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2590 struct mempolicy *pol, unsigned long nr_pages,
2591 struct page **page_array)
2592 {
2593 int nodes;
2594 unsigned long nr_pages_per_node;
2595 int delta;
2596 int i;
2597 unsigned long nr_allocated;
2598 unsigned long total_allocated = 0;
2599
2600 nodes = nodes_weight(pol->nodes);
2601 nr_pages_per_node = nr_pages / nodes;
2602 delta = nr_pages - nodes * nr_pages_per_node;
2603
2604 for (i = 0; i < nodes; i++) {
2605 if (delta) {
2606 nr_allocated = alloc_pages_bulk_noprof(gfp,
2607 interleave_nodes(pol), NULL,
2608 nr_pages_per_node + 1,
2609 page_array);
2610 delta--;
2611 } else {
2612 nr_allocated = alloc_pages_bulk_noprof(gfp,
2613 interleave_nodes(pol), NULL,
2614 nr_pages_per_node, page_array);
2615 }
2616
2617 page_array += nr_allocated;
2618 total_allocated += nr_allocated;
2619 }
2620
2621 return total_allocated;
2622 }
2623
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2624 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2625 struct mempolicy *pol, unsigned long nr_pages,
2626 struct page **page_array)
2627 {
2628 struct weighted_interleave_state *state;
2629 struct task_struct *me = current;
2630 unsigned int cpuset_mems_cookie;
2631 unsigned long total_allocated = 0;
2632 unsigned long nr_allocated = 0;
2633 unsigned long rounds;
2634 unsigned long node_pages, delta;
2635 u8 *weights, weight;
2636 unsigned int weight_total = 0;
2637 unsigned long rem_pages = nr_pages;
2638 nodemask_t nodes;
2639 int nnodes, node;
2640 int resume_node = MAX_NUMNODES - 1;
2641 u8 resume_weight = 0;
2642 int prev_node;
2643 int i;
2644
2645 if (!nr_pages)
2646 return 0;
2647
2648 /* read the nodes onto the stack, retry if done during rebind */
2649 do {
2650 cpuset_mems_cookie = read_mems_allowed_begin();
2651 nnodes = read_once_policy_nodemask(pol, &nodes);
2652 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2653
2654 /* if the nodemask has become invalid, we cannot do anything */
2655 if (!nnodes)
2656 return 0;
2657
2658 /* Continue allocating from most recent node and adjust the nr_pages */
2659 node = me->il_prev;
2660 weight = me->il_weight;
2661 if (weight && node_isset(node, nodes)) {
2662 node_pages = min(rem_pages, weight);
2663 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2664 page_array);
2665 page_array += nr_allocated;
2666 total_allocated += nr_allocated;
2667 /* if that's all the pages, no need to interleave */
2668 if (rem_pages <= weight) {
2669 me->il_weight -= rem_pages;
2670 return total_allocated;
2671 }
2672 /* Otherwise we adjust remaining pages, continue from there */
2673 rem_pages -= weight;
2674 }
2675 /* clear active weight in case of an allocation failure */
2676 me->il_weight = 0;
2677 prev_node = node;
2678
2679 /* create a local copy of node weights to operate on outside rcu */
2680 weights = kzalloc(nr_node_ids, GFP_KERNEL);
2681 if (!weights)
2682 return total_allocated;
2683
2684 rcu_read_lock();
2685 state = rcu_dereference(wi_state);
2686 if (state) {
2687 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2688 rcu_read_unlock();
2689 } else {
2690 rcu_read_unlock();
2691 for (i = 0; i < nr_node_ids; i++)
2692 weights[i] = 1;
2693 }
2694
2695 /* calculate total, detect system default usage */
2696 for_each_node_mask(node, nodes)
2697 weight_total += weights[node];
2698
2699 /*
2700 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2701 * Track which node weighted interleave should resume from.
2702 *
2703 * if (rounds > 0) and (delta == 0), resume_node will always be
2704 * the node following prev_node and its weight.
2705 */
2706 rounds = rem_pages / weight_total;
2707 delta = rem_pages % weight_total;
2708 resume_node = next_node_in(prev_node, nodes);
2709 resume_weight = weights[resume_node];
2710 for (i = 0; i < nnodes; i++) {
2711 node = next_node_in(prev_node, nodes);
2712 weight = weights[node];
2713 node_pages = weight * rounds;
2714 /* If a delta exists, add this node's portion of the delta */
2715 if (delta > weight) {
2716 node_pages += weight;
2717 delta -= weight;
2718 } else if (delta) {
2719 /* when delta is depleted, resume from that node */
2720 node_pages += delta;
2721 resume_node = node;
2722 resume_weight = weight - delta;
2723 delta = 0;
2724 }
2725 /* node_pages can be 0 if an allocation fails and rounds == 0 */
2726 if (!node_pages)
2727 break;
2728 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2729 page_array);
2730 page_array += nr_allocated;
2731 total_allocated += nr_allocated;
2732 if (total_allocated == nr_pages)
2733 break;
2734 prev_node = node;
2735 }
2736 me->il_prev = resume_node;
2737 me->il_weight = resume_weight;
2738 kfree(weights);
2739 return total_allocated;
2740 }
2741
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2742 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2743 struct mempolicy *pol, unsigned long nr_pages,
2744 struct page **page_array)
2745 {
2746 gfp_t preferred_gfp;
2747 unsigned long nr_allocated = 0;
2748
2749 preferred_gfp = gfp | __GFP_NOWARN;
2750 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2751
2752 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2753 nr_pages, page_array);
2754
2755 if (nr_allocated < nr_pages)
2756 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2757 nr_pages - nr_allocated,
2758 page_array + nr_allocated);
2759 return nr_allocated;
2760 }
2761
2762 /* alloc pages bulk and mempolicy should be considered at the
2763 * same time in some situation such as vmalloc.
2764 *
2765 * It can accelerate memory allocation especially interleaving
2766 * allocate memory.
2767 */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2768 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2769 unsigned long nr_pages, struct page **page_array)
2770 {
2771 struct mempolicy *pol = &default_policy;
2772 nodemask_t *nodemask;
2773 int nid;
2774
2775 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2776 pol = get_task_policy(current);
2777
2778 if (pol->mode == MPOL_INTERLEAVE)
2779 return alloc_pages_bulk_interleave(gfp, pol,
2780 nr_pages, page_array);
2781
2782 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2783 return alloc_pages_bulk_weighted_interleave(
2784 gfp, pol, nr_pages, page_array);
2785
2786 if (pol->mode == MPOL_PREFERRED_MANY)
2787 return alloc_pages_bulk_preferred_many(gfp,
2788 numa_node_id(), pol, nr_pages, page_array);
2789
2790 nid = numa_node_id();
2791 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2792 return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2793 nr_pages, page_array);
2794 }
2795
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2796 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2797 {
2798 struct mempolicy *pol = mpol_dup(src->vm_policy);
2799
2800 if (IS_ERR(pol))
2801 return PTR_ERR(pol);
2802 dst->vm_policy = pol;
2803 return 0;
2804 }
2805
2806 /*
2807 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2808 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2809 * with the mems_allowed returned by cpuset_mems_allowed(). This
2810 * keeps mempolicies cpuset relative after its cpuset moves. See
2811 * further kernel/cpuset.c update_nodemask().
2812 *
2813 * current's mempolicy may be rebinded by the other task(the task that changes
2814 * cpuset's mems), so we needn't do rebind work for current task.
2815 */
2816
2817 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2818 struct mempolicy *__mpol_dup(struct mempolicy *old)
2819 {
2820 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2821
2822 if (!new)
2823 return ERR_PTR(-ENOMEM);
2824
2825 /* task's mempolicy is protected by alloc_lock */
2826 if (old == current->mempolicy) {
2827 task_lock(current);
2828 *new = *old;
2829 task_unlock(current);
2830 } else
2831 *new = *old;
2832
2833 if (current_cpuset_is_being_rebound()) {
2834 nodemask_t mems = cpuset_mems_allowed(current);
2835 mpol_rebind_policy(new, &mems);
2836 }
2837 atomic_set(&new->refcnt, 1);
2838 return new;
2839 }
2840
2841 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2842 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2843 {
2844 if (!a || !b)
2845 return false;
2846 if (a->mode != b->mode)
2847 return false;
2848 if (a->flags != b->flags)
2849 return false;
2850 if (a->home_node != b->home_node)
2851 return false;
2852 if (mpol_store_user_nodemask(a))
2853 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2854 return false;
2855
2856 switch (a->mode) {
2857 case MPOL_BIND:
2858 case MPOL_INTERLEAVE:
2859 case MPOL_PREFERRED:
2860 case MPOL_PREFERRED_MANY:
2861 case MPOL_WEIGHTED_INTERLEAVE:
2862 return !!nodes_equal(a->nodes, b->nodes);
2863 case MPOL_LOCAL:
2864 return true;
2865 default:
2866 BUG();
2867 return false;
2868 }
2869 }
2870
2871 /*
2872 * Shared memory backing store policy support.
2873 *
2874 * Remember policies even when nobody has shared memory mapped.
2875 * The policies are kept in Red-Black tree linked from the inode.
2876 * They are protected by the sp->lock rwlock, which should be held
2877 * for any accesses to the tree.
2878 */
2879
2880 /*
2881 * lookup first element intersecting start-end. Caller holds sp->lock for
2882 * reading or for writing
2883 */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2884 static struct sp_node *sp_lookup(struct shared_policy *sp,
2885 pgoff_t start, pgoff_t end)
2886 {
2887 struct rb_node *n = sp->root.rb_node;
2888
2889 while (n) {
2890 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2891
2892 if (start >= p->end)
2893 n = n->rb_right;
2894 else if (end <= p->start)
2895 n = n->rb_left;
2896 else
2897 break;
2898 }
2899 if (!n)
2900 return NULL;
2901 for (;;) {
2902 struct sp_node *w = NULL;
2903 struct rb_node *prev = rb_prev(n);
2904 if (!prev)
2905 break;
2906 w = rb_entry(prev, struct sp_node, nd);
2907 if (w->end <= start)
2908 break;
2909 n = prev;
2910 }
2911 return rb_entry(n, struct sp_node, nd);
2912 }
2913
2914 /*
2915 * Insert a new shared policy into the list. Caller holds sp->lock for
2916 * writing.
2917 */
sp_insert(struct shared_policy * sp,struct sp_node * new)2918 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2919 {
2920 struct rb_node **p = &sp->root.rb_node;
2921 struct rb_node *parent = NULL;
2922 struct sp_node *nd;
2923
2924 while (*p) {
2925 parent = *p;
2926 nd = rb_entry(parent, struct sp_node, nd);
2927 if (new->start < nd->start)
2928 p = &(*p)->rb_left;
2929 else if (new->end > nd->end)
2930 p = &(*p)->rb_right;
2931 else
2932 BUG();
2933 }
2934 rb_link_node(&new->nd, parent, p);
2935 rb_insert_color(&new->nd, &sp->root);
2936 }
2937
2938 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2939 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2940 pgoff_t idx)
2941 {
2942 struct mempolicy *pol = NULL;
2943 struct sp_node *sn;
2944
2945 if (!sp->root.rb_node)
2946 return NULL;
2947 read_lock(&sp->lock);
2948 sn = sp_lookup(sp, idx, idx+1);
2949 if (sn) {
2950 mpol_get(sn->policy);
2951 pol = sn->policy;
2952 }
2953 read_unlock(&sp->lock);
2954 return pol;
2955 }
2956 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
2957
sp_free(struct sp_node * n)2958 static void sp_free(struct sp_node *n)
2959 {
2960 mpol_put(n->policy);
2961 kmem_cache_free(sn_cache, n);
2962 }
2963
2964 /**
2965 * mpol_misplaced - check whether current folio node is valid in policy
2966 *
2967 * @folio: folio to be checked
2968 * @vmf: structure describing the fault
2969 * @addr: virtual address in @vma for shared policy lookup and interleave policy
2970 *
2971 * Lookup current policy node id for vma,addr and "compare to" folio's
2972 * node id. Policy determination "mimics" alloc_page_vma().
2973 * Called from fault path where we know the vma and faulting address.
2974 *
2975 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2976 * policy, or a suitable node ID to allocate a replacement folio from.
2977 */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2978 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2979 unsigned long addr)
2980 {
2981 struct mempolicy *pol;
2982 pgoff_t ilx;
2983 struct zoneref *z;
2984 int curnid = folio_nid(folio);
2985 struct vm_area_struct *vma = vmf->vma;
2986 int thiscpu = raw_smp_processor_id();
2987 int thisnid = numa_node_id();
2988 int polnid = NUMA_NO_NODE;
2989 int ret = NUMA_NO_NODE;
2990
2991 /*
2992 * Make sure ptl is held so that we don't preempt and we
2993 * have a stable smp processor id
2994 */
2995 lockdep_assert_held(vmf->ptl);
2996 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2997 if (!(pol->flags & MPOL_F_MOF))
2998 goto out;
2999
3000 switch (pol->mode) {
3001 case MPOL_INTERLEAVE:
3002 polnid = interleave_nid(pol, ilx);
3003 break;
3004
3005 case MPOL_WEIGHTED_INTERLEAVE:
3006 polnid = weighted_interleave_nid(pol, ilx);
3007 break;
3008
3009 case MPOL_PREFERRED:
3010 if (node_isset(curnid, pol->nodes))
3011 goto out;
3012 polnid = first_node(pol->nodes);
3013 break;
3014
3015 case MPOL_LOCAL:
3016 polnid = numa_node_id();
3017 break;
3018
3019 case MPOL_BIND:
3020 case MPOL_PREFERRED_MANY:
3021 /*
3022 * Even though MPOL_PREFERRED_MANY can allocate pages outside
3023 * policy nodemask we don't allow numa migration to nodes
3024 * outside policy nodemask for now. This is done so that if we
3025 * want demotion to slow memory to happen, before allocating
3026 * from some DRAM node say 'x', we will end up using a
3027 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3028 * we should not promote to node 'x' from slow memory node.
3029 */
3030 if (pol->flags & MPOL_F_MORON) {
3031 /*
3032 * Optimize placement among multiple nodes
3033 * via NUMA balancing
3034 */
3035 if (node_isset(thisnid, pol->nodes))
3036 break;
3037 goto out;
3038 }
3039
3040 /*
3041 * use current page if in policy nodemask,
3042 * else select nearest allowed node, if any.
3043 * If no allowed nodes, use current [!misplaced].
3044 */
3045 if (node_isset(curnid, pol->nodes))
3046 goto out;
3047 z = first_zones_zonelist(
3048 node_zonelist(thisnid, GFP_HIGHUSER),
3049 gfp_zone(GFP_HIGHUSER),
3050 &pol->nodes);
3051 polnid = zonelist_node_idx(z);
3052 break;
3053
3054 default:
3055 BUG();
3056 }
3057
3058 /* Migrate the folio towards the node whose CPU is referencing it */
3059 if (pol->flags & MPOL_F_MORON) {
3060 polnid = thisnid;
3061
3062 if (!should_numa_migrate_memory(current, folio, curnid,
3063 thiscpu))
3064 goto out;
3065 }
3066
3067 if (curnid != polnid)
3068 ret = polnid;
3069 out:
3070 mpol_cond_put(pol);
3071
3072 return ret;
3073 }
3074
3075 /*
3076 * Drop the (possibly final) reference to task->mempolicy. It needs to be
3077 * dropped after task->mempolicy is set to NULL so that any allocation done as
3078 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3079 * policy.
3080 */
mpol_put_task_policy(struct task_struct * task)3081 void mpol_put_task_policy(struct task_struct *task)
3082 {
3083 struct mempolicy *pol;
3084
3085 task_lock(task);
3086 pol = task->mempolicy;
3087 task->mempolicy = NULL;
3088 task_unlock(task);
3089 mpol_put(pol);
3090 }
3091
sp_delete(struct shared_policy * sp,struct sp_node * n)3092 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3093 {
3094 rb_erase(&n->nd, &sp->root);
3095 sp_free(n);
3096 }
3097
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3098 static void sp_node_init(struct sp_node *node, unsigned long start,
3099 unsigned long end, struct mempolicy *pol)
3100 {
3101 node->start = start;
3102 node->end = end;
3103 node->policy = pol;
3104 }
3105
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3106 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3107 struct mempolicy *pol)
3108 {
3109 struct sp_node *n;
3110 struct mempolicy *newpol;
3111
3112 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3113 if (!n)
3114 return NULL;
3115
3116 newpol = mpol_dup(pol);
3117 if (IS_ERR(newpol)) {
3118 kmem_cache_free(sn_cache, n);
3119 return NULL;
3120 }
3121 newpol->flags |= MPOL_F_SHARED;
3122 sp_node_init(n, start, end, newpol);
3123
3124 return n;
3125 }
3126
3127 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3128 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3129 pgoff_t end, struct sp_node *new)
3130 {
3131 struct sp_node *n;
3132 struct sp_node *n_new = NULL;
3133 struct mempolicy *mpol_new = NULL;
3134 int ret = 0;
3135
3136 restart:
3137 write_lock(&sp->lock);
3138 n = sp_lookup(sp, start, end);
3139 /* Take care of old policies in the same range. */
3140 while (n && n->start < end) {
3141 struct rb_node *next = rb_next(&n->nd);
3142 if (n->start >= start) {
3143 if (n->end <= end)
3144 sp_delete(sp, n);
3145 else
3146 n->start = end;
3147 } else {
3148 /* Old policy spanning whole new range. */
3149 if (n->end > end) {
3150 if (!n_new)
3151 goto alloc_new;
3152
3153 *mpol_new = *n->policy;
3154 atomic_set(&mpol_new->refcnt, 1);
3155 sp_node_init(n_new, end, n->end, mpol_new);
3156 n->end = start;
3157 sp_insert(sp, n_new);
3158 n_new = NULL;
3159 mpol_new = NULL;
3160 break;
3161 } else
3162 n->end = start;
3163 }
3164 if (!next)
3165 break;
3166 n = rb_entry(next, struct sp_node, nd);
3167 }
3168 if (new)
3169 sp_insert(sp, new);
3170 write_unlock(&sp->lock);
3171 ret = 0;
3172
3173 err_out:
3174 if (mpol_new)
3175 mpol_put(mpol_new);
3176 if (n_new)
3177 kmem_cache_free(sn_cache, n_new);
3178
3179 return ret;
3180
3181 alloc_new:
3182 write_unlock(&sp->lock);
3183 ret = -ENOMEM;
3184 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3185 if (!n_new)
3186 goto err_out;
3187 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3188 if (!mpol_new)
3189 goto err_out;
3190 atomic_set(&mpol_new->refcnt, 1);
3191 goto restart;
3192 }
3193
3194 /**
3195 * mpol_shared_policy_init - initialize shared policy for inode
3196 * @sp: pointer to inode shared policy
3197 * @mpol: struct mempolicy to install
3198 *
3199 * Install non-NULL @mpol in inode's shared policy rb-tree.
3200 * On entry, the current task has a reference on a non-NULL @mpol.
3201 * This must be released on exit.
3202 * This is called at get_inode() calls and we can use GFP_KERNEL.
3203 */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3204 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3205 {
3206 int ret;
3207
3208 sp->root = RB_ROOT; /* empty tree == default mempolicy */
3209 rwlock_init(&sp->lock);
3210
3211 if (mpol) {
3212 struct sp_node *sn;
3213 struct mempolicy *npol;
3214 NODEMASK_SCRATCH(scratch);
3215
3216 if (!scratch)
3217 goto put_mpol;
3218
3219 /* contextualize the tmpfs mount point mempolicy to this file */
3220 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3221 if (IS_ERR(npol))
3222 goto free_scratch; /* no valid nodemask intersection */
3223
3224 task_lock(current);
3225 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3226 task_unlock(current);
3227 if (ret)
3228 goto put_npol;
3229
3230 /* alloc node covering entire file; adds ref to file's npol */
3231 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3232 if (sn)
3233 sp_insert(sp, sn);
3234 put_npol:
3235 mpol_put(npol); /* drop initial ref on file's npol */
3236 free_scratch:
3237 NODEMASK_SCRATCH_FREE(scratch);
3238 put_mpol:
3239 mpol_put(mpol); /* drop our incoming ref on sb mpol */
3240 }
3241 }
3242 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
3243
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3244 int mpol_set_shared_policy(struct shared_policy *sp,
3245 struct vm_area_struct *vma, struct mempolicy *pol)
3246 {
3247 int err;
3248 struct sp_node *new = NULL;
3249 unsigned long sz = vma_pages(vma);
3250
3251 if (pol) {
3252 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3253 if (!new)
3254 return -ENOMEM;
3255 }
3256 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3257 if (err && new)
3258 sp_free(new);
3259 return err;
3260 }
3261 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
3262
3263 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3264 void mpol_free_shared_policy(struct shared_policy *sp)
3265 {
3266 struct sp_node *n;
3267 struct rb_node *next;
3268
3269 if (!sp->root.rb_node)
3270 return;
3271 write_lock(&sp->lock);
3272 next = rb_first(&sp->root);
3273 while (next) {
3274 n = rb_entry(next, struct sp_node, nd);
3275 next = rb_next(&n->nd);
3276 sp_delete(sp, n);
3277 }
3278 write_unlock(&sp->lock);
3279 }
3280 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
3281
3282 #ifdef CONFIG_NUMA_BALANCING
3283 static int __initdata numabalancing_override;
3284
check_numabalancing_enable(void)3285 static void __init check_numabalancing_enable(void)
3286 {
3287 bool numabalancing_default = false;
3288
3289 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3290 numabalancing_default = true;
3291
3292 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3293 if (numabalancing_override)
3294 set_numabalancing_state(numabalancing_override == 1);
3295
3296 if (num_online_nodes() > 1 && !numabalancing_override) {
3297 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3298 numabalancing_default ? "Enabling" : "Disabling");
3299 set_numabalancing_state(numabalancing_default);
3300 }
3301 }
3302
setup_numabalancing(char * str)3303 static int __init setup_numabalancing(char *str)
3304 {
3305 int ret = 0;
3306 if (!str)
3307 goto out;
3308
3309 if (!strcmp(str, "enable")) {
3310 numabalancing_override = 1;
3311 ret = 1;
3312 } else if (!strcmp(str, "disable")) {
3313 numabalancing_override = -1;
3314 ret = 1;
3315 }
3316 out:
3317 if (!ret)
3318 pr_warn("Unable to parse numa_balancing=\n");
3319
3320 return ret;
3321 }
3322 __setup("numa_balancing=", setup_numabalancing);
3323 #else
check_numabalancing_enable(void)3324 static inline void __init check_numabalancing_enable(void)
3325 {
3326 }
3327 #endif /* CONFIG_NUMA_BALANCING */
3328
numa_policy_init(void)3329 void __init numa_policy_init(void)
3330 {
3331 nodemask_t interleave_nodes;
3332 unsigned long largest = 0;
3333 int nid, prefer = 0;
3334
3335 policy_cache = kmem_cache_create("numa_policy",
3336 sizeof(struct mempolicy),
3337 0, SLAB_PANIC, NULL);
3338
3339 sn_cache = kmem_cache_create("shared_policy_node",
3340 sizeof(struct sp_node),
3341 0, SLAB_PANIC, NULL);
3342
3343 for_each_node(nid) {
3344 preferred_node_policy[nid] = (struct mempolicy) {
3345 .refcnt = ATOMIC_INIT(1),
3346 .mode = MPOL_PREFERRED,
3347 .flags = MPOL_F_MOF | MPOL_F_MORON,
3348 .nodes = nodemask_of_node(nid),
3349 };
3350 }
3351
3352 /*
3353 * Set interleaving policy for system init. Interleaving is only
3354 * enabled across suitably sized nodes (default is >= 16MB), or
3355 * fall back to the largest node if they're all smaller.
3356 */
3357 nodes_clear(interleave_nodes);
3358 for_each_node_state(nid, N_MEMORY) {
3359 unsigned long total_pages = node_present_pages(nid);
3360
3361 /* Preserve the largest node */
3362 if (largest < total_pages) {
3363 largest = total_pages;
3364 prefer = nid;
3365 }
3366
3367 /* Interleave this node? */
3368 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3369 node_set(nid, interleave_nodes);
3370 }
3371
3372 /* All too small, use the largest */
3373 if (unlikely(nodes_empty(interleave_nodes)))
3374 node_set(prefer, interleave_nodes);
3375
3376 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3377 pr_err("%s: interleaving failed\n", __func__);
3378
3379 check_numabalancing_enable();
3380 }
3381
3382 /* Reset policy of current process to default */
numa_default_policy(void)3383 void numa_default_policy(void)
3384 {
3385 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3386 }
3387
3388 /*
3389 * Parse and format mempolicy from/to strings
3390 */
3391 static const char * const policy_modes[] =
3392 {
3393 [MPOL_DEFAULT] = "default",
3394 [MPOL_PREFERRED] = "prefer",
3395 [MPOL_BIND] = "bind",
3396 [MPOL_INTERLEAVE] = "interleave",
3397 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3398 [MPOL_LOCAL] = "local",
3399 [MPOL_PREFERRED_MANY] = "prefer (many)",
3400 };
3401
3402 #ifdef CONFIG_TMPFS
3403 /**
3404 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3405 * @str: string containing mempolicy to parse
3406 * @mpol: pointer to struct mempolicy pointer, returned on success.
3407 *
3408 * Format of input:
3409 * <mode>[=<flags>][:<nodelist>]
3410 *
3411 * Return: %0 on success, else %1
3412 */
mpol_parse_str(char * str,struct mempolicy ** mpol)3413 int mpol_parse_str(char *str, struct mempolicy **mpol)
3414 {
3415 struct mempolicy *new = NULL;
3416 unsigned short mode_flags;
3417 nodemask_t nodes;
3418 char *nodelist = strchr(str, ':');
3419 char *flags = strchr(str, '=');
3420 int err = 1, mode;
3421
3422 if (flags)
3423 *flags++ = '\0'; /* terminate mode string */
3424
3425 if (nodelist) {
3426 /* NUL-terminate mode or flags string */
3427 *nodelist++ = '\0';
3428 if (nodelist_parse(nodelist, nodes))
3429 goto out;
3430 if (!nodes_subset(nodes, node_states[N_MEMORY]))
3431 goto out;
3432 } else
3433 nodes_clear(nodes);
3434
3435 mode = match_string(policy_modes, MPOL_MAX, str);
3436 if (mode < 0)
3437 goto out;
3438
3439 switch (mode) {
3440 case MPOL_PREFERRED:
3441 /*
3442 * Insist on a nodelist of one node only, although later
3443 * we use first_node(nodes) to grab a single node, so here
3444 * nodelist (or nodes) cannot be empty.
3445 */
3446 if (nodelist) {
3447 char *rest = nodelist;
3448 while (isdigit(*rest))
3449 rest++;
3450 if (*rest)
3451 goto out;
3452 if (nodes_empty(nodes))
3453 goto out;
3454 }
3455 break;
3456 case MPOL_INTERLEAVE:
3457 case MPOL_WEIGHTED_INTERLEAVE:
3458 /*
3459 * Default to online nodes with memory if no nodelist
3460 */
3461 if (!nodelist)
3462 nodes = node_states[N_MEMORY];
3463 break;
3464 case MPOL_LOCAL:
3465 /*
3466 * Don't allow a nodelist; mpol_new() checks flags
3467 */
3468 if (nodelist)
3469 goto out;
3470 break;
3471 case MPOL_DEFAULT:
3472 /*
3473 * Insist on a empty nodelist
3474 */
3475 if (!nodelist)
3476 err = 0;
3477 goto out;
3478 case MPOL_PREFERRED_MANY:
3479 case MPOL_BIND:
3480 /*
3481 * Insist on a nodelist
3482 */
3483 if (!nodelist)
3484 goto out;
3485 }
3486
3487 mode_flags = 0;
3488 if (flags) {
3489 /*
3490 * Currently, we only support two mutually exclusive
3491 * mode flags.
3492 */
3493 if (!strcmp(flags, "static"))
3494 mode_flags |= MPOL_F_STATIC_NODES;
3495 else if (!strcmp(flags, "relative"))
3496 mode_flags |= MPOL_F_RELATIVE_NODES;
3497 else
3498 goto out;
3499 }
3500
3501 new = mpol_new(mode, mode_flags, &nodes);
3502 if (IS_ERR(new))
3503 goto out;
3504
3505 /*
3506 * Save nodes for mpol_to_str() to show the tmpfs mount options
3507 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3508 */
3509 if (mode != MPOL_PREFERRED) {
3510 new->nodes = nodes;
3511 } else if (nodelist) {
3512 nodes_clear(new->nodes);
3513 node_set(first_node(nodes), new->nodes);
3514 } else {
3515 new->mode = MPOL_LOCAL;
3516 }
3517
3518 /*
3519 * Save nodes for contextualization: this will be used to "clone"
3520 * the mempolicy in a specific context [cpuset] at a later time.
3521 */
3522 new->w.user_nodemask = nodes;
3523
3524 err = 0;
3525
3526 out:
3527 /* Restore string for error message */
3528 if (nodelist)
3529 *--nodelist = ':';
3530 if (flags)
3531 *--flags = '=';
3532 if (!err)
3533 *mpol = new;
3534 return err;
3535 }
3536 #endif /* CONFIG_TMPFS */
3537
3538 /**
3539 * mpol_to_str - format a mempolicy structure for printing
3540 * @buffer: to contain formatted mempolicy string
3541 * @maxlen: length of @buffer
3542 * @pol: pointer to mempolicy to be formatted
3543 *
3544 * Convert @pol into a string. If @buffer is too short, truncate the string.
3545 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3546 * interleave", plus the longest flag flags, "relative|balancing", and to
3547 * display at least a few node ids.
3548 */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3549 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3550 {
3551 char *p = buffer;
3552 nodemask_t nodes = NODE_MASK_NONE;
3553 unsigned short mode = MPOL_DEFAULT;
3554 unsigned short flags = 0;
3555
3556 if (pol &&
3557 pol != &default_policy &&
3558 !(pol >= &preferred_node_policy[0] &&
3559 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3560 mode = pol->mode;
3561 flags = pol->flags;
3562 }
3563
3564 switch (mode) {
3565 case MPOL_DEFAULT:
3566 case MPOL_LOCAL:
3567 break;
3568 case MPOL_PREFERRED:
3569 case MPOL_PREFERRED_MANY:
3570 case MPOL_BIND:
3571 case MPOL_INTERLEAVE:
3572 case MPOL_WEIGHTED_INTERLEAVE:
3573 nodes = pol->nodes;
3574 break;
3575 default:
3576 WARN_ON_ONCE(1);
3577 snprintf(p, maxlen, "unknown");
3578 return;
3579 }
3580
3581 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3582
3583 if (flags & MPOL_MODE_FLAGS) {
3584 p += snprintf(p, buffer + maxlen - p, "=");
3585
3586 /*
3587 * Static and relative are mutually exclusive.
3588 */
3589 if (flags & MPOL_F_STATIC_NODES)
3590 p += snprintf(p, buffer + maxlen - p, "static");
3591 else if (flags & MPOL_F_RELATIVE_NODES)
3592 p += snprintf(p, buffer + maxlen - p, "relative");
3593
3594 if (flags & MPOL_F_NUMA_BALANCING) {
3595 if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3596 p += snprintf(p, buffer + maxlen - p, "|");
3597 p += snprintf(p, buffer + maxlen - p, "balancing");
3598 }
3599 }
3600
3601 if (!nodes_empty(nodes))
3602 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3603 nodemask_pr_args(&nodes));
3604 }
3605
3606 #ifdef CONFIG_SYSFS
3607 struct iw_node_attr {
3608 struct kobj_attribute kobj_attr;
3609 int nid;
3610 };
3611
3612 struct sysfs_wi_group {
3613 struct kobject wi_kobj;
3614 struct mutex kobj_lock;
3615 struct iw_node_attr *nattrs[];
3616 };
3617
3618 static struct sysfs_wi_group *wi_group;
3619
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3620 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3621 char *buf)
3622 {
3623 struct iw_node_attr *node_attr;
3624 u8 weight;
3625
3626 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3627 weight = get_il_weight(node_attr->nid);
3628 return sysfs_emit(buf, "%d\n", weight);
3629 }
3630
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3631 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3632 const char *buf, size_t count)
3633 {
3634 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3635 struct iw_node_attr *node_attr;
3636 u8 weight = 0;
3637 int i;
3638
3639 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3640 if (count == 0 || sysfs_streq(buf, "") ||
3641 kstrtou8(buf, 0, &weight) || weight == 0)
3642 return -EINVAL;
3643
3644 new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3645 if (!new_wi_state)
3646 return -ENOMEM;
3647
3648 mutex_lock(&wi_state_lock);
3649 old_wi_state = rcu_dereference_protected(wi_state,
3650 lockdep_is_held(&wi_state_lock));
3651 if (old_wi_state) {
3652 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3653 nr_node_ids * sizeof(u8));
3654 } else {
3655 for (i = 0; i < nr_node_ids; i++)
3656 new_wi_state->iw_table[i] = 1;
3657 }
3658 new_wi_state->iw_table[node_attr->nid] = weight;
3659 new_wi_state->mode_auto = false;
3660
3661 rcu_assign_pointer(wi_state, new_wi_state);
3662 mutex_unlock(&wi_state_lock);
3663 if (old_wi_state) {
3664 synchronize_rcu();
3665 kfree(old_wi_state);
3666 }
3667 return count;
3668 }
3669
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3670 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3671 struct kobj_attribute *attr, char *buf)
3672 {
3673 struct weighted_interleave_state *state;
3674 bool wi_auto = true;
3675
3676 rcu_read_lock();
3677 state = rcu_dereference(wi_state);
3678 if (state)
3679 wi_auto = state->mode_auto;
3680 rcu_read_unlock();
3681
3682 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3683 }
3684
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3685 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3686 struct kobj_attribute *attr, const char *buf, size_t count)
3687 {
3688 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3689 unsigned int *bw;
3690 bool input;
3691 int i;
3692
3693 if (kstrtobool(buf, &input))
3694 return -EINVAL;
3695
3696 new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3697 if (!new_wi_state)
3698 return -ENOMEM;
3699 for (i = 0; i < nr_node_ids; i++)
3700 new_wi_state->iw_table[i] = 1;
3701
3702 mutex_lock(&wi_state_lock);
3703 if (!input) {
3704 old_wi_state = rcu_dereference_protected(wi_state,
3705 lockdep_is_held(&wi_state_lock));
3706 if (!old_wi_state)
3707 goto update_wi_state;
3708 if (input == old_wi_state->mode_auto) {
3709 mutex_unlock(&wi_state_lock);
3710 return count;
3711 }
3712
3713 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3714 nr_node_ids * sizeof(u8));
3715 goto update_wi_state;
3716 }
3717
3718 bw = node_bw_table;
3719 if (!bw) {
3720 mutex_unlock(&wi_state_lock);
3721 kfree(new_wi_state);
3722 return -ENODEV;
3723 }
3724
3725 new_wi_state->mode_auto = true;
3726 reduce_interleave_weights(bw, new_wi_state->iw_table);
3727
3728 update_wi_state:
3729 rcu_assign_pointer(wi_state, new_wi_state);
3730 mutex_unlock(&wi_state_lock);
3731 if (old_wi_state) {
3732 synchronize_rcu();
3733 kfree(old_wi_state);
3734 }
3735 return count;
3736 }
3737
sysfs_wi_node_delete(int nid)3738 static void sysfs_wi_node_delete(int nid)
3739 {
3740 struct iw_node_attr *attr;
3741
3742 if (nid < 0 || nid >= nr_node_ids)
3743 return;
3744
3745 mutex_lock(&wi_group->kobj_lock);
3746 attr = wi_group->nattrs[nid];
3747 if (!attr) {
3748 mutex_unlock(&wi_group->kobj_lock);
3749 return;
3750 }
3751
3752 wi_group->nattrs[nid] = NULL;
3753 mutex_unlock(&wi_group->kobj_lock);
3754
3755 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3756 kfree(attr->kobj_attr.attr.name);
3757 kfree(attr);
3758 }
3759
sysfs_wi_node_delete_all(void)3760 static void sysfs_wi_node_delete_all(void)
3761 {
3762 int nid;
3763
3764 for (nid = 0; nid < nr_node_ids; nid++)
3765 sysfs_wi_node_delete(nid);
3766 }
3767
wi_state_free(void)3768 static void wi_state_free(void)
3769 {
3770 struct weighted_interleave_state *old_wi_state;
3771
3772 mutex_lock(&wi_state_lock);
3773 old_wi_state = rcu_dereference_protected(wi_state,
3774 lockdep_is_held(&wi_state_lock));
3775 rcu_assign_pointer(wi_state, NULL);
3776 mutex_unlock(&wi_state_lock);
3777
3778 if (old_wi_state) {
3779 synchronize_rcu();
3780 kfree(old_wi_state);
3781 }
3782 }
3783
3784 static struct kobj_attribute wi_auto_attr =
3785 __ATTR(auto, 0664, weighted_interleave_auto_show,
3786 weighted_interleave_auto_store);
3787
wi_cleanup(void)3788 static void wi_cleanup(void) {
3789 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3790 sysfs_wi_node_delete_all();
3791 wi_state_free();
3792 }
3793
wi_kobj_release(struct kobject * wi_kobj)3794 static void wi_kobj_release(struct kobject *wi_kobj)
3795 {
3796 kfree(wi_group);
3797 }
3798
3799 static const struct kobj_type wi_ktype = {
3800 .sysfs_ops = &kobj_sysfs_ops,
3801 .release = wi_kobj_release,
3802 };
3803
sysfs_wi_node_add(int nid)3804 static int sysfs_wi_node_add(int nid)
3805 {
3806 int ret;
3807 char *name;
3808 struct iw_node_attr *new_attr;
3809
3810 if (nid < 0 || nid >= nr_node_ids) {
3811 pr_err("invalid node id: %d\n", nid);
3812 return -EINVAL;
3813 }
3814
3815 new_attr = kzalloc_obj(*new_attr);
3816 if (!new_attr)
3817 return -ENOMEM;
3818
3819 name = kasprintf(GFP_KERNEL, "node%d", nid);
3820 if (!name) {
3821 kfree(new_attr);
3822 return -ENOMEM;
3823 }
3824
3825 sysfs_attr_init(&new_attr->kobj_attr.attr);
3826 new_attr->kobj_attr.attr.name = name;
3827 new_attr->kobj_attr.attr.mode = 0644;
3828 new_attr->kobj_attr.show = node_show;
3829 new_attr->kobj_attr.store = node_store;
3830 new_attr->nid = nid;
3831
3832 mutex_lock(&wi_group->kobj_lock);
3833 if (wi_group->nattrs[nid]) {
3834 mutex_unlock(&wi_group->kobj_lock);
3835 ret = -EEXIST;
3836 goto out;
3837 }
3838
3839 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3840 if (ret) {
3841 mutex_unlock(&wi_group->kobj_lock);
3842 goto out;
3843 }
3844 wi_group->nattrs[nid] = new_attr;
3845 mutex_unlock(&wi_group->kobj_lock);
3846 return 0;
3847
3848 out:
3849 kfree(new_attr->kobj_attr.attr.name);
3850 kfree(new_attr);
3851 return ret;
3852 }
3853
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3854 static int wi_node_notifier(struct notifier_block *nb,
3855 unsigned long action, void *data)
3856 {
3857 int err;
3858 struct node_notify *nn = data;
3859 int nid = nn->nid;
3860
3861 switch (action) {
3862 case NODE_ADDED_FIRST_MEMORY:
3863 err = sysfs_wi_node_add(nid);
3864 if (err)
3865 pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3866 nid, err);
3867 break;
3868 case NODE_REMOVED_LAST_MEMORY:
3869 sysfs_wi_node_delete(nid);
3870 break;
3871 }
3872
3873 return NOTIFY_OK;
3874 }
3875
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3876 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3877 {
3878 int nid, err;
3879
3880 wi_group = kzalloc_flex(*wi_group, nattrs, nr_node_ids);
3881 if (!wi_group)
3882 return -ENOMEM;
3883 mutex_init(&wi_group->kobj_lock);
3884
3885 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3886 "weighted_interleave");
3887 if (err)
3888 goto err_put_kobj;
3889
3890 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3891 if (err)
3892 goto err_put_kobj;
3893
3894 for_each_online_node(nid) {
3895 if (!node_state(nid, N_MEMORY))
3896 continue;
3897
3898 err = sysfs_wi_node_add(nid);
3899 if (err) {
3900 pr_err("failed to add sysfs for node%d during init: %d\n",
3901 nid, err);
3902 goto err_cleanup_kobj;
3903 }
3904 }
3905
3906 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3907 return 0;
3908
3909 err_cleanup_kobj:
3910 wi_cleanup();
3911 kobject_del(&wi_group->wi_kobj);
3912 err_put_kobj:
3913 kobject_put(&wi_group->wi_kobj);
3914 return err;
3915 }
3916
mempolicy_sysfs_init(void)3917 static int __init mempolicy_sysfs_init(void)
3918 {
3919 int err;
3920 static struct kobject *mempolicy_kobj;
3921
3922 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3923 if (!mempolicy_kobj)
3924 return -ENOMEM;
3925
3926 err = add_weighted_interleave_group(mempolicy_kobj);
3927 if (err)
3928 goto err_kobj;
3929
3930 return 0;
3931
3932 err_kobj:
3933 kobject_del(mempolicy_kobj);
3934 kobject_put(mempolicy_kobj);
3935 return err;
3936 }
3937
3938 late_initcall(mempolicy_sysfs_init);
3939 #endif /* CONFIG_SYSFS */
3940