1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support six policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * weighted interleave
23 * Allocate memory interleaved over a set of nodes based on
24 * a set of weights (per-node), with normal fallback if it
25 * fails. Otherwise operates the same as interleave.
26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27 * on node 0 for every 1 page allocated on node 1.
28 *
29 * bind Only allocate memory on a specific set of nodes,
30 * no fallback.
31 * FIXME: memory is allocated starting with the first node
32 * to the last. It would be better if bind would truly restrict
33 * the allocation to memory nodes instead
34 *
35 * preferred Try a specific node first before normal fallback.
36 * As a special case NUMA_NO_NODE here means do the allocation
37 * on the local CPU. This is normally identical to default,
38 * but useful to set in a VMA when you have a non default
39 * process policy.
40 *
41 * preferred many Try a set of nodes first before normal fallback. This is
42 * similar to preferred without the special case.
43 *
44 * default Allocate on the local node first, or when on a VMA
45 * use the process policy. This is what Linux always did
46 * in a NUMA aware kernel and still does by, ahem, default.
47 *
48 * The process policy is applied for most non interrupt memory allocations
49 * in that process' context. Interrupts ignore the policies and always
50 * try to allocate on the local CPU. The VMA policy is only applied for memory
51 * allocations for a VMA in the VM.
52 *
53 * Currently there are a few corner cases in swapping where the policy
54 * is not applied, but the majority should be handled. When process policy
55 * is used it is not remembered over swap outs/swap ins.
56 *
57 * Only the highest zone in the zone hierarchy gets policied. Allocations
58 * requesting a lower zone just use default policy. This implies that
59 * on systems with highmem kernel lowmem allocation don't get policied.
60 * Same with GFP_DMA allocations.
61 *
62 * For shmem/tmpfs shared memory the policy is shared between
63 * all users and remembered even when nobody has memory mapped.
64 */
65
66 /* Notebook:
67 fix mmap readahead to honour policy and enable policy for any page cache
68 object
69 statistics for bigpages
70 global policy for page cache? currently it uses process policy. Requires
71 first item above.
72 handle mremap for shared memory (currently ignored for the policy)
73 grows down?
74 make bind policy root only? It can trigger oom much faster and the
75 kernel is not always grateful with that.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/sysctl.h>
89 #include <linux/sched/task.h>
90 #include <linux/nodemask.h>
91 #include <linux/cpuset.h>
92 #include <linux/slab.h>
93 #include <linux/string.h>
94 #include <linux/export.h>
95 #include <linux/nsproxy.h>
96 #include <linux/interrupt.h>
97 #include <linux/init.h>
98 #include <linux/compat.h>
99 #include <linux/ptrace.h>
100 #include <linux/swap.h>
101 #include <linux/seq_file.h>
102 #include <linux/proc_fs.h>
103 #include <linux/memory-tiers.h>
104 #include <linux/migrate.h>
105 #include <linux/ksm.h>
106 #include <linux/rmap.h>
107 #include <linux/security.h>
108 #include <linux/syscalls.h>
109 #include <linux/ctype.h>
110 #include <linux/mm_inline.h>
111 #include <linux/mmu_notifier.h>
112 #include <linux/printk.h>
113 #include <linux/leafops.h>
114 #include <linux/gcd.h>
115
116 #include <asm/tlbflush.h>
117 #include <asm/tlb.h>
118 #include <linux/uaccess.h>
119 #include <linux/memory.h>
120
121 #include "internal.h"
122
123 /* Internal flags */
124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
125 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
126 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
127
128 static struct kmem_cache *policy_cache;
129 static struct kmem_cache *sn_cache;
130
131 /* Highest zone. An specific allocation for a zone below that is not
132 policied. */
133 enum zone_type policy_zone = 0;
134
135 /*
136 * run-time system-wide default policy => local allocation
137 */
138 static struct mempolicy default_policy = {
139 .refcnt = ATOMIC_INIT(1), /* never free it */
140 .mode = MPOL_LOCAL,
141 };
142
143 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144
145 /*
146 * weightiness balances the tradeoff between small weights (cycles through nodes
147 * faster, more fair/even distribution) and large weights (smaller errors
148 * between actual bandwidth ratios and weight ratios). 32 is a number that has
149 * been found to perform at a reasonable compromise between the two goals.
150 */
151 static const int weightiness = 32;
152
153 /*
154 * A null weighted_interleave_state is interpreted as having .mode="auto",
155 * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156 */
157 struct weighted_interleave_state {
158 bool mode_auto;
159 u8 iw_table[];
160 };
161 static struct weighted_interleave_state __rcu *wi_state;
162 static unsigned int *node_bw_table;
163
164 /*
165 * wi_state_lock protects both wi_state and node_bw_table.
166 * node_bw_table is only used by writers to update wi_state.
167 */
168 static DEFINE_MUTEX(wi_state_lock);
169
get_il_weight(int node)170 static u8 get_il_weight(int node)
171 {
172 struct weighted_interleave_state *state;
173 u8 weight = 1;
174
175 rcu_read_lock();
176 state = rcu_dereference(wi_state);
177 if (state)
178 weight = state->iw_table[node];
179 rcu_read_unlock();
180 return weight;
181 }
182
183 /*
184 * Convert bandwidth values into weighted interleave weights.
185 * Call with wi_state_lock.
186 */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188 {
189 u64 sum_bw = 0;
190 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191 int nid;
192
193 for_each_node_state(nid, N_MEMORY)
194 sum_bw += bw[nid];
195
196 /* Scale bandwidths to whole numbers in the range [1, weightiness] */
197 for_each_node_state(nid, N_MEMORY) {
198 /*
199 * Try not to perform 64-bit division.
200 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201 * If sum_bw > scaling_factor, then round the weight up to 1.
202 */
203 scaling_factor = weightiness * bw[nid];
204 if (bw[nid] && sum_bw < scaling_factor) {
205 cast_sum_bw = (unsigned int)sum_bw;
206 new_iw[nid] = scaling_factor / cast_sum_bw;
207 } else {
208 new_iw[nid] = 1;
209 }
210 if (!iw_gcd)
211 iw_gcd = new_iw[nid];
212 iw_gcd = gcd(iw_gcd, new_iw[nid]);
213 }
214
215 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216 for_each_node_state(nid, N_MEMORY)
217 new_iw[nid] /= iw_gcd;
218 }
219
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221 {
222 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223 unsigned int *old_bw, *new_bw;
224 unsigned int bw_val;
225 int i;
226
227 bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229 if (!new_bw)
230 return -ENOMEM;
231
232 new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
233 GFP_KERNEL);
234 if (!new_wi_state) {
235 kfree(new_bw);
236 return -ENOMEM;
237 }
238 new_wi_state->mode_auto = true;
239 for (i = 0; i < nr_node_ids; i++)
240 new_wi_state->iw_table[i] = 1;
241
242 /*
243 * Update bandwidth info, even in manual mode. That way, when switching
244 * to auto mode in the future, iw_table can be overwritten using
245 * accurate bw data.
246 */
247 mutex_lock(&wi_state_lock);
248
249 old_bw = node_bw_table;
250 if (old_bw)
251 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
252 new_bw[node] = bw_val;
253 node_bw_table = new_bw;
254
255 old_wi_state = rcu_dereference_protected(wi_state,
256 lockdep_is_held(&wi_state_lock));
257 if (old_wi_state && !old_wi_state->mode_auto) {
258 /* Manual mode; skip reducing weights and updating wi_state */
259 mutex_unlock(&wi_state_lock);
260 kfree(new_wi_state);
261 goto out;
262 }
263
264 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
265 reduce_interleave_weights(new_bw, new_wi_state->iw_table);
266 rcu_assign_pointer(wi_state, new_wi_state);
267
268 mutex_unlock(&wi_state_lock);
269 if (old_wi_state) {
270 synchronize_rcu();
271 kfree(old_wi_state);
272 }
273 out:
274 kfree(old_bw);
275 return 0;
276 }
277
278 /**
279 * numa_nearest_node - Find nearest node by state
280 * @node: Node id to start the search
281 * @state: State to filter the search
282 *
283 * Lookup the closest node by distance if @nid is not in state.
284 *
285 * Return: this @node if it is in state, otherwise the closest node by distance
286 */
numa_nearest_node(int node,unsigned int state)287 int numa_nearest_node(int node, unsigned int state)
288 {
289 int min_dist = INT_MAX, dist, n, min_node;
290
291 if (state >= NR_NODE_STATES)
292 return -EINVAL;
293
294 if (node == NUMA_NO_NODE || node_state(node, state))
295 return node;
296
297 min_node = node;
298 for_each_node_state(n, state) {
299 dist = node_distance(node, n);
300 if (dist < min_dist) {
301 min_dist = dist;
302 min_node = n;
303 }
304 }
305
306 return min_node;
307 }
308 EXPORT_SYMBOL_GPL(numa_nearest_node);
309
310 /**
311 * nearest_node_nodemask - Find the node in @mask at the nearest distance
312 * from @node.
313 *
314 * @node: a valid node ID to start the search from.
315 * @mask: a pointer to a nodemask representing the allowed nodes.
316 *
317 * This function iterates over all nodes in @mask and calculates the
318 * distance from the starting @node, then it returns the node ID that is
319 * the closest to @node, or MAX_NUMNODES if no node is found.
320 *
321 * Note that @node must be a valid node ID usable with node_distance(),
322 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
323 * or unexpected behavior.
324 */
nearest_node_nodemask(int node,nodemask_t * mask)325 int nearest_node_nodemask(int node, nodemask_t *mask)
326 {
327 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
328
329 for_each_node_mask(n, *mask) {
330 dist = node_distance(node, n);
331 if (dist < min_dist) {
332 min_dist = dist;
333 min_node = n;
334 }
335 }
336
337 return min_node;
338 }
339 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
340
get_task_policy(struct task_struct * p)341 struct mempolicy *get_task_policy(struct task_struct *p)
342 {
343 struct mempolicy *pol = p->mempolicy;
344 int node;
345
346 if (pol)
347 return pol;
348
349 node = numa_node_id();
350 if (node != NUMA_NO_NODE) {
351 pol = &preferred_node_policy[node];
352 /* preferred_node_policy is not initialised early in boot */
353 if (pol->mode)
354 return pol;
355 }
356
357 return &default_policy;
358 }
359 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
360
361 static const struct mempolicy_operations {
362 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
363 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
364 } mpol_ops[MPOL_MAX];
365
mpol_store_user_nodemask(const struct mempolicy * pol)366 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
367 {
368 return pol->flags & MPOL_MODE_FLAGS;
369 }
370
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)371 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
372 const nodemask_t *rel)
373 {
374 nodemask_t tmp;
375 nodes_fold(tmp, *orig, nodes_weight(*rel));
376 nodes_onto(*ret, tmp, *rel);
377 }
378
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)379 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
380 {
381 if (nodes_empty(*nodes))
382 return -EINVAL;
383 pol->nodes = *nodes;
384 return 0;
385 }
386
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)387 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
388 {
389 if (nodes_empty(*nodes))
390 return -EINVAL;
391
392 nodes_clear(pol->nodes);
393 node_set(first_node(*nodes), pol->nodes);
394 return 0;
395 }
396
397 /*
398 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
399 * any, for the new policy. mpol_new() has already validated the nodes
400 * parameter with respect to the policy mode and flags.
401 *
402 * Must be called holding task's alloc_lock to protect task's mems_allowed
403 * and mempolicy. May also be called holding the mmap_lock for write.
404 */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)405 static int mpol_set_nodemask(struct mempolicy *pol,
406 const nodemask_t *nodes, struct nodemask_scratch *nsc)
407 {
408 int ret;
409
410 /*
411 * Default (pol==NULL) resp. local memory policies are not a
412 * subject of any remapping. They also do not need any special
413 * constructor.
414 */
415 if (!pol || pol->mode == MPOL_LOCAL)
416 return 0;
417
418 /* Check N_MEMORY */
419 nodes_and(nsc->mask1,
420 cpuset_current_mems_allowed, node_states[N_MEMORY]);
421
422 VM_BUG_ON(!nodes);
423
424 if (pol->flags & MPOL_F_RELATIVE_NODES)
425 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
426 else
427 nodes_and(nsc->mask2, *nodes, nsc->mask1);
428
429 if (mpol_store_user_nodemask(pol))
430 pol->w.user_nodemask = *nodes;
431 else
432 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
433
434 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
435 return ret;
436 }
437
438 /*
439 * This function just creates a new policy, does some check and simple
440 * initialization. You must invoke mpol_set_nodemask() to set nodes.
441 */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)442 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
443 nodemask_t *nodes)
444 {
445 struct mempolicy *policy;
446
447 if (mode == MPOL_DEFAULT) {
448 if (nodes && !nodes_empty(*nodes))
449 return ERR_PTR(-EINVAL);
450 return NULL;
451 }
452 VM_BUG_ON(!nodes);
453
454 /*
455 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
456 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
457 * All other modes require a valid pointer to a non-empty nodemask.
458 */
459 if (mode == MPOL_PREFERRED) {
460 if (nodes_empty(*nodes)) {
461 if (((flags & MPOL_F_STATIC_NODES) ||
462 (flags & MPOL_F_RELATIVE_NODES)))
463 return ERR_PTR(-EINVAL);
464
465 mode = MPOL_LOCAL;
466 }
467 } else if (mode == MPOL_LOCAL) {
468 if (!nodes_empty(*nodes) ||
469 (flags & MPOL_F_STATIC_NODES) ||
470 (flags & MPOL_F_RELATIVE_NODES))
471 return ERR_PTR(-EINVAL);
472 } else if (nodes_empty(*nodes))
473 return ERR_PTR(-EINVAL);
474
475 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
476 if (!policy)
477 return ERR_PTR(-ENOMEM);
478 atomic_set(&policy->refcnt, 1);
479 policy->mode = mode;
480 policy->flags = flags;
481 policy->home_node = NUMA_NO_NODE;
482
483 return policy;
484 }
485
486 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)487 void __mpol_put(struct mempolicy *pol)
488 {
489 if (!atomic_dec_and_test(&pol->refcnt))
490 return;
491 kmem_cache_free(policy_cache, pol);
492 }
493 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
494
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)495 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
496 {
497 }
498
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)499 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
500 {
501 nodemask_t tmp;
502
503 if (pol->flags & MPOL_F_STATIC_NODES)
504 nodes_and(tmp, pol->w.user_nodemask, *nodes);
505 else if (pol->flags & MPOL_F_RELATIVE_NODES)
506 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
507 else {
508 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
509 *nodes);
510 pol->w.cpuset_mems_allowed = *nodes;
511 }
512
513 if (nodes_empty(tmp))
514 tmp = *nodes;
515
516 pol->nodes = tmp;
517 }
518
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)519 static void mpol_rebind_preferred(struct mempolicy *pol,
520 const nodemask_t *nodes)
521 {
522 pol->w.cpuset_mems_allowed = *nodes;
523 }
524
525 /*
526 * mpol_rebind_policy - Migrate a policy to a different set of nodes
527 *
528 * Per-vma policies are protected by mmap_lock. Allocations using per-task
529 * policies are protected by task->mems_allowed_seq to prevent a premature
530 * OOM/allocation failure due to parallel nodemask modification.
531 */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)532 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
533 {
534 if (!pol || pol->mode == MPOL_LOCAL)
535 return;
536 if (!mpol_store_user_nodemask(pol) &&
537 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
538 return;
539
540 mpol_ops[pol->mode].rebind(pol, newmask);
541 }
542
543 /*
544 * Wrapper for mpol_rebind_policy() that just requires task
545 * pointer, and updates task mempolicy.
546 *
547 * Called with task's alloc_lock held.
548 */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)549 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
550 {
551 mpol_rebind_policy(tsk->mempolicy, new);
552 }
553
554 /*
555 * Rebind each vma in mm to new nodemask.
556 *
557 * Call holding a reference to mm. Takes mm->mmap_lock during call.
558 */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)559 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
560 {
561 struct vm_area_struct *vma;
562 VMA_ITERATOR(vmi, mm, 0);
563
564 mmap_write_lock(mm);
565 for_each_vma(vmi, vma) {
566 vma_start_write(vma);
567 mpol_rebind_policy(vma->vm_policy, new);
568 }
569 mmap_write_unlock(mm);
570 }
571
572 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
573 [MPOL_DEFAULT] = {
574 .rebind = mpol_rebind_default,
575 },
576 [MPOL_INTERLEAVE] = {
577 .create = mpol_new_nodemask,
578 .rebind = mpol_rebind_nodemask,
579 },
580 [MPOL_PREFERRED] = {
581 .create = mpol_new_preferred,
582 .rebind = mpol_rebind_preferred,
583 },
584 [MPOL_BIND] = {
585 .create = mpol_new_nodemask,
586 .rebind = mpol_rebind_nodemask,
587 },
588 [MPOL_LOCAL] = {
589 .rebind = mpol_rebind_default,
590 },
591 [MPOL_PREFERRED_MANY] = {
592 .create = mpol_new_nodemask,
593 .rebind = mpol_rebind_preferred,
594 },
595 [MPOL_WEIGHTED_INTERLEAVE] = {
596 .create = mpol_new_nodemask,
597 .rebind = mpol_rebind_nodemask,
598 },
599 };
600
601 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
602 unsigned long flags);
603 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
604 pgoff_t ilx, int *nid);
605
strictly_unmovable(unsigned long flags)606 static bool strictly_unmovable(unsigned long flags)
607 {
608 /*
609 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
610 * if any misplaced page is found.
611 */
612 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
613 MPOL_MF_STRICT;
614 }
615
616 struct migration_mpol { /* for alloc_migration_target_by_mpol() */
617 struct mempolicy *pol;
618 pgoff_t ilx;
619 };
620
621 struct queue_pages {
622 struct list_head *pagelist;
623 unsigned long flags;
624 nodemask_t *nmask;
625 unsigned long start;
626 unsigned long end;
627 struct vm_area_struct *first;
628 struct folio *large; /* note last large folio encountered */
629 long nr_failed; /* could not be isolated at this time */
630 };
631
632 /*
633 * Check if the folio's nid is in qp->nmask.
634 *
635 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
636 * in the invert of qp->nmask.
637 */
queue_folio_required(struct folio * folio,struct queue_pages * qp)638 static inline bool queue_folio_required(struct folio *folio,
639 struct queue_pages *qp)
640 {
641 int nid = folio_nid(folio);
642 unsigned long flags = qp->flags;
643
644 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
645 }
646
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)647 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
648 {
649 struct folio *folio;
650 struct queue_pages *qp = walk->private;
651
652 if (unlikely(pmd_is_migration_entry(*pmd))) {
653 qp->nr_failed++;
654 return;
655 }
656 folio = pmd_folio(*pmd);
657 if (is_huge_zero_folio(folio)) {
658 walk->action = ACTION_CONTINUE;
659 return;
660 }
661 if (!queue_folio_required(folio, qp))
662 return;
663 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
664 !vma_migratable(walk->vma) ||
665 !migrate_folio_add(folio, qp->pagelist, qp->flags))
666 qp->nr_failed++;
667 }
668
669 /*
670 * Scan through folios, checking if they satisfy the required conditions,
671 * moving them from LRU to local pagelist for migration if they do (or not).
672 *
673 * queue_folios_pte_range() has two possible return values:
674 * 0 - continue walking to scan for more, even if an existing folio on the
675 * wrong node could not be isolated and queued for migration.
676 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
677 * and an existing folio was on a node that does not follow the policy.
678 */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)679 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
680 unsigned long end, struct mm_walk *walk)
681 {
682 struct vm_area_struct *vma = walk->vma;
683 struct folio *folio;
684 struct queue_pages *qp = walk->private;
685 unsigned long flags = qp->flags;
686 pte_t *pte, *mapped_pte;
687 pte_t ptent;
688 spinlock_t *ptl;
689 int max_nr, nr;
690
691 ptl = pmd_trans_huge_lock(pmd, vma);
692 if (ptl) {
693 queue_folios_pmd(pmd, walk);
694 spin_unlock(ptl);
695 goto out;
696 }
697
698 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
699 if (!pte) {
700 walk->action = ACTION_AGAIN;
701 return 0;
702 }
703 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
704 max_nr = (end - addr) >> PAGE_SHIFT;
705 nr = 1;
706 ptent = ptep_get(pte);
707 if (pte_none(ptent))
708 continue;
709 if (!pte_present(ptent)) {
710 const softleaf_t entry = softleaf_from_pte(ptent);
711
712 if (softleaf_is_migration(entry))
713 qp->nr_failed++;
714 continue;
715 }
716 folio = vm_normal_folio(vma, addr, ptent);
717 if (!folio || folio_is_zone_device(folio))
718 continue;
719 if (folio_test_large(folio) && max_nr != 1)
720 nr = folio_pte_batch(folio, pte, ptent, max_nr);
721 /*
722 * vm_normal_folio() filters out zero pages, but there might
723 * still be reserved folios to skip, perhaps in a VDSO.
724 */
725 if (folio_test_reserved(folio))
726 continue;
727 if (!queue_folio_required(folio, qp))
728 continue;
729 if (folio_test_large(folio)) {
730 /*
731 * A large folio can only be isolated from LRU once,
732 * but may be mapped by many PTEs (and Copy-On-Write may
733 * intersperse PTEs of other, order 0, folios). This is
734 * a common case, so don't mistake it for failure (but
735 * there can be other cases of multi-mapped pages which
736 * this quick check does not help to filter out - and a
737 * search of the pagelist might grow to be prohibitive).
738 *
739 * migrate_pages(&pagelist) returns nr_failed folios, so
740 * check "large" now so that queue_pages_range() returns
741 * a comparable nr_failed folios. This does imply that
742 * if folio could not be isolated for some racy reason
743 * at its first PTE, later PTEs will not give it another
744 * chance of isolation; but keeps the accounting simple.
745 */
746 if (folio == qp->large)
747 continue;
748 qp->large = folio;
749 }
750 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
751 !vma_migratable(vma) ||
752 !migrate_folio_add(folio, qp->pagelist, flags)) {
753 qp->nr_failed += nr;
754 if (strictly_unmovable(flags))
755 break;
756 }
757 }
758 pte_unmap_unlock(mapped_pte, ptl);
759 cond_resched();
760 out:
761 if (qp->nr_failed && strictly_unmovable(flags))
762 return -EIO;
763 return 0;
764 }
765
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)766 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
767 unsigned long addr, unsigned long end,
768 struct mm_walk *walk)
769 {
770 #ifdef CONFIG_HUGETLB_PAGE
771 struct queue_pages *qp = walk->private;
772 unsigned long flags = qp->flags;
773 struct folio *folio;
774 spinlock_t *ptl;
775 pte_t ptep;
776
777 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
778 ptep = huge_ptep_get(walk->mm, addr, pte);
779 if (!pte_present(ptep)) {
780 if (!huge_pte_none(ptep)) {
781 const softleaf_t entry = softleaf_from_pte(ptep);
782
783 if (unlikely(softleaf_is_migration(entry)))
784 qp->nr_failed++;
785 }
786
787 goto unlock;
788 }
789 folio = pfn_folio(pte_pfn(ptep));
790 if (!queue_folio_required(folio, qp))
791 goto unlock;
792 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
793 !vma_migratable(walk->vma)) {
794 qp->nr_failed++;
795 goto unlock;
796 }
797 /*
798 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
799 * Choosing not to migrate a shared folio is not counted as a failure.
800 *
801 * See folio_maybe_mapped_shared() on possible imprecision when we
802 * cannot easily detect if a folio is shared.
803 */
804 if ((flags & MPOL_MF_MOVE_ALL) ||
805 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
806 if (!folio_isolate_hugetlb(folio, qp->pagelist))
807 qp->nr_failed++;
808 unlock:
809 spin_unlock(ptl);
810 if (qp->nr_failed && strictly_unmovable(flags))
811 return -EIO;
812 #endif
813 return 0;
814 }
815
816 #ifdef CONFIG_NUMA_BALANCING
817 /**
818 * folio_can_map_prot_numa() - check whether the folio can map prot numa
819 * @folio: The folio whose mapping considered for being made NUMA hintable
820 * @vma: The VMA that the folio belongs to.
821 * @is_private_single_threaded: Is this a single-threaded private VMA or not
822 *
823 * This function checks to see if the folio actually indicates that
824 * we need to make the mapping one which causes a NUMA hinting fault,
825 * as there are cases where it's simply unnecessary, and the folio's
826 * access time is adjusted for memory tiering if prot numa needed.
827 *
828 * Return: True if the mapping of the folio needs to be changed, false otherwise.
829 */
folio_can_map_prot_numa(struct folio * folio,struct vm_area_struct * vma,bool is_private_single_threaded)830 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
831 bool is_private_single_threaded)
832 {
833 int nid;
834
835 if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
836 return false;
837
838 /* Also skip shared copy-on-write folios */
839 if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
840 return false;
841
842 /* Folios are pinned and can't be migrated */
843 if (folio_maybe_dma_pinned(folio))
844 return false;
845
846 /*
847 * While migration can move some dirty folios,
848 * it cannot move them all from MIGRATE_ASYNC
849 * context.
850 */
851 if (folio_is_file_lru(folio) && folio_test_dirty(folio))
852 return false;
853
854 /*
855 * Don't mess with PTEs if folio is already on the node
856 * a single-threaded process is running on.
857 */
858 nid = folio_nid(folio);
859 if (is_private_single_threaded && (nid == numa_node_id()))
860 return false;
861
862 /*
863 * Skip scanning top tier node if normal numa
864 * balancing is disabled
865 */
866 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
867 node_is_toptier(nid))
868 return false;
869
870 if (folio_use_access_time(folio))
871 folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
872
873 return true;
874 }
875
876 /*
877 * This is used to mark a range of virtual addresses to be inaccessible.
878 * These are later cleared by a NUMA hinting fault. Depending on these
879 * faults, pages may be migrated for better NUMA placement.
880 *
881 * This is assuming that NUMA faults are handled using PROT_NONE. If
882 * an architecture makes a different choice, it will need further
883 * changes to the core.
884 */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)885 unsigned long change_prot_numa(struct vm_area_struct *vma,
886 unsigned long addr, unsigned long end)
887 {
888 struct mmu_gather tlb;
889 long nr_updated;
890
891 tlb_gather_mmu(&tlb, vma->vm_mm);
892
893 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
894 if (nr_updated > 0) {
895 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
896 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
897 }
898
899 tlb_finish_mmu(&tlb);
900
901 return nr_updated;
902 }
903 #endif /* CONFIG_NUMA_BALANCING */
904
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)905 static int queue_pages_test_walk(unsigned long start, unsigned long end,
906 struct mm_walk *walk)
907 {
908 struct vm_area_struct *next, *vma = walk->vma;
909 struct queue_pages *qp = walk->private;
910 unsigned long flags = qp->flags;
911
912 /* range check first */
913 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
914
915 if (!qp->first) {
916 qp->first = vma;
917 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
918 (qp->start < vma->vm_start))
919 /* hole at head side of range */
920 return -EFAULT;
921 }
922 next = find_vma(vma->vm_mm, vma->vm_end);
923 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
924 ((vma->vm_end < qp->end) &&
925 (!next || vma->vm_end < next->vm_start)))
926 /* hole at middle or tail of range */
927 return -EFAULT;
928
929 /*
930 * Need check MPOL_MF_STRICT to return -EIO if possible
931 * regardless of vma_migratable
932 */
933 if (!vma_migratable(vma) &&
934 !(flags & MPOL_MF_STRICT))
935 return 1;
936
937 /*
938 * Check page nodes, and queue pages to move, in the current vma.
939 * But if no moving, and no strict checking, the scan can be skipped.
940 */
941 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
942 return 0;
943 return 1;
944 }
945
946 static const struct mm_walk_ops queue_pages_walk_ops = {
947 .hugetlb_entry = queue_folios_hugetlb,
948 .pmd_entry = queue_folios_pte_range,
949 .test_walk = queue_pages_test_walk,
950 .walk_lock = PGWALK_RDLOCK,
951 };
952
953 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
954 .hugetlb_entry = queue_folios_hugetlb,
955 .pmd_entry = queue_folios_pte_range,
956 .test_walk = queue_pages_test_walk,
957 .walk_lock = PGWALK_WRLOCK,
958 };
959
960 /*
961 * Walk through page tables and collect pages to be migrated.
962 *
963 * If pages found in a given range are not on the required set of @nodes,
964 * and migration is allowed, they are isolated and queued to @pagelist.
965 *
966 * queue_pages_range() may return:
967 * 0 - all pages already on the right node, or successfully queued for moving
968 * (or neither strict checking nor moving requested: only range checking).
969 * >0 - this number of misplaced folios could not be queued for moving
970 * (a hugetlbfs page or a transparent huge page being counted as 1).
971 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
972 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
973 */
974 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)975 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
976 nodemask_t *nodes, unsigned long flags,
977 struct list_head *pagelist)
978 {
979 int err;
980 struct queue_pages qp = {
981 .pagelist = pagelist,
982 .flags = flags,
983 .nmask = nodes,
984 .start = start,
985 .end = end,
986 .first = NULL,
987 };
988 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
989 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
990
991 err = walk_page_range(mm, start, end, ops, &qp);
992
993 if (!qp.first)
994 /* whole range in hole */
995 err = -EFAULT;
996
997 return err ? : qp.nr_failed;
998 }
999
1000 /*
1001 * Apply policy to a single VMA
1002 * This must be called with the mmap_lock held for writing.
1003 */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)1004 static int vma_replace_policy(struct vm_area_struct *vma,
1005 struct mempolicy *pol)
1006 {
1007 int err;
1008 struct mempolicy *old;
1009 struct mempolicy *new;
1010
1011 vma_assert_write_locked(vma);
1012
1013 new = mpol_dup(pol);
1014 if (IS_ERR(new))
1015 return PTR_ERR(new);
1016
1017 if (vma->vm_ops && vma->vm_ops->set_policy) {
1018 err = vma->vm_ops->set_policy(vma, new);
1019 if (err)
1020 goto err_out;
1021 }
1022
1023 old = vma->vm_policy;
1024 vma->vm_policy = new; /* protected by mmap_lock */
1025 mpol_put(old);
1026
1027 return 0;
1028 err_out:
1029 mpol_put(new);
1030 return err;
1031 }
1032
1033 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)1034 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1035 struct vm_area_struct **prev, unsigned long start,
1036 unsigned long end, struct mempolicy *new_pol)
1037 {
1038 unsigned long vmstart, vmend;
1039
1040 vmend = min(end, vma->vm_end);
1041 if (start > vma->vm_start) {
1042 *prev = vma;
1043 vmstart = start;
1044 } else {
1045 vmstart = vma->vm_start;
1046 }
1047
1048 if (mpol_equal(vma->vm_policy, new_pol)) {
1049 *prev = vma;
1050 return 0;
1051 }
1052
1053 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
1054 if (IS_ERR(vma))
1055 return PTR_ERR(vma);
1056
1057 *prev = vma;
1058 return vma_replace_policy(vma, new_pol);
1059 }
1060
1061 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)1062 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1063 nodemask_t *nodes)
1064 {
1065 struct mempolicy *new, *old;
1066 NODEMASK_SCRATCH(scratch);
1067 int ret;
1068
1069 if (!scratch)
1070 return -ENOMEM;
1071
1072 new = mpol_new(mode, flags, nodes);
1073 if (IS_ERR(new)) {
1074 ret = PTR_ERR(new);
1075 goto out;
1076 }
1077
1078 task_lock(current);
1079 ret = mpol_set_nodemask(new, nodes, scratch);
1080 if (ret) {
1081 task_unlock(current);
1082 mpol_put(new);
1083 goto out;
1084 }
1085
1086 old = current->mempolicy;
1087 current->mempolicy = new;
1088 if (new && (new->mode == MPOL_INTERLEAVE ||
1089 new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1090 current->il_prev = MAX_NUMNODES-1;
1091 current->il_weight = 0;
1092 }
1093 task_unlock(current);
1094 mpol_put(old);
1095 ret = 0;
1096 out:
1097 NODEMASK_SCRATCH_FREE(scratch);
1098 return ret;
1099 }
1100
1101 /*
1102 * Return nodemask for policy for get_mempolicy() query
1103 *
1104 * Called with task's alloc_lock held
1105 */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1106 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1107 {
1108 nodes_clear(*nodes);
1109 if (pol == &default_policy)
1110 return;
1111
1112 switch (pol->mode) {
1113 case MPOL_BIND:
1114 case MPOL_INTERLEAVE:
1115 case MPOL_PREFERRED:
1116 case MPOL_PREFERRED_MANY:
1117 case MPOL_WEIGHTED_INTERLEAVE:
1118 *nodes = pol->nodes;
1119 break;
1120 case MPOL_LOCAL:
1121 /* return empty node mask for local allocation */
1122 break;
1123 default:
1124 BUG();
1125 }
1126 }
1127
lookup_node(struct mm_struct * mm,unsigned long addr)1128 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1129 {
1130 struct page *p = NULL;
1131 int ret;
1132
1133 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1134 if (ret > 0) {
1135 ret = page_to_nid(p);
1136 put_page(p);
1137 }
1138 return ret;
1139 }
1140
1141 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1142 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1143 unsigned long addr, unsigned long flags)
1144 {
1145 int err;
1146 struct mm_struct *mm = current->mm;
1147 struct vm_area_struct *vma = NULL;
1148 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1149
1150 if (flags &
1151 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1152 return -EINVAL;
1153
1154 if (flags & MPOL_F_MEMS_ALLOWED) {
1155 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1156 return -EINVAL;
1157 *policy = 0; /* just so it's initialized */
1158 task_lock(current);
1159 *nmask = cpuset_current_mems_allowed;
1160 task_unlock(current);
1161 return 0;
1162 }
1163
1164 if (flags & MPOL_F_ADDR) {
1165 pgoff_t ilx; /* ignored here */
1166 /*
1167 * Do NOT fall back to task policy if the
1168 * vma/shared policy at addr is NULL. We
1169 * want to return MPOL_DEFAULT in this case.
1170 */
1171 mmap_read_lock(mm);
1172 vma = vma_lookup(mm, addr);
1173 if (!vma) {
1174 mmap_read_unlock(mm);
1175 return -EFAULT;
1176 }
1177 pol = __get_vma_policy(vma, addr, &ilx);
1178 } else if (addr)
1179 return -EINVAL;
1180
1181 if (!pol)
1182 pol = &default_policy; /* indicates default behavior */
1183
1184 if (flags & MPOL_F_NODE) {
1185 if (flags & MPOL_F_ADDR) {
1186 /*
1187 * Take a refcount on the mpol, because we are about to
1188 * drop the mmap_lock, after which only "pol" remains
1189 * valid, "vma" is stale.
1190 */
1191 pol_refcount = pol;
1192 vma = NULL;
1193 mpol_get(pol);
1194 mmap_read_unlock(mm);
1195 err = lookup_node(mm, addr);
1196 if (err < 0)
1197 goto out;
1198 *policy = err;
1199 } else if (pol == current->mempolicy &&
1200 pol->mode == MPOL_INTERLEAVE) {
1201 *policy = next_node_in(current->il_prev, pol->nodes);
1202 } else if (pol == current->mempolicy &&
1203 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1204 if (current->il_weight)
1205 *policy = current->il_prev;
1206 else
1207 *policy = next_node_in(current->il_prev,
1208 pol->nodes);
1209 } else {
1210 err = -EINVAL;
1211 goto out;
1212 }
1213 } else {
1214 *policy = pol == &default_policy ? MPOL_DEFAULT :
1215 pol->mode;
1216 /*
1217 * Internal mempolicy flags must be masked off before exposing
1218 * the policy to userspace.
1219 */
1220 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1221 }
1222
1223 err = 0;
1224 if (nmask) {
1225 if (mpol_store_user_nodemask(pol)) {
1226 *nmask = pol->w.user_nodemask;
1227 } else {
1228 task_lock(current);
1229 get_policy_nodemask(pol, nmask);
1230 task_unlock(current);
1231 }
1232 }
1233
1234 out:
1235 mpol_cond_put(pol);
1236 if (vma)
1237 mmap_read_unlock(mm);
1238 if (pol_refcount)
1239 mpol_put(pol_refcount);
1240 return err;
1241 }
1242
1243 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1244 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1245 unsigned long flags)
1246 {
1247 /*
1248 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1249 * Choosing not to migrate a shared folio is not counted as a failure.
1250 *
1251 * See folio_maybe_mapped_shared() on possible imprecision when we
1252 * cannot easily detect if a folio is shared.
1253 */
1254 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1255 if (folio_isolate_lru(folio)) {
1256 list_add_tail(&folio->lru, foliolist);
1257 node_stat_mod_folio(folio,
1258 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1259 folio_nr_pages(folio));
1260 } else {
1261 /*
1262 * Non-movable folio may reach here. And, there may be
1263 * temporary off LRU folios or non-LRU movable folios.
1264 * Treat them as unmovable folios since they can't be
1265 * isolated, so they can't be moved at the moment.
1266 */
1267 return false;
1268 }
1269 }
1270 return true;
1271 }
1272
1273 /*
1274 * Migrate pages from one node to a target node.
1275 * Returns error or the number of pages not migrated.
1276 */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1277 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1278 int flags)
1279 {
1280 nodemask_t nmask;
1281 struct vm_area_struct *vma;
1282 LIST_HEAD(pagelist);
1283 long nr_failed;
1284 long err = 0;
1285 struct migration_target_control mtc = {
1286 .nid = dest,
1287 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1288 .reason = MR_SYSCALL,
1289 };
1290
1291 nodes_clear(nmask);
1292 node_set(source, nmask);
1293
1294 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1295
1296 mmap_read_lock(mm);
1297 vma = find_vma(mm, 0);
1298 if (unlikely(!vma)) {
1299 mmap_read_unlock(mm);
1300 return 0;
1301 }
1302
1303 /*
1304 * This does not migrate the range, but isolates all pages that
1305 * need migration. Between passing in the full user address
1306 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1307 * but passes back the count of pages which could not be isolated.
1308 */
1309 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1310 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1311 mmap_read_unlock(mm);
1312
1313 if (!list_empty(&pagelist)) {
1314 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1315 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1316 if (err)
1317 putback_movable_pages(&pagelist);
1318 }
1319
1320 if (err >= 0)
1321 err += nr_failed;
1322 return err;
1323 }
1324
1325 /*
1326 * Move pages between the two nodesets so as to preserve the physical
1327 * layout as much as possible.
1328 *
1329 * Returns the number of page that could not be moved.
1330 */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1331 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1332 const nodemask_t *to, int flags)
1333 {
1334 long nr_failed = 0;
1335 long err = 0;
1336 nodemask_t tmp;
1337
1338 lru_cache_disable();
1339
1340 /*
1341 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1342 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1343 * bit in 'tmp', and return that <source, dest> pair for migration.
1344 * The pair of nodemasks 'to' and 'from' define the map.
1345 *
1346 * If no pair of bits is found that way, fallback to picking some
1347 * pair of 'source' and 'dest' bits that are not the same. If the
1348 * 'source' and 'dest' bits are the same, this represents a node
1349 * that will be migrating to itself, so no pages need move.
1350 *
1351 * If no bits are left in 'tmp', or if all remaining bits left
1352 * in 'tmp' correspond to the same bit in 'to', return false
1353 * (nothing left to migrate).
1354 *
1355 * This lets us pick a pair of nodes to migrate between, such that
1356 * if possible the dest node is not already occupied by some other
1357 * source node, minimizing the risk of overloading the memory on a
1358 * node that would happen if we migrated incoming memory to a node
1359 * before migrating outgoing memory source that same node.
1360 *
1361 * A single scan of tmp is sufficient. As we go, we remember the
1362 * most recent <s, d> pair that moved (s != d). If we find a pair
1363 * that not only moved, but what's better, moved to an empty slot
1364 * (d is not set in tmp), then we break out then, with that pair.
1365 * Otherwise when we finish scanning from_tmp, we at least have the
1366 * most recent <s, d> pair that moved. If we get all the way through
1367 * the scan of tmp without finding any node that moved, much less
1368 * moved to an empty node, then there is nothing left worth migrating.
1369 */
1370
1371 tmp = *from;
1372 while (!nodes_empty(tmp)) {
1373 int s, d;
1374 int source = NUMA_NO_NODE;
1375 int dest = 0;
1376
1377 for_each_node_mask(s, tmp) {
1378
1379 /*
1380 * do_migrate_pages() tries to maintain the relative
1381 * node relationship of the pages established between
1382 * threads and memory areas.
1383 *
1384 * However if the number of source nodes is not equal to
1385 * the number of destination nodes we can not preserve
1386 * this node relative relationship. In that case, skip
1387 * copying memory from a node that is in the destination
1388 * mask.
1389 *
1390 * Example: [2,3,4] -> [3,4,5] moves everything.
1391 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1392 */
1393
1394 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1395 (node_isset(s, *to)))
1396 continue;
1397
1398 d = node_remap(s, *from, *to);
1399 if (s == d)
1400 continue;
1401
1402 source = s; /* Node moved. Memorize */
1403 dest = d;
1404
1405 /* dest not in remaining from nodes? */
1406 if (!node_isset(dest, tmp))
1407 break;
1408 }
1409 if (source == NUMA_NO_NODE)
1410 break;
1411
1412 node_clear(source, tmp);
1413 err = migrate_to_node(mm, source, dest, flags);
1414 if (err > 0)
1415 nr_failed += err;
1416 if (err < 0)
1417 break;
1418 }
1419
1420 lru_cache_enable();
1421 if (err < 0)
1422 return err;
1423 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1424 }
1425
1426 /*
1427 * Allocate a new folio for page migration, according to NUMA mempolicy.
1428 */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1429 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1430 unsigned long private)
1431 {
1432 struct migration_mpol *mmpol = (struct migration_mpol *)private;
1433 struct mempolicy *pol = mmpol->pol;
1434 pgoff_t ilx = mmpol->ilx;
1435 unsigned int order;
1436 int nid = numa_node_id();
1437 gfp_t gfp;
1438
1439 order = folio_order(src);
1440 ilx += src->index >> order;
1441
1442 if (folio_test_hugetlb(src)) {
1443 nodemask_t *nodemask;
1444 struct hstate *h;
1445
1446 h = folio_hstate(src);
1447 gfp = htlb_alloc_mask(h);
1448 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1449 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1450 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1451 }
1452
1453 if (folio_test_large(src))
1454 gfp = GFP_TRANSHUGE;
1455 else
1456 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1457
1458 return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1459 }
1460 #else
1461
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1462 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1463 unsigned long flags)
1464 {
1465 return false;
1466 }
1467
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1468 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1469 const nodemask_t *to, int flags)
1470 {
1471 return -ENOSYS;
1472 }
1473
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1474 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1475 unsigned long private)
1476 {
1477 return NULL;
1478 }
1479 #endif
1480
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1481 static long do_mbind(unsigned long start, unsigned long len,
1482 unsigned short mode, unsigned short mode_flags,
1483 nodemask_t *nmask, unsigned long flags)
1484 {
1485 struct mm_struct *mm = current->mm;
1486 struct vm_area_struct *vma, *prev;
1487 struct vma_iterator vmi;
1488 struct migration_mpol mmpol;
1489 struct mempolicy *new;
1490 unsigned long end;
1491 long err;
1492 long nr_failed;
1493 LIST_HEAD(pagelist);
1494
1495 if (flags & ~(unsigned long)MPOL_MF_VALID)
1496 return -EINVAL;
1497 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1498 return -EPERM;
1499
1500 if (start & ~PAGE_MASK)
1501 return -EINVAL;
1502
1503 if (mode == MPOL_DEFAULT)
1504 flags &= ~MPOL_MF_STRICT;
1505
1506 len = PAGE_ALIGN(len);
1507 end = start + len;
1508
1509 if (end < start)
1510 return -EINVAL;
1511 if (end == start)
1512 return 0;
1513
1514 new = mpol_new(mode, mode_flags, nmask);
1515 if (IS_ERR(new))
1516 return PTR_ERR(new);
1517
1518 /*
1519 * If we are using the default policy then operation
1520 * on discontinuous address spaces is okay after all
1521 */
1522 if (!new)
1523 flags |= MPOL_MF_DISCONTIG_OK;
1524
1525 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1526 lru_cache_disable();
1527 {
1528 NODEMASK_SCRATCH(scratch);
1529 if (scratch) {
1530 mmap_write_lock(mm);
1531 err = mpol_set_nodemask(new, nmask, scratch);
1532 if (err)
1533 mmap_write_unlock(mm);
1534 } else
1535 err = -ENOMEM;
1536 NODEMASK_SCRATCH_FREE(scratch);
1537 }
1538 if (err)
1539 goto mpol_out;
1540
1541 /*
1542 * Lock the VMAs before scanning for pages to migrate,
1543 * to ensure we don't miss a concurrently inserted page.
1544 */
1545 nr_failed = queue_pages_range(mm, start, end, nmask,
1546 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1547
1548 if (nr_failed < 0) {
1549 err = nr_failed;
1550 nr_failed = 0;
1551 } else {
1552 vma_iter_init(&vmi, mm, start);
1553 prev = vma_prev(&vmi);
1554 for_each_vma_range(vmi, vma, end) {
1555 err = mbind_range(&vmi, vma, &prev, start, end, new);
1556 if (err)
1557 break;
1558 }
1559 }
1560
1561 if (!err && !list_empty(&pagelist)) {
1562 /* Convert MPOL_DEFAULT's NULL to task or default policy */
1563 if (!new) {
1564 new = get_task_policy(current);
1565 mpol_get(new);
1566 }
1567 mmpol.pol = new;
1568 mmpol.ilx = 0;
1569
1570 /*
1571 * In the interleaved case, attempt to allocate on exactly the
1572 * targeted nodes, for the first VMA to be migrated; for later
1573 * VMAs, the nodes will still be interleaved from the targeted
1574 * nodemask, but one by one may be selected differently.
1575 */
1576 if (new->mode == MPOL_INTERLEAVE ||
1577 new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1578 struct folio *folio;
1579 unsigned int order;
1580 unsigned long addr = -EFAULT;
1581
1582 list_for_each_entry(folio, &pagelist, lru) {
1583 if (!folio_test_ksm(folio))
1584 break;
1585 }
1586 if (!list_entry_is_head(folio, &pagelist, lru)) {
1587 vma_iter_init(&vmi, mm, start);
1588 for_each_vma_range(vmi, vma, end) {
1589 addr = page_address_in_vma(folio,
1590 folio_page(folio, 0), vma);
1591 if (addr != -EFAULT)
1592 break;
1593 }
1594 }
1595 if (addr != -EFAULT) {
1596 order = folio_order(folio);
1597 /* We already know the pol, but not the ilx */
1598 mpol_cond_put(get_vma_policy(vma, addr, order,
1599 &mmpol.ilx));
1600 /* Set base from which to increment by index */
1601 mmpol.ilx -= folio->index >> order;
1602 }
1603 }
1604 }
1605
1606 mmap_write_unlock(mm);
1607
1608 if (!err && !list_empty(&pagelist)) {
1609 nr_failed |= migrate_pages(&pagelist,
1610 alloc_migration_target_by_mpol, NULL,
1611 (unsigned long)&mmpol, MIGRATE_SYNC,
1612 MR_MEMPOLICY_MBIND, NULL);
1613 }
1614
1615 if (nr_failed && (flags & MPOL_MF_STRICT))
1616 err = -EIO;
1617 if (!list_empty(&pagelist))
1618 putback_movable_pages(&pagelist);
1619 mpol_out:
1620 mpol_put(new);
1621 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1622 lru_cache_enable();
1623 return err;
1624 }
1625
1626 /*
1627 * User space interface with variable sized bitmaps for nodelists.
1628 */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1629 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1630 unsigned long maxnode)
1631 {
1632 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1633 int ret;
1634
1635 if (in_compat_syscall())
1636 ret = compat_get_bitmap(mask,
1637 (const compat_ulong_t __user *)nmask,
1638 maxnode);
1639 else
1640 ret = copy_from_user(mask, nmask,
1641 nlongs * sizeof(unsigned long));
1642
1643 if (ret)
1644 return -EFAULT;
1645
1646 if (maxnode % BITS_PER_LONG)
1647 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1648
1649 return 0;
1650 }
1651
1652 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1653 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1654 unsigned long maxnode)
1655 {
1656 --maxnode;
1657 nodes_clear(*nodes);
1658 if (maxnode == 0 || !nmask)
1659 return 0;
1660 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1661 return -EINVAL;
1662
1663 /*
1664 * When the user specified more nodes than supported just check
1665 * if the non supported part is all zero, one word at a time,
1666 * starting at the end.
1667 */
1668 while (maxnode > MAX_NUMNODES) {
1669 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1670 unsigned long t;
1671
1672 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1673 return -EFAULT;
1674
1675 if (maxnode - bits >= MAX_NUMNODES) {
1676 maxnode -= bits;
1677 } else {
1678 maxnode = MAX_NUMNODES;
1679 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1680 }
1681 if (t)
1682 return -EINVAL;
1683 }
1684
1685 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1686 }
1687
1688 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1689 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1690 nodemask_t *nodes)
1691 {
1692 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1693 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1694 bool compat = in_compat_syscall();
1695
1696 if (compat)
1697 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1698
1699 if (copy > nbytes) {
1700 if (copy > PAGE_SIZE)
1701 return -EINVAL;
1702 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1703 return -EFAULT;
1704 copy = nbytes;
1705 maxnode = nr_node_ids;
1706 }
1707
1708 if (compat)
1709 return compat_put_bitmap((compat_ulong_t __user *)mask,
1710 nodes_addr(*nodes), maxnode);
1711
1712 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1713 }
1714
1715 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1716 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1717 {
1718 *flags = *mode & MPOL_MODE_FLAGS;
1719 *mode &= ~MPOL_MODE_FLAGS;
1720
1721 if ((unsigned int)(*mode) >= MPOL_MAX)
1722 return -EINVAL;
1723 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1724 return -EINVAL;
1725 if (*flags & MPOL_F_NUMA_BALANCING) {
1726 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1727 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1728 else
1729 return -EINVAL;
1730 }
1731 return 0;
1732 }
1733
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1734 static long kernel_mbind(unsigned long start, unsigned long len,
1735 unsigned long mode, const unsigned long __user *nmask,
1736 unsigned long maxnode, unsigned int flags)
1737 {
1738 unsigned short mode_flags;
1739 nodemask_t nodes;
1740 int lmode = mode;
1741 int err;
1742
1743 start = untagged_addr(start);
1744 err = sanitize_mpol_flags(&lmode, &mode_flags);
1745 if (err)
1746 return err;
1747
1748 err = get_nodes(&nodes, nmask, maxnode);
1749 if (err)
1750 return err;
1751
1752 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1753 }
1754
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1755 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1756 unsigned long, home_node, unsigned long, flags)
1757 {
1758 struct mm_struct *mm = current->mm;
1759 struct vm_area_struct *vma, *prev;
1760 struct mempolicy *new, *old;
1761 unsigned long end;
1762 int err = -ENOENT;
1763 VMA_ITERATOR(vmi, mm, start);
1764
1765 start = untagged_addr(start);
1766 if (start & ~PAGE_MASK)
1767 return -EINVAL;
1768 /*
1769 * flags is used for future extension if any.
1770 */
1771 if (flags != 0)
1772 return -EINVAL;
1773
1774 /*
1775 * Check home_node is online to avoid accessing uninitialized
1776 * NODE_DATA.
1777 */
1778 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1779 return -EINVAL;
1780
1781 len = PAGE_ALIGN(len);
1782 end = start + len;
1783
1784 if (end < start)
1785 return -EINVAL;
1786 if (end == start)
1787 return 0;
1788 mmap_write_lock(mm);
1789 prev = vma_prev(&vmi);
1790 for_each_vma_range(vmi, vma, end) {
1791 /*
1792 * If any vma in the range got policy other than MPOL_BIND
1793 * or MPOL_PREFERRED_MANY we return error. We don't reset
1794 * the home node for vmas we already updated before.
1795 */
1796 old = vma_policy(vma);
1797 if (!old) {
1798 prev = vma;
1799 continue;
1800 }
1801 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1802 err = -EOPNOTSUPP;
1803 break;
1804 }
1805 new = mpol_dup(old);
1806 if (IS_ERR(new)) {
1807 err = PTR_ERR(new);
1808 break;
1809 }
1810
1811 vma_start_write(vma);
1812 new->home_node = home_node;
1813 err = mbind_range(&vmi, vma, &prev, start, end, new);
1814 mpol_put(new);
1815 if (err)
1816 break;
1817 }
1818 mmap_write_unlock(mm);
1819 return err;
1820 }
1821
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1822 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1823 unsigned long, mode, const unsigned long __user *, nmask,
1824 unsigned long, maxnode, unsigned int, flags)
1825 {
1826 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1827 }
1828
1829 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1830 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1831 unsigned long maxnode)
1832 {
1833 unsigned short mode_flags;
1834 nodemask_t nodes;
1835 int lmode = mode;
1836 int err;
1837
1838 err = sanitize_mpol_flags(&lmode, &mode_flags);
1839 if (err)
1840 return err;
1841
1842 err = get_nodes(&nodes, nmask, maxnode);
1843 if (err)
1844 return err;
1845
1846 return do_set_mempolicy(lmode, mode_flags, &nodes);
1847 }
1848
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1849 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1850 unsigned long, maxnode)
1851 {
1852 return kernel_set_mempolicy(mode, nmask, maxnode);
1853 }
1854
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1855 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1856 const unsigned long __user *old_nodes,
1857 const unsigned long __user *new_nodes)
1858 {
1859 struct mm_struct *mm = NULL;
1860 struct task_struct *task;
1861 nodemask_t task_nodes;
1862 int err;
1863 nodemask_t *old;
1864 nodemask_t *new;
1865 NODEMASK_SCRATCH(scratch);
1866
1867 if (!scratch)
1868 return -ENOMEM;
1869
1870 old = &scratch->mask1;
1871 new = &scratch->mask2;
1872
1873 err = get_nodes(old, old_nodes, maxnode);
1874 if (err)
1875 goto out;
1876
1877 err = get_nodes(new, new_nodes, maxnode);
1878 if (err)
1879 goto out;
1880
1881 /* Find the mm_struct */
1882 rcu_read_lock();
1883 task = pid ? find_task_by_vpid(pid) : current;
1884 if (!task) {
1885 rcu_read_unlock();
1886 err = -ESRCH;
1887 goto out;
1888 }
1889 get_task_struct(task);
1890
1891 err = -EINVAL;
1892
1893 /*
1894 * Check if this process has the right to modify the specified process.
1895 * Use the regular "ptrace_may_access()" checks.
1896 */
1897 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1898 rcu_read_unlock();
1899 err = -EPERM;
1900 goto out_put;
1901 }
1902 rcu_read_unlock();
1903
1904 task_nodes = cpuset_mems_allowed(task);
1905 /* Is the user allowed to access the target nodes? */
1906 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1907 err = -EPERM;
1908 goto out_put;
1909 }
1910
1911 task_nodes = cpuset_mems_allowed(current);
1912 nodes_and(*new, *new, task_nodes);
1913 if (nodes_empty(*new))
1914 goto out_put;
1915
1916 err = security_task_movememory(task);
1917 if (err)
1918 goto out_put;
1919
1920 mm = get_task_mm(task);
1921 put_task_struct(task);
1922
1923 if (!mm) {
1924 err = -EINVAL;
1925 goto out;
1926 }
1927
1928 err = do_migrate_pages(mm, old, new,
1929 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1930
1931 mmput(mm);
1932 out:
1933 NODEMASK_SCRATCH_FREE(scratch);
1934
1935 return err;
1936
1937 out_put:
1938 put_task_struct(task);
1939 goto out;
1940 }
1941
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1942 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1943 const unsigned long __user *, old_nodes,
1944 const unsigned long __user *, new_nodes)
1945 {
1946 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1947 }
1948
1949 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1950 static int kernel_get_mempolicy(int __user *policy,
1951 unsigned long __user *nmask,
1952 unsigned long maxnode,
1953 unsigned long addr,
1954 unsigned long flags)
1955 {
1956 int err;
1957 int pval;
1958 nodemask_t nodes;
1959
1960 if (nmask != NULL && maxnode < nr_node_ids)
1961 return -EINVAL;
1962
1963 addr = untagged_addr(addr);
1964
1965 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1966
1967 if (err)
1968 return err;
1969
1970 if (policy && put_user(pval, policy))
1971 return -EFAULT;
1972
1973 if (nmask)
1974 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1975
1976 return err;
1977 }
1978
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1979 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1980 unsigned long __user *, nmask, unsigned long, maxnode,
1981 unsigned long, addr, unsigned long, flags)
1982 {
1983 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1984 }
1985
vma_migratable(struct vm_area_struct * vma)1986 bool vma_migratable(struct vm_area_struct *vma)
1987 {
1988 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1989 return false;
1990
1991 /*
1992 * DAX device mappings require predictable access latency, so avoid
1993 * incurring periodic faults.
1994 */
1995 if (vma_is_dax(vma))
1996 return false;
1997
1998 if (is_vm_hugetlb_page(vma) &&
1999 !hugepage_migration_supported(hstate_vma(vma)))
2000 return false;
2001
2002 /*
2003 * Migration allocates pages in the highest zone. If we cannot
2004 * do so then migration (at least from node to node) is not
2005 * possible.
2006 */
2007 if (vma->vm_file &&
2008 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
2009 < policy_zone)
2010 return false;
2011 return true;
2012 }
2013
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2014 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2015 unsigned long addr, pgoff_t *ilx)
2016 {
2017 *ilx = 0;
2018 return (vma->vm_ops && vma->vm_ops->get_policy) ?
2019 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2020 }
2021
2022 /*
2023 * get_vma_policy(@vma, @addr, @order, @ilx)
2024 * @vma: virtual memory area whose policy is sought
2025 * @addr: address in @vma for shared policy lookup
2026 * @order: 0, or appropriate huge_page_order for interleaving
2027 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2028 * MPOL_WEIGHTED_INTERLEAVE
2029 *
2030 * Returns effective policy for a VMA at specified address.
2031 * Falls back to current->mempolicy or system default policy, as necessary.
2032 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2033 * count--added by the get_policy() vm_op, as appropriate--to protect against
2034 * freeing by another task. It is the caller's responsibility to free the
2035 * extra reference for shared policies.
2036 */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)2037 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2038 unsigned long addr, int order, pgoff_t *ilx)
2039 {
2040 struct mempolicy *pol;
2041
2042 pol = __get_vma_policy(vma, addr, ilx);
2043 if (!pol)
2044 pol = get_task_policy(current);
2045 if (pol->mode == MPOL_INTERLEAVE ||
2046 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2047 *ilx += vma->vm_pgoff >> order;
2048 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2049 }
2050 return pol;
2051 }
2052
vma_policy_mof(struct vm_area_struct * vma)2053 bool vma_policy_mof(struct vm_area_struct *vma)
2054 {
2055 struct mempolicy *pol;
2056
2057 if (vma->vm_ops && vma->vm_ops->get_policy) {
2058 bool ret = false;
2059 pgoff_t ilx; /* ignored here */
2060
2061 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2062 if (pol && (pol->flags & MPOL_F_MOF))
2063 ret = true;
2064 mpol_cond_put(pol);
2065
2066 return ret;
2067 }
2068
2069 pol = vma->vm_policy;
2070 if (!pol)
2071 pol = get_task_policy(current);
2072
2073 return pol->flags & MPOL_F_MOF;
2074 }
2075
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2076 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2077 {
2078 enum zone_type dynamic_policy_zone = policy_zone;
2079
2080 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2081
2082 /*
2083 * if policy->nodes has movable memory only,
2084 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2085 *
2086 * policy->nodes is intersect with node_states[N_MEMORY].
2087 * so if the following test fails, it implies
2088 * policy->nodes has movable memory only.
2089 */
2090 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2091 dynamic_policy_zone = ZONE_MOVABLE;
2092
2093 return zone >= dynamic_policy_zone;
2094 }
2095
weighted_interleave_nodes(struct mempolicy * policy)2096 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2097 {
2098 unsigned int node;
2099 unsigned int cpuset_mems_cookie;
2100
2101 retry:
2102 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2103 cpuset_mems_cookie = read_mems_allowed_begin();
2104 node = current->il_prev;
2105 if (!current->il_weight || !node_isset(node, policy->nodes)) {
2106 node = next_node_in(node, policy->nodes);
2107 if (read_mems_allowed_retry(cpuset_mems_cookie))
2108 goto retry;
2109 if (node == MAX_NUMNODES)
2110 return node;
2111 current->il_prev = node;
2112 current->il_weight = get_il_weight(node);
2113 }
2114 current->il_weight--;
2115 return node;
2116 }
2117
2118 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2119 static unsigned int interleave_nodes(struct mempolicy *policy)
2120 {
2121 unsigned int nid;
2122 unsigned int cpuset_mems_cookie;
2123
2124 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2125 do {
2126 cpuset_mems_cookie = read_mems_allowed_begin();
2127 nid = next_node_in(current->il_prev, policy->nodes);
2128 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2129
2130 if (nid < MAX_NUMNODES)
2131 current->il_prev = nid;
2132 return nid;
2133 }
2134
2135 /*
2136 * Depending on the memory policy provide a node from which to allocate the
2137 * next slab entry.
2138 */
mempolicy_slab_node(void)2139 unsigned int mempolicy_slab_node(void)
2140 {
2141 struct mempolicy *policy;
2142 int node = numa_mem_id();
2143
2144 if (!in_task())
2145 return node;
2146
2147 policy = current->mempolicy;
2148 if (!policy)
2149 return node;
2150
2151 switch (policy->mode) {
2152 case MPOL_PREFERRED:
2153 return first_node(policy->nodes);
2154
2155 case MPOL_INTERLEAVE:
2156 return interleave_nodes(policy);
2157
2158 case MPOL_WEIGHTED_INTERLEAVE:
2159 return weighted_interleave_nodes(policy);
2160
2161 case MPOL_BIND:
2162 case MPOL_PREFERRED_MANY:
2163 {
2164 struct zoneref *z;
2165
2166 /*
2167 * Follow bind policy behavior and start allocation at the
2168 * first node.
2169 */
2170 struct zonelist *zonelist;
2171 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2172 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2173 z = first_zones_zonelist(zonelist, highest_zoneidx,
2174 &policy->nodes);
2175 return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2176 }
2177 case MPOL_LOCAL:
2178 return node;
2179
2180 default:
2181 BUG();
2182 }
2183 }
2184
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2185 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2186 nodemask_t *mask)
2187 {
2188 /*
2189 * barrier stabilizes the nodemask locally so that it can be iterated
2190 * over safely without concern for changes. Allocators validate node
2191 * selection does not violate mems_allowed, so this is safe.
2192 */
2193 barrier();
2194 memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2195 barrier();
2196 return nodes_weight(*mask);
2197 }
2198
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2199 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2200 {
2201 struct weighted_interleave_state *state;
2202 nodemask_t nodemask;
2203 unsigned int target, nr_nodes;
2204 u8 *table = NULL;
2205 unsigned int weight_total = 0;
2206 u8 weight;
2207 int nid = 0;
2208
2209 nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2210 if (!nr_nodes)
2211 return numa_node_id();
2212
2213 rcu_read_lock();
2214
2215 state = rcu_dereference(wi_state);
2216 /* Uninitialized wi_state means we should assume all weights are 1 */
2217 if (state)
2218 table = state->iw_table;
2219
2220 /* calculate the total weight */
2221 for_each_node_mask(nid, nodemask)
2222 weight_total += table ? table[nid] : 1;
2223
2224 /* Calculate the node offset based on totals */
2225 target = ilx % weight_total;
2226 nid = first_node(nodemask);
2227 while (target) {
2228 /* detect system default usage */
2229 weight = table ? table[nid] : 1;
2230 if (target < weight)
2231 break;
2232 target -= weight;
2233 nid = next_node_in(nid, nodemask);
2234 }
2235 rcu_read_unlock();
2236 return nid;
2237 }
2238
2239 /*
2240 * Do static interleaving for interleave index @ilx. Returns the ilx'th
2241 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2242 * exceeds the number of present nodes.
2243 */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2244 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2245 {
2246 nodemask_t nodemask;
2247 unsigned int target, nnodes;
2248 int i;
2249 int nid;
2250
2251 nnodes = read_once_policy_nodemask(pol, &nodemask);
2252 if (!nnodes)
2253 return numa_node_id();
2254 target = ilx % nnodes;
2255 nid = first_node(nodemask);
2256 for (i = 0; i < target; i++)
2257 nid = next_node(nid, nodemask);
2258 return nid;
2259 }
2260
2261 /*
2262 * Return a nodemask representing a mempolicy for filtering nodes for
2263 * page allocation, together with preferred node id (or the input node id).
2264 */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2265 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2266 pgoff_t ilx, int *nid)
2267 {
2268 nodemask_t *nodemask = NULL;
2269
2270 switch (pol->mode) {
2271 case MPOL_PREFERRED:
2272 /* Override input node id */
2273 *nid = first_node(pol->nodes);
2274 break;
2275 case MPOL_PREFERRED_MANY:
2276 nodemask = &pol->nodes;
2277 if (pol->home_node != NUMA_NO_NODE)
2278 *nid = pol->home_node;
2279 break;
2280 case MPOL_BIND:
2281 /* Restrict to nodemask (but not on lower zones) */
2282 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2283 cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2284 nodemask = &pol->nodes;
2285 if (pol->home_node != NUMA_NO_NODE)
2286 *nid = pol->home_node;
2287 /*
2288 * __GFP_THISNODE shouldn't even be used with the bind policy
2289 * because we might easily break the expectation to stay on the
2290 * requested node and not break the policy.
2291 */
2292 WARN_ON_ONCE(gfp & __GFP_THISNODE);
2293 break;
2294 case MPOL_INTERLEAVE:
2295 /* Override input node id */
2296 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2297 interleave_nodes(pol) : interleave_nid(pol, ilx);
2298 break;
2299 case MPOL_WEIGHTED_INTERLEAVE:
2300 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2301 weighted_interleave_nodes(pol) :
2302 weighted_interleave_nid(pol, ilx);
2303 break;
2304 }
2305
2306 return nodemask;
2307 }
2308
2309 #ifdef CONFIG_HUGETLBFS
2310 /*
2311 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2312 * @vma: virtual memory area whose policy is sought
2313 * @addr: address in @vma for shared policy lookup and interleave policy
2314 * @gfp_flags: for requested zone
2315 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2316 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2317 *
2318 * Returns a nid suitable for a huge page allocation and a pointer
2319 * to the struct mempolicy for conditional unref after allocation.
2320 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2321 * to the mempolicy's @nodemask for filtering the zonelist.
2322 */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2323 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2324 struct mempolicy **mpol, nodemask_t **nodemask)
2325 {
2326 pgoff_t ilx;
2327 int nid;
2328
2329 nid = numa_node_id();
2330 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2331 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2332 return nid;
2333 }
2334
2335 /*
2336 * init_nodemask_of_mempolicy
2337 *
2338 * If the current task's mempolicy is "default" [NULL], return 'false'
2339 * to indicate default policy. Otherwise, extract the policy nodemask
2340 * for 'bind' or 'interleave' policy into the argument nodemask, or
2341 * initialize the argument nodemask to contain the single node for
2342 * 'preferred' or 'local' policy and return 'true' to indicate presence
2343 * of non-default mempolicy.
2344 *
2345 * We don't bother with reference counting the mempolicy [mpol_get/put]
2346 * because the current task is examining it's own mempolicy and a task's
2347 * mempolicy is only ever changed by the task itself.
2348 *
2349 * N.B., it is the caller's responsibility to free a returned nodemask.
2350 */
init_nodemask_of_mempolicy(nodemask_t * mask)2351 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2352 {
2353 struct mempolicy *mempolicy;
2354
2355 if (!(mask && current->mempolicy))
2356 return false;
2357
2358 task_lock(current);
2359 mempolicy = current->mempolicy;
2360 switch (mempolicy->mode) {
2361 case MPOL_PREFERRED:
2362 case MPOL_PREFERRED_MANY:
2363 case MPOL_BIND:
2364 case MPOL_INTERLEAVE:
2365 case MPOL_WEIGHTED_INTERLEAVE:
2366 *mask = mempolicy->nodes;
2367 break;
2368
2369 case MPOL_LOCAL:
2370 init_nodemask_of_node(mask, numa_node_id());
2371 break;
2372
2373 default:
2374 BUG();
2375 }
2376 task_unlock(current);
2377
2378 return true;
2379 }
2380 #endif
2381
2382 /*
2383 * mempolicy_in_oom_domain
2384 *
2385 * If tsk's mempolicy is "bind", check for intersection between mask and
2386 * the policy nodemask. Otherwise, return true for all other policies
2387 * including "interleave", as a tsk with "interleave" policy may have
2388 * memory allocated from all nodes in system.
2389 *
2390 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2391 */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2392 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2393 const nodemask_t *mask)
2394 {
2395 struct mempolicy *mempolicy;
2396 bool ret = true;
2397
2398 if (!mask)
2399 return ret;
2400
2401 task_lock(tsk);
2402 mempolicy = tsk->mempolicy;
2403 if (mempolicy && mempolicy->mode == MPOL_BIND)
2404 ret = nodes_intersects(mempolicy->nodes, *mask);
2405 task_unlock(tsk);
2406
2407 return ret;
2408 }
2409
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2410 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2411 int nid, nodemask_t *nodemask)
2412 {
2413 struct page *page;
2414 gfp_t preferred_gfp;
2415
2416 /*
2417 * This is a two pass approach. The first pass will only try the
2418 * preferred nodes but skip the direct reclaim and allow the
2419 * allocation to fail, while the second pass will try all the
2420 * nodes in system.
2421 */
2422 preferred_gfp = gfp | __GFP_NOWARN;
2423 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2424 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2425 if (!page)
2426 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2427
2428 return page;
2429 }
2430
2431 /**
2432 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2433 * @gfp: GFP flags.
2434 * @order: Order of the page allocation.
2435 * @pol: Pointer to the NUMA mempolicy.
2436 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2437 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2438 *
2439 * Return: The page on success or NULL if allocation fails.
2440 */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2441 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2442 struct mempolicy *pol, pgoff_t ilx, int nid)
2443 {
2444 nodemask_t *nodemask;
2445 struct page *page;
2446
2447 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2448
2449 if (pol->mode == MPOL_PREFERRED_MANY)
2450 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2451
2452 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2453 /* filter "hugepage" allocation, unless from alloc_pages() */
2454 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2455 /*
2456 * For hugepage allocation and non-interleave policy which
2457 * allows the current node (or other explicitly preferred
2458 * node) we only try to allocate from the current/preferred
2459 * node and don't fall back to other nodes, as the cost of
2460 * remote accesses would likely offset THP benefits.
2461 *
2462 * If the policy is interleave or does not allow the current
2463 * node in its nodemask, we allocate the standard way.
2464 */
2465 if (pol->mode != MPOL_INTERLEAVE &&
2466 pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2467 (!nodemask || node_isset(nid, *nodemask))) {
2468 /*
2469 * First, try to allocate THP only on local node, but
2470 * don't reclaim unnecessarily, just compact.
2471 */
2472 page = __alloc_frozen_pages_noprof(
2473 gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2474 nid, NULL);
2475 if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2476 return page;
2477 /*
2478 * If hugepage allocations are configured to always
2479 * synchronous compact or the vma has been madvised
2480 * to prefer hugepage backing, retry allowing remote
2481 * memory with both reclaim and compact as well.
2482 */
2483 }
2484 }
2485
2486 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2487
2488 if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2489 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2490 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2491 if (static_branch_likely(&vm_numa_stat_key) &&
2492 page_to_nid(page) == nid) {
2493 preempt_disable();
2494 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2495 preempt_enable();
2496 }
2497 }
2498
2499 return page;
2500 }
2501
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2502 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2503 struct mempolicy *pol, pgoff_t ilx, int nid)
2504 {
2505 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2506 ilx, nid);
2507 if (!page)
2508 return NULL;
2509
2510 set_page_refcounted(page);
2511 return page_rmappable_folio(page);
2512 }
2513
2514 /**
2515 * vma_alloc_folio - Allocate a folio for a VMA.
2516 * @gfp: GFP flags.
2517 * @order: Order of the folio.
2518 * @vma: Pointer to VMA.
2519 * @addr: Virtual address of the allocation. Must be inside @vma.
2520 *
2521 * Allocate a folio for a specific address in @vma, using the appropriate
2522 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2523 * VMA to prevent it from going away. Should be used for all allocations
2524 * for folios that will be mapped into user space, excepting hugetlbfs, and
2525 * excepting where direct use of folio_alloc_mpol() is more appropriate.
2526 *
2527 * Return: The folio on success or NULL if allocation fails.
2528 */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2529 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2530 unsigned long addr)
2531 {
2532 struct mempolicy *pol;
2533 pgoff_t ilx;
2534 struct folio *folio;
2535
2536 if (vma->vm_flags & VM_DROPPABLE)
2537 gfp |= __GFP_NOWARN;
2538
2539 pol = get_vma_policy(vma, addr, order, &ilx);
2540 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2541 mpol_cond_put(pol);
2542 return folio;
2543 }
2544 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2545
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2546 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2547 {
2548 struct mempolicy *pol = &default_policy;
2549
2550 /*
2551 * No reference counting needed for current->mempolicy
2552 * nor system default_policy
2553 */
2554 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2555 pol = get_task_policy(current);
2556
2557 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2558 numa_node_id());
2559 }
2560
2561 /**
2562 * alloc_pages - Allocate pages.
2563 * @gfp: GFP flags.
2564 * @order: Power of two of number of pages to allocate.
2565 *
2566 * Allocate 1 << @order contiguous pages. The physical address of the
2567 * first page is naturally aligned (eg an order-3 allocation will be aligned
2568 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2569 * process is honoured when in process context.
2570 *
2571 * Context: Can be called from any context, providing the appropriate GFP
2572 * flags are used.
2573 * Return: The page on success or NULL if allocation fails.
2574 */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2575 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2576 {
2577 struct page *page = alloc_frozen_pages_noprof(gfp, order);
2578
2579 if (page)
2580 set_page_refcounted(page);
2581 return page;
2582 }
2583 EXPORT_SYMBOL(alloc_pages_noprof);
2584
folio_alloc_noprof(gfp_t gfp,unsigned int order)2585 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2586 {
2587 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2588 }
2589 EXPORT_SYMBOL(folio_alloc_noprof);
2590
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2591 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2592 struct mempolicy *pol, unsigned long nr_pages,
2593 struct page **page_array)
2594 {
2595 int nodes;
2596 unsigned long nr_pages_per_node;
2597 int delta;
2598 int i;
2599 unsigned long nr_allocated;
2600 unsigned long total_allocated = 0;
2601
2602 nodes = nodes_weight(pol->nodes);
2603 nr_pages_per_node = nr_pages / nodes;
2604 delta = nr_pages - nodes * nr_pages_per_node;
2605
2606 for (i = 0; i < nodes; i++) {
2607 if (delta) {
2608 nr_allocated = alloc_pages_bulk_noprof(gfp,
2609 interleave_nodes(pol), NULL,
2610 nr_pages_per_node + 1,
2611 page_array);
2612 delta--;
2613 } else {
2614 nr_allocated = alloc_pages_bulk_noprof(gfp,
2615 interleave_nodes(pol), NULL,
2616 nr_pages_per_node, page_array);
2617 }
2618
2619 page_array += nr_allocated;
2620 total_allocated += nr_allocated;
2621 }
2622
2623 return total_allocated;
2624 }
2625
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2626 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2627 struct mempolicy *pol, unsigned long nr_pages,
2628 struct page **page_array)
2629 {
2630 struct weighted_interleave_state *state;
2631 struct task_struct *me = current;
2632 unsigned int cpuset_mems_cookie;
2633 unsigned long total_allocated = 0;
2634 unsigned long nr_allocated = 0;
2635 unsigned long rounds;
2636 unsigned long node_pages, delta;
2637 u8 *weights, weight;
2638 unsigned int weight_total = 0;
2639 unsigned long rem_pages = nr_pages;
2640 nodemask_t nodes;
2641 int nnodes, node;
2642 int resume_node = MAX_NUMNODES - 1;
2643 u8 resume_weight = 0;
2644 int prev_node;
2645 int i;
2646
2647 if (!nr_pages)
2648 return 0;
2649
2650 /* read the nodes onto the stack, retry if done during rebind */
2651 do {
2652 cpuset_mems_cookie = read_mems_allowed_begin();
2653 nnodes = read_once_policy_nodemask(pol, &nodes);
2654 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2655
2656 /* if the nodemask has become invalid, we cannot do anything */
2657 if (!nnodes)
2658 return 0;
2659
2660 /* Continue allocating from most recent node and adjust the nr_pages */
2661 node = me->il_prev;
2662 weight = me->il_weight;
2663 if (weight && node_isset(node, nodes)) {
2664 node_pages = min(rem_pages, weight);
2665 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2666 page_array);
2667 page_array += nr_allocated;
2668 total_allocated += nr_allocated;
2669 /* if that's all the pages, no need to interleave */
2670 if (rem_pages <= weight) {
2671 me->il_weight -= rem_pages;
2672 return total_allocated;
2673 }
2674 /* Otherwise we adjust remaining pages, continue from there */
2675 rem_pages -= weight;
2676 }
2677 /* clear active weight in case of an allocation failure */
2678 me->il_weight = 0;
2679 prev_node = node;
2680
2681 /* create a local copy of node weights to operate on outside rcu */
2682 weights = kzalloc(nr_node_ids, GFP_KERNEL);
2683 if (!weights)
2684 return total_allocated;
2685
2686 rcu_read_lock();
2687 state = rcu_dereference(wi_state);
2688 if (state) {
2689 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2690 rcu_read_unlock();
2691 } else {
2692 rcu_read_unlock();
2693 for (i = 0; i < nr_node_ids; i++)
2694 weights[i] = 1;
2695 }
2696
2697 /* calculate total, detect system default usage */
2698 for_each_node_mask(node, nodes)
2699 weight_total += weights[node];
2700
2701 /*
2702 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2703 * Track which node weighted interleave should resume from.
2704 *
2705 * if (rounds > 0) and (delta == 0), resume_node will always be
2706 * the node following prev_node and its weight.
2707 */
2708 rounds = rem_pages / weight_total;
2709 delta = rem_pages % weight_total;
2710 resume_node = next_node_in(prev_node, nodes);
2711 resume_weight = weights[resume_node];
2712 for (i = 0; i < nnodes; i++) {
2713 node = next_node_in(prev_node, nodes);
2714 weight = weights[node];
2715 node_pages = weight * rounds;
2716 /* If a delta exists, add this node's portion of the delta */
2717 if (delta > weight) {
2718 node_pages += weight;
2719 delta -= weight;
2720 } else if (delta) {
2721 /* when delta is depleted, resume from that node */
2722 node_pages += delta;
2723 resume_node = node;
2724 resume_weight = weight - delta;
2725 delta = 0;
2726 }
2727 /* node_pages can be 0 if an allocation fails and rounds == 0 */
2728 if (!node_pages)
2729 break;
2730 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2731 page_array);
2732 page_array += nr_allocated;
2733 total_allocated += nr_allocated;
2734 if (total_allocated == nr_pages)
2735 break;
2736 prev_node = node;
2737 }
2738 me->il_prev = resume_node;
2739 me->il_weight = resume_weight;
2740 kfree(weights);
2741 return total_allocated;
2742 }
2743
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2744 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2745 struct mempolicy *pol, unsigned long nr_pages,
2746 struct page **page_array)
2747 {
2748 gfp_t preferred_gfp;
2749 unsigned long nr_allocated = 0;
2750
2751 preferred_gfp = gfp | __GFP_NOWARN;
2752 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2753
2754 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2755 nr_pages, page_array);
2756
2757 if (nr_allocated < nr_pages)
2758 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2759 nr_pages - nr_allocated,
2760 page_array + nr_allocated);
2761 return nr_allocated;
2762 }
2763
2764 /* alloc pages bulk and mempolicy should be considered at the
2765 * same time in some situation such as vmalloc.
2766 *
2767 * It can accelerate memory allocation especially interleaving
2768 * allocate memory.
2769 */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2770 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2771 unsigned long nr_pages, struct page **page_array)
2772 {
2773 struct mempolicy *pol = &default_policy;
2774 nodemask_t *nodemask;
2775 int nid;
2776
2777 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2778 pol = get_task_policy(current);
2779
2780 if (pol->mode == MPOL_INTERLEAVE)
2781 return alloc_pages_bulk_interleave(gfp, pol,
2782 nr_pages, page_array);
2783
2784 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2785 return alloc_pages_bulk_weighted_interleave(
2786 gfp, pol, nr_pages, page_array);
2787
2788 if (pol->mode == MPOL_PREFERRED_MANY)
2789 return alloc_pages_bulk_preferred_many(gfp,
2790 numa_node_id(), pol, nr_pages, page_array);
2791
2792 nid = numa_node_id();
2793 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2794 return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2795 nr_pages, page_array);
2796 }
2797
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2798 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2799 {
2800 struct mempolicy *pol = mpol_dup(src->vm_policy);
2801
2802 if (IS_ERR(pol))
2803 return PTR_ERR(pol);
2804 dst->vm_policy = pol;
2805 return 0;
2806 }
2807
2808 /*
2809 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2810 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2811 * with the mems_allowed returned by cpuset_mems_allowed(). This
2812 * keeps mempolicies cpuset relative after its cpuset moves. See
2813 * further kernel/cpuset.c update_nodemask().
2814 *
2815 * current's mempolicy may be rebinded by the other task(the task that changes
2816 * cpuset's mems), so we needn't do rebind work for current task.
2817 */
2818
2819 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2820 struct mempolicy *__mpol_dup(struct mempolicy *old)
2821 {
2822 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2823
2824 if (!new)
2825 return ERR_PTR(-ENOMEM);
2826
2827 /* task's mempolicy is protected by alloc_lock */
2828 if (old == current->mempolicy) {
2829 task_lock(current);
2830 *new = *old;
2831 task_unlock(current);
2832 } else
2833 *new = *old;
2834
2835 if (current_cpuset_is_being_rebound()) {
2836 nodemask_t mems = cpuset_mems_allowed(current);
2837 mpol_rebind_policy(new, &mems);
2838 }
2839 atomic_set(&new->refcnt, 1);
2840 return new;
2841 }
2842
2843 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2844 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2845 {
2846 if (!a || !b)
2847 return false;
2848 if (a->mode != b->mode)
2849 return false;
2850 if (a->flags != b->flags)
2851 return false;
2852 if (a->home_node != b->home_node)
2853 return false;
2854 if (mpol_store_user_nodemask(a))
2855 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2856 return false;
2857
2858 switch (a->mode) {
2859 case MPOL_BIND:
2860 case MPOL_INTERLEAVE:
2861 case MPOL_PREFERRED:
2862 case MPOL_PREFERRED_MANY:
2863 case MPOL_WEIGHTED_INTERLEAVE:
2864 return !!nodes_equal(a->nodes, b->nodes);
2865 case MPOL_LOCAL:
2866 return true;
2867 default:
2868 BUG();
2869 return false;
2870 }
2871 }
2872
2873 /*
2874 * Shared memory backing store policy support.
2875 *
2876 * Remember policies even when nobody has shared memory mapped.
2877 * The policies are kept in Red-Black tree linked from the inode.
2878 * They are protected by the sp->lock rwlock, which should be held
2879 * for any accesses to the tree.
2880 */
2881
2882 /*
2883 * lookup first element intersecting start-end. Caller holds sp->lock for
2884 * reading or for writing
2885 */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2886 static struct sp_node *sp_lookup(struct shared_policy *sp,
2887 pgoff_t start, pgoff_t end)
2888 {
2889 struct rb_node *n = sp->root.rb_node;
2890
2891 while (n) {
2892 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2893
2894 if (start >= p->end)
2895 n = n->rb_right;
2896 else if (end <= p->start)
2897 n = n->rb_left;
2898 else
2899 break;
2900 }
2901 if (!n)
2902 return NULL;
2903 for (;;) {
2904 struct sp_node *w = NULL;
2905 struct rb_node *prev = rb_prev(n);
2906 if (!prev)
2907 break;
2908 w = rb_entry(prev, struct sp_node, nd);
2909 if (w->end <= start)
2910 break;
2911 n = prev;
2912 }
2913 return rb_entry(n, struct sp_node, nd);
2914 }
2915
2916 /*
2917 * Insert a new shared policy into the list. Caller holds sp->lock for
2918 * writing.
2919 */
sp_insert(struct shared_policy * sp,struct sp_node * new)2920 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2921 {
2922 struct rb_node **p = &sp->root.rb_node;
2923 struct rb_node *parent = NULL;
2924 struct sp_node *nd;
2925
2926 while (*p) {
2927 parent = *p;
2928 nd = rb_entry(parent, struct sp_node, nd);
2929 if (new->start < nd->start)
2930 p = &(*p)->rb_left;
2931 else if (new->end > nd->end)
2932 p = &(*p)->rb_right;
2933 else
2934 BUG();
2935 }
2936 rb_link_node(&new->nd, parent, p);
2937 rb_insert_color(&new->nd, &sp->root);
2938 }
2939
2940 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2941 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2942 pgoff_t idx)
2943 {
2944 struct mempolicy *pol = NULL;
2945 struct sp_node *sn;
2946
2947 if (!sp->root.rb_node)
2948 return NULL;
2949 read_lock(&sp->lock);
2950 sn = sp_lookup(sp, idx, idx+1);
2951 if (sn) {
2952 mpol_get(sn->policy);
2953 pol = sn->policy;
2954 }
2955 read_unlock(&sp->lock);
2956 return pol;
2957 }
2958 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
2959
sp_free(struct sp_node * n)2960 static void sp_free(struct sp_node *n)
2961 {
2962 mpol_put(n->policy);
2963 kmem_cache_free(sn_cache, n);
2964 }
2965
2966 /**
2967 * mpol_misplaced - check whether current folio node is valid in policy
2968 *
2969 * @folio: folio to be checked
2970 * @vmf: structure describing the fault
2971 * @addr: virtual address in @vma for shared policy lookup and interleave policy
2972 *
2973 * Lookup current policy node id for vma,addr and "compare to" folio's
2974 * node id. Policy determination "mimics" alloc_page_vma().
2975 * Called from fault path where we know the vma and faulting address.
2976 *
2977 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2978 * policy, or a suitable node ID to allocate a replacement folio from.
2979 */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2980 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2981 unsigned long addr)
2982 {
2983 struct mempolicy *pol;
2984 pgoff_t ilx;
2985 struct zoneref *z;
2986 int curnid = folio_nid(folio);
2987 struct vm_area_struct *vma = vmf->vma;
2988 int thiscpu = raw_smp_processor_id();
2989 int thisnid = numa_node_id();
2990 int polnid = NUMA_NO_NODE;
2991 int ret = NUMA_NO_NODE;
2992
2993 /*
2994 * Make sure ptl is held so that we don't preempt and we
2995 * have a stable smp processor id
2996 */
2997 lockdep_assert_held(vmf->ptl);
2998 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2999 if (!(pol->flags & MPOL_F_MOF))
3000 goto out;
3001
3002 switch (pol->mode) {
3003 case MPOL_INTERLEAVE:
3004 polnid = interleave_nid(pol, ilx);
3005 break;
3006
3007 case MPOL_WEIGHTED_INTERLEAVE:
3008 polnid = weighted_interleave_nid(pol, ilx);
3009 break;
3010
3011 case MPOL_PREFERRED:
3012 if (node_isset(curnid, pol->nodes))
3013 goto out;
3014 polnid = first_node(pol->nodes);
3015 break;
3016
3017 case MPOL_LOCAL:
3018 polnid = numa_node_id();
3019 break;
3020
3021 case MPOL_BIND:
3022 case MPOL_PREFERRED_MANY:
3023 /*
3024 * Even though MPOL_PREFERRED_MANY can allocate pages outside
3025 * policy nodemask we don't allow numa migration to nodes
3026 * outside policy nodemask for now. This is done so that if we
3027 * want demotion to slow memory to happen, before allocating
3028 * from some DRAM node say 'x', we will end up using a
3029 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3030 * we should not promote to node 'x' from slow memory node.
3031 */
3032 if (pol->flags & MPOL_F_MORON) {
3033 /*
3034 * Optimize placement among multiple nodes
3035 * via NUMA balancing
3036 */
3037 if (node_isset(thisnid, pol->nodes))
3038 break;
3039 goto out;
3040 }
3041
3042 /*
3043 * use current page if in policy nodemask,
3044 * else select nearest allowed node, if any.
3045 * If no allowed nodes, use current [!misplaced].
3046 */
3047 if (node_isset(curnid, pol->nodes))
3048 goto out;
3049 z = first_zones_zonelist(
3050 node_zonelist(thisnid, GFP_HIGHUSER),
3051 gfp_zone(GFP_HIGHUSER),
3052 &pol->nodes);
3053 polnid = zonelist_node_idx(z);
3054 break;
3055
3056 default:
3057 BUG();
3058 }
3059
3060 /* Migrate the folio towards the node whose CPU is referencing it */
3061 if (pol->flags & MPOL_F_MORON) {
3062 polnid = thisnid;
3063
3064 if (!should_numa_migrate_memory(current, folio, curnid,
3065 thiscpu))
3066 goto out;
3067 }
3068
3069 if (curnid != polnid)
3070 ret = polnid;
3071 out:
3072 mpol_cond_put(pol);
3073
3074 return ret;
3075 }
3076
3077 /*
3078 * Drop the (possibly final) reference to task->mempolicy. It needs to be
3079 * dropped after task->mempolicy is set to NULL so that any allocation done as
3080 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3081 * policy.
3082 */
mpol_put_task_policy(struct task_struct * task)3083 void mpol_put_task_policy(struct task_struct *task)
3084 {
3085 struct mempolicy *pol;
3086
3087 task_lock(task);
3088 pol = task->mempolicy;
3089 task->mempolicy = NULL;
3090 task_unlock(task);
3091 mpol_put(pol);
3092 }
3093
sp_delete(struct shared_policy * sp,struct sp_node * n)3094 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3095 {
3096 rb_erase(&n->nd, &sp->root);
3097 sp_free(n);
3098 }
3099
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3100 static void sp_node_init(struct sp_node *node, unsigned long start,
3101 unsigned long end, struct mempolicy *pol)
3102 {
3103 node->start = start;
3104 node->end = end;
3105 node->policy = pol;
3106 }
3107
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3108 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3109 struct mempolicy *pol)
3110 {
3111 struct sp_node *n;
3112 struct mempolicy *newpol;
3113
3114 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3115 if (!n)
3116 return NULL;
3117
3118 newpol = mpol_dup(pol);
3119 if (IS_ERR(newpol)) {
3120 kmem_cache_free(sn_cache, n);
3121 return NULL;
3122 }
3123 newpol->flags |= MPOL_F_SHARED;
3124 sp_node_init(n, start, end, newpol);
3125
3126 return n;
3127 }
3128
3129 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3130 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3131 pgoff_t end, struct sp_node *new)
3132 {
3133 struct sp_node *n;
3134 struct sp_node *n_new = NULL;
3135 struct mempolicy *mpol_new = NULL;
3136 int ret = 0;
3137
3138 restart:
3139 write_lock(&sp->lock);
3140 n = sp_lookup(sp, start, end);
3141 /* Take care of old policies in the same range. */
3142 while (n && n->start < end) {
3143 struct rb_node *next = rb_next(&n->nd);
3144 if (n->start >= start) {
3145 if (n->end <= end)
3146 sp_delete(sp, n);
3147 else
3148 n->start = end;
3149 } else {
3150 /* Old policy spanning whole new range. */
3151 if (n->end > end) {
3152 if (!n_new)
3153 goto alloc_new;
3154
3155 *mpol_new = *n->policy;
3156 atomic_set(&mpol_new->refcnt, 1);
3157 sp_node_init(n_new, end, n->end, mpol_new);
3158 n->end = start;
3159 sp_insert(sp, n_new);
3160 n_new = NULL;
3161 mpol_new = NULL;
3162 break;
3163 } else
3164 n->end = start;
3165 }
3166 if (!next)
3167 break;
3168 n = rb_entry(next, struct sp_node, nd);
3169 }
3170 if (new)
3171 sp_insert(sp, new);
3172 write_unlock(&sp->lock);
3173 ret = 0;
3174
3175 err_out:
3176 if (mpol_new)
3177 mpol_put(mpol_new);
3178 if (n_new)
3179 kmem_cache_free(sn_cache, n_new);
3180
3181 return ret;
3182
3183 alloc_new:
3184 write_unlock(&sp->lock);
3185 ret = -ENOMEM;
3186 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3187 if (!n_new)
3188 goto err_out;
3189 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3190 if (!mpol_new)
3191 goto err_out;
3192 atomic_set(&mpol_new->refcnt, 1);
3193 goto restart;
3194 }
3195
3196 /**
3197 * mpol_shared_policy_init - initialize shared policy for inode
3198 * @sp: pointer to inode shared policy
3199 * @mpol: struct mempolicy to install
3200 *
3201 * Install non-NULL @mpol in inode's shared policy rb-tree.
3202 * On entry, the current task has a reference on a non-NULL @mpol.
3203 * This must be released on exit.
3204 * This is called at get_inode() calls and we can use GFP_KERNEL.
3205 */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3206 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3207 {
3208 int ret;
3209
3210 sp->root = RB_ROOT; /* empty tree == default mempolicy */
3211 rwlock_init(&sp->lock);
3212
3213 if (mpol) {
3214 struct sp_node *sn;
3215 struct mempolicy *npol;
3216 NODEMASK_SCRATCH(scratch);
3217
3218 if (!scratch)
3219 goto put_mpol;
3220
3221 /* contextualize the tmpfs mount point mempolicy to this file */
3222 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3223 if (IS_ERR(npol))
3224 goto free_scratch; /* no valid nodemask intersection */
3225
3226 task_lock(current);
3227 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3228 task_unlock(current);
3229 if (ret)
3230 goto put_npol;
3231
3232 /* alloc node covering entire file; adds ref to file's npol */
3233 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3234 if (sn)
3235 sp_insert(sp, sn);
3236 put_npol:
3237 mpol_put(npol); /* drop initial ref on file's npol */
3238 free_scratch:
3239 NODEMASK_SCRATCH_FREE(scratch);
3240 put_mpol:
3241 mpol_put(mpol); /* drop our incoming ref on sb mpol */
3242 }
3243 }
3244 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
3245
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3246 int mpol_set_shared_policy(struct shared_policy *sp,
3247 struct vm_area_struct *vma, struct mempolicy *pol)
3248 {
3249 int err;
3250 struct sp_node *new = NULL;
3251 unsigned long sz = vma_pages(vma);
3252
3253 if (pol) {
3254 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3255 if (!new)
3256 return -ENOMEM;
3257 }
3258 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3259 if (err && new)
3260 sp_free(new);
3261 return err;
3262 }
3263 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
3264
3265 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3266 void mpol_free_shared_policy(struct shared_policy *sp)
3267 {
3268 struct sp_node *n;
3269 struct rb_node *next;
3270
3271 if (!sp->root.rb_node)
3272 return;
3273 write_lock(&sp->lock);
3274 next = rb_first(&sp->root);
3275 while (next) {
3276 n = rb_entry(next, struct sp_node, nd);
3277 next = rb_next(&n->nd);
3278 sp_delete(sp, n);
3279 }
3280 write_unlock(&sp->lock);
3281 }
3282 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
3283
3284 #ifdef CONFIG_NUMA_BALANCING
3285 static int __initdata numabalancing_override;
3286
check_numabalancing_enable(void)3287 static void __init check_numabalancing_enable(void)
3288 {
3289 bool numabalancing_default = false;
3290
3291 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3292 numabalancing_default = true;
3293
3294 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3295 if (numabalancing_override)
3296 set_numabalancing_state(numabalancing_override == 1);
3297
3298 if (num_online_nodes() > 1 && !numabalancing_override) {
3299 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3300 numabalancing_default ? "Enabling" : "Disabling");
3301 set_numabalancing_state(numabalancing_default);
3302 }
3303 }
3304
setup_numabalancing(char * str)3305 static int __init setup_numabalancing(char *str)
3306 {
3307 int ret = 0;
3308 if (!str)
3309 goto out;
3310
3311 if (!strcmp(str, "enable")) {
3312 numabalancing_override = 1;
3313 ret = 1;
3314 } else if (!strcmp(str, "disable")) {
3315 numabalancing_override = -1;
3316 ret = 1;
3317 }
3318 out:
3319 if (!ret)
3320 pr_warn("Unable to parse numa_balancing=\n");
3321
3322 return ret;
3323 }
3324 __setup("numa_balancing=", setup_numabalancing);
3325 #else
check_numabalancing_enable(void)3326 static inline void __init check_numabalancing_enable(void)
3327 {
3328 }
3329 #endif /* CONFIG_NUMA_BALANCING */
3330
numa_policy_init(void)3331 void __init numa_policy_init(void)
3332 {
3333 nodemask_t interleave_nodes;
3334 unsigned long largest = 0;
3335 int nid, prefer = 0;
3336
3337 policy_cache = kmem_cache_create("numa_policy",
3338 sizeof(struct mempolicy),
3339 0, SLAB_PANIC, NULL);
3340
3341 sn_cache = kmem_cache_create("shared_policy_node",
3342 sizeof(struct sp_node),
3343 0, SLAB_PANIC, NULL);
3344
3345 for_each_node(nid) {
3346 preferred_node_policy[nid] = (struct mempolicy) {
3347 .refcnt = ATOMIC_INIT(1),
3348 .mode = MPOL_PREFERRED,
3349 .flags = MPOL_F_MOF | MPOL_F_MORON,
3350 .nodes = nodemask_of_node(nid),
3351 };
3352 }
3353
3354 /*
3355 * Set interleaving policy for system init. Interleaving is only
3356 * enabled across suitably sized nodes (default is >= 16MB), or
3357 * fall back to the largest node if they're all smaller.
3358 */
3359 nodes_clear(interleave_nodes);
3360 for_each_node_state(nid, N_MEMORY) {
3361 unsigned long total_pages = node_present_pages(nid);
3362
3363 /* Preserve the largest node */
3364 if (largest < total_pages) {
3365 largest = total_pages;
3366 prefer = nid;
3367 }
3368
3369 /* Interleave this node? */
3370 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3371 node_set(nid, interleave_nodes);
3372 }
3373
3374 /* All too small, use the largest */
3375 if (unlikely(nodes_empty(interleave_nodes)))
3376 node_set(prefer, interleave_nodes);
3377
3378 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3379 pr_err("%s: interleaving failed\n", __func__);
3380
3381 check_numabalancing_enable();
3382 }
3383
3384 /* Reset policy of current process to default */
numa_default_policy(void)3385 void numa_default_policy(void)
3386 {
3387 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3388 }
3389
3390 /*
3391 * Parse and format mempolicy from/to strings
3392 */
3393 static const char * const policy_modes[] =
3394 {
3395 [MPOL_DEFAULT] = "default",
3396 [MPOL_PREFERRED] = "prefer",
3397 [MPOL_BIND] = "bind",
3398 [MPOL_INTERLEAVE] = "interleave",
3399 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3400 [MPOL_LOCAL] = "local",
3401 [MPOL_PREFERRED_MANY] = "prefer (many)",
3402 };
3403
3404 #ifdef CONFIG_TMPFS
3405 /**
3406 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3407 * @str: string containing mempolicy to parse
3408 * @mpol: pointer to struct mempolicy pointer, returned on success.
3409 *
3410 * Format of input:
3411 * <mode>[=<flags>][:<nodelist>]
3412 *
3413 * Return: %0 on success, else %1
3414 */
mpol_parse_str(char * str,struct mempolicy ** mpol)3415 int mpol_parse_str(char *str, struct mempolicy **mpol)
3416 {
3417 struct mempolicy *new = NULL;
3418 unsigned short mode_flags;
3419 nodemask_t nodes;
3420 char *nodelist = strchr(str, ':');
3421 char *flags = strchr(str, '=');
3422 int err = 1, mode;
3423
3424 if (flags)
3425 *flags++ = '\0'; /* terminate mode string */
3426
3427 if (nodelist) {
3428 /* NUL-terminate mode or flags string */
3429 *nodelist++ = '\0';
3430 if (nodelist_parse(nodelist, nodes))
3431 goto out;
3432 if (!nodes_subset(nodes, node_states[N_MEMORY]))
3433 goto out;
3434 } else
3435 nodes_clear(nodes);
3436
3437 mode = match_string(policy_modes, MPOL_MAX, str);
3438 if (mode < 0)
3439 goto out;
3440
3441 switch (mode) {
3442 case MPOL_PREFERRED:
3443 /*
3444 * Insist on a nodelist of one node only, although later
3445 * we use first_node(nodes) to grab a single node, so here
3446 * nodelist (or nodes) cannot be empty.
3447 */
3448 if (nodelist) {
3449 char *rest = nodelist;
3450 while (isdigit(*rest))
3451 rest++;
3452 if (*rest)
3453 goto out;
3454 if (nodes_empty(nodes))
3455 goto out;
3456 }
3457 break;
3458 case MPOL_INTERLEAVE:
3459 case MPOL_WEIGHTED_INTERLEAVE:
3460 /*
3461 * Default to online nodes with memory if no nodelist
3462 */
3463 if (!nodelist)
3464 nodes = node_states[N_MEMORY];
3465 break;
3466 case MPOL_LOCAL:
3467 /*
3468 * Don't allow a nodelist; mpol_new() checks flags
3469 */
3470 if (nodelist)
3471 goto out;
3472 break;
3473 case MPOL_DEFAULT:
3474 /*
3475 * Insist on a empty nodelist
3476 */
3477 if (!nodelist)
3478 err = 0;
3479 goto out;
3480 case MPOL_PREFERRED_MANY:
3481 case MPOL_BIND:
3482 /*
3483 * Insist on a nodelist
3484 */
3485 if (!nodelist)
3486 goto out;
3487 }
3488
3489 mode_flags = 0;
3490 if (flags) {
3491 /*
3492 * Currently, we only support two mutually exclusive
3493 * mode flags.
3494 */
3495 if (!strcmp(flags, "static"))
3496 mode_flags |= MPOL_F_STATIC_NODES;
3497 else if (!strcmp(flags, "relative"))
3498 mode_flags |= MPOL_F_RELATIVE_NODES;
3499 else
3500 goto out;
3501 }
3502
3503 new = mpol_new(mode, mode_flags, &nodes);
3504 if (IS_ERR(new))
3505 goto out;
3506
3507 /*
3508 * Save nodes for mpol_to_str() to show the tmpfs mount options
3509 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3510 */
3511 if (mode != MPOL_PREFERRED) {
3512 new->nodes = nodes;
3513 } else if (nodelist) {
3514 nodes_clear(new->nodes);
3515 node_set(first_node(nodes), new->nodes);
3516 } else {
3517 new->mode = MPOL_LOCAL;
3518 }
3519
3520 /*
3521 * Save nodes for contextualization: this will be used to "clone"
3522 * the mempolicy in a specific context [cpuset] at a later time.
3523 */
3524 new->w.user_nodemask = nodes;
3525
3526 err = 0;
3527
3528 out:
3529 /* Restore string for error message */
3530 if (nodelist)
3531 *--nodelist = ':';
3532 if (flags)
3533 *--flags = '=';
3534 if (!err)
3535 *mpol = new;
3536 return err;
3537 }
3538 #endif /* CONFIG_TMPFS */
3539
3540 /**
3541 * mpol_to_str - format a mempolicy structure for printing
3542 * @buffer: to contain formatted mempolicy string
3543 * @maxlen: length of @buffer
3544 * @pol: pointer to mempolicy to be formatted
3545 *
3546 * Convert @pol into a string. If @buffer is too short, truncate the string.
3547 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3548 * interleave", plus the longest flag flags, "relative|balancing", and to
3549 * display at least a few node ids.
3550 */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3551 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3552 {
3553 char *p = buffer;
3554 nodemask_t nodes = NODE_MASK_NONE;
3555 unsigned short mode = MPOL_DEFAULT;
3556 unsigned short flags = 0;
3557
3558 if (pol &&
3559 pol != &default_policy &&
3560 !(pol >= &preferred_node_policy[0] &&
3561 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3562 mode = pol->mode;
3563 flags = pol->flags;
3564 }
3565
3566 switch (mode) {
3567 case MPOL_DEFAULT:
3568 case MPOL_LOCAL:
3569 break;
3570 case MPOL_PREFERRED:
3571 case MPOL_PREFERRED_MANY:
3572 case MPOL_BIND:
3573 case MPOL_INTERLEAVE:
3574 case MPOL_WEIGHTED_INTERLEAVE:
3575 nodes = pol->nodes;
3576 break;
3577 default:
3578 WARN_ON_ONCE(1);
3579 snprintf(p, maxlen, "unknown");
3580 return;
3581 }
3582
3583 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3584
3585 if (flags & MPOL_MODE_FLAGS) {
3586 p += snprintf(p, buffer + maxlen - p, "=");
3587
3588 /*
3589 * Static and relative are mutually exclusive.
3590 */
3591 if (flags & MPOL_F_STATIC_NODES)
3592 p += snprintf(p, buffer + maxlen - p, "static");
3593 else if (flags & MPOL_F_RELATIVE_NODES)
3594 p += snprintf(p, buffer + maxlen - p, "relative");
3595
3596 if (flags & MPOL_F_NUMA_BALANCING) {
3597 if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3598 p += snprintf(p, buffer + maxlen - p, "|");
3599 p += snprintf(p, buffer + maxlen - p, "balancing");
3600 }
3601 }
3602
3603 if (!nodes_empty(nodes))
3604 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3605 nodemask_pr_args(&nodes));
3606 }
3607
3608 #ifdef CONFIG_SYSFS
3609 struct iw_node_attr {
3610 struct kobj_attribute kobj_attr;
3611 int nid;
3612 };
3613
3614 struct sysfs_wi_group {
3615 struct kobject wi_kobj;
3616 struct mutex kobj_lock;
3617 struct iw_node_attr *nattrs[];
3618 };
3619
3620 static struct sysfs_wi_group *wi_group;
3621
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3622 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3623 char *buf)
3624 {
3625 struct iw_node_attr *node_attr;
3626 u8 weight;
3627
3628 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3629 weight = get_il_weight(node_attr->nid);
3630 return sysfs_emit(buf, "%d\n", weight);
3631 }
3632
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3633 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3634 const char *buf, size_t count)
3635 {
3636 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3637 struct iw_node_attr *node_attr;
3638 u8 weight = 0;
3639 int i;
3640
3641 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3642 if (count == 0 || sysfs_streq(buf, "") ||
3643 kstrtou8(buf, 0, &weight) || weight == 0)
3644 return -EINVAL;
3645
3646 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3647 GFP_KERNEL);
3648 if (!new_wi_state)
3649 return -ENOMEM;
3650
3651 mutex_lock(&wi_state_lock);
3652 old_wi_state = rcu_dereference_protected(wi_state,
3653 lockdep_is_held(&wi_state_lock));
3654 if (old_wi_state) {
3655 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3656 nr_node_ids * sizeof(u8));
3657 } else {
3658 for (i = 0; i < nr_node_ids; i++)
3659 new_wi_state->iw_table[i] = 1;
3660 }
3661 new_wi_state->iw_table[node_attr->nid] = weight;
3662 new_wi_state->mode_auto = false;
3663
3664 rcu_assign_pointer(wi_state, new_wi_state);
3665 mutex_unlock(&wi_state_lock);
3666 if (old_wi_state) {
3667 synchronize_rcu();
3668 kfree(old_wi_state);
3669 }
3670 return count;
3671 }
3672
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3673 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3674 struct kobj_attribute *attr, char *buf)
3675 {
3676 struct weighted_interleave_state *state;
3677 bool wi_auto = true;
3678
3679 rcu_read_lock();
3680 state = rcu_dereference(wi_state);
3681 if (state)
3682 wi_auto = state->mode_auto;
3683 rcu_read_unlock();
3684
3685 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3686 }
3687
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3688 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3689 struct kobj_attribute *attr, const char *buf, size_t count)
3690 {
3691 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3692 unsigned int *bw;
3693 bool input;
3694 int i;
3695
3696 if (kstrtobool(buf, &input))
3697 return -EINVAL;
3698
3699 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3700 GFP_KERNEL);
3701 if (!new_wi_state)
3702 return -ENOMEM;
3703 for (i = 0; i < nr_node_ids; i++)
3704 new_wi_state->iw_table[i] = 1;
3705
3706 mutex_lock(&wi_state_lock);
3707 if (!input) {
3708 old_wi_state = rcu_dereference_protected(wi_state,
3709 lockdep_is_held(&wi_state_lock));
3710 if (!old_wi_state)
3711 goto update_wi_state;
3712 if (input == old_wi_state->mode_auto) {
3713 mutex_unlock(&wi_state_lock);
3714 return count;
3715 }
3716
3717 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3718 nr_node_ids * sizeof(u8));
3719 goto update_wi_state;
3720 }
3721
3722 bw = node_bw_table;
3723 if (!bw) {
3724 mutex_unlock(&wi_state_lock);
3725 kfree(new_wi_state);
3726 return -ENODEV;
3727 }
3728
3729 new_wi_state->mode_auto = true;
3730 reduce_interleave_weights(bw, new_wi_state->iw_table);
3731
3732 update_wi_state:
3733 rcu_assign_pointer(wi_state, new_wi_state);
3734 mutex_unlock(&wi_state_lock);
3735 if (old_wi_state) {
3736 synchronize_rcu();
3737 kfree(old_wi_state);
3738 }
3739 return count;
3740 }
3741
sysfs_wi_node_delete(int nid)3742 static void sysfs_wi_node_delete(int nid)
3743 {
3744 struct iw_node_attr *attr;
3745
3746 if (nid < 0 || nid >= nr_node_ids)
3747 return;
3748
3749 mutex_lock(&wi_group->kobj_lock);
3750 attr = wi_group->nattrs[nid];
3751 if (!attr) {
3752 mutex_unlock(&wi_group->kobj_lock);
3753 return;
3754 }
3755
3756 wi_group->nattrs[nid] = NULL;
3757 mutex_unlock(&wi_group->kobj_lock);
3758
3759 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3760 kfree(attr->kobj_attr.attr.name);
3761 kfree(attr);
3762 }
3763
sysfs_wi_node_delete_all(void)3764 static void sysfs_wi_node_delete_all(void)
3765 {
3766 int nid;
3767
3768 for (nid = 0; nid < nr_node_ids; nid++)
3769 sysfs_wi_node_delete(nid);
3770 }
3771
wi_state_free(void)3772 static void wi_state_free(void)
3773 {
3774 struct weighted_interleave_state *old_wi_state;
3775
3776 mutex_lock(&wi_state_lock);
3777 old_wi_state = rcu_dereference_protected(wi_state,
3778 lockdep_is_held(&wi_state_lock));
3779 rcu_assign_pointer(wi_state, NULL);
3780 mutex_unlock(&wi_state_lock);
3781
3782 if (old_wi_state) {
3783 synchronize_rcu();
3784 kfree(old_wi_state);
3785 }
3786 }
3787
3788 static struct kobj_attribute wi_auto_attr =
3789 __ATTR(auto, 0664, weighted_interleave_auto_show,
3790 weighted_interleave_auto_store);
3791
wi_cleanup(void)3792 static void wi_cleanup(void) {
3793 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3794 sysfs_wi_node_delete_all();
3795 wi_state_free();
3796 }
3797
wi_kobj_release(struct kobject * wi_kobj)3798 static void wi_kobj_release(struct kobject *wi_kobj)
3799 {
3800 kfree(wi_group);
3801 }
3802
3803 static const struct kobj_type wi_ktype = {
3804 .sysfs_ops = &kobj_sysfs_ops,
3805 .release = wi_kobj_release,
3806 };
3807
sysfs_wi_node_add(int nid)3808 static int sysfs_wi_node_add(int nid)
3809 {
3810 int ret;
3811 char *name;
3812 struct iw_node_attr *new_attr;
3813
3814 if (nid < 0 || nid >= nr_node_ids) {
3815 pr_err("invalid node id: %d\n", nid);
3816 return -EINVAL;
3817 }
3818
3819 new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3820 if (!new_attr)
3821 return -ENOMEM;
3822
3823 name = kasprintf(GFP_KERNEL, "node%d", nid);
3824 if (!name) {
3825 kfree(new_attr);
3826 return -ENOMEM;
3827 }
3828
3829 sysfs_attr_init(&new_attr->kobj_attr.attr);
3830 new_attr->kobj_attr.attr.name = name;
3831 new_attr->kobj_attr.attr.mode = 0644;
3832 new_attr->kobj_attr.show = node_show;
3833 new_attr->kobj_attr.store = node_store;
3834 new_attr->nid = nid;
3835
3836 mutex_lock(&wi_group->kobj_lock);
3837 if (wi_group->nattrs[nid]) {
3838 mutex_unlock(&wi_group->kobj_lock);
3839 ret = -EEXIST;
3840 goto out;
3841 }
3842
3843 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3844 if (ret) {
3845 mutex_unlock(&wi_group->kobj_lock);
3846 goto out;
3847 }
3848 wi_group->nattrs[nid] = new_attr;
3849 mutex_unlock(&wi_group->kobj_lock);
3850 return 0;
3851
3852 out:
3853 kfree(new_attr->kobj_attr.attr.name);
3854 kfree(new_attr);
3855 return ret;
3856 }
3857
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3858 static int wi_node_notifier(struct notifier_block *nb,
3859 unsigned long action, void *data)
3860 {
3861 int err;
3862 struct node_notify *nn = data;
3863 int nid = nn->nid;
3864
3865 switch (action) {
3866 case NODE_ADDED_FIRST_MEMORY:
3867 err = sysfs_wi_node_add(nid);
3868 if (err)
3869 pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3870 nid, err);
3871 break;
3872 case NODE_REMOVED_LAST_MEMORY:
3873 sysfs_wi_node_delete(nid);
3874 break;
3875 }
3876
3877 return NOTIFY_OK;
3878 }
3879
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3880 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3881 {
3882 int nid, err;
3883
3884 wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3885 GFP_KERNEL);
3886 if (!wi_group)
3887 return -ENOMEM;
3888 mutex_init(&wi_group->kobj_lock);
3889
3890 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3891 "weighted_interleave");
3892 if (err)
3893 goto err_put_kobj;
3894
3895 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3896 if (err)
3897 goto err_put_kobj;
3898
3899 for_each_online_node(nid) {
3900 if (!node_state(nid, N_MEMORY))
3901 continue;
3902
3903 err = sysfs_wi_node_add(nid);
3904 if (err) {
3905 pr_err("failed to add sysfs for node%d during init: %d\n",
3906 nid, err);
3907 goto err_cleanup_kobj;
3908 }
3909 }
3910
3911 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3912 return 0;
3913
3914 err_cleanup_kobj:
3915 wi_cleanup();
3916 kobject_del(&wi_group->wi_kobj);
3917 err_put_kobj:
3918 kobject_put(&wi_group->wi_kobj);
3919 return err;
3920 }
3921
mempolicy_sysfs_init(void)3922 static int __init mempolicy_sysfs_init(void)
3923 {
3924 int err;
3925 static struct kobject *mempolicy_kobj;
3926
3927 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3928 if (!mempolicy_kobj)
3929 return -ENOMEM;
3930
3931 err = add_weighted_interleave_group(mempolicy_kobj);
3932 if (err)
3933 goto err_kobj;
3934
3935 return 0;
3936
3937 err_kobj:
3938 kobject_del(mempolicy_kobj);
3939 kobject_put(mempolicy_kobj);
3940 return err;
3941 }
3942
3943 late_initcall(mempolicy_sysfs_init);
3944 #endif /* CONFIG_SYSFS */
3945