1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support six policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * weighted interleave
23 * Allocate memory interleaved over a set of nodes based on
24 * a set of weights (per-node), with normal fallback if it
25 * fails. Otherwise operates the same as interleave.
26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27 * on node 0 for every 1 page allocated on node 1.
28 *
29 * bind Only allocate memory on a specific set of nodes,
30 * no fallback.
31 * FIXME: memory is allocated starting with the first node
32 * to the last. It would be better if bind would truly restrict
33 * the allocation to memory nodes instead
34 *
35 * preferred Try a specific node first before normal fallback.
36 * As a special case NUMA_NO_NODE here means do the allocation
37 * on the local CPU. This is normally identical to default,
38 * but useful to set in a VMA when you have a non default
39 * process policy.
40 *
41 * preferred many Try a set of nodes first before normal fallback. This is
42 * similar to preferred without the special case.
43 *
44 * default Allocate on the local node first, or when on a VMA
45 * use the process policy. This is what Linux always did
46 * in a NUMA aware kernel and still does by, ahem, default.
47 *
48 * The process policy is applied for most non interrupt memory allocations
49 * in that process' context. Interrupts ignore the policies and always
50 * try to allocate on the local CPU. The VMA policy is only applied for memory
51 * allocations for a VMA in the VM.
52 *
53 * Currently there are a few corner cases in swapping where the policy
54 * is not applied, but the majority should be handled. When process policy
55 * is used it is not remembered over swap outs/swap ins.
56 *
57 * Only the highest zone in the zone hierarchy gets policied. Allocations
58 * requesting a lower zone just use default policy. This implies that
59 * on systems with highmem kernel lowmem allocation don't get policied.
60 * Same with GFP_DMA allocations.
61 *
62 * For shmem/tmpfs shared memory the policy is shared between
63 * all users and remembered even when nobody has memory mapped.
64 */
65
66 /* Notebook:
67 fix mmap readahead to honour policy and enable policy for any page cache
68 object
69 statistics for bigpages
70 global policy for page cache? currently it uses process policy. Requires
71 first item above.
72 handle mremap for shared memory (currently ignored for the policy)
73 grows down?
74 make bind policy root only? It can trigger oom much faster and the
75 kernel is not always grateful with that.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/sysctl.h>
89 #include <linux/sched/task.h>
90 #include <linux/nodemask.h>
91 #include <linux/cpuset.h>
92 #include <linux/slab.h>
93 #include <linux/string.h>
94 #include <linux/export.h>
95 #include <linux/nsproxy.h>
96 #include <linux/interrupt.h>
97 #include <linux/init.h>
98 #include <linux/compat.h>
99 #include <linux/ptrace.h>
100 #include <linux/swap.h>
101 #include <linux/seq_file.h>
102 #include <linux/proc_fs.h>
103 #include <linux/memory-tiers.h>
104 #include <linux/migrate.h>
105 #include <linux/ksm.h>
106 #include <linux/rmap.h>
107 #include <linux/security.h>
108 #include <linux/syscalls.h>
109 #include <linux/ctype.h>
110 #include <linux/mm_inline.h>
111 #include <linux/mmu_notifier.h>
112 #include <linux/printk.h>
113 #include <linux/leafops.h>
114 #include <linux/gcd.h>
115
116 #include <asm/tlbflush.h>
117 #include <asm/tlb.h>
118 #include <linux/uaccess.h>
119 #include <linux/memory.h>
120
121 #include "internal.h"
122
123 /* Internal flags */
124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
125 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
126 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
127
128 static struct kmem_cache *policy_cache;
129 static struct kmem_cache *sn_cache;
130
131 /* Highest zone. An specific allocation for a zone below that is not
132 policied. */
133 enum zone_type policy_zone = 0;
134
135 /*
136 * run-time system-wide default policy => local allocation
137 */
138 static struct mempolicy default_policy = {
139 .refcnt = ATOMIC_INIT(1), /* never free it */
140 .mode = MPOL_LOCAL,
141 };
142
143 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144
145 /*
146 * weightiness balances the tradeoff between small weights (cycles through nodes
147 * faster, more fair/even distribution) and large weights (smaller errors
148 * between actual bandwidth ratios and weight ratios). 32 is a number that has
149 * been found to perform at a reasonable compromise between the two goals.
150 */
151 static const int weightiness = 32;
152
153 /*
154 * A null weighted_interleave_state is interpreted as having .mode="auto",
155 * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156 */
157 struct weighted_interleave_state {
158 bool mode_auto;
159 u8 iw_table[];
160 };
161 static struct weighted_interleave_state __rcu *wi_state;
162 static unsigned int *node_bw_table;
163
164 /*
165 * wi_state_lock protects both wi_state and node_bw_table.
166 * node_bw_table is only used by writers to update wi_state.
167 */
168 static DEFINE_MUTEX(wi_state_lock);
169
get_il_weight(int node)170 static u8 get_il_weight(int node)
171 {
172 struct weighted_interleave_state *state;
173 u8 weight = 1;
174
175 rcu_read_lock();
176 state = rcu_dereference(wi_state);
177 if (state)
178 weight = state->iw_table[node];
179 rcu_read_unlock();
180 return weight;
181 }
182
183 /*
184 * Convert bandwidth values into weighted interleave weights.
185 * Call with wi_state_lock.
186 */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188 {
189 u64 sum_bw = 0;
190 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191 int nid;
192
193 for_each_node_state(nid, N_MEMORY)
194 sum_bw += bw[nid];
195
196 /* Scale bandwidths to whole numbers in the range [1, weightiness] */
197 for_each_node_state(nid, N_MEMORY) {
198 /*
199 * Try not to perform 64-bit division.
200 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201 * If sum_bw > scaling_factor, then round the weight up to 1.
202 */
203 scaling_factor = weightiness * bw[nid];
204 if (bw[nid] && sum_bw < scaling_factor) {
205 cast_sum_bw = (unsigned int)sum_bw;
206 new_iw[nid] = scaling_factor / cast_sum_bw;
207 } else {
208 new_iw[nid] = 1;
209 }
210 if (!iw_gcd)
211 iw_gcd = new_iw[nid];
212 iw_gcd = gcd(iw_gcd, new_iw[nid]);
213 }
214
215 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216 for_each_node_state(nid, N_MEMORY)
217 new_iw[nid] /= iw_gcd;
218 }
219
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221 {
222 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223 unsigned int *old_bw, *new_bw;
224 unsigned int bw_val;
225 int i;
226
227 bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229 if (!new_bw)
230 return -ENOMEM;
231
232 new_wi_state = kmalloc_flex(*new_wi_state, iw_table, nr_node_ids);
233 if (!new_wi_state) {
234 kfree(new_bw);
235 return -ENOMEM;
236 }
237 new_wi_state->mode_auto = true;
238 for (i = 0; i < nr_node_ids; i++)
239 new_wi_state->iw_table[i] = 1;
240
241 /*
242 * Update bandwidth info, even in manual mode. That way, when switching
243 * to auto mode in the future, iw_table can be overwritten using
244 * accurate bw data.
245 */
246 mutex_lock(&wi_state_lock);
247
248 old_bw = node_bw_table;
249 if (old_bw)
250 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
251 new_bw[node] = bw_val;
252 node_bw_table = new_bw;
253
254 old_wi_state = rcu_dereference_protected(wi_state,
255 lockdep_is_held(&wi_state_lock));
256 if (old_wi_state && !old_wi_state->mode_auto) {
257 /* Manual mode; skip reducing weights and updating wi_state */
258 mutex_unlock(&wi_state_lock);
259 kfree(new_wi_state);
260 goto out;
261 }
262
263 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
264 reduce_interleave_weights(new_bw, new_wi_state->iw_table);
265 rcu_assign_pointer(wi_state, new_wi_state);
266
267 mutex_unlock(&wi_state_lock);
268 if (old_wi_state) {
269 synchronize_rcu();
270 kfree(old_wi_state);
271 }
272 out:
273 kfree(old_bw);
274 return 0;
275 }
276
277 /**
278 * numa_nearest_node - Find nearest node by state
279 * @node: Node id to start the search
280 * @state: State to filter the search
281 *
282 * Lookup the closest node by distance if @nid is not in state.
283 *
284 * Return: this @node if it is in state, otherwise the closest node by distance
285 */
numa_nearest_node(int node,unsigned int state)286 int numa_nearest_node(int node, unsigned int state)
287 {
288 int min_dist = INT_MAX, dist, n, min_node;
289
290 if (state >= NR_NODE_STATES)
291 return -EINVAL;
292
293 if (node == NUMA_NO_NODE || node_state(node, state))
294 return node;
295
296 min_node = node;
297 for_each_node_state(n, state) {
298 dist = node_distance(node, n);
299 if (dist < min_dist) {
300 min_dist = dist;
301 min_node = n;
302 }
303 }
304
305 return min_node;
306 }
307 EXPORT_SYMBOL_GPL(numa_nearest_node);
308
309 /**
310 * nearest_node_nodemask - Find the node in @mask at the nearest distance
311 * from @node.
312 *
313 * @node: a valid node ID to start the search from.
314 * @mask: a pointer to a nodemask representing the allowed nodes.
315 *
316 * This function iterates over all nodes in @mask and calculates the
317 * distance from the starting @node, then it returns the node ID that is
318 * the closest to @node, or MAX_NUMNODES if no node is found.
319 *
320 * Note that @node must be a valid node ID usable with node_distance(),
321 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
322 * or unexpected behavior.
323 */
nearest_node_nodemask(int node,nodemask_t * mask)324 int nearest_node_nodemask(int node, nodemask_t *mask)
325 {
326 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
327
328 for_each_node_mask(n, *mask) {
329 dist = node_distance(node, n);
330 if (dist < min_dist) {
331 min_dist = dist;
332 min_node = n;
333 }
334 }
335
336 return min_node;
337 }
338 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
339
get_task_policy(struct task_struct * p)340 struct mempolicy *get_task_policy(struct task_struct *p)
341 {
342 struct mempolicy *pol = p->mempolicy;
343 int node;
344
345 if (pol)
346 return pol;
347
348 node = numa_node_id();
349 if (node != NUMA_NO_NODE) {
350 pol = &preferred_node_policy[node];
351 /* preferred_node_policy is not initialised early in boot */
352 if (pol->mode)
353 return pol;
354 }
355
356 return &default_policy;
357 }
358 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
359
360 static const struct mempolicy_operations {
361 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
362 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
363 } mpol_ops[MPOL_MAX];
364
mpol_store_user_nodemask(const struct mempolicy * pol)365 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
366 {
367 return pol->flags & MPOL_USER_NODEMASK_FLAGS;
368 }
369
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)370 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
371 const nodemask_t *rel)
372 {
373 nodemask_t tmp;
374 nodes_fold(tmp, *orig, nodes_weight(*rel));
375 nodes_onto(*ret, tmp, *rel);
376 }
377
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)378 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
379 {
380 if (nodes_empty(*nodes))
381 return -EINVAL;
382 pol->nodes = *nodes;
383 return 0;
384 }
385
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)386 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 if (nodes_empty(*nodes))
389 return -EINVAL;
390
391 nodes_clear(pol->nodes);
392 node_set(first_node(*nodes), pol->nodes);
393 return 0;
394 }
395
396 /*
397 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
398 * any, for the new policy. mpol_new() has already validated the nodes
399 * parameter with respect to the policy mode and flags.
400 *
401 * Must be called holding task's alloc_lock to protect task's mems_allowed
402 * and mempolicy. May also be called holding the mmap_lock for write.
403 */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)404 static int mpol_set_nodemask(struct mempolicy *pol,
405 const nodemask_t *nodes, struct nodemask_scratch *nsc)
406 {
407 int ret;
408
409 /*
410 * Default (pol==NULL) resp. local memory policies are not a
411 * subject of any remapping. They also do not need any special
412 * constructor.
413 */
414 if (!pol || pol->mode == MPOL_LOCAL)
415 return 0;
416
417 /* Check N_MEMORY */
418 nodes_and(nsc->mask1,
419 cpuset_current_mems_allowed, node_states[N_MEMORY]);
420
421 VM_BUG_ON(!nodes);
422
423 if (pol->flags & MPOL_F_RELATIVE_NODES)
424 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
425 else
426 nodes_and(nsc->mask2, *nodes, nsc->mask1);
427
428 if (mpol_store_user_nodemask(pol))
429 pol->w.user_nodemask = *nodes;
430 else
431 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
432
433 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
434 return ret;
435 }
436
437 /*
438 * This function just creates a new policy, does some check and simple
439 * initialization. You must invoke mpol_set_nodemask() to set nodes.
440 */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)441 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
442 nodemask_t *nodes)
443 {
444 struct mempolicy *policy;
445
446 if (mode == MPOL_DEFAULT) {
447 if (nodes && !nodes_empty(*nodes))
448 return ERR_PTR(-EINVAL);
449 return NULL;
450 }
451 VM_BUG_ON(!nodes);
452
453 /*
454 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
455 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
456 * All other modes require a valid pointer to a non-empty nodemask.
457 */
458 if (mode == MPOL_PREFERRED) {
459 if (nodes_empty(*nodes)) {
460 if (((flags & MPOL_F_STATIC_NODES) ||
461 (flags & MPOL_F_RELATIVE_NODES)))
462 return ERR_PTR(-EINVAL);
463
464 mode = MPOL_LOCAL;
465 }
466 } else if (mode == MPOL_LOCAL) {
467 if (!nodes_empty(*nodes) ||
468 (flags & MPOL_F_STATIC_NODES) ||
469 (flags & MPOL_F_RELATIVE_NODES))
470 return ERR_PTR(-EINVAL);
471 } else if (nodes_empty(*nodes))
472 return ERR_PTR(-EINVAL);
473
474 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
475 if (!policy)
476 return ERR_PTR(-ENOMEM);
477 atomic_set(&policy->refcnt, 1);
478 policy->mode = mode;
479 policy->flags = flags;
480 policy->home_node = NUMA_NO_NODE;
481
482 return policy;
483 }
484
485 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)486 void __mpol_put(struct mempolicy *pol)
487 {
488 if (!atomic_dec_and_test(&pol->refcnt))
489 return;
490 /*
491 * Required to allow mmap_lock_speculative*() access, see for example
492 * futex_key_to_node_opt(). All accesses are serialized by mmap_lock,
493 * however the speculative lock section unbound by the normal lock
494 * boundaries, requiring RCU freeing.
495 */
496 kfree_rcu(pol, rcu);
497 }
498 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
499
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)500 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
501 {
502 }
503
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)504 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
505 {
506 nodemask_t tmp;
507
508 if (pol->flags & MPOL_F_STATIC_NODES)
509 nodes_and(tmp, pol->w.user_nodemask, *nodes);
510 else if (pol->flags & MPOL_F_RELATIVE_NODES)
511 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
512 else {
513 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
514 *nodes);
515 pol->w.cpuset_mems_allowed = *nodes;
516 }
517
518 if (nodes_empty(tmp))
519 tmp = *nodes;
520
521 pol->nodes = tmp;
522 }
523
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)524 static void mpol_rebind_preferred(struct mempolicy *pol,
525 const nodemask_t *nodes)
526 {
527 pol->w.cpuset_mems_allowed = *nodes;
528 }
529
530 /*
531 * mpol_rebind_policy - Migrate a policy to a different set of nodes
532 *
533 * Per-vma policies are protected by mmap_lock. Allocations using per-task
534 * policies are protected by task->mems_allowed_seq to prevent a premature
535 * OOM/allocation failure due to parallel nodemask modification.
536 */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)537 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
538 {
539 if (!pol || pol->mode == MPOL_LOCAL)
540 return;
541 if (!mpol_store_user_nodemask(pol) &&
542 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
543 return;
544
545 mpol_ops[pol->mode].rebind(pol, newmask);
546 }
547
548 /*
549 * Wrapper for mpol_rebind_policy() that just requires task
550 * pointer, and updates task mempolicy.
551 *
552 * Called with task's alloc_lock held.
553 */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)554 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
555 {
556 mpol_rebind_policy(tsk->mempolicy, new);
557 }
558
559 /*
560 * Rebind each vma in mm to new nodemask.
561 *
562 * Call holding a reference to mm. Takes mm->mmap_lock during call.
563 */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)564 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
565 {
566 struct vm_area_struct *vma;
567 VMA_ITERATOR(vmi, mm, 0);
568
569 mmap_write_lock(mm);
570 for_each_vma(vmi, vma) {
571 vma_start_write(vma);
572 mpol_rebind_policy(vma->vm_policy, new);
573 }
574 mmap_write_unlock(mm);
575 }
576
577 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
578 [MPOL_DEFAULT] = {
579 .rebind = mpol_rebind_default,
580 },
581 [MPOL_INTERLEAVE] = {
582 .create = mpol_new_nodemask,
583 .rebind = mpol_rebind_nodemask,
584 },
585 [MPOL_PREFERRED] = {
586 .create = mpol_new_preferred,
587 .rebind = mpol_rebind_preferred,
588 },
589 [MPOL_BIND] = {
590 .create = mpol_new_nodemask,
591 .rebind = mpol_rebind_nodemask,
592 },
593 [MPOL_LOCAL] = {
594 .rebind = mpol_rebind_default,
595 },
596 [MPOL_PREFERRED_MANY] = {
597 .create = mpol_new_nodemask,
598 .rebind = mpol_rebind_preferred,
599 },
600 [MPOL_WEIGHTED_INTERLEAVE] = {
601 .create = mpol_new_nodemask,
602 .rebind = mpol_rebind_nodemask,
603 },
604 };
605
606 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
607 unsigned long flags);
608 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
609 pgoff_t ilx, int *nid);
610
strictly_unmovable(unsigned long flags)611 static bool strictly_unmovable(unsigned long flags)
612 {
613 /*
614 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
615 * if any misplaced page is found.
616 */
617 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
618 MPOL_MF_STRICT;
619 }
620
621 struct migration_mpol { /* for alloc_migration_target_by_mpol() */
622 struct mempolicy *pol;
623 pgoff_t ilx;
624 };
625
626 struct queue_pages {
627 struct list_head *pagelist;
628 unsigned long flags;
629 nodemask_t *nmask;
630 unsigned long start;
631 unsigned long end;
632 struct vm_area_struct *first;
633 struct folio *large; /* note last large folio encountered */
634 long nr_failed; /* could not be isolated at this time */
635 };
636
637 /*
638 * Check if the folio's nid is in qp->nmask.
639 *
640 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
641 * in the invert of qp->nmask.
642 */
queue_folio_required(struct folio * folio,struct queue_pages * qp)643 static inline bool queue_folio_required(struct folio *folio,
644 struct queue_pages *qp)
645 {
646 int nid = folio_nid(folio);
647 unsigned long flags = qp->flags;
648
649 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
650 }
651
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)652 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
653 {
654 struct folio *folio;
655 struct queue_pages *qp = walk->private;
656
657 if (unlikely(pmd_is_migration_entry(*pmd))) {
658 qp->nr_failed++;
659 return;
660 }
661 folio = pmd_folio(*pmd);
662 if (is_huge_zero_folio(folio)) {
663 walk->action = ACTION_CONTINUE;
664 return;
665 }
666 if (!queue_folio_required(folio, qp))
667 return;
668 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
669 !vma_migratable(walk->vma) ||
670 !migrate_folio_add(folio, qp->pagelist, qp->flags))
671 qp->nr_failed++;
672 }
673
674 /*
675 * Scan through folios, checking if they satisfy the required conditions,
676 * moving them from LRU to local pagelist for migration if they do (or not).
677 *
678 * queue_folios_pte_range() has two possible return values:
679 * 0 - continue walking to scan for more, even if an existing folio on the
680 * wrong node could not be isolated and queued for migration.
681 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
682 * and an existing folio was on a node that does not follow the policy.
683 */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)684 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
685 unsigned long end, struct mm_walk *walk)
686 {
687 struct vm_area_struct *vma = walk->vma;
688 struct folio *folio;
689 struct queue_pages *qp = walk->private;
690 unsigned long flags = qp->flags;
691 pte_t *pte, *mapped_pte;
692 pte_t ptent;
693 spinlock_t *ptl;
694 int max_nr, nr;
695
696 ptl = pmd_trans_huge_lock(pmd, vma);
697 if (ptl) {
698 queue_folios_pmd(pmd, walk);
699 spin_unlock(ptl);
700 goto out;
701 }
702
703 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
704 if (!pte) {
705 walk->action = ACTION_AGAIN;
706 return 0;
707 }
708 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
709 max_nr = (end - addr) >> PAGE_SHIFT;
710 nr = 1;
711 ptent = ptep_get(pte);
712 if (pte_none(ptent))
713 continue;
714 if (!pte_present(ptent)) {
715 const softleaf_t entry = softleaf_from_pte(ptent);
716
717 if (softleaf_is_migration(entry))
718 qp->nr_failed++;
719 continue;
720 }
721 folio = vm_normal_folio(vma, addr, ptent);
722 if (!folio || folio_is_zone_device(folio))
723 continue;
724 if (folio_test_large(folio) && max_nr != 1)
725 nr = folio_pte_batch(folio, pte, ptent, max_nr);
726 /*
727 * vm_normal_folio() filters out zero pages, but there might
728 * still be reserved folios to skip, perhaps in a VDSO.
729 */
730 if (folio_test_reserved(folio))
731 continue;
732 if (!queue_folio_required(folio, qp))
733 continue;
734 if (folio_test_large(folio)) {
735 /*
736 * A large folio can only be isolated from LRU once,
737 * but may be mapped by many PTEs (and Copy-On-Write may
738 * intersperse PTEs of other, order 0, folios). This is
739 * a common case, so don't mistake it for failure (but
740 * there can be other cases of multi-mapped pages which
741 * this quick check does not help to filter out - and a
742 * search of the pagelist might grow to be prohibitive).
743 *
744 * migrate_pages(&pagelist) returns nr_failed folios, so
745 * check "large" now so that queue_pages_range() returns
746 * a comparable nr_failed folios. This does imply that
747 * if folio could not be isolated for some racy reason
748 * at its first PTE, later PTEs will not give it another
749 * chance of isolation; but keeps the accounting simple.
750 */
751 if (folio == qp->large)
752 continue;
753 qp->large = folio;
754 }
755 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
756 !vma_migratable(vma) ||
757 !migrate_folio_add(folio, qp->pagelist, flags)) {
758 qp->nr_failed += nr;
759 if (strictly_unmovable(flags))
760 break;
761 }
762 }
763 pte_unmap_unlock(mapped_pte, ptl);
764 cond_resched();
765 out:
766 if (qp->nr_failed && strictly_unmovable(flags))
767 return -EIO;
768 return 0;
769 }
770
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)771 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
772 unsigned long addr, unsigned long end,
773 struct mm_walk *walk)
774 {
775 #ifdef CONFIG_HUGETLB_PAGE
776 struct queue_pages *qp = walk->private;
777 unsigned long flags = qp->flags;
778 struct folio *folio;
779 spinlock_t *ptl;
780 pte_t ptep;
781
782 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
783 ptep = huge_ptep_get(walk->mm, addr, pte);
784 if (!pte_present(ptep)) {
785 if (!huge_pte_none(ptep)) {
786 const softleaf_t entry = softleaf_from_pte(ptep);
787
788 if (unlikely(softleaf_is_migration(entry)))
789 qp->nr_failed++;
790 }
791
792 goto unlock;
793 }
794 folio = pfn_folio(pte_pfn(ptep));
795 if (!queue_folio_required(folio, qp))
796 goto unlock;
797 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
798 !vma_migratable(walk->vma)) {
799 qp->nr_failed++;
800 goto unlock;
801 }
802 /*
803 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
804 * Choosing not to migrate a shared folio is not counted as a failure.
805 *
806 * See folio_maybe_mapped_shared() on possible imprecision when we
807 * cannot easily detect if a folio is shared.
808 */
809 if ((flags & MPOL_MF_MOVE_ALL) ||
810 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
811 if (!folio_isolate_hugetlb(folio, qp->pagelist))
812 qp->nr_failed++;
813 unlock:
814 spin_unlock(ptl);
815 if (qp->nr_failed && strictly_unmovable(flags))
816 return -EIO;
817 #endif
818 return 0;
819 }
820
821 #ifdef CONFIG_NUMA_BALANCING
822 /**
823 * folio_can_map_prot_numa() - check whether the folio can map prot numa
824 * @folio: The folio whose mapping considered for being made NUMA hintable
825 * @vma: The VMA that the folio belongs to.
826 * @is_private_single_threaded: Is this a single-threaded private VMA or not
827 *
828 * This function checks to see if the folio actually indicates that
829 * we need to make the mapping one which causes a NUMA hinting fault,
830 * as there are cases where it's simply unnecessary, and the folio's
831 * access time is adjusted for memory tiering if prot numa needed.
832 *
833 * Return: True if the mapping of the folio needs to be changed, false otherwise.
834 */
folio_can_map_prot_numa(struct folio * folio,struct vm_area_struct * vma,bool is_private_single_threaded)835 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
836 bool is_private_single_threaded)
837 {
838 int nid;
839
840 if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
841 return false;
842
843 /* Also skip shared copy-on-write folios */
844 if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
845 return false;
846
847 /* Folios are pinned and can't be migrated */
848 if (folio_maybe_dma_pinned(folio))
849 return false;
850
851 /*
852 * While migration can move some dirty folios,
853 * it cannot move them all from MIGRATE_ASYNC
854 * context.
855 */
856 if (folio_is_file_lru(folio) && folio_test_dirty(folio))
857 return false;
858
859 /*
860 * Don't mess with PTEs if folio is already on the node
861 * a single-threaded process is running on.
862 */
863 nid = folio_nid(folio);
864 if (is_private_single_threaded && (nid == numa_node_id()))
865 return false;
866
867 /*
868 * Skip scanning top tier node if normal numa
869 * balancing is disabled
870 */
871 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
872 node_is_toptier(nid))
873 return false;
874
875 if (folio_use_access_time(folio))
876 folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
877
878 return true;
879 }
880
881 /*
882 * This is used to mark a range of virtual addresses to be inaccessible.
883 * These are later cleared by a NUMA hinting fault. Depending on these
884 * faults, pages may be migrated for better NUMA placement.
885 *
886 * This is assuming that NUMA faults are handled using PROT_NONE. If
887 * an architecture makes a different choice, it will need further
888 * changes to the core.
889 */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)890 unsigned long change_prot_numa(struct vm_area_struct *vma,
891 unsigned long addr, unsigned long end)
892 {
893 struct mmu_gather tlb;
894 long nr_updated;
895
896 tlb_gather_mmu(&tlb, vma->vm_mm);
897
898 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
899 if (nr_updated > 0) {
900 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
901 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
902 }
903
904 tlb_finish_mmu(&tlb);
905
906 return nr_updated;
907 }
908 #endif /* CONFIG_NUMA_BALANCING */
909
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)910 static int queue_pages_test_walk(unsigned long start, unsigned long end,
911 struct mm_walk *walk)
912 {
913 struct vm_area_struct *next, *vma = walk->vma;
914 struct queue_pages *qp = walk->private;
915 unsigned long flags = qp->flags;
916
917 /* range check first */
918 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
919
920 if (!qp->first) {
921 qp->first = vma;
922 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
923 (qp->start < vma->vm_start))
924 /* hole at head side of range */
925 return -EFAULT;
926 }
927 next = find_vma(vma->vm_mm, vma->vm_end);
928 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
929 ((vma->vm_end < qp->end) &&
930 (!next || vma->vm_end < next->vm_start)))
931 /* hole at middle or tail of range */
932 return -EFAULT;
933
934 /*
935 * Need check MPOL_MF_STRICT to return -EIO if possible
936 * regardless of vma_migratable
937 */
938 if (!vma_migratable(vma) &&
939 !(flags & MPOL_MF_STRICT))
940 return 1;
941
942 /*
943 * Check page nodes, and queue pages to move, in the current vma.
944 * But if no moving, and no strict checking, the scan can be skipped.
945 */
946 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
947 return 0;
948 return 1;
949 }
950
951 static const struct mm_walk_ops queue_pages_walk_ops = {
952 .hugetlb_entry = queue_folios_hugetlb,
953 .pmd_entry = queue_folios_pte_range,
954 .test_walk = queue_pages_test_walk,
955 .walk_lock = PGWALK_RDLOCK,
956 };
957
958 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
959 .hugetlb_entry = queue_folios_hugetlb,
960 .pmd_entry = queue_folios_pte_range,
961 .test_walk = queue_pages_test_walk,
962 .walk_lock = PGWALK_WRLOCK,
963 };
964
965 /*
966 * Walk through page tables and collect pages to be migrated.
967 *
968 * If pages found in a given range are not on the required set of @nodes,
969 * and migration is allowed, they are isolated and queued to @pagelist.
970 *
971 * queue_pages_range() may return:
972 * 0 - all pages already on the right node, or successfully queued for moving
973 * (or neither strict checking nor moving requested: only range checking).
974 * >0 - this number of misplaced folios could not be queued for moving
975 * (a hugetlbfs page or a transparent huge page being counted as 1).
976 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
977 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
978 */
979 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)980 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
981 nodemask_t *nodes, unsigned long flags,
982 struct list_head *pagelist)
983 {
984 int err;
985 struct queue_pages qp = {
986 .pagelist = pagelist,
987 .flags = flags,
988 .nmask = nodes,
989 .start = start,
990 .end = end,
991 .first = NULL,
992 };
993 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
994 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
995
996 err = walk_page_range(mm, start, end, ops, &qp);
997
998 if (!qp.first)
999 /* whole range in hole */
1000 err = -EFAULT;
1001
1002 return err ? : qp.nr_failed;
1003 }
1004
1005 /*
1006 * Apply policy to a single VMA
1007 * This must be called with the mmap_lock held for writing.
1008 */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)1009 static int vma_replace_policy(struct vm_area_struct *vma,
1010 struct mempolicy *pol)
1011 {
1012 int err;
1013 struct mempolicy *old;
1014 struct mempolicy *new;
1015
1016 vma_assert_write_locked(vma);
1017
1018 new = mpol_dup(pol);
1019 if (IS_ERR(new))
1020 return PTR_ERR(new);
1021
1022 if (vma->vm_ops && vma->vm_ops->set_policy) {
1023 err = vma->vm_ops->set_policy(vma, new);
1024 if (err)
1025 goto err_out;
1026 }
1027
1028 old = vma->vm_policy;
1029 WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */
1030 mpol_put(old);
1031
1032 return 0;
1033 err_out:
1034 mpol_put(new);
1035 return err;
1036 }
1037
1038 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)1039 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1040 struct vm_area_struct **prev, unsigned long start,
1041 unsigned long end, struct mempolicy *new_pol)
1042 {
1043 unsigned long vmstart, vmend;
1044
1045 vmend = min(end, vma->vm_end);
1046 if (start > vma->vm_start) {
1047 *prev = vma;
1048 vmstart = start;
1049 } else {
1050 vmstart = vma->vm_start;
1051 }
1052
1053 if (mpol_equal(vma->vm_policy, new_pol)) {
1054 *prev = vma;
1055 return 0;
1056 }
1057
1058 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
1059 if (IS_ERR(vma))
1060 return PTR_ERR(vma);
1061
1062 *prev = vma;
1063 return vma_replace_policy(vma, new_pol);
1064 }
1065
1066 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)1067 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1068 nodemask_t *nodes)
1069 {
1070 struct mempolicy *new, *old;
1071 NODEMASK_SCRATCH(scratch);
1072 int ret;
1073
1074 if (!scratch)
1075 return -ENOMEM;
1076
1077 new = mpol_new(mode, flags, nodes);
1078 if (IS_ERR(new)) {
1079 ret = PTR_ERR(new);
1080 goto out;
1081 }
1082
1083 task_lock(current);
1084 ret = mpol_set_nodemask(new, nodes, scratch);
1085 if (ret) {
1086 task_unlock(current);
1087 mpol_put(new);
1088 goto out;
1089 }
1090
1091 old = current->mempolicy;
1092 current->mempolicy = new;
1093 if (new && (new->mode == MPOL_INTERLEAVE ||
1094 new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1095 current->il_prev = MAX_NUMNODES-1;
1096 current->il_weight = 0;
1097 }
1098 task_unlock(current);
1099 mpol_put(old);
1100 ret = 0;
1101 out:
1102 NODEMASK_SCRATCH_FREE(scratch);
1103 return ret;
1104 }
1105
1106 /*
1107 * Return nodemask for policy for get_mempolicy() query
1108 *
1109 * Called with task's alloc_lock held
1110 */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1111 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1112 {
1113 nodes_clear(*nodes);
1114 if (pol == &default_policy)
1115 return;
1116
1117 switch (pol->mode) {
1118 case MPOL_BIND:
1119 case MPOL_INTERLEAVE:
1120 case MPOL_PREFERRED:
1121 case MPOL_PREFERRED_MANY:
1122 case MPOL_WEIGHTED_INTERLEAVE:
1123 *nodes = pol->nodes;
1124 break;
1125 case MPOL_LOCAL:
1126 /* return empty node mask for local allocation */
1127 break;
1128 default:
1129 BUG();
1130 }
1131 }
1132
lookup_node(struct mm_struct * mm,unsigned long addr)1133 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1134 {
1135 struct page *p = NULL;
1136 int ret;
1137
1138 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1139 if (ret > 0) {
1140 ret = page_to_nid(p);
1141 put_page(p);
1142 }
1143 return ret;
1144 }
1145
1146 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1147 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1148 unsigned long addr, unsigned long flags)
1149 {
1150 int err;
1151 struct mm_struct *mm = current->mm;
1152 struct vm_area_struct *vma = NULL;
1153 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1154
1155 if (flags &
1156 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1157 return -EINVAL;
1158
1159 if (flags & MPOL_F_MEMS_ALLOWED) {
1160 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1161 return -EINVAL;
1162 *policy = 0; /* just so it's initialized */
1163 task_lock(current);
1164 *nmask = cpuset_current_mems_allowed;
1165 task_unlock(current);
1166 return 0;
1167 }
1168
1169 if (flags & MPOL_F_ADDR) {
1170 pgoff_t ilx; /* ignored here */
1171 /*
1172 * Do NOT fall back to task policy if the
1173 * vma/shared policy at addr is NULL. We
1174 * want to return MPOL_DEFAULT in this case.
1175 */
1176 mmap_read_lock(mm);
1177 vma = vma_lookup(mm, addr);
1178 if (!vma) {
1179 mmap_read_unlock(mm);
1180 return -EFAULT;
1181 }
1182 pol = __get_vma_policy(vma, addr, &ilx);
1183 } else if (addr)
1184 return -EINVAL;
1185
1186 if (!pol)
1187 pol = &default_policy; /* indicates default behavior */
1188
1189 if (flags & MPOL_F_NODE) {
1190 if (flags & MPOL_F_ADDR) {
1191 /*
1192 * Take a refcount on the mpol, because we are about to
1193 * drop the mmap_lock, after which only "pol" remains
1194 * valid, "vma" is stale.
1195 */
1196 pol_refcount = pol;
1197 vma = NULL;
1198 mpol_get(pol);
1199 mmap_read_unlock(mm);
1200 err = lookup_node(mm, addr);
1201 if (err < 0)
1202 goto out;
1203 *policy = err;
1204 } else if (pol == current->mempolicy &&
1205 pol->mode == MPOL_INTERLEAVE) {
1206 *policy = next_node_in(current->il_prev, pol->nodes);
1207 } else if (pol == current->mempolicy &&
1208 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1209 if (current->il_weight)
1210 *policy = current->il_prev;
1211 else
1212 *policy = next_node_in(current->il_prev,
1213 pol->nodes);
1214 } else {
1215 err = -EINVAL;
1216 goto out;
1217 }
1218 } else {
1219 *policy = pol == &default_policy ? MPOL_DEFAULT :
1220 pol->mode;
1221 /*
1222 * Internal mempolicy flags must be masked off before exposing
1223 * the policy to userspace.
1224 */
1225 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1226 }
1227
1228 err = 0;
1229 if (nmask) {
1230 if (mpol_store_user_nodemask(pol)) {
1231 *nmask = pol->w.user_nodemask;
1232 } else {
1233 task_lock(current);
1234 get_policy_nodemask(pol, nmask);
1235 task_unlock(current);
1236 }
1237 }
1238
1239 out:
1240 mpol_cond_put(pol);
1241 if (vma)
1242 mmap_read_unlock(mm);
1243 if (pol_refcount)
1244 mpol_put(pol_refcount);
1245 return err;
1246 }
1247
1248 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1249 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1250 unsigned long flags)
1251 {
1252 /*
1253 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1254 * Choosing not to migrate a shared folio is not counted as a failure.
1255 *
1256 * See folio_maybe_mapped_shared() on possible imprecision when we
1257 * cannot easily detect if a folio is shared.
1258 */
1259 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1260 if (folio_isolate_lru(folio)) {
1261 list_add_tail(&folio->lru, foliolist);
1262 node_stat_mod_folio(folio,
1263 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1264 folio_nr_pages(folio));
1265 } else {
1266 /*
1267 * Non-movable folio may reach here. And, there may be
1268 * temporary off LRU folios or non-LRU movable folios.
1269 * Treat them as unmovable folios since they can't be
1270 * isolated, so they can't be moved at the moment.
1271 */
1272 return false;
1273 }
1274 }
1275 return true;
1276 }
1277
1278 /*
1279 * Migrate pages from one node to a target node.
1280 * Returns error or the number of pages not migrated.
1281 */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1282 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1283 int flags)
1284 {
1285 nodemask_t nmask;
1286 struct vm_area_struct *vma;
1287 LIST_HEAD(pagelist);
1288 long nr_failed;
1289 long err = 0;
1290 struct migration_target_control mtc = {
1291 .nid = dest,
1292 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1293 .reason = MR_SYSCALL,
1294 };
1295
1296 nodes_clear(nmask);
1297 node_set(source, nmask);
1298
1299 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1300
1301 mmap_read_lock(mm);
1302 vma = find_vma(mm, 0);
1303 if (unlikely(!vma)) {
1304 mmap_read_unlock(mm);
1305 return 0;
1306 }
1307
1308 /*
1309 * This does not migrate the range, but isolates all pages that
1310 * need migration. Between passing in the full user address
1311 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1312 * but passes back the count of pages which could not be isolated.
1313 */
1314 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1315 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1316 mmap_read_unlock(mm);
1317
1318 if (!list_empty(&pagelist)) {
1319 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1320 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1321 if (err)
1322 putback_movable_pages(&pagelist);
1323 }
1324
1325 if (err >= 0)
1326 err += nr_failed;
1327 return err;
1328 }
1329
1330 /*
1331 * Move pages between the two nodesets so as to preserve the physical
1332 * layout as much as possible.
1333 *
1334 * Returns the number of page that could not be moved.
1335 */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1336 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1337 const nodemask_t *to, int flags)
1338 {
1339 long nr_failed = 0;
1340 long err = 0;
1341 nodemask_t tmp;
1342
1343 lru_cache_disable();
1344
1345 /*
1346 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1347 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1348 * bit in 'tmp', and return that <source, dest> pair for migration.
1349 * The pair of nodemasks 'to' and 'from' define the map.
1350 *
1351 * If no pair of bits is found that way, fallback to picking some
1352 * pair of 'source' and 'dest' bits that are not the same. If the
1353 * 'source' and 'dest' bits are the same, this represents a node
1354 * that will be migrating to itself, so no pages need move.
1355 *
1356 * If no bits are left in 'tmp', or if all remaining bits left
1357 * in 'tmp' correspond to the same bit in 'to', return false
1358 * (nothing left to migrate).
1359 *
1360 * This lets us pick a pair of nodes to migrate between, such that
1361 * if possible the dest node is not already occupied by some other
1362 * source node, minimizing the risk of overloading the memory on a
1363 * node that would happen if we migrated incoming memory to a node
1364 * before migrating outgoing memory source that same node.
1365 *
1366 * A single scan of tmp is sufficient. As we go, we remember the
1367 * most recent <s, d> pair that moved (s != d). If we find a pair
1368 * that not only moved, but what's better, moved to an empty slot
1369 * (d is not set in tmp), then we break out then, with that pair.
1370 * Otherwise when we finish scanning from_tmp, we at least have the
1371 * most recent <s, d> pair that moved. If we get all the way through
1372 * the scan of tmp without finding any node that moved, much less
1373 * moved to an empty node, then there is nothing left worth migrating.
1374 */
1375
1376 tmp = *from;
1377 while (!nodes_empty(tmp)) {
1378 int s, d;
1379 int source = NUMA_NO_NODE;
1380 int dest = 0;
1381
1382 for_each_node_mask(s, tmp) {
1383
1384 /*
1385 * do_migrate_pages() tries to maintain the relative
1386 * node relationship of the pages established between
1387 * threads and memory areas.
1388 *
1389 * However if the number of source nodes is not equal to
1390 * the number of destination nodes we can not preserve
1391 * this node relative relationship. In that case, skip
1392 * copying memory from a node that is in the destination
1393 * mask.
1394 *
1395 * Example: [2,3,4] -> [3,4,5] moves everything.
1396 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1397 */
1398
1399 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1400 (node_isset(s, *to)))
1401 continue;
1402
1403 d = node_remap(s, *from, *to);
1404 if (s == d)
1405 continue;
1406
1407 source = s; /* Node moved. Memorize */
1408 dest = d;
1409
1410 /* dest not in remaining from nodes? */
1411 if (!node_isset(dest, tmp))
1412 break;
1413 }
1414 if (source == NUMA_NO_NODE)
1415 break;
1416
1417 node_clear(source, tmp);
1418 err = migrate_to_node(mm, source, dest, flags);
1419 if (err > 0)
1420 nr_failed += err;
1421 if (err < 0)
1422 break;
1423 }
1424
1425 lru_cache_enable();
1426 if (err < 0)
1427 return err;
1428 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1429 }
1430
1431 /*
1432 * Allocate a new folio for page migration, according to NUMA mempolicy.
1433 */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1434 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1435 unsigned long private)
1436 {
1437 struct migration_mpol *mmpol = (struct migration_mpol *)private;
1438 struct mempolicy *pol = mmpol->pol;
1439 pgoff_t ilx = mmpol->ilx;
1440 unsigned int order;
1441 int nid = numa_node_id();
1442 gfp_t gfp;
1443
1444 order = folio_order(src);
1445 ilx += src->index >> order;
1446
1447 if (folio_test_hugetlb(src)) {
1448 nodemask_t *nodemask;
1449 struct hstate *h;
1450
1451 h = folio_hstate(src);
1452 gfp = htlb_alloc_mask(h);
1453 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1454 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1455 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1456 }
1457
1458 if (folio_test_large(src))
1459 gfp = GFP_TRANSHUGE;
1460 else
1461 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1462
1463 return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1464 }
1465 #else
1466
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1467 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1468 unsigned long flags)
1469 {
1470 return false;
1471 }
1472
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1473 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1474 const nodemask_t *to, int flags)
1475 {
1476 return -ENOSYS;
1477 }
1478
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1479 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1480 unsigned long private)
1481 {
1482 return NULL;
1483 }
1484 #endif
1485
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1486 static long do_mbind(unsigned long start, unsigned long len,
1487 unsigned short mode, unsigned short mode_flags,
1488 nodemask_t *nmask, unsigned long flags)
1489 {
1490 struct mm_struct *mm = current->mm;
1491 struct vm_area_struct *vma, *prev;
1492 struct vma_iterator vmi;
1493 struct migration_mpol mmpol;
1494 struct mempolicy *new;
1495 unsigned long end;
1496 long err;
1497 long nr_failed;
1498 LIST_HEAD(pagelist);
1499
1500 if (flags & ~(unsigned long)MPOL_MF_VALID)
1501 return -EINVAL;
1502 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1503 return -EPERM;
1504
1505 if (start & ~PAGE_MASK)
1506 return -EINVAL;
1507
1508 if (mode == MPOL_DEFAULT)
1509 flags &= ~MPOL_MF_STRICT;
1510
1511 len = PAGE_ALIGN(len);
1512 end = start + len;
1513
1514 if (end < start)
1515 return -EINVAL;
1516 if (end == start)
1517 return 0;
1518
1519 new = mpol_new(mode, mode_flags, nmask);
1520 if (IS_ERR(new))
1521 return PTR_ERR(new);
1522
1523 /*
1524 * If we are using the default policy then operation
1525 * on discontinuous address spaces is okay after all
1526 */
1527 if (!new)
1528 flags |= MPOL_MF_DISCONTIG_OK;
1529
1530 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1531 lru_cache_disable();
1532 {
1533 NODEMASK_SCRATCH(scratch);
1534 if (scratch) {
1535 mmap_write_lock(mm);
1536 err = mpol_set_nodemask(new, nmask, scratch);
1537 if (err)
1538 mmap_write_unlock(mm);
1539 } else
1540 err = -ENOMEM;
1541 NODEMASK_SCRATCH_FREE(scratch);
1542 }
1543 if (err)
1544 goto mpol_out;
1545
1546 /*
1547 * Lock the VMAs before scanning for pages to migrate,
1548 * to ensure we don't miss a concurrently inserted page.
1549 */
1550 nr_failed = queue_pages_range(mm, start, end, nmask,
1551 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1552
1553 if (nr_failed < 0) {
1554 err = nr_failed;
1555 nr_failed = 0;
1556 } else {
1557 vma_iter_init(&vmi, mm, start);
1558 prev = vma_prev(&vmi);
1559 for_each_vma_range(vmi, vma, end) {
1560 err = mbind_range(&vmi, vma, &prev, start, end, new);
1561 if (err)
1562 break;
1563 }
1564 }
1565
1566 if (!err && !list_empty(&pagelist)) {
1567 /* Convert MPOL_DEFAULT's NULL to task or default policy */
1568 if (!new) {
1569 new = get_task_policy(current);
1570 mpol_get(new);
1571 }
1572 mmpol.pol = new;
1573 mmpol.ilx = 0;
1574
1575 /*
1576 * In the interleaved case, attempt to allocate on exactly the
1577 * targeted nodes, for the first VMA to be migrated; for later
1578 * VMAs, the nodes will still be interleaved from the targeted
1579 * nodemask, but one by one may be selected differently.
1580 */
1581 if (new->mode == MPOL_INTERLEAVE ||
1582 new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1583 struct folio *folio;
1584 unsigned int order;
1585 unsigned long addr = -EFAULT;
1586
1587 list_for_each_entry(folio, &pagelist, lru) {
1588 if (!folio_test_ksm(folio))
1589 break;
1590 }
1591 if (!list_entry_is_head(folio, &pagelist, lru)) {
1592 vma_iter_init(&vmi, mm, start);
1593 for_each_vma_range(vmi, vma, end) {
1594 addr = page_address_in_vma(folio,
1595 folio_page(folio, 0), vma);
1596 if (addr != -EFAULT)
1597 break;
1598 }
1599 }
1600 if (addr != -EFAULT) {
1601 order = folio_order(folio);
1602 /* We already know the pol, but not the ilx */
1603 mpol_cond_put(get_vma_policy(vma, addr, order,
1604 &mmpol.ilx));
1605 /* Set base from which to increment by index */
1606 mmpol.ilx -= folio->index >> order;
1607 }
1608 }
1609 }
1610
1611 mmap_write_unlock(mm);
1612
1613 if (!err && !list_empty(&pagelist)) {
1614 nr_failed |= migrate_pages(&pagelist,
1615 alloc_migration_target_by_mpol, NULL,
1616 (unsigned long)&mmpol, MIGRATE_SYNC,
1617 MR_MEMPOLICY_MBIND, NULL);
1618 }
1619
1620 if (nr_failed && (flags & MPOL_MF_STRICT))
1621 err = -EIO;
1622 if (!list_empty(&pagelist))
1623 putback_movable_pages(&pagelist);
1624 mpol_out:
1625 mpol_put(new);
1626 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1627 lru_cache_enable();
1628 return err;
1629 }
1630
1631 /*
1632 * User space interface with variable sized bitmaps for nodelists.
1633 */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1634 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1635 unsigned long maxnode)
1636 {
1637 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1638 int ret;
1639
1640 if (in_compat_syscall())
1641 ret = compat_get_bitmap(mask,
1642 (const compat_ulong_t __user *)nmask,
1643 maxnode);
1644 else
1645 ret = copy_from_user(mask, nmask,
1646 nlongs * sizeof(unsigned long));
1647
1648 if (ret)
1649 return -EFAULT;
1650
1651 if (maxnode % BITS_PER_LONG)
1652 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1653
1654 return 0;
1655 }
1656
1657 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1658 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1659 unsigned long maxnode)
1660 {
1661 --maxnode;
1662 nodes_clear(*nodes);
1663 if (maxnode == 0 || !nmask)
1664 return 0;
1665 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1666 return -EINVAL;
1667
1668 /*
1669 * When the user specified more nodes than supported just check
1670 * if the non supported part is all zero, one word at a time,
1671 * starting at the end.
1672 */
1673 while (maxnode > MAX_NUMNODES) {
1674 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1675 unsigned long t;
1676
1677 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1678 return -EFAULT;
1679
1680 if (maxnode - bits >= MAX_NUMNODES) {
1681 maxnode -= bits;
1682 } else {
1683 maxnode = MAX_NUMNODES;
1684 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1685 }
1686 if (t)
1687 return -EINVAL;
1688 }
1689
1690 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1691 }
1692
1693 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1694 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1695 nodemask_t *nodes)
1696 {
1697 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1698 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1699 bool compat = in_compat_syscall();
1700
1701 if (compat)
1702 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1703
1704 if (copy > nbytes) {
1705 if (copy > PAGE_SIZE)
1706 return -EINVAL;
1707 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1708 return -EFAULT;
1709 copy = nbytes;
1710 maxnode = nr_node_ids;
1711 }
1712
1713 if (compat)
1714 return compat_put_bitmap((compat_ulong_t __user *)mask,
1715 nodes_addr(*nodes), maxnode);
1716
1717 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1718 }
1719
1720 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1721 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1722 {
1723 *flags = *mode & MPOL_MODE_FLAGS;
1724 *mode &= ~MPOL_MODE_FLAGS;
1725
1726 if ((unsigned int)(*mode) >= MPOL_MAX)
1727 return -EINVAL;
1728 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1729 return -EINVAL;
1730 if (*flags & MPOL_F_NUMA_BALANCING) {
1731 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1732 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1733 else
1734 return -EINVAL;
1735 }
1736 return 0;
1737 }
1738
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1739 static long kernel_mbind(unsigned long start, unsigned long len,
1740 unsigned long mode, const unsigned long __user *nmask,
1741 unsigned long maxnode, unsigned int flags)
1742 {
1743 unsigned short mode_flags;
1744 nodemask_t nodes;
1745 int lmode = mode;
1746 int err;
1747
1748 start = untagged_addr(start);
1749 err = sanitize_mpol_flags(&lmode, &mode_flags);
1750 if (err)
1751 return err;
1752
1753 err = get_nodes(&nodes, nmask, maxnode);
1754 if (err)
1755 return err;
1756
1757 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1758 }
1759
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1760 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1761 unsigned long, home_node, unsigned long, flags)
1762 {
1763 struct mm_struct *mm = current->mm;
1764 struct vm_area_struct *vma, *prev;
1765 struct mempolicy *new, *old;
1766 unsigned long end;
1767 int err = -ENOENT;
1768 VMA_ITERATOR(vmi, mm, start);
1769
1770 start = untagged_addr(start);
1771 if (start & ~PAGE_MASK)
1772 return -EINVAL;
1773 /*
1774 * flags is used for future extension if any.
1775 */
1776 if (flags != 0)
1777 return -EINVAL;
1778
1779 /*
1780 * Check home_node is online to avoid accessing uninitialized
1781 * NODE_DATA.
1782 */
1783 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1784 return -EINVAL;
1785
1786 len = PAGE_ALIGN(len);
1787 end = start + len;
1788
1789 if (end < start)
1790 return -EINVAL;
1791 if (end == start)
1792 return 0;
1793 mmap_write_lock(mm);
1794 prev = vma_prev(&vmi);
1795 for_each_vma_range(vmi, vma, end) {
1796 /*
1797 * If any vma in the range got policy other than MPOL_BIND
1798 * or MPOL_PREFERRED_MANY we return error. We don't reset
1799 * the home node for vmas we already updated before.
1800 */
1801 old = vma_policy(vma);
1802 if (!old) {
1803 prev = vma;
1804 continue;
1805 }
1806 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1807 err = -EOPNOTSUPP;
1808 break;
1809 }
1810 new = mpol_dup(old);
1811 if (IS_ERR(new)) {
1812 err = PTR_ERR(new);
1813 break;
1814 }
1815
1816 vma_start_write(vma);
1817 new->home_node = home_node;
1818 err = mbind_range(&vmi, vma, &prev, start, end, new);
1819 mpol_put(new);
1820 if (err)
1821 break;
1822 }
1823 mmap_write_unlock(mm);
1824 return err;
1825 }
1826
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1827 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1828 unsigned long, mode, const unsigned long __user *, nmask,
1829 unsigned long, maxnode, unsigned int, flags)
1830 {
1831 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1832 }
1833
1834 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1835 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1836 unsigned long maxnode)
1837 {
1838 unsigned short mode_flags;
1839 nodemask_t nodes;
1840 int lmode = mode;
1841 int err;
1842
1843 err = sanitize_mpol_flags(&lmode, &mode_flags);
1844 if (err)
1845 return err;
1846
1847 err = get_nodes(&nodes, nmask, maxnode);
1848 if (err)
1849 return err;
1850
1851 return do_set_mempolicy(lmode, mode_flags, &nodes);
1852 }
1853
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1854 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1855 unsigned long, maxnode)
1856 {
1857 return kernel_set_mempolicy(mode, nmask, maxnode);
1858 }
1859
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1860 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1861 const unsigned long __user *old_nodes,
1862 const unsigned long __user *new_nodes)
1863 {
1864 struct mm_struct *mm = NULL;
1865 struct task_struct *task;
1866 nodemask_t task_nodes;
1867 int err;
1868 nodemask_t *old;
1869 nodemask_t *new;
1870 NODEMASK_SCRATCH(scratch);
1871
1872 if (!scratch)
1873 return -ENOMEM;
1874
1875 old = &scratch->mask1;
1876 new = &scratch->mask2;
1877
1878 err = get_nodes(old, old_nodes, maxnode);
1879 if (err)
1880 goto out;
1881
1882 err = get_nodes(new, new_nodes, maxnode);
1883 if (err)
1884 goto out;
1885
1886 /* Find the mm_struct */
1887 rcu_read_lock();
1888 task = pid ? find_task_by_vpid(pid) : current;
1889 if (!task) {
1890 rcu_read_unlock();
1891 err = -ESRCH;
1892 goto out;
1893 }
1894 get_task_struct(task);
1895
1896 err = -EINVAL;
1897
1898 /*
1899 * Check if this process has the right to modify the specified process.
1900 * Use the regular "ptrace_may_access()" checks.
1901 */
1902 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1903 rcu_read_unlock();
1904 err = -EPERM;
1905 goto out_put;
1906 }
1907 rcu_read_unlock();
1908
1909 task_nodes = cpuset_mems_allowed(task);
1910 /* Is the user allowed to access the target nodes? */
1911 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1912 err = -EPERM;
1913 goto out_put;
1914 }
1915
1916 task_nodes = cpuset_mems_allowed(current);
1917 if (!nodes_and(*new, *new, task_nodes))
1918 goto out_put;
1919
1920 err = security_task_movememory(task);
1921 if (err)
1922 goto out_put;
1923
1924 mm = get_task_mm(task);
1925 put_task_struct(task);
1926
1927 if (!mm) {
1928 err = -EINVAL;
1929 goto out;
1930 }
1931
1932 err = do_migrate_pages(mm, old, new,
1933 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1934
1935 mmput(mm);
1936 out:
1937 NODEMASK_SCRATCH_FREE(scratch);
1938
1939 return err;
1940
1941 out_put:
1942 put_task_struct(task);
1943 goto out;
1944 }
1945
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1946 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1947 const unsigned long __user *, old_nodes,
1948 const unsigned long __user *, new_nodes)
1949 {
1950 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1951 }
1952
1953 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1954 static int kernel_get_mempolicy(int __user *policy,
1955 unsigned long __user *nmask,
1956 unsigned long maxnode,
1957 unsigned long addr,
1958 unsigned long flags)
1959 {
1960 int err;
1961 int pval;
1962 nodemask_t nodes;
1963
1964 if (nmask != NULL && maxnode < nr_node_ids)
1965 return -EINVAL;
1966
1967 addr = untagged_addr(addr);
1968
1969 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1970
1971 if (err)
1972 return err;
1973
1974 if (policy && put_user(pval, policy))
1975 return -EFAULT;
1976
1977 if (nmask)
1978 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1979
1980 return err;
1981 }
1982
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1983 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1984 unsigned long __user *, nmask, unsigned long, maxnode,
1985 unsigned long, addr, unsigned long, flags)
1986 {
1987 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1988 }
1989
vma_migratable(struct vm_area_struct * vma)1990 bool vma_migratable(struct vm_area_struct *vma)
1991 {
1992 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1993 return false;
1994
1995 /*
1996 * DAX device mappings require predictable access latency, so avoid
1997 * incurring periodic faults.
1998 */
1999 if (vma_is_dax(vma))
2000 return false;
2001
2002 if (is_vm_hugetlb_page(vma) &&
2003 !hugepage_migration_supported(hstate_vma(vma)))
2004 return false;
2005
2006 /*
2007 * Migration allocates pages in the highest zone. If we cannot
2008 * do so then migration (at least from node to node) is not
2009 * possible.
2010 */
2011 if (vma->vm_file &&
2012 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
2013 < policy_zone)
2014 return false;
2015 return true;
2016 }
2017
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2018 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2019 unsigned long addr, pgoff_t *ilx)
2020 {
2021 *ilx = 0;
2022 return (vma->vm_ops && vma->vm_ops->get_policy) ?
2023 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2024 }
2025
2026 /*
2027 * get_vma_policy(@vma, @addr, @order, @ilx)
2028 * @vma: virtual memory area whose policy is sought
2029 * @addr: address in @vma for shared policy lookup
2030 * @order: 0, or appropriate huge_page_order for interleaving
2031 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2032 * MPOL_WEIGHTED_INTERLEAVE
2033 *
2034 * Returns effective policy for a VMA at specified address.
2035 * Falls back to current->mempolicy or system default policy, as necessary.
2036 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2037 * count--added by the get_policy() vm_op, as appropriate--to protect against
2038 * freeing by another task. It is the caller's responsibility to free the
2039 * extra reference for shared policies.
2040 */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)2041 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2042 unsigned long addr, int order, pgoff_t *ilx)
2043 {
2044 struct mempolicy *pol;
2045
2046 pol = __get_vma_policy(vma, addr, ilx);
2047 if (!pol)
2048 pol = get_task_policy(current);
2049 if (pol->mode == MPOL_INTERLEAVE ||
2050 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2051 *ilx += vma->vm_pgoff >> order;
2052 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2053 }
2054 return pol;
2055 }
2056
vma_policy_mof(struct vm_area_struct * vma)2057 bool vma_policy_mof(struct vm_area_struct *vma)
2058 {
2059 struct mempolicy *pol;
2060
2061 if (vma->vm_ops && vma->vm_ops->get_policy) {
2062 bool ret = false;
2063 pgoff_t ilx; /* ignored here */
2064
2065 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2066 if (pol && (pol->flags & MPOL_F_MOF))
2067 ret = true;
2068 mpol_cond_put(pol);
2069
2070 return ret;
2071 }
2072
2073 pol = vma->vm_policy;
2074 if (!pol)
2075 pol = get_task_policy(current);
2076
2077 return pol->flags & MPOL_F_MOF;
2078 }
2079
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2080 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2081 {
2082 enum zone_type dynamic_policy_zone = policy_zone;
2083
2084 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2085
2086 /*
2087 * if policy->nodes has movable memory only,
2088 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2089 *
2090 * policy->nodes is intersect with node_states[N_MEMORY].
2091 * so if the following test fails, it implies
2092 * policy->nodes has movable memory only.
2093 */
2094 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2095 dynamic_policy_zone = ZONE_MOVABLE;
2096
2097 return zone >= dynamic_policy_zone;
2098 }
2099
weighted_interleave_nodes(struct mempolicy * policy)2100 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2101 {
2102 unsigned int node;
2103 unsigned int cpuset_mems_cookie;
2104
2105 retry:
2106 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2107 cpuset_mems_cookie = read_mems_allowed_begin();
2108 node = current->il_prev;
2109 if (!current->il_weight || !node_isset(node, policy->nodes)) {
2110 node = next_node_in(node, policy->nodes);
2111 if (read_mems_allowed_retry(cpuset_mems_cookie))
2112 goto retry;
2113 if (node == MAX_NUMNODES)
2114 return node;
2115 current->il_prev = node;
2116 current->il_weight = get_il_weight(node);
2117 }
2118 current->il_weight--;
2119 return node;
2120 }
2121
2122 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2123 static unsigned int interleave_nodes(struct mempolicy *policy)
2124 {
2125 unsigned int nid;
2126 unsigned int cpuset_mems_cookie;
2127
2128 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2129 do {
2130 cpuset_mems_cookie = read_mems_allowed_begin();
2131 nid = next_node_in(current->il_prev, policy->nodes);
2132 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2133
2134 if (nid < MAX_NUMNODES)
2135 current->il_prev = nid;
2136 return nid;
2137 }
2138
2139 /*
2140 * Depending on the memory policy provide a node from which to allocate the
2141 * next slab entry.
2142 */
mempolicy_slab_node(void)2143 unsigned int mempolicy_slab_node(void)
2144 {
2145 struct mempolicy *policy;
2146 int node = numa_mem_id();
2147
2148 if (!in_task())
2149 return node;
2150
2151 policy = current->mempolicy;
2152 if (!policy)
2153 return node;
2154
2155 switch (policy->mode) {
2156 case MPOL_PREFERRED:
2157 return first_node(policy->nodes);
2158
2159 case MPOL_INTERLEAVE:
2160 return interleave_nodes(policy);
2161
2162 case MPOL_WEIGHTED_INTERLEAVE:
2163 return weighted_interleave_nodes(policy);
2164
2165 case MPOL_BIND:
2166 case MPOL_PREFERRED_MANY:
2167 {
2168 struct zoneref *z;
2169
2170 /*
2171 * Follow bind policy behavior and start allocation at the
2172 * first node.
2173 */
2174 struct zonelist *zonelist;
2175 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2176 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2177 z = first_zones_zonelist(zonelist, highest_zoneidx,
2178 &policy->nodes);
2179 return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2180 }
2181 case MPOL_LOCAL:
2182 return node;
2183
2184 default:
2185 BUG();
2186 }
2187 }
2188
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2189 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2190 nodemask_t *mask)
2191 {
2192 /*
2193 * barrier stabilizes the nodemask locally so that it can be iterated
2194 * over safely without concern for changes. Allocators validate node
2195 * selection does not violate mems_allowed, so this is safe.
2196 */
2197 barrier();
2198 memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2199 barrier();
2200 return nodes_weight(*mask);
2201 }
2202
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2203 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2204 {
2205 struct weighted_interleave_state *state;
2206 nodemask_t nodemask;
2207 unsigned int target, nr_nodes;
2208 u8 *table = NULL;
2209 unsigned int weight_total = 0;
2210 u8 weight;
2211 int nid = 0;
2212
2213 nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2214 if (!nr_nodes)
2215 return numa_node_id();
2216
2217 rcu_read_lock();
2218
2219 state = rcu_dereference(wi_state);
2220 /* Uninitialized wi_state means we should assume all weights are 1 */
2221 if (state)
2222 table = state->iw_table;
2223
2224 /* calculate the total weight */
2225 for_each_node_mask(nid, nodemask)
2226 weight_total += table ? table[nid] : 1;
2227
2228 /* Calculate the node offset based on totals */
2229 target = ilx % weight_total;
2230 nid = first_node(nodemask);
2231 while (target) {
2232 /* detect system default usage */
2233 weight = table ? table[nid] : 1;
2234 if (target < weight)
2235 break;
2236 target -= weight;
2237 nid = next_node_in(nid, nodemask);
2238 }
2239 rcu_read_unlock();
2240 return nid;
2241 }
2242
2243 /*
2244 * Do static interleaving for interleave index @ilx. Returns the ilx'th
2245 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2246 * exceeds the number of present nodes.
2247 */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2248 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2249 {
2250 nodemask_t nodemask;
2251 unsigned int target, nnodes;
2252 int i;
2253 int nid;
2254
2255 nnodes = read_once_policy_nodemask(pol, &nodemask);
2256 if (!nnodes)
2257 return numa_node_id();
2258 target = ilx % nnodes;
2259 nid = first_node(nodemask);
2260 for (i = 0; i < target; i++)
2261 nid = next_node(nid, nodemask);
2262 return nid;
2263 }
2264
2265 /*
2266 * Return a nodemask representing a mempolicy for filtering nodes for
2267 * page allocation, together with preferred node id (or the input node id).
2268 */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2269 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2270 pgoff_t ilx, int *nid)
2271 {
2272 nodemask_t *nodemask = NULL;
2273
2274 switch (pol->mode) {
2275 case MPOL_PREFERRED:
2276 /* Override input node id */
2277 *nid = first_node(pol->nodes);
2278 break;
2279 case MPOL_PREFERRED_MANY:
2280 nodemask = &pol->nodes;
2281 if (pol->home_node != NUMA_NO_NODE)
2282 *nid = pol->home_node;
2283 break;
2284 case MPOL_BIND:
2285 /* Restrict to nodemask (but not on lower zones) */
2286 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2287 cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2288 nodemask = &pol->nodes;
2289 if (pol->home_node != NUMA_NO_NODE)
2290 *nid = pol->home_node;
2291 /*
2292 * __GFP_THISNODE shouldn't even be used with the bind policy
2293 * because we might easily break the expectation to stay on the
2294 * requested node and not break the policy.
2295 */
2296 WARN_ON_ONCE(gfp & __GFP_THISNODE);
2297 break;
2298 case MPOL_INTERLEAVE:
2299 /* Override input node id */
2300 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2301 interleave_nodes(pol) : interleave_nid(pol, ilx);
2302 break;
2303 case MPOL_WEIGHTED_INTERLEAVE:
2304 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2305 weighted_interleave_nodes(pol) :
2306 weighted_interleave_nid(pol, ilx);
2307 break;
2308 }
2309
2310 return nodemask;
2311 }
2312
2313 #ifdef CONFIG_HUGETLBFS
2314 /*
2315 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2316 * @vma: virtual memory area whose policy is sought
2317 * @addr: address in @vma for shared policy lookup and interleave policy
2318 * @gfp_flags: for requested zone
2319 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2320 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2321 *
2322 * Returns a nid suitable for a huge page allocation and a pointer
2323 * to the struct mempolicy for conditional unref after allocation.
2324 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2325 * to the mempolicy's @nodemask for filtering the zonelist.
2326 */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2327 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2328 struct mempolicy **mpol, nodemask_t **nodemask)
2329 {
2330 pgoff_t ilx;
2331 int nid;
2332
2333 nid = numa_node_id();
2334 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2335 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2336 return nid;
2337 }
2338
2339 /*
2340 * init_nodemask_of_mempolicy
2341 *
2342 * If the current task's mempolicy is "default" [NULL], return 'false'
2343 * to indicate default policy. Otherwise, extract the policy nodemask
2344 * for 'bind' or 'interleave' policy into the argument nodemask, or
2345 * initialize the argument nodemask to contain the single node for
2346 * 'preferred' or 'local' policy and return 'true' to indicate presence
2347 * of non-default mempolicy.
2348 *
2349 * We don't bother with reference counting the mempolicy [mpol_get/put]
2350 * because the current task is examining it's own mempolicy and a task's
2351 * mempolicy is only ever changed by the task itself.
2352 *
2353 * N.B., it is the caller's responsibility to free a returned nodemask.
2354 */
init_nodemask_of_mempolicy(nodemask_t * mask)2355 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2356 {
2357 struct mempolicy *mempolicy;
2358
2359 if (!(mask && current->mempolicy))
2360 return false;
2361
2362 task_lock(current);
2363 mempolicy = current->mempolicy;
2364 switch (mempolicy->mode) {
2365 case MPOL_PREFERRED:
2366 case MPOL_PREFERRED_MANY:
2367 case MPOL_BIND:
2368 case MPOL_INTERLEAVE:
2369 case MPOL_WEIGHTED_INTERLEAVE:
2370 *mask = mempolicy->nodes;
2371 break;
2372
2373 case MPOL_LOCAL:
2374 init_nodemask_of_node(mask, numa_node_id());
2375 break;
2376
2377 default:
2378 BUG();
2379 }
2380 task_unlock(current);
2381
2382 return true;
2383 }
2384 #endif
2385
2386 /*
2387 * mempolicy_in_oom_domain
2388 *
2389 * If tsk's mempolicy is "bind", check for intersection between mask and
2390 * the policy nodemask. Otherwise, return true for all other policies
2391 * including "interleave", as a tsk with "interleave" policy may have
2392 * memory allocated from all nodes in system.
2393 *
2394 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2395 */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2396 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2397 const nodemask_t *mask)
2398 {
2399 struct mempolicy *mempolicy;
2400 bool ret = true;
2401
2402 if (!mask)
2403 return ret;
2404
2405 task_lock(tsk);
2406 mempolicy = tsk->mempolicy;
2407 if (mempolicy && mempolicy->mode == MPOL_BIND)
2408 ret = nodes_intersects(mempolicy->nodes, *mask);
2409 task_unlock(tsk);
2410
2411 return ret;
2412 }
2413
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2414 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2415 int nid, nodemask_t *nodemask)
2416 {
2417 struct page *page;
2418 gfp_t preferred_gfp;
2419
2420 /*
2421 * This is a two pass approach. The first pass will only try the
2422 * preferred nodes but skip the direct reclaim and allow the
2423 * allocation to fail, while the second pass will try all the
2424 * nodes in system.
2425 */
2426 preferred_gfp = gfp | __GFP_NOWARN;
2427 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2428 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2429 if (!page)
2430 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2431
2432 return page;
2433 }
2434
2435 /**
2436 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2437 * @gfp: GFP flags.
2438 * @order: Order of the page allocation.
2439 * @pol: Pointer to the NUMA mempolicy.
2440 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2441 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2442 *
2443 * Return: The page on success or NULL if allocation fails.
2444 */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2445 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2446 struct mempolicy *pol, pgoff_t ilx, int nid)
2447 {
2448 nodemask_t *nodemask;
2449 struct page *page;
2450
2451 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2452
2453 if (pol->mode == MPOL_PREFERRED_MANY)
2454 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2455
2456 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2457 /* filter "hugepage" allocation, unless from alloc_pages() */
2458 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2459 /*
2460 * For hugepage allocation and non-interleave policy which
2461 * allows the current node (or other explicitly preferred
2462 * node) we only try to allocate from the current/preferred
2463 * node and don't fall back to other nodes, as the cost of
2464 * remote accesses would likely offset THP benefits.
2465 *
2466 * If the policy is interleave or does not allow the current
2467 * node in its nodemask, we allocate the standard way.
2468 */
2469 if (pol->mode != MPOL_INTERLEAVE &&
2470 pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2471 (!nodemask || node_isset(nid, *nodemask))) {
2472 /*
2473 * First, try to allocate THP only on local node, but
2474 * don't reclaim unnecessarily, just compact.
2475 */
2476 page = __alloc_frozen_pages_noprof(
2477 gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2478 nid, NULL);
2479 if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2480 return page;
2481 /*
2482 * If hugepage allocations are configured to always
2483 * synchronous compact or the vma has been madvised
2484 * to prefer hugepage backing, retry allowing remote
2485 * memory with both reclaim and compact as well.
2486 */
2487 }
2488 }
2489
2490 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2491
2492 if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2493 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2494 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2495 if (static_branch_likely(&vm_numa_stat_key) &&
2496 page_to_nid(page) == nid) {
2497 preempt_disable();
2498 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2499 preempt_enable();
2500 }
2501 }
2502
2503 return page;
2504 }
2505
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2506 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2507 struct mempolicy *pol, pgoff_t ilx, int nid)
2508 {
2509 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2510 ilx, nid);
2511 if (!page)
2512 return NULL;
2513
2514 set_page_refcounted(page);
2515 return page_rmappable_folio(page);
2516 }
2517
2518 /**
2519 * vma_alloc_folio - Allocate a folio for a VMA.
2520 * @gfp: GFP flags.
2521 * @order: Order of the folio.
2522 * @vma: Pointer to VMA.
2523 * @addr: Virtual address of the allocation. Must be inside @vma.
2524 *
2525 * Allocate a folio for a specific address in @vma, using the appropriate
2526 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2527 * VMA to prevent it from going away. Should be used for all allocations
2528 * for folios that will be mapped into user space, excepting hugetlbfs, and
2529 * excepting where direct use of folio_alloc_mpol() is more appropriate.
2530 *
2531 * Return: The folio on success or NULL if allocation fails.
2532 */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2533 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2534 unsigned long addr)
2535 {
2536 struct mempolicy *pol;
2537 pgoff_t ilx;
2538 struct folio *folio;
2539
2540 if (vma->vm_flags & VM_DROPPABLE)
2541 gfp |= __GFP_NOWARN;
2542
2543 pol = get_vma_policy(vma, addr, order, &ilx);
2544 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2545 mpol_cond_put(pol);
2546 return folio;
2547 }
2548 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2549
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2550 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2551 {
2552 struct mempolicy *pol = &default_policy;
2553
2554 /*
2555 * No reference counting needed for current->mempolicy
2556 * nor system default_policy
2557 */
2558 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2559 pol = get_task_policy(current);
2560
2561 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2562 numa_node_id());
2563 }
2564
2565 /**
2566 * alloc_pages - Allocate pages.
2567 * @gfp: GFP flags.
2568 * @order: Power of two of number of pages to allocate.
2569 *
2570 * Allocate 1 << @order contiguous pages. The physical address of the
2571 * first page is naturally aligned (eg an order-3 allocation will be aligned
2572 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2573 * process is honoured when in process context.
2574 *
2575 * Context: Can be called from any context, providing the appropriate GFP
2576 * flags are used.
2577 * Return: The page on success or NULL if allocation fails.
2578 */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2579 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2580 {
2581 struct page *page = alloc_frozen_pages_noprof(gfp, order);
2582
2583 if (page)
2584 set_page_refcounted(page);
2585 return page;
2586 }
2587 EXPORT_SYMBOL(alloc_pages_noprof);
2588
folio_alloc_noprof(gfp_t gfp,unsigned int order)2589 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2590 {
2591 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2592 }
2593 EXPORT_SYMBOL(folio_alloc_noprof);
2594
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2595 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2596 struct mempolicy *pol, unsigned long nr_pages,
2597 struct page **page_array)
2598 {
2599 int nodes;
2600 unsigned long nr_pages_per_node;
2601 int delta;
2602 int i;
2603 unsigned long nr_allocated;
2604 unsigned long total_allocated = 0;
2605
2606 nodes = nodes_weight(pol->nodes);
2607 nr_pages_per_node = nr_pages / nodes;
2608 delta = nr_pages - nodes * nr_pages_per_node;
2609
2610 for (i = 0; i < nodes; i++) {
2611 if (delta) {
2612 nr_allocated = alloc_pages_bulk_noprof(gfp,
2613 interleave_nodes(pol), NULL,
2614 nr_pages_per_node + 1,
2615 page_array);
2616 delta--;
2617 } else {
2618 nr_allocated = alloc_pages_bulk_noprof(gfp,
2619 interleave_nodes(pol), NULL,
2620 nr_pages_per_node, page_array);
2621 }
2622
2623 page_array += nr_allocated;
2624 total_allocated += nr_allocated;
2625 }
2626
2627 return total_allocated;
2628 }
2629
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2630 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2631 struct mempolicy *pol, unsigned long nr_pages,
2632 struct page **page_array)
2633 {
2634 struct weighted_interleave_state *state;
2635 struct task_struct *me = current;
2636 unsigned int cpuset_mems_cookie;
2637 unsigned long total_allocated = 0;
2638 unsigned long nr_allocated = 0;
2639 unsigned long rounds;
2640 unsigned long node_pages, delta;
2641 u8 *weights, weight;
2642 unsigned int weight_total = 0;
2643 unsigned long rem_pages = nr_pages;
2644 nodemask_t nodes;
2645 int nnodes, node;
2646 int resume_node = MAX_NUMNODES - 1;
2647 u8 resume_weight = 0;
2648 int prev_node;
2649 int i;
2650
2651 if (!nr_pages)
2652 return 0;
2653
2654 /* read the nodes onto the stack, retry if done during rebind */
2655 do {
2656 cpuset_mems_cookie = read_mems_allowed_begin();
2657 nnodes = read_once_policy_nodemask(pol, &nodes);
2658 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2659
2660 /* if the nodemask has become invalid, we cannot do anything */
2661 if (!nnodes)
2662 return 0;
2663
2664 /* Continue allocating from most recent node and adjust the nr_pages */
2665 node = me->il_prev;
2666 weight = me->il_weight;
2667 if (weight && node_isset(node, nodes)) {
2668 node_pages = min(rem_pages, weight);
2669 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2670 page_array);
2671 page_array += nr_allocated;
2672 total_allocated += nr_allocated;
2673 /* if that's all the pages, no need to interleave */
2674 if (rem_pages <= weight) {
2675 me->il_weight -= rem_pages;
2676 return total_allocated;
2677 }
2678 /* Otherwise we adjust remaining pages, continue from there */
2679 rem_pages -= weight;
2680 }
2681 /* clear active weight in case of an allocation failure */
2682 me->il_weight = 0;
2683 prev_node = node;
2684
2685 /* create a local copy of node weights to operate on outside rcu */
2686 weights = kzalloc(nr_node_ids, GFP_KERNEL);
2687 if (!weights)
2688 return total_allocated;
2689
2690 rcu_read_lock();
2691 state = rcu_dereference(wi_state);
2692 if (state) {
2693 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2694 rcu_read_unlock();
2695 } else {
2696 rcu_read_unlock();
2697 for (i = 0; i < nr_node_ids; i++)
2698 weights[i] = 1;
2699 }
2700
2701 /* calculate total, detect system default usage */
2702 for_each_node_mask(node, nodes)
2703 weight_total += weights[node];
2704
2705 /*
2706 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2707 * Track which node weighted interleave should resume from.
2708 *
2709 * if (rounds > 0) and (delta == 0), resume_node will always be
2710 * the node following prev_node and its weight.
2711 */
2712 rounds = rem_pages / weight_total;
2713 delta = rem_pages % weight_total;
2714 resume_node = next_node_in(prev_node, nodes);
2715 resume_weight = weights[resume_node];
2716 for (i = 0; i < nnodes; i++) {
2717 node = next_node_in(prev_node, nodes);
2718 weight = weights[node];
2719 node_pages = weight * rounds;
2720 /* If a delta exists, add this node's portion of the delta */
2721 if (delta > weight) {
2722 node_pages += weight;
2723 delta -= weight;
2724 } else if (delta) {
2725 /* when delta is depleted, resume from that node */
2726 node_pages += delta;
2727 resume_node = node;
2728 resume_weight = weight - delta;
2729 delta = 0;
2730 }
2731 /* node_pages can be 0 if an allocation fails and rounds == 0 */
2732 if (!node_pages)
2733 break;
2734 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2735 page_array);
2736 page_array += nr_allocated;
2737 total_allocated += nr_allocated;
2738 if (total_allocated == nr_pages)
2739 break;
2740 prev_node = node;
2741 }
2742 me->il_prev = resume_node;
2743 me->il_weight = resume_weight;
2744 kfree(weights);
2745 return total_allocated;
2746 }
2747
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2748 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2749 struct mempolicy *pol, unsigned long nr_pages,
2750 struct page **page_array)
2751 {
2752 gfp_t preferred_gfp;
2753 unsigned long nr_allocated = 0;
2754
2755 preferred_gfp = gfp | __GFP_NOWARN;
2756 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2757
2758 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2759 nr_pages, page_array);
2760
2761 if (nr_allocated < nr_pages)
2762 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2763 nr_pages - nr_allocated,
2764 page_array + nr_allocated);
2765 return nr_allocated;
2766 }
2767
2768 /* alloc pages bulk and mempolicy should be considered at the
2769 * same time in some situation such as vmalloc.
2770 *
2771 * It can accelerate memory allocation especially interleaving
2772 * allocate memory.
2773 */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2774 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2775 unsigned long nr_pages, struct page **page_array)
2776 {
2777 struct mempolicy *pol = &default_policy;
2778 nodemask_t *nodemask;
2779 int nid;
2780
2781 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2782 pol = get_task_policy(current);
2783
2784 if (pol->mode == MPOL_INTERLEAVE)
2785 return alloc_pages_bulk_interleave(gfp, pol,
2786 nr_pages, page_array);
2787
2788 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2789 return alloc_pages_bulk_weighted_interleave(
2790 gfp, pol, nr_pages, page_array);
2791
2792 if (pol->mode == MPOL_PREFERRED_MANY)
2793 return alloc_pages_bulk_preferred_many(gfp,
2794 numa_node_id(), pol, nr_pages, page_array);
2795
2796 nid = numa_node_id();
2797 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2798 return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2799 nr_pages, page_array);
2800 }
2801
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2802 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2803 {
2804 struct mempolicy *pol = mpol_dup(src->vm_policy);
2805
2806 if (IS_ERR(pol))
2807 return PTR_ERR(pol);
2808 dst->vm_policy = pol;
2809 return 0;
2810 }
2811
2812 /*
2813 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2814 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2815 * with the mems_allowed returned by cpuset_mems_allowed(). This
2816 * keeps mempolicies cpuset relative after its cpuset moves. See
2817 * further kernel/cpuset.c update_nodemask().
2818 *
2819 * current's mempolicy may be rebinded by the other task(the task that changes
2820 * cpuset's mems), so we needn't do rebind work for current task.
2821 */
2822
2823 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2824 struct mempolicy *__mpol_dup(struct mempolicy *old)
2825 {
2826 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2827
2828 if (!new)
2829 return ERR_PTR(-ENOMEM);
2830
2831 /* task's mempolicy is protected by alloc_lock */
2832 if (old == current->mempolicy) {
2833 task_lock(current);
2834 *new = *old;
2835 task_unlock(current);
2836 } else
2837 *new = *old;
2838
2839 if (current_cpuset_is_being_rebound()) {
2840 nodemask_t mems = cpuset_mems_allowed(current);
2841 mpol_rebind_policy(new, &mems);
2842 }
2843 atomic_set(&new->refcnt, 1);
2844 return new;
2845 }
2846
2847 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2848 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2849 {
2850 if (!a || !b)
2851 return false;
2852 if (a->mode != b->mode)
2853 return false;
2854 if (a->flags != b->flags)
2855 return false;
2856 if (a->home_node != b->home_node)
2857 return false;
2858 if (mpol_store_user_nodemask(a))
2859 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2860 return false;
2861
2862 switch (a->mode) {
2863 case MPOL_BIND:
2864 case MPOL_INTERLEAVE:
2865 case MPOL_PREFERRED:
2866 case MPOL_PREFERRED_MANY:
2867 case MPOL_WEIGHTED_INTERLEAVE:
2868 return !!nodes_equal(a->nodes, b->nodes);
2869 case MPOL_LOCAL:
2870 return true;
2871 default:
2872 BUG();
2873 return false;
2874 }
2875 }
2876
2877 /*
2878 * Shared memory backing store policy support.
2879 *
2880 * Remember policies even when nobody has shared memory mapped.
2881 * The policies are kept in Red-Black tree linked from the inode.
2882 * They are protected by the sp->lock rwlock, which should be held
2883 * for any accesses to the tree.
2884 */
2885
2886 /*
2887 * lookup first element intersecting start-end. Caller holds sp->lock for
2888 * reading or for writing
2889 */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2890 static struct sp_node *sp_lookup(struct shared_policy *sp,
2891 pgoff_t start, pgoff_t end)
2892 {
2893 struct rb_node *n = sp->root.rb_node;
2894
2895 while (n) {
2896 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2897
2898 if (start >= p->end)
2899 n = n->rb_right;
2900 else if (end <= p->start)
2901 n = n->rb_left;
2902 else
2903 break;
2904 }
2905 if (!n)
2906 return NULL;
2907 for (;;) {
2908 struct sp_node *w = NULL;
2909 struct rb_node *prev = rb_prev(n);
2910 if (!prev)
2911 break;
2912 w = rb_entry(prev, struct sp_node, nd);
2913 if (w->end <= start)
2914 break;
2915 n = prev;
2916 }
2917 return rb_entry(n, struct sp_node, nd);
2918 }
2919
2920 /*
2921 * Insert a new shared policy into the list. Caller holds sp->lock for
2922 * writing.
2923 */
sp_insert(struct shared_policy * sp,struct sp_node * new)2924 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2925 {
2926 struct rb_node **p = &sp->root.rb_node;
2927 struct rb_node *parent = NULL;
2928 struct sp_node *nd;
2929
2930 while (*p) {
2931 parent = *p;
2932 nd = rb_entry(parent, struct sp_node, nd);
2933 if (new->start < nd->start)
2934 p = &(*p)->rb_left;
2935 else if (new->end > nd->end)
2936 p = &(*p)->rb_right;
2937 else
2938 BUG();
2939 }
2940 rb_link_node(&new->nd, parent, p);
2941 rb_insert_color(&new->nd, &sp->root);
2942 }
2943
2944 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2945 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2946 pgoff_t idx)
2947 {
2948 struct mempolicy *pol = NULL;
2949 struct sp_node *sn;
2950
2951 if (!sp->root.rb_node)
2952 return NULL;
2953 read_lock(&sp->lock);
2954 sn = sp_lookup(sp, idx, idx+1);
2955 if (sn) {
2956 mpol_get(sn->policy);
2957 pol = sn->policy;
2958 }
2959 read_unlock(&sp->lock);
2960 return pol;
2961 }
2962 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
2963
sp_free(struct sp_node * n)2964 static void sp_free(struct sp_node *n)
2965 {
2966 mpol_put(n->policy);
2967 kmem_cache_free(sn_cache, n);
2968 }
2969
2970 /**
2971 * mpol_misplaced - check whether current folio node is valid in policy
2972 *
2973 * @folio: folio to be checked
2974 * @vmf: structure describing the fault
2975 * @addr: virtual address in @vma for shared policy lookup and interleave policy
2976 *
2977 * Lookup current policy node id for vma,addr and "compare to" folio's
2978 * node id. Policy determination "mimics" alloc_page_vma().
2979 * Called from fault path where we know the vma and faulting address.
2980 *
2981 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2982 * policy, or a suitable node ID to allocate a replacement folio from.
2983 */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2984 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2985 unsigned long addr)
2986 {
2987 struct mempolicy *pol;
2988 pgoff_t ilx;
2989 struct zoneref *z;
2990 int curnid = folio_nid(folio);
2991 struct vm_area_struct *vma = vmf->vma;
2992 int thiscpu = raw_smp_processor_id();
2993 int thisnid = numa_node_id();
2994 int polnid = NUMA_NO_NODE;
2995 int ret = NUMA_NO_NODE;
2996
2997 /*
2998 * Make sure ptl is held so that we don't preempt and we
2999 * have a stable smp processor id
3000 */
3001 lockdep_assert_held(vmf->ptl);
3002 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
3003 if (!(pol->flags & MPOL_F_MOF))
3004 goto out;
3005
3006 switch (pol->mode) {
3007 case MPOL_INTERLEAVE:
3008 polnid = interleave_nid(pol, ilx);
3009 break;
3010
3011 case MPOL_WEIGHTED_INTERLEAVE:
3012 polnid = weighted_interleave_nid(pol, ilx);
3013 break;
3014
3015 case MPOL_PREFERRED:
3016 if (node_isset(curnid, pol->nodes))
3017 goto out;
3018 polnid = first_node(pol->nodes);
3019 break;
3020
3021 case MPOL_LOCAL:
3022 polnid = numa_node_id();
3023 break;
3024
3025 case MPOL_BIND:
3026 case MPOL_PREFERRED_MANY:
3027 /*
3028 * Even though MPOL_PREFERRED_MANY can allocate pages outside
3029 * policy nodemask we don't allow numa migration to nodes
3030 * outside policy nodemask for now. This is done so that if we
3031 * want demotion to slow memory to happen, before allocating
3032 * from some DRAM node say 'x', we will end up using a
3033 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3034 * we should not promote to node 'x' from slow memory node.
3035 */
3036 if (pol->flags & MPOL_F_MORON) {
3037 /*
3038 * Optimize placement among multiple nodes
3039 * via NUMA balancing
3040 */
3041 if (node_isset(thisnid, pol->nodes))
3042 break;
3043 goto out;
3044 }
3045
3046 /*
3047 * use current page if in policy nodemask,
3048 * else select nearest allowed node, if any.
3049 * If no allowed nodes, use current [!misplaced].
3050 */
3051 if (node_isset(curnid, pol->nodes))
3052 goto out;
3053 z = first_zones_zonelist(
3054 node_zonelist(thisnid, GFP_HIGHUSER),
3055 gfp_zone(GFP_HIGHUSER),
3056 &pol->nodes);
3057 polnid = zonelist_node_idx(z);
3058 break;
3059
3060 default:
3061 BUG();
3062 }
3063
3064 /* Migrate the folio towards the node whose CPU is referencing it */
3065 if (pol->flags & MPOL_F_MORON) {
3066 polnid = thisnid;
3067
3068 if (!should_numa_migrate_memory(current, folio, curnid,
3069 thiscpu))
3070 goto out;
3071 }
3072
3073 if (curnid != polnid)
3074 ret = polnid;
3075 out:
3076 mpol_cond_put(pol);
3077
3078 return ret;
3079 }
3080
3081 /*
3082 * Drop the (possibly final) reference to task->mempolicy. It needs to be
3083 * dropped after task->mempolicy is set to NULL so that any allocation done as
3084 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3085 * policy.
3086 */
mpol_put_task_policy(struct task_struct * task)3087 void mpol_put_task_policy(struct task_struct *task)
3088 {
3089 struct mempolicy *pol;
3090
3091 task_lock(task);
3092 pol = task->mempolicy;
3093 task->mempolicy = NULL;
3094 task_unlock(task);
3095 mpol_put(pol);
3096 }
3097
sp_delete(struct shared_policy * sp,struct sp_node * n)3098 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3099 {
3100 rb_erase(&n->nd, &sp->root);
3101 sp_free(n);
3102 }
3103
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3104 static void sp_node_init(struct sp_node *node, unsigned long start,
3105 unsigned long end, struct mempolicy *pol)
3106 {
3107 node->start = start;
3108 node->end = end;
3109 node->policy = pol;
3110 }
3111
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3112 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3113 struct mempolicy *pol)
3114 {
3115 struct sp_node *n;
3116 struct mempolicy *newpol;
3117
3118 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3119 if (!n)
3120 return NULL;
3121
3122 newpol = mpol_dup(pol);
3123 if (IS_ERR(newpol)) {
3124 kmem_cache_free(sn_cache, n);
3125 return NULL;
3126 }
3127 newpol->flags |= MPOL_F_SHARED;
3128 sp_node_init(n, start, end, newpol);
3129
3130 return n;
3131 }
3132
3133 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3134 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3135 pgoff_t end, struct sp_node *new)
3136 {
3137 struct sp_node *n;
3138 struct sp_node *n_new = NULL;
3139 struct mempolicy *mpol_new = NULL;
3140 int ret = 0;
3141
3142 restart:
3143 write_lock(&sp->lock);
3144 n = sp_lookup(sp, start, end);
3145 /* Take care of old policies in the same range. */
3146 while (n && n->start < end) {
3147 struct rb_node *next = rb_next(&n->nd);
3148 if (n->start >= start) {
3149 if (n->end <= end)
3150 sp_delete(sp, n);
3151 else
3152 n->start = end;
3153 } else {
3154 /* Old policy spanning whole new range. */
3155 if (n->end > end) {
3156 if (!n_new)
3157 goto alloc_new;
3158
3159 *mpol_new = *n->policy;
3160 atomic_set(&mpol_new->refcnt, 1);
3161 sp_node_init(n_new, end, n->end, mpol_new);
3162 n->end = start;
3163 sp_insert(sp, n_new);
3164 n_new = NULL;
3165 mpol_new = NULL;
3166 break;
3167 } else
3168 n->end = start;
3169 }
3170 if (!next)
3171 break;
3172 n = rb_entry(next, struct sp_node, nd);
3173 }
3174 if (new)
3175 sp_insert(sp, new);
3176 write_unlock(&sp->lock);
3177 ret = 0;
3178
3179 err_out:
3180 if (mpol_new)
3181 mpol_put(mpol_new);
3182 if (n_new)
3183 kmem_cache_free(sn_cache, n_new);
3184
3185 return ret;
3186
3187 alloc_new:
3188 write_unlock(&sp->lock);
3189 ret = -ENOMEM;
3190 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3191 if (!n_new)
3192 goto err_out;
3193 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3194 if (!mpol_new)
3195 goto err_out;
3196 atomic_set(&mpol_new->refcnt, 1);
3197 goto restart;
3198 }
3199
3200 /**
3201 * mpol_shared_policy_init - initialize shared policy for inode
3202 * @sp: pointer to inode shared policy
3203 * @mpol: struct mempolicy to install
3204 *
3205 * Install non-NULL @mpol in inode's shared policy rb-tree.
3206 * On entry, the current task has a reference on a non-NULL @mpol.
3207 * This must be released on exit.
3208 * This is called at get_inode() calls and we can use GFP_KERNEL.
3209 */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3210 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3211 {
3212 int ret;
3213
3214 sp->root = RB_ROOT; /* empty tree == default mempolicy */
3215 rwlock_init(&sp->lock);
3216
3217 if (mpol) {
3218 struct sp_node *sn;
3219 struct mempolicy *npol;
3220 NODEMASK_SCRATCH(scratch);
3221
3222 if (!scratch)
3223 goto put_mpol;
3224
3225 /* contextualize the tmpfs mount point mempolicy to this file */
3226 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3227 if (IS_ERR(npol))
3228 goto free_scratch; /* no valid nodemask intersection */
3229
3230 task_lock(current);
3231 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3232 task_unlock(current);
3233 if (ret)
3234 goto put_npol;
3235
3236 /* alloc node covering entire file; adds ref to file's npol */
3237 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3238 if (sn)
3239 sp_insert(sp, sn);
3240 put_npol:
3241 mpol_put(npol); /* drop initial ref on file's npol */
3242 free_scratch:
3243 NODEMASK_SCRATCH_FREE(scratch);
3244 put_mpol:
3245 mpol_put(mpol); /* drop our incoming ref on sb mpol */
3246 }
3247 }
3248 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
3249
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3250 int mpol_set_shared_policy(struct shared_policy *sp,
3251 struct vm_area_struct *vma, struct mempolicy *pol)
3252 {
3253 int err;
3254 struct sp_node *new = NULL;
3255 unsigned long sz = vma_pages(vma);
3256
3257 if (pol) {
3258 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3259 if (!new)
3260 return -ENOMEM;
3261 }
3262 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3263 if (err && new)
3264 sp_free(new);
3265 return err;
3266 }
3267 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
3268
3269 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3270 void mpol_free_shared_policy(struct shared_policy *sp)
3271 {
3272 struct sp_node *n;
3273 struct rb_node *next;
3274
3275 if (!sp->root.rb_node)
3276 return;
3277 write_lock(&sp->lock);
3278 next = rb_first(&sp->root);
3279 while (next) {
3280 n = rb_entry(next, struct sp_node, nd);
3281 next = rb_next(&n->nd);
3282 sp_delete(sp, n);
3283 }
3284 write_unlock(&sp->lock);
3285 }
3286 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
3287
3288 #ifdef CONFIG_NUMA_BALANCING
3289 static int __initdata numabalancing_override;
3290
check_numabalancing_enable(void)3291 static void __init check_numabalancing_enable(void)
3292 {
3293 bool numabalancing_default = false;
3294
3295 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3296 numabalancing_default = true;
3297
3298 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3299 if (numabalancing_override)
3300 set_numabalancing_state(numabalancing_override == 1);
3301
3302 if (num_online_nodes() > 1 && !numabalancing_override) {
3303 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3304 numabalancing_default ? "Enabling" : "Disabling");
3305 set_numabalancing_state(numabalancing_default);
3306 }
3307 }
3308
setup_numabalancing(char * str)3309 static int __init setup_numabalancing(char *str)
3310 {
3311 int ret = 0;
3312 if (!str)
3313 goto out;
3314
3315 if (!strcmp(str, "enable")) {
3316 numabalancing_override = 1;
3317 ret = 1;
3318 } else if (!strcmp(str, "disable")) {
3319 numabalancing_override = -1;
3320 ret = 1;
3321 }
3322 out:
3323 if (!ret)
3324 pr_warn("Unable to parse numa_balancing=\n");
3325
3326 return ret;
3327 }
3328 __setup("numa_balancing=", setup_numabalancing);
3329 #else
check_numabalancing_enable(void)3330 static inline void __init check_numabalancing_enable(void)
3331 {
3332 }
3333 #endif /* CONFIG_NUMA_BALANCING */
3334
numa_policy_init(void)3335 void __init numa_policy_init(void)
3336 {
3337 nodemask_t interleave_nodes;
3338 unsigned long largest = 0;
3339 int nid, prefer = 0;
3340
3341 policy_cache = kmem_cache_create("numa_policy",
3342 sizeof(struct mempolicy),
3343 0, SLAB_PANIC, NULL);
3344
3345 sn_cache = kmem_cache_create("shared_policy_node",
3346 sizeof(struct sp_node),
3347 0, SLAB_PANIC, NULL);
3348
3349 for_each_node(nid) {
3350 preferred_node_policy[nid] = (struct mempolicy) {
3351 .refcnt = ATOMIC_INIT(1),
3352 .mode = MPOL_PREFERRED,
3353 .flags = MPOL_F_MOF | MPOL_F_MORON,
3354 .nodes = nodemask_of_node(nid),
3355 };
3356 }
3357
3358 /*
3359 * Set interleaving policy for system init. Interleaving is only
3360 * enabled across suitably sized nodes (default is >= 16MB), or
3361 * fall back to the largest node if they're all smaller.
3362 */
3363 nodes_clear(interleave_nodes);
3364 for_each_node_state(nid, N_MEMORY) {
3365 unsigned long total_pages = node_present_pages(nid);
3366
3367 /* Preserve the largest node */
3368 if (largest < total_pages) {
3369 largest = total_pages;
3370 prefer = nid;
3371 }
3372
3373 /* Interleave this node? */
3374 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3375 node_set(nid, interleave_nodes);
3376 }
3377
3378 /* All too small, use the largest */
3379 if (unlikely(nodes_empty(interleave_nodes)))
3380 node_set(prefer, interleave_nodes);
3381
3382 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3383 pr_err("%s: interleaving failed\n", __func__);
3384
3385 check_numabalancing_enable();
3386 }
3387
3388 /* Reset policy of current process to default */
numa_default_policy(void)3389 void numa_default_policy(void)
3390 {
3391 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3392 }
3393
3394 /*
3395 * Parse and format mempolicy from/to strings
3396 */
3397 static const char * const policy_modes[] =
3398 {
3399 [MPOL_DEFAULT] = "default",
3400 [MPOL_PREFERRED] = "prefer",
3401 [MPOL_BIND] = "bind",
3402 [MPOL_INTERLEAVE] = "interleave",
3403 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3404 [MPOL_LOCAL] = "local",
3405 [MPOL_PREFERRED_MANY] = "prefer (many)",
3406 };
3407
3408 #ifdef CONFIG_TMPFS
3409 /**
3410 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3411 * @str: string containing mempolicy to parse
3412 * @mpol: pointer to struct mempolicy pointer, returned on success.
3413 *
3414 * Format of input:
3415 * <mode>[=<flags>][:<nodelist>]
3416 *
3417 * Return: %0 on success, else %1
3418 */
mpol_parse_str(char * str,struct mempolicy ** mpol)3419 int mpol_parse_str(char *str, struct mempolicy **mpol)
3420 {
3421 struct mempolicy *new = NULL;
3422 unsigned short mode_flags;
3423 nodemask_t nodes;
3424 char *nodelist = strchr(str, ':');
3425 char *flags = strchr(str, '=');
3426 int err = 1, mode;
3427
3428 if (flags)
3429 *flags++ = '\0'; /* terminate mode string */
3430
3431 if (nodelist) {
3432 /* NUL-terminate mode or flags string */
3433 *nodelist++ = '\0';
3434 if (nodelist_parse(nodelist, nodes))
3435 goto out;
3436 if (!nodes_subset(nodes, node_states[N_MEMORY]))
3437 goto out;
3438 } else
3439 nodes_clear(nodes);
3440
3441 mode = match_string(policy_modes, MPOL_MAX, str);
3442 if (mode < 0)
3443 goto out;
3444
3445 switch (mode) {
3446 case MPOL_PREFERRED:
3447 /*
3448 * Insist on a nodelist of one node only, although later
3449 * we use first_node(nodes) to grab a single node, so here
3450 * nodelist (or nodes) cannot be empty.
3451 */
3452 if (nodelist) {
3453 char *rest = nodelist;
3454 while (isdigit(*rest))
3455 rest++;
3456 if (*rest)
3457 goto out;
3458 if (nodes_empty(nodes))
3459 goto out;
3460 }
3461 break;
3462 case MPOL_INTERLEAVE:
3463 case MPOL_WEIGHTED_INTERLEAVE:
3464 /*
3465 * Default to online nodes with memory if no nodelist
3466 */
3467 if (!nodelist)
3468 nodes = node_states[N_MEMORY];
3469 break;
3470 case MPOL_LOCAL:
3471 /*
3472 * Don't allow a nodelist; mpol_new() checks flags
3473 */
3474 if (nodelist)
3475 goto out;
3476 break;
3477 case MPOL_DEFAULT:
3478 /*
3479 * Insist on a empty nodelist
3480 */
3481 if (!nodelist)
3482 err = 0;
3483 goto out;
3484 case MPOL_PREFERRED_MANY:
3485 case MPOL_BIND:
3486 /*
3487 * Insist on a nodelist
3488 */
3489 if (!nodelist)
3490 goto out;
3491 }
3492
3493 mode_flags = 0;
3494 if (flags) {
3495 /*
3496 * Currently, we only support two mutually exclusive
3497 * mode flags.
3498 */
3499 if (!strcmp(flags, "static"))
3500 mode_flags |= MPOL_F_STATIC_NODES;
3501 else if (!strcmp(flags, "relative"))
3502 mode_flags |= MPOL_F_RELATIVE_NODES;
3503 else
3504 goto out;
3505 }
3506
3507 new = mpol_new(mode, mode_flags, &nodes);
3508 if (IS_ERR(new))
3509 goto out;
3510
3511 /*
3512 * Save nodes for mpol_to_str() to show the tmpfs mount options
3513 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3514 */
3515 if (mode != MPOL_PREFERRED) {
3516 new->nodes = nodes;
3517 } else if (nodelist) {
3518 nodes_clear(new->nodes);
3519 node_set(first_node(nodes), new->nodes);
3520 } else {
3521 new->mode = MPOL_LOCAL;
3522 }
3523
3524 /*
3525 * Save nodes for contextualization: this will be used to "clone"
3526 * the mempolicy in a specific context [cpuset] at a later time.
3527 */
3528 new->w.user_nodemask = nodes;
3529
3530 err = 0;
3531
3532 out:
3533 /* Restore string for error message */
3534 if (nodelist)
3535 *--nodelist = ':';
3536 if (flags)
3537 *--flags = '=';
3538 if (!err)
3539 *mpol = new;
3540 return err;
3541 }
3542 #endif /* CONFIG_TMPFS */
3543
3544 /**
3545 * mpol_to_str - format a mempolicy structure for printing
3546 * @buffer: to contain formatted mempolicy string
3547 * @maxlen: length of @buffer
3548 * @pol: pointer to mempolicy to be formatted
3549 *
3550 * Convert @pol into a string. If @buffer is too short, truncate the string.
3551 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3552 * interleave", plus the longest flag flags, "relative|balancing", and to
3553 * display at least a few node ids.
3554 */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3555 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3556 {
3557 char *p = buffer;
3558 nodemask_t nodes = NODE_MASK_NONE;
3559 unsigned short mode = MPOL_DEFAULT;
3560 unsigned short flags = 0;
3561
3562 if (pol &&
3563 pol != &default_policy &&
3564 !(pol >= &preferred_node_policy[0] &&
3565 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3566 mode = pol->mode;
3567 flags = pol->flags;
3568 }
3569
3570 switch (mode) {
3571 case MPOL_DEFAULT:
3572 case MPOL_LOCAL:
3573 break;
3574 case MPOL_PREFERRED:
3575 case MPOL_PREFERRED_MANY:
3576 case MPOL_BIND:
3577 case MPOL_INTERLEAVE:
3578 case MPOL_WEIGHTED_INTERLEAVE:
3579 nodes = pol->nodes;
3580 break;
3581 default:
3582 WARN_ON_ONCE(1);
3583 snprintf(p, maxlen, "unknown");
3584 return;
3585 }
3586
3587 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3588
3589 if (flags & MPOL_MODE_FLAGS) {
3590 p += snprintf(p, buffer + maxlen - p, "=");
3591
3592 /*
3593 * Static and relative are mutually exclusive.
3594 */
3595 if (flags & MPOL_F_STATIC_NODES)
3596 p += snprintf(p, buffer + maxlen - p, "static");
3597 else if (flags & MPOL_F_RELATIVE_NODES)
3598 p += snprintf(p, buffer + maxlen - p, "relative");
3599
3600 if (flags & MPOL_F_NUMA_BALANCING) {
3601 if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3602 p += snprintf(p, buffer + maxlen - p, "|");
3603 p += snprintf(p, buffer + maxlen - p, "balancing");
3604 }
3605 }
3606
3607 if (!nodes_empty(nodes))
3608 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3609 nodemask_pr_args(&nodes));
3610 }
3611
3612 #ifdef CONFIG_SYSFS
3613 struct iw_node_attr {
3614 struct kobj_attribute kobj_attr;
3615 int nid;
3616 };
3617
3618 struct sysfs_wi_group {
3619 struct kobject wi_kobj;
3620 struct mutex kobj_lock;
3621 struct iw_node_attr *nattrs[];
3622 };
3623
3624 static struct sysfs_wi_group *wi_group;
3625
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3626 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3627 char *buf)
3628 {
3629 struct iw_node_attr *node_attr;
3630 u8 weight;
3631
3632 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3633 weight = get_il_weight(node_attr->nid);
3634 return sysfs_emit(buf, "%d\n", weight);
3635 }
3636
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3637 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3638 const char *buf, size_t count)
3639 {
3640 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3641 struct iw_node_attr *node_attr;
3642 u8 weight = 0;
3643 int i;
3644
3645 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3646 if (count == 0 || sysfs_streq(buf, "") ||
3647 kstrtou8(buf, 0, &weight) || weight == 0)
3648 return -EINVAL;
3649
3650 new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3651 if (!new_wi_state)
3652 return -ENOMEM;
3653
3654 mutex_lock(&wi_state_lock);
3655 old_wi_state = rcu_dereference_protected(wi_state,
3656 lockdep_is_held(&wi_state_lock));
3657 if (old_wi_state) {
3658 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3659 nr_node_ids * sizeof(u8));
3660 } else {
3661 for (i = 0; i < nr_node_ids; i++)
3662 new_wi_state->iw_table[i] = 1;
3663 }
3664 new_wi_state->iw_table[node_attr->nid] = weight;
3665 new_wi_state->mode_auto = false;
3666
3667 rcu_assign_pointer(wi_state, new_wi_state);
3668 mutex_unlock(&wi_state_lock);
3669 if (old_wi_state) {
3670 synchronize_rcu();
3671 kfree(old_wi_state);
3672 }
3673 return count;
3674 }
3675
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3676 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3677 struct kobj_attribute *attr, char *buf)
3678 {
3679 struct weighted_interleave_state *state;
3680 bool wi_auto = true;
3681
3682 rcu_read_lock();
3683 state = rcu_dereference(wi_state);
3684 if (state)
3685 wi_auto = state->mode_auto;
3686 rcu_read_unlock();
3687
3688 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3689 }
3690
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3691 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3692 struct kobj_attribute *attr, const char *buf, size_t count)
3693 {
3694 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3695 unsigned int *bw;
3696 bool input;
3697 int i;
3698
3699 if (kstrtobool(buf, &input))
3700 return -EINVAL;
3701
3702 new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3703 if (!new_wi_state)
3704 return -ENOMEM;
3705 for (i = 0; i < nr_node_ids; i++)
3706 new_wi_state->iw_table[i] = 1;
3707
3708 mutex_lock(&wi_state_lock);
3709 if (!input) {
3710 old_wi_state = rcu_dereference_protected(wi_state,
3711 lockdep_is_held(&wi_state_lock));
3712 if (!old_wi_state)
3713 goto update_wi_state;
3714 if (input == old_wi_state->mode_auto) {
3715 mutex_unlock(&wi_state_lock);
3716 return count;
3717 }
3718
3719 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3720 nr_node_ids * sizeof(u8));
3721 goto update_wi_state;
3722 }
3723
3724 bw = node_bw_table;
3725 if (!bw) {
3726 mutex_unlock(&wi_state_lock);
3727 kfree(new_wi_state);
3728 return -ENODEV;
3729 }
3730
3731 new_wi_state->mode_auto = true;
3732 reduce_interleave_weights(bw, new_wi_state->iw_table);
3733
3734 update_wi_state:
3735 rcu_assign_pointer(wi_state, new_wi_state);
3736 mutex_unlock(&wi_state_lock);
3737 if (old_wi_state) {
3738 synchronize_rcu();
3739 kfree(old_wi_state);
3740 }
3741 return count;
3742 }
3743
sysfs_wi_node_delete(int nid)3744 static void sysfs_wi_node_delete(int nid)
3745 {
3746 struct iw_node_attr *attr;
3747
3748 if (nid < 0 || nid >= nr_node_ids)
3749 return;
3750
3751 mutex_lock(&wi_group->kobj_lock);
3752 attr = wi_group->nattrs[nid];
3753 if (!attr) {
3754 mutex_unlock(&wi_group->kobj_lock);
3755 return;
3756 }
3757
3758 wi_group->nattrs[nid] = NULL;
3759 mutex_unlock(&wi_group->kobj_lock);
3760
3761 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3762 kfree(attr->kobj_attr.attr.name);
3763 kfree(attr);
3764 }
3765
sysfs_wi_node_delete_all(void)3766 static void sysfs_wi_node_delete_all(void)
3767 {
3768 int nid;
3769
3770 for (nid = 0; nid < nr_node_ids; nid++)
3771 sysfs_wi_node_delete(nid);
3772 }
3773
wi_state_free(void)3774 static void wi_state_free(void)
3775 {
3776 struct weighted_interleave_state *old_wi_state;
3777
3778 mutex_lock(&wi_state_lock);
3779 old_wi_state = rcu_dereference_protected(wi_state,
3780 lockdep_is_held(&wi_state_lock));
3781 rcu_assign_pointer(wi_state, NULL);
3782 mutex_unlock(&wi_state_lock);
3783
3784 if (old_wi_state) {
3785 synchronize_rcu();
3786 kfree(old_wi_state);
3787 }
3788 }
3789
3790 static struct kobj_attribute wi_auto_attr =
3791 __ATTR(auto, 0664, weighted_interleave_auto_show,
3792 weighted_interleave_auto_store);
3793
wi_cleanup(void)3794 static void wi_cleanup(void) {
3795 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3796 sysfs_wi_node_delete_all();
3797 wi_state_free();
3798 }
3799
wi_kobj_release(struct kobject * wi_kobj)3800 static void wi_kobj_release(struct kobject *wi_kobj)
3801 {
3802 kfree(wi_group);
3803 }
3804
3805 static const struct kobj_type wi_ktype = {
3806 .sysfs_ops = &kobj_sysfs_ops,
3807 .release = wi_kobj_release,
3808 };
3809
sysfs_wi_node_add(int nid)3810 static int sysfs_wi_node_add(int nid)
3811 {
3812 int ret;
3813 char *name;
3814 struct iw_node_attr *new_attr;
3815
3816 if (nid < 0 || nid >= nr_node_ids) {
3817 pr_err("invalid node id: %d\n", nid);
3818 return -EINVAL;
3819 }
3820
3821 new_attr = kzalloc_obj(*new_attr);
3822 if (!new_attr)
3823 return -ENOMEM;
3824
3825 name = kasprintf(GFP_KERNEL, "node%d", nid);
3826 if (!name) {
3827 kfree(new_attr);
3828 return -ENOMEM;
3829 }
3830
3831 sysfs_attr_init(&new_attr->kobj_attr.attr);
3832 new_attr->kobj_attr.attr.name = name;
3833 new_attr->kobj_attr.attr.mode = 0644;
3834 new_attr->kobj_attr.show = node_show;
3835 new_attr->kobj_attr.store = node_store;
3836 new_attr->nid = nid;
3837
3838 mutex_lock(&wi_group->kobj_lock);
3839 if (wi_group->nattrs[nid]) {
3840 mutex_unlock(&wi_group->kobj_lock);
3841 ret = -EEXIST;
3842 goto out;
3843 }
3844
3845 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3846 if (ret) {
3847 mutex_unlock(&wi_group->kobj_lock);
3848 goto out;
3849 }
3850 wi_group->nattrs[nid] = new_attr;
3851 mutex_unlock(&wi_group->kobj_lock);
3852 return 0;
3853
3854 out:
3855 kfree(new_attr->kobj_attr.attr.name);
3856 kfree(new_attr);
3857 return ret;
3858 }
3859
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3860 static int wi_node_notifier(struct notifier_block *nb,
3861 unsigned long action, void *data)
3862 {
3863 int err;
3864 struct node_notify *nn = data;
3865 int nid = nn->nid;
3866
3867 switch (action) {
3868 case NODE_ADDED_FIRST_MEMORY:
3869 err = sysfs_wi_node_add(nid);
3870 if (err)
3871 pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3872 nid, err);
3873 break;
3874 case NODE_REMOVED_LAST_MEMORY:
3875 sysfs_wi_node_delete(nid);
3876 break;
3877 }
3878
3879 return NOTIFY_OK;
3880 }
3881
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3882 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3883 {
3884 int nid, err;
3885
3886 wi_group = kzalloc_flex(*wi_group, nattrs, nr_node_ids);
3887 if (!wi_group)
3888 return -ENOMEM;
3889 mutex_init(&wi_group->kobj_lock);
3890
3891 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3892 "weighted_interleave");
3893 if (err)
3894 goto err_put_kobj;
3895
3896 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3897 if (err)
3898 goto err_put_kobj;
3899
3900 for_each_online_node(nid) {
3901 if (!node_state(nid, N_MEMORY))
3902 continue;
3903
3904 err = sysfs_wi_node_add(nid);
3905 if (err) {
3906 pr_err("failed to add sysfs for node%d during init: %d\n",
3907 nid, err);
3908 goto err_cleanup_kobj;
3909 }
3910 }
3911
3912 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3913 return 0;
3914
3915 err_cleanup_kobj:
3916 wi_cleanup();
3917 kobject_del(&wi_group->wi_kobj);
3918 err_put_kobj:
3919 kobject_put(&wi_group->wi_kobj);
3920 return err;
3921 }
3922
mempolicy_sysfs_init(void)3923 static int __init mempolicy_sysfs_init(void)
3924 {
3925 int err;
3926 static struct kobject *mempolicy_kobj;
3927
3928 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3929 if (!mempolicy_kobj)
3930 return -ENOMEM;
3931
3932 err = add_weighted_interleave_group(mempolicy_kobj);
3933 if (err)
3934 goto err_kobj;
3935
3936 return 0;
3937
3938 err_kobj:
3939 kobject_del(mempolicy_kobj);
3940 kobject_put(mempolicy_kobj);
3941 return err;
3942 }
3943
3944 late_initcall(mempolicy_sysfs_init);
3945 #endif /* CONFIG_SYSFS */
3946