xref: /linux/mm/mempolicy.c (revision b05f8d7e077952d14acb63e3ccdf5f64404b59a4)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/task.h>
89 #include <linux/nodemask.h>
90 #include <linux/cpuset.h>
91 #include <linux/slab.h>
92 #include <linux/string.h>
93 #include <linux/export.h>
94 #include <linux/nsproxy.h>
95 #include <linux/interrupt.h>
96 #include <linux/init.h>
97 #include <linux/compat.h>
98 #include <linux/ptrace.h>
99 #include <linux/swap.h>
100 #include <linux/seq_file.h>
101 #include <linux/proc_fs.h>
102 #include <linux/migrate.h>
103 #include <linux/ksm.h>
104 #include <linux/rmap.h>
105 #include <linux/security.h>
106 #include <linux/syscalls.h>
107 #include <linux/ctype.h>
108 #include <linux/mm_inline.h>
109 #include <linux/mmu_notifier.h>
110 #include <linux/printk.h>
111 #include <linux/swapops.h>
112 
113 #include <asm/tlbflush.h>
114 #include <asm/tlb.h>
115 #include <linux/uaccess.h>
116 
117 #include "internal.h"
118 
119 /* Internal flags */
120 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
121 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
122 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
123 
124 static struct kmem_cache *policy_cache;
125 static struct kmem_cache *sn_cache;
126 
127 /* Highest zone. An specific allocation for a zone below that is not
128    policied. */
129 enum zone_type policy_zone = 0;
130 
131 /*
132  * run-time system-wide default policy => local allocation
133  */
134 static struct mempolicy default_policy = {
135 	.refcnt = ATOMIC_INIT(1), /* never free it */
136 	.mode = MPOL_LOCAL,
137 };
138 
139 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
140 
141 /*
142  * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
143  * system-default value should be used. A NULL iw_table also denotes that
144  * system-default values should be used. Until the system-default table
145  * is implemented, the system-default is always 1.
146  *
147  * iw_table is RCU protected
148  */
149 static u8 __rcu *iw_table;
150 static DEFINE_MUTEX(iw_table_lock);
151 
152 static u8 get_il_weight(int node)
153 {
154 	u8 *table;
155 	u8 weight;
156 
157 	rcu_read_lock();
158 	table = rcu_dereference(iw_table);
159 	/* if no iw_table, use system default */
160 	weight = table ? table[node] : 1;
161 	/* if value in iw_table is 0, use system default */
162 	weight = weight ? weight : 1;
163 	rcu_read_unlock();
164 	return weight;
165 }
166 
167 /**
168  * numa_nearest_node - Find nearest node by state
169  * @node: Node id to start the search
170  * @state: State to filter the search
171  *
172  * Lookup the closest node by distance if @nid is not in state.
173  *
174  * Return: this @node if it is in state, otherwise the closest node by distance
175  */
176 int numa_nearest_node(int node, unsigned int state)
177 {
178 	int min_dist = INT_MAX, dist, n, min_node;
179 
180 	if (state >= NR_NODE_STATES)
181 		return -EINVAL;
182 
183 	if (node == NUMA_NO_NODE || node_state(node, state))
184 		return node;
185 
186 	min_node = node;
187 	for_each_node_state(n, state) {
188 		dist = node_distance(node, n);
189 		if (dist < min_dist) {
190 			min_dist = dist;
191 			min_node = n;
192 		}
193 	}
194 
195 	return min_node;
196 }
197 EXPORT_SYMBOL_GPL(numa_nearest_node);
198 
199 /**
200  * nearest_node_nodemask - Find the node in @mask at the nearest distance
201  *			   from @node.
202  *
203  * @node: a valid node ID to start the search from.
204  * @mask: a pointer to a nodemask representing the allowed nodes.
205  *
206  * This function iterates over all nodes in @mask and calculates the
207  * distance from the starting @node, then it returns the node ID that is
208  * the closest to @node, or MAX_NUMNODES if no node is found.
209  *
210  * Note that @node must be a valid node ID usable with node_distance(),
211  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
212  * or unexpected behavior.
213  */
214 int nearest_node_nodemask(int node, nodemask_t *mask)
215 {
216 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
217 
218 	for_each_node_mask(n, *mask) {
219 		dist = node_distance(node, n);
220 		if (dist < min_dist) {
221 			min_dist = dist;
222 			min_node = n;
223 		}
224 	}
225 
226 	return min_node;
227 }
228 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
229 
230 struct mempolicy *get_task_policy(struct task_struct *p)
231 {
232 	struct mempolicy *pol = p->mempolicy;
233 	int node;
234 
235 	if (pol)
236 		return pol;
237 
238 	node = numa_node_id();
239 	if (node != NUMA_NO_NODE) {
240 		pol = &preferred_node_policy[node];
241 		/* preferred_node_policy is not initialised early in boot */
242 		if (pol->mode)
243 			return pol;
244 	}
245 
246 	return &default_policy;
247 }
248 
249 static const struct mempolicy_operations {
250 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
251 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
252 } mpol_ops[MPOL_MAX];
253 
254 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
255 {
256 	return pol->flags & MPOL_MODE_FLAGS;
257 }
258 
259 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
260 				   const nodemask_t *rel)
261 {
262 	nodemask_t tmp;
263 	nodes_fold(tmp, *orig, nodes_weight(*rel));
264 	nodes_onto(*ret, tmp, *rel);
265 }
266 
267 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
268 {
269 	if (nodes_empty(*nodes))
270 		return -EINVAL;
271 	pol->nodes = *nodes;
272 	return 0;
273 }
274 
275 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
276 {
277 	if (nodes_empty(*nodes))
278 		return -EINVAL;
279 
280 	nodes_clear(pol->nodes);
281 	node_set(first_node(*nodes), pol->nodes);
282 	return 0;
283 }
284 
285 /*
286  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
287  * any, for the new policy.  mpol_new() has already validated the nodes
288  * parameter with respect to the policy mode and flags.
289  *
290  * Must be called holding task's alloc_lock to protect task's mems_allowed
291  * and mempolicy.  May also be called holding the mmap_lock for write.
292  */
293 static int mpol_set_nodemask(struct mempolicy *pol,
294 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
295 {
296 	int ret;
297 
298 	/*
299 	 * Default (pol==NULL) resp. local memory policies are not a
300 	 * subject of any remapping. They also do not need any special
301 	 * constructor.
302 	 */
303 	if (!pol || pol->mode == MPOL_LOCAL)
304 		return 0;
305 
306 	/* Check N_MEMORY */
307 	nodes_and(nsc->mask1,
308 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
309 
310 	VM_BUG_ON(!nodes);
311 
312 	if (pol->flags & MPOL_F_RELATIVE_NODES)
313 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
314 	else
315 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
316 
317 	if (mpol_store_user_nodemask(pol))
318 		pol->w.user_nodemask = *nodes;
319 	else
320 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
321 
322 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
323 	return ret;
324 }
325 
326 /*
327  * This function just creates a new policy, does some check and simple
328  * initialization. You must invoke mpol_set_nodemask() to set nodes.
329  */
330 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
331 				  nodemask_t *nodes)
332 {
333 	struct mempolicy *policy;
334 
335 	if (mode == MPOL_DEFAULT) {
336 		if (nodes && !nodes_empty(*nodes))
337 			return ERR_PTR(-EINVAL);
338 		return NULL;
339 	}
340 	VM_BUG_ON(!nodes);
341 
342 	/*
343 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
344 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
345 	 * All other modes require a valid pointer to a non-empty nodemask.
346 	 */
347 	if (mode == MPOL_PREFERRED) {
348 		if (nodes_empty(*nodes)) {
349 			if (((flags & MPOL_F_STATIC_NODES) ||
350 			     (flags & MPOL_F_RELATIVE_NODES)))
351 				return ERR_PTR(-EINVAL);
352 
353 			mode = MPOL_LOCAL;
354 		}
355 	} else if (mode == MPOL_LOCAL) {
356 		if (!nodes_empty(*nodes) ||
357 		    (flags & MPOL_F_STATIC_NODES) ||
358 		    (flags & MPOL_F_RELATIVE_NODES))
359 			return ERR_PTR(-EINVAL);
360 	} else if (nodes_empty(*nodes))
361 		return ERR_PTR(-EINVAL);
362 
363 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
364 	if (!policy)
365 		return ERR_PTR(-ENOMEM);
366 	atomic_set(&policy->refcnt, 1);
367 	policy->mode = mode;
368 	policy->flags = flags;
369 	policy->home_node = NUMA_NO_NODE;
370 
371 	return policy;
372 }
373 
374 /* Slow path of a mpol destructor. */
375 void __mpol_put(struct mempolicy *pol)
376 {
377 	if (!atomic_dec_and_test(&pol->refcnt))
378 		return;
379 	kmem_cache_free(policy_cache, pol);
380 }
381 
382 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
383 {
384 }
385 
386 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 	nodemask_t tmp;
389 
390 	if (pol->flags & MPOL_F_STATIC_NODES)
391 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
392 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
393 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
394 	else {
395 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
396 								*nodes);
397 		pol->w.cpuset_mems_allowed = *nodes;
398 	}
399 
400 	if (nodes_empty(tmp))
401 		tmp = *nodes;
402 
403 	pol->nodes = tmp;
404 }
405 
406 static void mpol_rebind_preferred(struct mempolicy *pol,
407 						const nodemask_t *nodes)
408 {
409 	pol->w.cpuset_mems_allowed = *nodes;
410 }
411 
412 /*
413  * mpol_rebind_policy - Migrate a policy to a different set of nodes
414  *
415  * Per-vma policies are protected by mmap_lock. Allocations using per-task
416  * policies are protected by task->mems_allowed_seq to prevent a premature
417  * OOM/allocation failure due to parallel nodemask modification.
418  */
419 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
420 {
421 	if (!pol || pol->mode == MPOL_LOCAL)
422 		return;
423 	if (!mpol_store_user_nodemask(pol) &&
424 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
425 		return;
426 
427 	mpol_ops[pol->mode].rebind(pol, newmask);
428 }
429 
430 /*
431  * Wrapper for mpol_rebind_policy() that just requires task
432  * pointer, and updates task mempolicy.
433  *
434  * Called with task's alloc_lock held.
435  */
436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
437 {
438 	mpol_rebind_policy(tsk->mempolicy, new);
439 }
440 
441 /*
442  * Rebind each vma in mm to new nodemask.
443  *
444  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
445  */
446 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
447 {
448 	struct vm_area_struct *vma;
449 	VMA_ITERATOR(vmi, mm, 0);
450 
451 	mmap_write_lock(mm);
452 	for_each_vma(vmi, vma) {
453 		vma_start_write(vma);
454 		mpol_rebind_policy(vma->vm_policy, new);
455 	}
456 	mmap_write_unlock(mm);
457 }
458 
459 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
460 	[MPOL_DEFAULT] = {
461 		.rebind = mpol_rebind_default,
462 	},
463 	[MPOL_INTERLEAVE] = {
464 		.create = mpol_new_nodemask,
465 		.rebind = mpol_rebind_nodemask,
466 	},
467 	[MPOL_PREFERRED] = {
468 		.create = mpol_new_preferred,
469 		.rebind = mpol_rebind_preferred,
470 	},
471 	[MPOL_BIND] = {
472 		.create = mpol_new_nodemask,
473 		.rebind = mpol_rebind_nodemask,
474 	},
475 	[MPOL_LOCAL] = {
476 		.rebind = mpol_rebind_default,
477 	},
478 	[MPOL_PREFERRED_MANY] = {
479 		.create = mpol_new_nodemask,
480 		.rebind = mpol_rebind_preferred,
481 	},
482 	[MPOL_WEIGHTED_INTERLEAVE] = {
483 		.create = mpol_new_nodemask,
484 		.rebind = mpol_rebind_nodemask,
485 	},
486 };
487 
488 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
489 				unsigned long flags);
490 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
491 				pgoff_t ilx, int *nid);
492 
493 static bool strictly_unmovable(unsigned long flags)
494 {
495 	/*
496 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
497 	 * if any misplaced page is found.
498 	 */
499 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
500 			 MPOL_MF_STRICT;
501 }
502 
503 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
504 	struct mempolicy *pol;
505 	pgoff_t ilx;
506 };
507 
508 struct queue_pages {
509 	struct list_head *pagelist;
510 	unsigned long flags;
511 	nodemask_t *nmask;
512 	unsigned long start;
513 	unsigned long end;
514 	struct vm_area_struct *first;
515 	struct folio *large;		/* note last large folio encountered */
516 	long nr_failed;			/* could not be isolated at this time */
517 };
518 
519 /*
520  * Check if the folio's nid is in qp->nmask.
521  *
522  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
523  * in the invert of qp->nmask.
524  */
525 static inline bool queue_folio_required(struct folio *folio,
526 					struct queue_pages *qp)
527 {
528 	int nid = folio_nid(folio);
529 	unsigned long flags = qp->flags;
530 
531 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
532 }
533 
534 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
535 {
536 	struct folio *folio;
537 	struct queue_pages *qp = walk->private;
538 
539 	if (unlikely(is_pmd_migration_entry(*pmd))) {
540 		qp->nr_failed++;
541 		return;
542 	}
543 	folio = pmd_folio(*pmd);
544 	if (is_huge_zero_folio(folio)) {
545 		walk->action = ACTION_CONTINUE;
546 		return;
547 	}
548 	if (!queue_folio_required(folio, qp))
549 		return;
550 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
551 	    !vma_migratable(walk->vma) ||
552 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
553 		qp->nr_failed++;
554 }
555 
556 /*
557  * Scan through folios, checking if they satisfy the required conditions,
558  * moving them from LRU to local pagelist for migration if they do (or not).
559  *
560  * queue_folios_pte_range() has two possible return values:
561  * 0 - continue walking to scan for more, even if an existing folio on the
562  *     wrong node could not be isolated and queued for migration.
563  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
564  *        and an existing folio was on a node that does not follow the policy.
565  */
566 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
567 			unsigned long end, struct mm_walk *walk)
568 {
569 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
570 	struct vm_area_struct *vma = walk->vma;
571 	struct folio *folio;
572 	struct queue_pages *qp = walk->private;
573 	unsigned long flags = qp->flags;
574 	pte_t *pte, *mapped_pte;
575 	pte_t ptent;
576 	spinlock_t *ptl;
577 	int max_nr, nr;
578 
579 	ptl = pmd_trans_huge_lock(pmd, vma);
580 	if (ptl) {
581 		queue_folios_pmd(pmd, walk);
582 		spin_unlock(ptl);
583 		goto out;
584 	}
585 
586 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
587 	if (!pte) {
588 		walk->action = ACTION_AGAIN;
589 		return 0;
590 	}
591 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
592 		max_nr = (end - addr) >> PAGE_SHIFT;
593 		nr = 1;
594 		ptent = ptep_get(pte);
595 		if (pte_none(ptent))
596 			continue;
597 		if (!pte_present(ptent)) {
598 			if (is_migration_entry(pte_to_swp_entry(ptent)))
599 				qp->nr_failed++;
600 			continue;
601 		}
602 		folio = vm_normal_folio(vma, addr, ptent);
603 		if (!folio || folio_is_zone_device(folio))
604 			continue;
605 		if (folio_test_large(folio) && max_nr != 1)
606 			nr = folio_pte_batch(folio, addr, pte, ptent,
607 					     max_nr, fpb_flags,
608 					     NULL, NULL, NULL);
609 		/*
610 		 * vm_normal_folio() filters out zero pages, but there might
611 		 * still be reserved folios to skip, perhaps in a VDSO.
612 		 */
613 		if (folio_test_reserved(folio))
614 			continue;
615 		if (!queue_folio_required(folio, qp))
616 			continue;
617 		if (folio_test_large(folio)) {
618 			/*
619 			 * A large folio can only be isolated from LRU once,
620 			 * but may be mapped by many PTEs (and Copy-On-Write may
621 			 * intersperse PTEs of other, order 0, folios).  This is
622 			 * a common case, so don't mistake it for failure (but
623 			 * there can be other cases of multi-mapped pages which
624 			 * this quick check does not help to filter out - and a
625 			 * search of the pagelist might grow to be prohibitive).
626 			 *
627 			 * migrate_pages(&pagelist) returns nr_failed folios, so
628 			 * check "large" now so that queue_pages_range() returns
629 			 * a comparable nr_failed folios.  This does imply that
630 			 * if folio could not be isolated for some racy reason
631 			 * at its first PTE, later PTEs will not give it another
632 			 * chance of isolation; but keeps the accounting simple.
633 			 */
634 			if (folio == qp->large)
635 				continue;
636 			qp->large = folio;
637 		}
638 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
639 		    !vma_migratable(vma) ||
640 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
641 			qp->nr_failed += nr;
642 			if (strictly_unmovable(flags))
643 				break;
644 		}
645 	}
646 	pte_unmap_unlock(mapped_pte, ptl);
647 	cond_resched();
648 out:
649 	if (qp->nr_failed && strictly_unmovable(flags))
650 		return -EIO;
651 	return 0;
652 }
653 
654 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
655 			       unsigned long addr, unsigned long end,
656 			       struct mm_walk *walk)
657 {
658 #ifdef CONFIG_HUGETLB_PAGE
659 	struct queue_pages *qp = walk->private;
660 	unsigned long flags = qp->flags;
661 	struct folio *folio;
662 	spinlock_t *ptl;
663 	pte_t entry;
664 
665 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
666 	entry = huge_ptep_get(walk->mm, addr, pte);
667 	if (!pte_present(entry)) {
668 		if (unlikely(is_hugetlb_entry_migration(entry)))
669 			qp->nr_failed++;
670 		goto unlock;
671 	}
672 	folio = pfn_folio(pte_pfn(entry));
673 	if (!queue_folio_required(folio, qp))
674 		goto unlock;
675 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
676 	    !vma_migratable(walk->vma)) {
677 		qp->nr_failed++;
678 		goto unlock;
679 	}
680 	/*
681 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
682 	 * Choosing not to migrate a shared folio is not counted as a failure.
683 	 *
684 	 * See folio_maybe_mapped_shared() on possible imprecision when we
685 	 * cannot easily detect if a folio is shared.
686 	 */
687 	if ((flags & MPOL_MF_MOVE_ALL) ||
688 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
689 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
690 			qp->nr_failed++;
691 unlock:
692 	spin_unlock(ptl);
693 	if (qp->nr_failed && strictly_unmovable(flags))
694 		return -EIO;
695 #endif
696 	return 0;
697 }
698 
699 #ifdef CONFIG_NUMA_BALANCING
700 /*
701  * This is used to mark a range of virtual addresses to be inaccessible.
702  * These are later cleared by a NUMA hinting fault. Depending on these
703  * faults, pages may be migrated for better NUMA placement.
704  *
705  * This is assuming that NUMA faults are handled using PROT_NONE. If
706  * an architecture makes a different choice, it will need further
707  * changes to the core.
708  */
709 unsigned long change_prot_numa(struct vm_area_struct *vma,
710 			unsigned long addr, unsigned long end)
711 {
712 	struct mmu_gather tlb;
713 	long nr_updated;
714 
715 	tlb_gather_mmu(&tlb, vma->vm_mm);
716 
717 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
718 	if (nr_updated > 0) {
719 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
720 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
721 	}
722 
723 	tlb_finish_mmu(&tlb);
724 
725 	return nr_updated;
726 }
727 #endif /* CONFIG_NUMA_BALANCING */
728 
729 static int queue_pages_test_walk(unsigned long start, unsigned long end,
730 				struct mm_walk *walk)
731 {
732 	struct vm_area_struct *next, *vma = walk->vma;
733 	struct queue_pages *qp = walk->private;
734 	unsigned long flags = qp->flags;
735 
736 	/* range check first */
737 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
738 
739 	if (!qp->first) {
740 		qp->first = vma;
741 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
742 			(qp->start < vma->vm_start))
743 			/* hole at head side of range */
744 			return -EFAULT;
745 	}
746 	next = find_vma(vma->vm_mm, vma->vm_end);
747 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
748 		((vma->vm_end < qp->end) &&
749 		(!next || vma->vm_end < next->vm_start)))
750 		/* hole at middle or tail of range */
751 		return -EFAULT;
752 
753 	/*
754 	 * Need check MPOL_MF_STRICT to return -EIO if possible
755 	 * regardless of vma_migratable
756 	 */
757 	if (!vma_migratable(vma) &&
758 	    !(flags & MPOL_MF_STRICT))
759 		return 1;
760 
761 	/*
762 	 * Check page nodes, and queue pages to move, in the current vma.
763 	 * But if no moving, and no strict checking, the scan can be skipped.
764 	 */
765 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
766 		return 0;
767 	return 1;
768 }
769 
770 static const struct mm_walk_ops queue_pages_walk_ops = {
771 	.hugetlb_entry		= queue_folios_hugetlb,
772 	.pmd_entry		= queue_folios_pte_range,
773 	.test_walk		= queue_pages_test_walk,
774 	.walk_lock		= PGWALK_RDLOCK,
775 };
776 
777 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
778 	.hugetlb_entry		= queue_folios_hugetlb,
779 	.pmd_entry		= queue_folios_pte_range,
780 	.test_walk		= queue_pages_test_walk,
781 	.walk_lock		= PGWALK_WRLOCK,
782 };
783 
784 /*
785  * Walk through page tables and collect pages to be migrated.
786  *
787  * If pages found in a given range are not on the required set of @nodes,
788  * and migration is allowed, they are isolated and queued to @pagelist.
789  *
790  * queue_pages_range() may return:
791  * 0 - all pages already on the right node, or successfully queued for moving
792  *     (or neither strict checking nor moving requested: only range checking).
793  * >0 - this number of misplaced folios could not be queued for moving
794  *      (a hugetlbfs page or a transparent huge page being counted as 1).
795  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
796  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
797  */
798 static long
799 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
800 		nodemask_t *nodes, unsigned long flags,
801 		struct list_head *pagelist)
802 {
803 	int err;
804 	struct queue_pages qp = {
805 		.pagelist = pagelist,
806 		.flags = flags,
807 		.nmask = nodes,
808 		.start = start,
809 		.end = end,
810 		.first = NULL,
811 	};
812 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
813 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
814 
815 	err = walk_page_range(mm, start, end, ops, &qp);
816 
817 	if (!qp.first)
818 		/* whole range in hole */
819 		err = -EFAULT;
820 
821 	return err ? : qp.nr_failed;
822 }
823 
824 /*
825  * Apply policy to a single VMA
826  * This must be called with the mmap_lock held for writing.
827  */
828 static int vma_replace_policy(struct vm_area_struct *vma,
829 				struct mempolicy *pol)
830 {
831 	int err;
832 	struct mempolicy *old;
833 	struct mempolicy *new;
834 
835 	vma_assert_write_locked(vma);
836 
837 	new = mpol_dup(pol);
838 	if (IS_ERR(new))
839 		return PTR_ERR(new);
840 
841 	if (vma->vm_ops && vma->vm_ops->set_policy) {
842 		err = vma->vm_ops->set_policy(vma, new);
843 		if (err)
844 			goto err_out;
845 	}
846 
847 	old = vma->vm_policy;
848 	vma->vm_policy = new; /* protected by mmap_lock */
849 	mpol_put(old);
850 
851 	return 0;
852  err_out:
853 	mpol_put(new);
854 	return err;
855 }
856 
857 /* Split or merge the VMA (if required) and apply the new policy */
858 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
859 		struct vm_area_struct **prev, unsigned long start,
860 		unsigned long end, struct mempolicy *new_pol)
861 {
862 	unsigned long vmstart, vmend;
863 
864 	vmend = min(end, vma->vm_end);
865 	if (start > vma->vm_start) {
866 		*prev = vma;
867 		vmstart = start;
868 	} else {
869 		vmstart = vma->vm_start;
870 	}
871 
872 	if (mpol_equal(vma->vm_policy, new_pol)) {
873 		*prev = vma;
874 		return 0;
875 	}
876 
877 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
878 	if (IS_ERR(vma))
879 		return PTR_ERR(vma);
880 
881 	*prev = vma;
882 	return vma_replace_policy(vma, new_pol);
883 }
884 
885 /* Set the process memory policy */
886 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
887 			     nodemask_t *nodes)
888 {
889 	struct mempolicy *new, *old;
890 	NODEMASK_SCRATCH(scratch);
891 	int ret;
892 
893 	if (!scratch)
894 		return -ENOMEM;
895 
896 	new = mpol_new(mode, flags, nodes);
897 	if (IS_ERR(new)) {
898 		ret = PTR_ERR(new);
899 		goto out;
900 	}
901 
902 	task_lock(current);
903 	ret = mpol_set_nodemask(new, nodes, scratch);
904 	if (ret) {
905 		task_unlock(current);
906 		mpol_put(new);
907 		goto out;
908 	}
909 
910 	old = current->mempolicy;
911 	current->mempolicy = new;
912 	if (new && (new->mode == MPOL_INTERLEAVE ||
913 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
914 		current->il_prev = MAX_NUMNODES-1;
915 		current->il_weight = 0;
916 	}
917 	task_unlock(current);
918 	mpol_put(old);
919 	ret = 0;
920 out:
921 	NODEMASK_SCRATCH_FREE(scratch);
922 	return ret;
923 }
924 
925 /*
926  * Return nodemask for policy for get_mempolicy() query
927  *
928  * Called with task's alloc_lock held
929  */
930 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
931 {
932 	nodes_clear(*nodes);
933 	if (pol == &default_policy)
934 		return;
935 
936 	switch (pol->mode) {
937 	case MPOL_BIND:
938 	case MPOL_INTERLEAVE:
939 	case MPOL_PREFERRED:
940 	case MPOL_PREFERRED_MANY:
941 	case MPOL_WEIGHTED_INTERLEAVE:
942 		*nodes = pol->nodes;
943 		break;
944 	case MPOL_LOCAL:
945 		/* return empty node mask for local allocation */
946 		break;
947 	default:
948 		BUG();
949 	}
950 }
951 
952 static int lookup_node(struct mm_struct *mm, unsigned long addr)
953 {
954 	struct page *p = NULL;
955 	int ret;
956 
957 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
958 	if (ret > 0) {
959 		ret = page_to_nid(p);
960 		put_page(p);
961 	}
962 	return ret;
963 }
964 
965 /* Retrieve NUMA policy */
966 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
967 			     unsigned long addr, unsigned long flags)
968 {
969 	int err;
970 	struct mm_struct *mm = current->mm;
971 	struct vm_area_struct *vma = NULL;
972 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
973 
974 	if (flags &
975 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
976 		return -EINVAL;
977 
978 	if (flags & MPOL_F_MEMS_ALLOWED) {
979 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
980 			return -EINVAL;
981 		*policy = 0;	/* just so it's initialized */
982 		task_lock(current);
983 		*nmask  = cpuset_current_mems_allowed;
984 		task_unlock(current);
985 		return 0;
986 	}
987 
988 	if (flags & MPOL_F_ADDR) {
989 		pgoff_t ilx;		/* ignored here */
990 		/*
991 		 * Do NOT fall back to task policy if the
992 		 * vma/shared policy at addr is NULL.  We
993 		 * want to return MPOL_DEFAULT in this case.
994 		 */
995 		mmap_read_lock(mm);
996 		vma = vma_lookup(mm, addr);
997 		if (!vma) {
998 			mmap_read_unlock(mm);
999 			return -EFAULT;
1000 		}
1001 		pol = __get_vma_policy(vma, addr, &ilx);
1002 	} else if (addr)
1003 		return -EINVAL;
1004 
1005 	if (!pol)
1006 		pol = &default_policy;	/* indicates default behavior */
1007 
1008 	if (flags & MPOL_F_NODE) {
1009 		if (flags & MPOL_F_ADDR) {
1010 			/*
1011 			 * Take a refcount on the mpol, because we are about to
1012 			 * drop the mmap_lock, after which only "pol" remains
1013 			 * valid, "vma" is stale.
1014 			 */
1015 			pol_refcount = pol;
1016 			vma = NULL;
1017 			mpol_get(pol);
1018 			mmap_read_unlock(mm);
1019 			err = lookup_node(mm, addr);
1020 			if (err < 0)
1021 				goto out;
1022 			*policy = err;
1023 		} else if (pol == current->mempolicy &&
1024 				pol->mode == MPOL_INTERLEAVE) {
1025 			*policy = next_node_in(current->il_prev, pol->nodes);
1026 		} else if (pol == current->mempolicy &&
1027 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1028 			if (current->il_weight)
1029 				*policy = current->il_prev;
1030 			else
1031 				*policy = next_node_in(current->il_prev,
1032 						       pol->nodes);
1033 		} else {
1034 			err = -EINVAL;
1035 			goto out;
1036 		}
1037 	} else {
1038 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1039 						pol->mode;
1040 		/*
1041 		 * Internal mempolicy flags must be masked off before exposing
1042 		 * the policy to userspace.
1043 		 */
1044 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1045 	}
1046 
1047 	err = 0;
1048 	if (nmask) {
1049 		if (mpol_store_user_nodemask(pol)) {
1050 			*nmask = pol->w.user_nodemask;
1051 		} else {
1052 			task_lock(current);
1053 			get_policy_nodemask(pol, nmask);
1054 			task_unlock(current);
1055 		}
1056 	}
1057 
1058  out:
1059 	mpol_cond_put(pol);
1060 	if (vma)
1061 		mmap_read_unlock(mm);
1062 	if (pol_refcount)
1063 		mpol_put(pol_refcount);
1064 	return err;
1065 }
1066 
1067 #ifdef CONFIG_MIGRATION
1068 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1069 				unsigned long flags)
1070 {
1071 	/*
1072 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1073 	 * Choosing not to migrate a shared folio is not counted as a failure.
1074 	 *
1075 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1076 	 * cannot easily detect if a folio is shared.
1077 	 */
1078 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1079 		if (folio_isolate_lru(folio)) {
1080 			list_add_tail(&folio->lru, foliolist);
1081 			node_stat_mod_folio(folio,
1082 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1083 				folio_nr_pages(folio));
1084 		} else {
1085 			/*
1086 			 * Non-movable folio may reach here.  And, there may be
1087 			 * temporary off LRU folios or non-LRU movable folios.
1088 			 * Treat them as unmovable folios since they can't be
1089 			 * isolated, so they can't be moved at the moment.
1090 			 */
1091 			return false;
1092 		}
1093 	}
1094 	return true;
1095 }
1096 
1097 /*
1098  * Migrate pages from one node to a target node.
1099  * Returns error or the number of pages not migrated.
1100  */
1101 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1102 			    int flags)
1103 {
1104 	nodemask_t nmask;
1105 	struct vm_area_struct *vma;
1106 	LIST_HEAD(pagelist);
1107 	long nr_failed;
1108 	long err = 0;
1109 	struct migration_target_control mtc = {
1110 		.nid = dest,
1111 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1112 		.reason = MR_SYSCALL,
1113 	};
1114 
1115 	nodes_clear(nmask);
1116 	node_set(source, nmask);
1117 
1118 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1119 
1120 	mmap_read_lock(mm);
1121 	vma = find_vma(mm, 0);
1122 	if (unlikely(!vma)) {
1123 		mmap_read_unlock(mm);
1124 		return 0;
1125 	}
1126 
1127 	/*
1128 	 * This does not migrate the range, but isolates all pages that
1129 	 * need migration.  Between passing in the full user address
1130 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1131 	 * but passes back the count of pages which could not be isolated.
1132 	 */
1133 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1134 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1135 	mmap_read_unlock(mm);
1136 
1137 	if (!list_empty(&pagelist)) {
1138 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1139 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1140 		if (err)
1141 			putback_movable_pages(&pagelist);
1142 	}
1143 
1144 	if (err >= 0)
1145 		err += nr_failed;
1146 	return err;
1147 }
1148 
1149 /*
1150  * Move pages between the two nodesets so as to preserve the physical
1151  * layout as much as possible.
1152  *
1153  * Returns the number of page that could not be moved.
1154  */
1155 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1156 		     const nodemask_t *to, int flags)
1157 {
1158 	long nr_failed = 0;
1159 	long err = 0;
1160 	nodemask_t tmp;
1161 
1162 	lru_cache_disable();
1163 
1164 	/*
1165 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1166 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1167 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1168 	 * The pair of nodemasks 'to' and 'from' define the map.
1169 	 *
1170 	 * If no pair of bits is found that way, fallback to picking some
1171 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1172 	 * 'source' and 'dest' bits are the same, this represents a node
1173 	 * that will be migrating to itself, so no pages need move.
1174 	 *
1175 	 * If no bits are left in 'tmp', or if all remaining bits left
1176 	 * in 'tmp' correspond to the same bit in 'to', return false
1177 	 * (nothing left to migrate).
1178 	 *
1179 	 * This lets us pick a pair of nodes to migrate between, such that
1180 	 * if possible the dest node is not already occupied by some other
1181 	 * source node, minimizing the risk of overloading the memory on a
1182 	 * node that would happen if we migrated incoming memory to a node
1183 	 * before migrating outgoing memory source that same node.
1184 	 *
1185 	 * A single scan of tmp is sufficient.  As we go, we remember the
1186 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1187 	 * that not only moved, but what's better, moved to an empty slot
1188 	 * (d is not set in tmp), then we break out then, with that pair.
1189 	 * Otherwise when we finish scanning from_tmp, we at least have the
1190 	 * most recent <s, d> pair that moved.  If we get all the way through
1191 	 * the scan of tmp without finding any node that moved, much less
1192 	 * moved to an empty node, then there is nothing left worth migrating.
1193 	 */
1194 
1195 	tmp = *from;
1196 	while (!nodes_empty(tmp)) {
1197 		int s, d;
1198 		int source = NUMA_NO_NODE;
1199 		int dest = 0;
1200 
1201 		for_each_node_mask(s, tmp) {
1202 
1203 			/*
1204 			 * do_migrate_pages() tries to maintain the relative
1205 			 * node relationship of the pages established between
1206 			 * threads and memory areas.
1207                          *
1208 			 * However if the number of source nodes is not equal to
1209 			 * the number of destination nodes we can not preserve
1210 			 * this node relative relationship.  In that case, skip
1211 			 * copying memory from a node that is in the destination
1212 			 * mask.
1213 			 *
1214 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1215 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1216 			 */
1217 
1218 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1219 						(node_isset(s, *to)))
1220 				continue;
1221 
1222 			d = node_remap(s, *from, *to);
1223 			if (s == d)
1224 				continue;
1225 
1226 			source = s;	/* Node moved. Memorize */
1227 			dest = d;
1228 
1229 			/* dest not in remaining from nodes? */
1230 			if (!node_isset(dest, tmp))
1231 				break;
1232 		}
1233 		if (source == NUMA_NO_NODE)
1234 			break;
1235 
1236 		node_clear(source, tmp);
1237 		err = migrate_to_node(mm, source, dest, flags);
1238 		if (err > 0)
1239 			nr_failed += err;
1240 		if (err < 0)
1241 			break;
1242 	}
1243 
1244 	lru_cache_enable();
1245 	if (err < 0)
1246 		return err;
1247 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1248 }
1249 
1250 /*
1251  * Allocate a new folio for page migration, according to NUMA mempolicy.
1252  */
1253 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1254 						    unsigned long private)
1255 {
1256 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1257 	struct mempolicy *pol = mmpol->pol;
1258 	pgoff_t ilx = mmpol->ilx;
1259 	unsigned int order;
1260 	int nid = numa_node_id();
1261 	gfp_t gfp;
1262 
1263 	order = folio_order(src);
1264 	ilx += src->index >> order;
1265 
1266 	if (folio_test_hugetlb(src)) {
1267 		nodemask_t *nodemask;
1268 		struct hstate *h;
1269 
1270 		h = folio_hstate(src);
1271 		gfp = htlb_alloc_mask(h);
1272 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1273 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1274 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1275 	}
1276 
1277 	if (folio_test_large(src))
1278 		gfp = GFP_TRANSHUGE;
1279 	else
1280 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1281 
1282 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1283 }
1284 #else
1285 
1286 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1287 				unsigned long flags)
1288 {
1289 	return false;
1290 }
1291 
1292 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1293 		     const nodemask_t *to, int flags)
1294 {
1295 	return -ENOSYS;
1296 }
1297 
1298 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1299 						    unsigned long private)
1300 {
1301 	return NULL;
1302 }
1303 #endif
1304 
1305 static long do_mbind(unsigned long start, unsigned long len,
1306 		     unsigned short mode, unsigned short mode_flags,
1307 		     nodemask_t *nmask, unsigned long flags)
1308 {
1309 	struct mm_struct *mm = current->mm;
1310 	struct vm_area_struct *vma, *prev;
1311 	struct vma_iterator vmi;
1312 	struct migration_mpol mmpol;
1313 	struct mempolicy *new;
1314 	unsigned long end;
1315 	long err;
1316 	long nr_failed;
1317 	LIST_HEAD(pagelist);
1318 
1319 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1320 		return -EINVAL;
1321 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1322 		return -EPERM;
1323 
1324 	if (start & ~PAGE_MASK)
1325 		return -EINVAL;
1326 
1327 	if (mode == MPOL_DEFAULT)
1328 		flags &= ~MPOL_MF_STRICT;
1329 
1330 	len = PAGE_ALIGN(len);
1331 	end = start + len;
1332 
1333 	if (end < start)
1334 		return -EINVAL;
1335 	if (end == start)
1336 		return 0;
1337 
1338 	new = mpol_new(mode, mode_flags, nmask);
1339 	if (IS_ERR(new))
1340 		return PTR_ERR(new);
1341 
1342 	/*
1343 	 * If we are using the default policy then operation
1344 	 * on discontinuous address spaces is okay after all
1345 	 */
1346 	if (!new)
1347 		flags |= MPOL_MF_DISCONTIG_OK;
1348 
1349 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1350 		lru_cache_disable();
1351 	{
1352 		NODEMASK_SCRATCH(scratch);
1353 		if (scratch) {
1354 			mmap_write_lock(mm);
1355 			err = mpol_set_nodemask(new, nmask, scratch);
1356 			if (err)
1357 				mmap_write_unlock(mm);
1358 		} else
1359 			err = -ENOMEM;
1360 		NODEMASK_SCRATCH_FREE(scratch);
1361 	}
1362 	if (err)
1363 		goto mpol_out;
1364 
1365 	/*
1366 	 * Lock the VMAs before scanning for pages to migrate,
1367 	 * to ensure we don't miss a concurrently inserted page.
1368 	 */
1369 	nr_failed = queue_pages_range(mm, start, end, nmask,
1370 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1371 
1372 	if (nr_failed < 0) {
1373 		err = nr_failed;
1374 		nr_failed = 0;
1375 	} else {
1376 		vma_iter_init(&vmi, mm, start);
1377 		prev = vma_prev(&vmi);
1378 		for_each_vma_range(vmi, vma, end) {
1379 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1380 			if (err)
1381 				break;
1382 		}
1383 	}
1384 
1385 	if (!err && !list_empty(&pagelist)) {
1386 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1387 		if (!new) {
1388 			new = get_task_policy(current);
1389 			mpol_get(new);
1390 		}
1391 		mmpol.pol = new;
1392 		mmpol.ilx = 0;
1393 
1394 		/*
1395 		 * In the interleaved case, attempt to allocate on exactly the
1396 		 * targeted nodes, for the first VMA to be migrated; for later
1397 		 * VMAs, the nodes will still be interleaved from the targeted
1398 		 * nodemask, but one by one may be selected differently.
1399 		 */
1400 		if (new->mode == MPOL_INTERLEAVE ||
1401 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1402 			struct folio *folio;
1403 			unsigned int order;
1404 			unsigned long addr = -EFAULT;
1405 
1406 			list_for_each_entry(folio, &pagelist, lru) {
1407 				if (!folio_test_ksm(folio))
1408 					break;
1409 			}
1410 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1411 				vma_iter_init(&vmi, mm, start);
1412 				for_each_vma_range(vmi, vma, end) {
1413 					addr = page_address_in_vma(folio,
1414 						folio_page(folio, 0), vma);
1415 					if (addr != -EFAULT)
1416 						break;
1417 				}
1418 			}
1419 			if (addr != -EFAULT) {
1420 				order = folio_order(folio);
1421 				/* We already know the pol, but not the ilx */
1422 				mpol_cond_put(get_vma_policy(vma, addr, order,
1423 							     &mmpol.ilx));
1424 				/* Set base from which to increment by index */
1425 				mmpol.ilx -= folio->index >> order;
1426 			}
1427 		}
1428 	}
1429 
1430 	mmap_write_unlock(mm);
1431 
1432 	if (!err && !list_empty(&pagelist)) {
1433 		nr_failed |= migrate_pages(&pagelist,
1434 				alloc_migration_target_by_mpol, NULL,
1435 				(unsigned long)&mmpol, MIGRATE_SYNC,
1436 				MR_MEMPOLICY_MBIND, NULL);
1437 	}
1438 
1439 	if (nr_failed && (flags & MPOL_MF_STRICT))
1440 		err = -EIO;
1441 	if (!list_empty(&pagelist))
1442 		putback_movable_pages(&pagelist);
1443 mpol_out:
1444 	mpol_put(new);
1445 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1446 		lru_cache_enable();
1447 	return err;
1448 }
1449 
1450 /*
1451  * User space interface with variable sized bitmaps for nodelists.
1452  */
1453 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1454 		      unsigned long maxnode)
1455 {
1456 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1457 	int ret;
1458 
1459 	if (in_compat_syscall())
1460 		ret = compat_get_bitmap(mask,
1461 					(const compat_ulong_t __user *)nmask,
1462 					maxnode);
1463 	else
1464 		ret = copy_from_user(mask, nmask,
1465 				     nlongs * sizeof(unsigned long));
1466 
1467 	if (ret)
1468 		return -EFAULT;
1469 
1470 	if (maxnode % BITS_PER_LONG)
1471 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1472 
1473 	return 0;
1474 }
1475 
1476 /* Copy a node mask from user space. */
1477 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1478 		     unsigned long maxnode)
1479 {
1480 	--maxnode;
1481 	nodes_clear(*nodes);
1482 	if (maxnode == 0 || !nmask)
1483 		return 0;
1484 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1485 		return -EINVAL;
1486 
1487 	/*
1488 	 * When the user specified more nodes than supported just check
1489 	 * if the non supported part is all zero, one word at a time,
1490 	 * starting at the end.
1491 	 */
1492 	while (maxnode > MAX_NUMNODES) {
1493 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1494 		unsigned long t;
1495 
1496 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1497 			return -EFAULT;
1498 
1499 		if (maxnode - bits >= MAX_NUMNODES) {
1500 			maxnode -= bits;
1501 		} else {
1502 			maxnode = MAX_NUMNODES;
1503 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1504 		}
1505 		if (t)
1506 			return -EINVAL;
1507 	}
1508 
1509 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1510 }
1511 
1512 /* Copy a kernel node mask to user space */
1513 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1514 			      nodemask_t *nodes)
1515 {
1516 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1517 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1518 	bool compat = in_compat_syscall();
1519 
1520 	if (compat)
1521 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1522 
1523 	if (copy > nbytes) {
1524 		if (copy > PAGE_SIZE)
1525 			return -EINVAL;
1526 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1527 			return -EFAULT;
1528 		copy = nbytes;
1529 		maxnode = nr_node_ids;
1530 	}
1531 
1532 	if (compat)
1533 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1534 					 nodes_addr(*nodes), maxnode);
1535 
1536 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1537 }
1538 
1539 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1540 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1541 {
1542 	*flags = *mode & MPOL_MODE_FLAGS;
1543 	*mode &= ~MPOL_MODE_FLAGS;
1544 
1545 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1546 		return -EINVAL;
1547 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1548 		return -EINVAL;
1549 	if (*flags & MPOL_F_NUMA_BALANCING) {
1550 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1551 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1552 		else
1553 			return -EINVAL;
1554 	}
1555 	return 0;
1556 }
1557 
1558 static long kernel_mbind(unsigned long start, unsigned long len,
1559 			 unsigned long mode, const unsigned long __user *nmask,
1560 			 unsigned long maxnode, unsigned int flags)
1561 {
1562 	unsigned short mode_flags;
1563 	nodemask_t nodes;
1564 	int lmode = mode;
1565 	int err;
1566 
1567 	start = untagged_addr(start);
1568 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1569 	if (err)
1570 		return err;
1571 
1572 	err = get_nodes(&nodes, nmask, maxnode);
1573 	if (err)
1574 		return err;
1575 
1576 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1577 }
1578 
1579 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1580 		unsigned long, home_node, unsigned long, flags)
1581 {
1582 	struct mm_struct *mm = current->mm;
1583 	struct vm_area_struct *vma, *prev;
1584 	struct mempolicy *new, *old;
1585 	unsigned long end;
1586 	int err = -ENOENT;
1587 	VMA_ITERATOR(vmi, mm, start);
1588 
1589 	start = untagged_addr(start);
1590 	if (start & ~PAGE_MASK)
1591 		return -EINVAL;
1592 	/*
1593 	 * flags is used for future extension if any.
1594 	 */
1595 	if (flags != 0)
1596 		return -EINVAL;
1597 
1598 	/*
1599 	 * Check home_node is online to avoid accessing uninitialized
1600 	 * NODE_DATA.
1601 	 */
1602 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1603 		return -EINVAL;
1604 
1605 	len = PAGE_ALIGN(len);
1606 	end = start + len;
1607 
1608 	if (end < start)
1609 		return -EINVAL;
1610 	if (end == start)
1611 		return 0;
1612 	mmap_write_lock(mm);
1613 	prev = vma_prev(&vmi);
1614 	for_each_vma_range(vmi, vma, end) {
1615 		/*
1616 		 * If any vma in the range got policy other than MPOL_BIND
1617 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1618 		 * the home node for vmas we already updated before.
1619 		 */
1620 		old = vma_policy(vma);
1621 		if (!old) {
1622 			prev = vma;
1623 			continue;
1624 		}
1625 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1626 			err = -EOPNOTSUPP;
1627 			break;
1628 		}
1629 		new = mpol_dup(old);
1630 		if (IS_ERR(new)) {
1631 			err = PTR_ERR(new);
1632 			break;
1633 		}
1634 
1635 		vma_start_write(vma);
1636 		new->home_node = home_node;
1637 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1638 		mpol_put(new);
1639 		if (err)
1640 			break;
1641 	}
1642 	mmap_write_unlock(mm);
1643 	return err;
1644 }
1645 
1646 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1647 		unsigned long, mode, const unsigned long __user *, nmask,
1648 		unsigned long, maxnode, unsigned int, flags)
1649 {
1650 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1651 }
1652 
1653 /* Set the process memory policy */
1654 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1655 				 unsigned long maxnode)
1656 {
1657 	unsigned short mode_flags;
1658 	nodemask_t nodes;
1659 	int lmode = mode;
1660 	int err;
1661 
1662 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1663 	if (err)
1664 		return err;
1665 
1666 	err = get_nodes(&nodes, nmask, maxnode);
1667 	if (err)
1668 		return err;
1669 
1670 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1671 }
1672 
1673 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1674 		unsigned long, maxnode)
1675 {
1676 	return kernel_set_mempolicy(mode, nmask, maxnode);
1677 }
1678 
1679 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1680 				const unsigned long __user *old_nodes,
1681 				const unsigned long __user *new_nodes)
1682 {
1683 	struct mm_struct *mm = NULL;
1684 	struct task_struct *task;
1685 	nodemask_t task_nodes;
1686 	int err;
1687 	nodemask_t *old;
1688 	nodemask_t *new;
1689 	NODEMASK_SCRATCH(scratch);
1690 
1691 	if (!scratch)
1692 		return -ENOMEM;
1693 
1694 	old = &scratch->mask1;
1695 	new = &scratch->mask2;
1696 
1697 	err = get_nodes(old, old_nodes, maxnode);
1698 	if (err)
1699 		goto out;
1700 
1701 	err = get_nodes(new, new_nodes, maxnode);
1702 	if (err)
1703 		goto out;
1704 
1705 	/* Find the mm_struct */
1706 	rcu_read_lock();
1707 	task = pid ? find_task_by_vpid(pid) : current;
1708 	if (!task) {
1709 		rcu_read_unlock();
1710 		err = -ESRCH;
1711 		goto out;
1712 	}
1713 	get_task_struct(task);
1714 
1715 	err = -EINVAL;
1716 
1717 	/*
1718 	 * Check if this process has the right to modify the specified process.
1719 	 * Use the regular "ptrace_may_access()" checks.
1720 	 */
1721 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1722 		rcu_read_unlock();
1723 		err = -EPERM;
1724 		goto out_put;
1725 	}
1726 	rcu_read_unlock();
1727 
1728 	task_nodes = cpuset_mems_allowed(task);
1729 	/* Is the user allowed to access the target nodes? */
1730 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1731 		err = -EPERM;
1732 		goto out_put;
1733 	}
1734 
1735 	task_nodes = cpuset_mems_allowed(current);
1736 	nodes_and(*new, *new, task_nodes);
1737 	if (nodes_empty(*new))
1738 		goto out_put;
1739 
1740 	err = security_task_movememory(task);
1741 	if (err)
1742 		goto out_put;
1743 
1744 	mm = get_task_mm(task);
1745 	put_task_struct(task);
1746 
1747 	if (!mm) {
1748 		err = -EINVAL;
1749 		goto out;
1750 	}
1751 
1752 	err = do_migrate_pages(mm, old, new,
1753 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1754 
1755 	mmput(mm);
1756 out:
1757 	NODEMASK_SCRATCH_FREE(scratch);
1758 
1759 	return err;
1760 
1761 out_put:
1762 	put_task_struct(task);
1763 	goto out;
1764 }
1765 
1766 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1767 		const unsigned long __user *, old_nodes,
1768 		const unsigned long __user *, new_nodes)
1769 {
1770 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1771 }
1772 
1773 /* Retrieve NUMA policy */
1774 static int kernel_get_mempolicy(int __user *policy,
1775 				unsigned long __user *nmask,
1776 				unsigned long maxnode,
1777 				unsigned long addr,
1778 				unsigned long flags)
1779 {
1780 	int err;
1781 	int pval;
1782 	nodemask_t nodes;
1783 
1784 	if (nmask != NULL && maxnode < nr_node_ids)
1785 		return -EINVAL;
1786 
1787 	addr = untagged_addr(addr);
1788 
1789 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1790 
1791 	if (err)
1792 		return err;
1793 
1794 	if (policy && put_user(pval, policy))
1795 		return -EFAULT;
1796 
1797 	if (nmask)
1798 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1799 
1800 	return err;
1801 }
1802 
1803 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1804 		unsigned long __user *, nmask, unsigned long, maxnode,
1805 		unsigned long, addr, unsigned long, flags)
1806 {
1807 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1808 }
1809 
1810 bool vma_migratable(struct vm_area_struct *vma)
1811 {
1812 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1813 		return false;
1814 
1815 	/*
1816 	 * DAX device mappings require predictable access latency, so avoid
1817 	 * incurring periodic faults.
1818 	 */
1819 	if (vma_is_dax(vma))
1820 		return false;
1821 
1822 	if (is_vm_hugetlb_page(vma) &&
1823 		!hugepage_migration_supported(hstate_vma(vma)))
1824 		return false;
1825 
1826 	/*
1827 	 * Migration allocates pages in the highest zone. If we cannot
1828 	 * do so then migration (at least from node to node) is not
1829 	 * possible.
1830 	 */
1831 	if (vma->vm_file &&
1832 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1833 			< policy_zone)
1834 		return false;
1835 	return true;
1836 }
1837 
1838 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1839 				   unsigned long addr, pgoff_t *ilx)
1840 {
1841 	*ilx = 0;
1842 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
1843 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1844 }
1845 
1846 /*
1847  * get_vma_policy(@vma, @addr, @order, @ilx)
1848  * @vma: virtual memory area whose policy is sought
1849  * @addr: address in @vma for shared policy lookup
1850  * @order: 0, or appropriate huge_page_order for interleaving
1851  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1852  *       MPOL_WEIGHTED_INTERLEAVE
1853  *
1854  * Returns effective policy for a VMA at specified address.
1855  * Falls back to current->mempolicy or system default policy, as necessary.
1856  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1857  * count--added by the get_policy() vm_op, as appropriate--to protect against
1858  * freeing by another task.  It is the caller's responsibility to free the
1859  * extra reference for shared policies.
1860  */
1861 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1862 				 unsigned long addr, int order, pgoff_t *ilx)
1863 {
1864 	struct mempolicy *pol;
1865 
1866 	pol = __get_vma_policy(vma, addr, ilx);
1867 	if (!pol)
1868 		pol = get_task_policy(current);
1869 	if (pol->mode == MPOL_INTERLEAVE ||
1870 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1871 		*ilx += vma->vm_pgoff >> order;
1872 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1873 	}
1874 	return pol;
1875 }
1876 
1877 bool vma_policy_mof(struct vm_area_struct *vma)
1878 {
1879 	struct mempolicy *pol;
1880 
1881 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1882 		bool ret = false;
1883 		pgoff_t ilx;		/* ignored here */
1884 
1885 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1886 		if (pol && (pol->flags & MPOL_F_MOF))
1887 			ret = true;
1888 		mpol_cond_put(pol);
1889 
1890 		return ret;
1891 	}
1892 
1893 	pol = vma->vm_policy;
1894 	if (!pol)
1895 		pol = get_task_policy(current);
1896 
1897 	return pol->flags & MPOL_F_MOF;
1898 }
1899 
1900 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1901 {
1902 	enum zone_type dynamic_policy_zone = policy_zone;
1903 
1904 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1905 
1906 	/*
1907 	 * if policy->nodes has movable memory only,
1908 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1909 	 *
1910 	 * policy->nodes is intersect with node_states[N_MEMORY].
1911 	 * so if the following test fails, it implies
1912 	 * policy->nodes has movable memory only.
1913 	 */
1914 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1915 		dynamic_policy_zone = ZONE_MOVABLE;
1916 
1917 	return zone >= dynamic_policy_zone;
1918 }
1919 
1920 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
1921 {
1922 	unsigned int node;
1923 	unsigned int cpuset_mems_cookie;
1924 
1925 retry:
1926 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
1927 	cpuset_mems_cookie = read_mems_allowed_begin();
1928 	node = current->il_prev;
1929 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
1930 		node = next_node_in(node, policy->nodes);
1931 		if (read_mems_allowed_retry(cpuset_mems_cookie))
1932 			goto retry;
1933 		if (node == MAX_NUMNODES)
1934 			return node;
1935 		current->il_prev = node;
1936 		current->il_weight = get_il_weight(node);
1937 	}
1938 	current->il_weight--;
1939 	return node;
1940 }
1941 
1942 /* Do dynamic interleaving for a process */
1943 static unsigned int interleave_nodes(struct mempolicy *policy)
1944 {
1945 	unsigned int nid;
1946 	unsigned int cpuset_mems_cookie;
1947 
1948 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
1949 	do {
1950 		cpuset_mems_cookie = read_mems_allowed_begin();
1951 		nid = next_node_in(current->il_prev, policy->nodes);
1952 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
1953 
1954 	if (nid < MAX_NUMNODES)
1955 		current->il_prev = nid;
1956 	return nid;
1957 }
1958 
1959 /*
1960  * Depending on the memory policy provide a node from which to allocate the
1961  * next slab entry.
1962  */
1963 unsigned int mempolicy_slab_node(void)
1964 {
1965 	struct mempolicy *policy;
1966 	int node = numa_mem_id();
1967 
1968 	if (!in_task())
1969 		return node;
1970 
1971 	policy = current->mempolicy;
1972 	if (!policy)
1973 		return node;
1974 
1975 	switch (policy->mode) {
1976 	case MPOL_PREFERRED:
1977 		return first_node(policy->nodes);
1978 
1979 	case MPOL_INTERLEAVE:
1980 		return interleave_nodes(policy);
1981 
1982 	case MPOL_WEIGHTED_INTERLEAVE:
1983 		return weighted_interleave_nodes(policy);
1984 
1985 	case MPOL_BIND:
1986 	case MPOL_PREFERRED_MANY:
1987 	{
1988 		struct zoneref *z;
1989 
1990 		/*
1991 		 * Follow bind policy behavior and start allocation at the
1992 		 * first node.
1993 		 */
1994 		struct zonelist *zonelist;
1995 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1996 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1997 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1998 							&policy->nodes);
1999 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2000 	}
2001 	case MPOL_LOCAL:
2002 		return node;
2003 
2004 	default:
2005 		BUG();
2006 	}
2007 }
2008 
2009 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2010 					      nodemask_t *mask)
2011 {
2012 	/*
2013 	 * barrier stabilizes the nodemask locally so that it can be iterated
2014 	 * over safely without concern for changes. Allocators validate node
2015 	 * selection does not violate mems_allowed, so this is safe.
2016 	 */
2017 	barrier();
2018 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2019 	barrier();
2020 	return nodes_weight(*mask);
2021 }
2022 
2023 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2024 {
2025 	nodemask_t nodemask;
2026 	unsigned int target, nr_nodes;
2027 	u8 *table;
2028 	unsigned int weight_total = 0;
2029 	u8 weight;
2030 	int nid;
2031 
2032 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2033 	if (!nr_nodes)
2034 		return numa_node_id();
2035 
2036 	rcu_read_lock();
2037 	table = rcu_dereference(iw_table);
2038 	/* calculate the total weight */
2039 	for_each_node_mask(nid, nodemask) {
2040 		/* detect system default usage */
2041 		weight = table ? table[nid] : 1;
2042 		weight = weight ? weight : 1;
2043 		weight_total += weight;
2044 	}
2045 
2046 	/* Calculate the node offset based on totals */
2047 	target = ilx % weight_total;
2048 	nid = first_node(nodemask);
2049 	while (target) {
2050 		/* detect system default usage */
2051 		weight = table ? table[nid] : 1;
2052 		weight = weight ? weight : 1;
2053 		if (target < weight)
2054 			break;
2055 		target -= weight;
2056 		nid = next_node_in(nid, nodemask);
2057 	}
2058 	rcu_read_unlock();
2059 	return nid;
2060 }
2061 
2062 /*
2063  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2064  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2065  * exceeds the number of present nodes.
2066  */
2067 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2068 {
2069 	nodemask_t nodemask;
2070 	unsigned int target, nnodes;
2071 	int i;
2072 	int nid;
2073 
2074 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2075 	if (!nnodes)
2076 		return numa_node_id();
2077 	target = ilx % nnodes;
2078 	nid = first_node(nodemask);
2079 	for (i = 0; i < target; i++)
2080 		nid = next_node(nid, nodemask);
2081 	return nid;
2082 }
2083 
2084 /*
2085  * Return a nodemask representing a mempolicy for filtering nodes for
2086  * page allocation, together with preferred node id (or the input node id).
2087  */
2088 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2089 				   pgoff_t ilx, int *nid)
2090 {
2091 	nodemask_t *nodemask = NULL;
2092 
2093 	switch (pol->mode) {
2094 	case MPOL_PREFERRED:
2095 		/* Override input node id */
2096 		*nid = first_node(pol->nodes);
2097 		break;
2098 	case MPOL_PREFERRED_MANY:
2099 		nodemask = &pol->nodes;
2100 		if (pol->home_node != NUMA_NO_NODE)
2101 			*nid = pol->home_node;
2102 		break;
2103 	case MPOL_BIND:
2104 		/* Restrict to nodemask (but not on lower zones) */
2105 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2106 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2107 			nodemask = &pol->nodes;
2108 		if (pol->home_node != NUMA_NO_NODE)
2109 			*nid = pol->home_node;
2110 		/*
2111 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2112 		 * because we might easily break the expectation to stay on the
2113 		 * requested node and not break the policy.
2114 		 */
2115 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2116 		break;
2117 	case MPOL_INTERLEAVE:
2118 		/* Override input node id */
2119 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2120 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2121 		break;
2122 	case MPOL_WEIGHTED_INTERLEAVE:
2123 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2124 			weighted_interleave_nodes(pol) :
2125 			weighted_interleave_nid(pol, ilx);
2126 		break;
2127 	}
2128 
2129 	return nodemask;
2130 }
2131 
2132 #ifdef CONFIG_HUGETLBFS
2133 /*
2134  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2135  * @vma: virtual memory area whose policy is sought
2136  * @addr: address in @vma for shared policy lookup and interleave policy
2137  * @gfp_flags: for requested zone
2138  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2139  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2140  *
2141  * Returns a nid suitable for a huge page allocation and a pointer
2142  * to the struct mempolicy for conditional unref after allocation.
2143  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2144  * to the mempolicy's @nodemask for filtering the zonelist.
2145  */
2146 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2147 		struct mempolicy **mpol, nodemask_t **nodemask)
2148 {
2149 	pgoff_t ilx;
2150 	int nid;
2151 
2152 	nid = numa_node_id();
2153 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2154 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2155 	return nid;
2156 }
2157 
2158 /*
2159  * init_nodemask_of_mempolicy
2160  *
2161  * If the current task's mempolicy is "default" [NULL], return 'false'
2162  * to indicate default policy.  Otherwise, extract the policy nodemask
2163  * for 'bind' or 'interleave' policy into the argument nodemask, or
2164  * initialize the argument nodemask to contain the single node for
2165  * 'preferred' or 'local' policy and return 'true' to indicate presence
2166  * of non-default mempolicy.
2167  *
2168  * We don't bother with reference counting the mempolicy [mpol_get/put]
2169  * because the current task is examining it's own mempolicy and a task's
2170  * mempolicy is only ever changed by the task itself.
2171  *
2172  * N.B., it is the caller's responsibility to free a returned nodemask.
2173  */
2174 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2175 {
2176 	struct mempolicy *mempolicy;
2177 
2178 	if (!(mask && current->mempolicy))
2179 		return false;
2180 
2181 	task_lock(current);
2182 	mempolicy = current->mempolicy;
2183 	switch (mempolicy->mode) {
2184 	case MPOL_PREFERRED:
2185 	case MPOL_PREFERRED_MANY:
2186 	case MPOL_BIND:
2187 	case MPOL_INTERLEAVE:
2188 	case MPOL_WEIGHTED_INTERLEAVE:
2189 		*mask = mempolicy->nodes;
2190 		break;
2191 
2192 	case MPOL_LOCAL:
2193 		init_nodemask_of_node(mask, numa_node_id());
2194 		break;
2195 
2196 	default:
2197 		BUG();
2198 	}
2199 	task_unlock(current);
2200 
2201 	return true;
2202 }
2203 #endif
2204 
2205 /*
2206  * mempolicy_in_oom_domain
2207  *
2208  * If tsk's mempolicy is "bind", check for intersection between mask and
2209  * the policy nodemask. Otherwise, return true for all other policies
2210  * including "interleave", as a tsk with "interleave" policy may have
2211  * memory allocated from all nodes in system.
2212  *
2213  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2214  */
2215 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2216 					const nodemask_t *mask)
2217 {
2218 	struct mempolicy *mempolicy;
2219 	bool ret = true;
2220 
2221 	if (!mask)
2222 		return ret;
2223 
2224 	task_lock(tsk);
2225 	mempolicy = tsk->mempolicy;
2226 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2227 		ret = nodes_intersects(mempolicy->nodes, *mask);
2228 	task_unlock(tsk);
2229 
2230 	return ret;
2231 }
2232 
2233 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2234 						int nid, nodemask_t *nodemask)
2235 {
2236 	struct page *page;
2237 	gfp_t preferred_gfp;
2238 
2239 	/*
2240 	 * This is a two pass approach. The first pass will only try the
2241 	 * preferred nodes but skip the direct reclaim and allow the
2242 	 * allocation to fail, while the second pass will try all the
2243 	 * nodes in system.
2244 	 */
2245 	preferred_gfp = gfp | __GFP_NOWARN;
2246 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2247 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2248 	if (!page)
2249 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2250 
2251 	return page;
2252 }
2253 
2254 /**
2255  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2256  * @gfp: GFP flags.
2257  * @order: Order of the page allocation.
2258  * @pol: Pointer to the NUMA mempolicy.
2259  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2260  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2261  *
2262  * Return: The page on success or NULL if allocation fails.
2263  */
2264 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2265 		struct mempolicy *pol, pgoff_t ilx, int nid)
2266 {
2267 	nodemask_t *nodemask;
2268 	struct page *page;
2269 
2270 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2271 
2272 	if (pol->mode == MPOL_PREFERRED_MANY)
2273 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2274 
2275 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2276 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2277 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2278 		/*
2279 		 * For hugepage allocation and non-interleave policy which
2280 		 * allows the current node (or other explicitly preferred
2281 		 * node) we only try to allocate from the current/preferred
2282 		 * node and don't fall back to other nodes, as the cost of
2283 		 * remote accesses would likely offset THP benefits.
2284 		 *
2285 		 * If the policy is interleave or does not allow the current
2286 		 * node in its nodemask, we allocate the standard way.
2287 		 */
2288 		if (pol->mode != MPOL_INTERLEAVE &&
2289 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2290 		    (!nodemask || node_isset(nid, *nodemask))) {
2291 			/*
2292 			 * First, try to allocate THP only on local node, but
2293 			 * don't reclaim unnecessarily, just compact.
2294 			 */
2295 			page = __alloc_frozen_pages_noprof(
2296 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2297 				nid, NULL);
2298 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2299 				return page;
2300 			/*
2301 			 * If hugepage allocations are configured to always
2302 			 * synchronous compact or the vma has been madvised
2303 			 * to prefer hugepage backing, retry allowing remote
2304 			 * memory with both reclaim and compact as well.
2305 			 */
2306 		}
2307 	}
2308 
2309 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2310 
2311 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2312 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2313 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2314 		if (static_branch_likely(&vm_numa_stat_key) &&
2315 		    page_to_nid(page) == nid) {
2316 			preempt_disable();
2317 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2318 			preempt_enable();
2319 		}
2320 	}
2321 
2322 	return page;
2323 }
2324 
2325 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2326 		struct mempolicy *pol, pgoff_t ilx, int nid)
2327 {
2328 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2329 			ilx, nid);
2330 	if (!page)
2331 		return NULL;
2332 
2333 	set_page_refcounted(page);
2334 	return page_rmappable_folio(page);
2335 }
2336 
2337 /**
2338  * vma_alloc_folio - Allocate a folio for a VMA.
2339  * @gfp: GFP flags.
2340  * @order: Order of the folio.
2341  * @vma: Pointer to VMA.
2342  * @addr: Virtual address of the allocation.  Must be inside @vma.
2343  *
2344  * Allocate a folio for a specific address in @vma, using the appropriate
2345  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2346  * VMA to prevent it from going away.  Should be used for all allocations
2347  * for folios that will be mapped into user space, excepting hugetlbfs, and
2348  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2349  *
2350  * Return: The folio on success or NULL if allocation fails.
2351  */
2352 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2353 		unsigned long addr)
2354 {
2355 	struct mempolicy *pol;
2356 	pgoff_t ilx;
2357 	struct folio *folio;
2358 
2359 	if (vma->vm_flags & VM_DROPPABLE)
2360 		gfp |= __GFP_NOWARN;
2361 
2362 	pol = get_vma_policy(vma, addr, order, &ilx);
2363 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2364 	mpol_cond_put(pol);
2365 	return folio;
2366 }
2367 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2368 
2369 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2370 {
2371 	struct mempolicy *pol = &default_policy;
2372 
2373 	/*
2374 	 * No reference counting needed for current->mempolicy
2375 	 * nor system default_policy
2376 	 */
2377 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2378 		pol = get_task_policy(current);
2379 
2380 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2381 				       numa_node_id());
2382 }
2383 
2384 /**
2385  * alloc_pages - Allocate pages.
2386  * @gfp: GFP flags.
2387  * @order: Power of two of number of pages to allocate.
2388  *
2389  * Allocate 1 << @order contiguous pages.  The physical address of the
2390  * first page is naturally aligned (eg an order-3 allocation will be aligned
2391  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2392  * process is honoured when in process context.
2393  *
2394  * Context: Can be called from any context, providing the appropriate GFP
2395  * flags are used.
2396  * Return: The page on success or NULL if allocation fails.
2397  */
2398 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2399 {
2400 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2401 
2402 	if (page)
2403 		set_page_refcounted(page);
2404 	return page;
2405 }
2406 EXPORT_SYMBOL(alloc_pages_noprof);
2407 
2408 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2409 {
2410 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2411 }
2412 EXPORT_SYMBOL(folio_alloc_noprof);
2413 
2414 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2415 		struct mempolicy *pol, unsigned long nr_pages,
2416 		struct page **page_array)
2417 {
2418 	int nodes;
2419 	unsigned long nr_pages_per_node;
2420 	int delta;
2421 	int i;
2422 	unsigned long nr_allocated;
2423 	unsigned long total_allocated = 0;
2424 
2425 	nodes = nodes_weight(pol->nodes);
2426 	nr_pages_per_node = nr_pages / nodes;
2427 	delta = nr_pages - nodes * nr_pages_per_node;
2428 
2429 	for (i = 0; i < nodes; i++) {
2430 		if (delta) {
2431 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2432 					interleave_nodes(pol), NULL,
2433 					nr_pages_per_node + 1,
2434 					page_array);
2435 			delta--;
2436 		} else {
2437 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2438 					interleave_nodes(pol), NULL,
2439 					nr_pages_per_node, page_array);
2440 		}
2441 
2442 		page_array += nr_allocated;
2443 		total_allocated += nr_allocated;
2444 	}
2445 
2446 	return total_allocated;
2447 }
2448 
2449 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2450 		struct mempolicy *pol, unsigned long nr_pages,
2451 		struct page **page_array)
2452 {
2453 	struct task_struct *me = current;
2454 	unsigned int cpuset_mems_cookie;
2455 	unsigned long total_allocated = 0;
2456 	unsigned long nr_allocated = 0;
2457 	unsigned long rounds;
2458 	unsigned long node_pages, delta;
2459 	u8 *table, *weights, weight;
2460 	unsigned int weight_total = 0;
2461 	unsigned long rem_pages = nr_pages;
2462 	nodemask_t nodes;
2463 	int nnodes, node;
2464 	int resume_node = MAX_NUMNODES - 1;
2465 	u8 resume_weight = 0;
2466 	int prev_node;
2467 	int i;
2468 
2469 	if (!nr_pages)
2470 		return 0;
2471 
2472 	/* read the nodes onto the stack, retry if done during rebind */
2473 	do {
2474 		cpuset_mems_cookie = read_mems_allowed_begin();
2475 		nnodes = read_once_policy_nodemask(pol, &nodes);
2476 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2477 
2478 	/* if the nodemask has become invalid, we cannot do anything */
2479 	if (!nnodes)
2480 		return 0;
2481 
2482 	/* Continue allocating from most recent node and adjust the nr_pages */
2483 	node = me->il_prev;
2484 	weight = me->il_weight;
2485 	if (weight && node_isset(node, nodes)) {
2486 		node_pages = min(rem_pages, weight);
2487 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2488 						  page_array);
2489 		page_array += nr_allocated;
2490 		total_allocated += nr_allocated;
2491 		/* if that's all the pages, no need to interleave */
2492 		if (rem_pages <= weight) {
2493 			me->il_weight -= rem_pages;
2494 			return total_allocated;
2495 		}
2496 		/* Otherwise we adjust remaining pages, continue from there */
2497 		rem_pages -= weight;
2498 	}
2499 	/* clear active weight in case of an allocation failure */
2500 	me->il_weight = 0;
2501 	prev_node = node;
2502 
2503 	/* create a local copy of node weights to operate on outside rcu */
2504 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2505 	if (!weights)
2506 		return total_allocated;
2507 
2508 	rcu_read_lock();
2509 	table = rcu_dereference(iw_table);
2510 	if (table)
2511 		memcpy(weights, table, nr_node_ids);
2512 	rcu_read_unlock();
2513 
2514 	/* calculate total, detect system default usage */
2515 	for_each_node_mask(node, nodes) {
2516 		if (!weights[node])
2517 			weights[node] = 1;
2518 		weight_total += weights[node];
2519 	}
2520 
2521 	/*
2522 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2523 	 * Track which node weighted interleave should resume from.
2524 	 *
2525 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2526 	 * the node following prev_node and its weight.
2527 	 */
2528 	rounds = rem_pages / weight_total;
2529 	delta = rem_pages % weight_total;
2530 	resume_node = next_node_in(prev_node, nodes);
2531 	resume_weight = weights[resume_node];
2532 	for (i = 0; i < nnodes; i++) {
2533 		node = next_node_in(prev_node, nodes);
2534 		weight = weights[node];
2535 		node_pages = weight * rounds;
2536 		/* If a delta exists, add this node's portion of the delta */
2537 		if (delta > weight) {
2538 			node_pages += weight;
2539 			delta -= weight;
2540 		} else if (delta) {
2541 			/* when delta is depleted, resume from that node */
2542 			node_pages += delta;
2543 			resume_node = node;
2544 			resume_weight = weight - delta;
2545 			delta = 0;
2546 		}
2547 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2548 		if (!node_pages)
2549 			break;
2550 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2551 						  page_array);
2552 		page_array += nr_allocated;
2553 		total_allocated += nr_allocated;
2554 		if (total_allocated == nr_pages)
2555 			break;
2556 		prev_node = node;
2557 	}
2558 	me->il_prev = resume_node;
2559 	me->il_weight = resume_weight;
2560 	kfree(weights);
2561 	return total_allocated;
2562 }
2563 
2564 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2565 		struct mempolicy *pol, unsigned long nr_pages,
2566 		struct page **page_array)
2567 {
2568 	gfp_t preferred_gfp;
2569 	unsigned long nr_allocated = 0;
2570 
2571 	preferred_gfp = gfp | __GFP_NOWARN;
2572 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2573 
2574 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2575 					   nr_pages, page_array);
2576 
2577 	if (nr_allocated < nr_pages)
2578 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2579 				nr_pages - nr_allocated,
2580 				page_array + nr_allocated);
2581 	return nr_allocated;
2582 }
2583 
2584 /* alloc pages bulk and mempolicy should be considered at the
2585  * same time in some situation such as vmalloc.
2586  *
2587  * It can accelerate memory allocation especially interleaving
2588  * allocate memory.
2589  */
2590 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2591 		unsigned long nr_pages, struct page **page_array)
2592 {
2593 	struct mempolicy *pol = &default_policy;
2594 	nodemask_t *nodemask;
2595 	int nid;
2596 
2597 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2598 		pol = get_task_policy(current);
2599 
2600 	if (pol->mode == MPOL_INTERLEAVE)
2601 		return alloc_pages_bulk_interleave(gfp, pol,
2602 							 nr_pages, page_array);
2603 
2604 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2605 		return alloc_pages_bulk_weighted_interleave(
2606 				  gfp, pol, nr_pages, page_array);
2607 
2608 	if (pol->mode == MPOL_PREFERRED_MANY)
2609 		return alloc_pages_bulk_preferred_many(gfp,
2610 				numa_node_id(), pol, nr_pages, page_array);
2611 
2612 	nid = numa_node_id();
2613 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2614 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2615 				       nr_pages, page_array);
2616 }
2617 
2618 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2619 {
2620 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2621 
2622 	if (IS_ERR(pol))
2623 		return PTR_ERR(pol);
2624 	dst->vm_policy = pol;
2625 	return 0;
2626 }
2627 
2628 /*
2629  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2630  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2631  * with the mems_allowed returned by cpuset_mems_allowed().  This
2632  * keeps mempolicies cpuset relative after its cpuset moves.  See
2633  * further kernel/cpuset.c update_nodemask().
2634  *
2635  * current's mempolicy may be rebinded by the other task(the task that changes
2636  * cpuset's mems), so we needn't do rebind work for current task.
2637  */
2638 
2639 /* Slow path of a mempolicy duplicate */
2640 struct mempolicy *__mpol_dup(struct mempolicy *old)
2641 {
2642 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2643 
2644 	if (!new)
2645 		return ERR_PTR(-ENOMEM);
2646 
2647 	/* task's mempolicy is protected by alloc_lock */
2648 	if (old == current->mempolicy) {
2649 		task_lock(current);
2650 		*new = *old;
2651 		task_unlock(current);
2652 	} else
2653 		*new = *old;
2654 
2655 	if (current_cpuset_is_being_rebound()) {
2656 		nodemask_t mems = cpuset_mems_allowed(current);
2657 		mpol_rebind_policy(new, &mems);
2658 	}
2659 	atomic_set(&new->refcnt, 1);
2660 	return new;
2661 }
2662 
2663 /* Slow path of a mempolicy comparison */
2664 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2665 {
2666 	if (!a || !b)
2667 		return false;
2668 	if (a->mode != b->mode)
2669 		return false;
2670 	if (a->flags != b->flags)
2671 		return false;
2672 	if (a->home_node != b->home_node)
2673 		return false;
2674 	if (mpol_store_user_nodemask(a))
2675 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2676 			return false;
2677 
2678 	switch (a->mode) {
2679 	case MPOL_BIND:
2680 	case MPOL_INTERLEAVE:
2681 	case MPOL_PREFERRED:
2682 	case MPOL_PREFERRED_MANY:
2683 	case MPOL_WEIGHTED_INTERLEAVE:
2684 		return !!nodes_equal(a->nodes, b->nodes);
2685 	case MPOL_LOCAL:
2686 		return true;
2687 	default:
2688 		BUG();
2689 		return false;
2690 	}
2691 }
2692 
2693 /*
2694  * Shared memory backing store policy support.
2695  *
2696  * Remember policies even when nobody has shared memory mapped.
2697  * The policies are kept in Red-Black tree linked from the inode.
2698  * They are protected by the sp->lock rwlock, which should be held
2699  * for any accesses to the tree.
2700  */
2701 
2702 /*
2703  * lookup first element intersecting start-end.  Caller holds sp->lock for
2704  * reading or for writing
2705  */
2706 static struct sp_node *sp_lookup(struct shared_policy *sp,
2707 					pgoff_t start, pgoff_t end)
2708 {
2709 	struct rb_node *n = sp->root.rb_node;
2710 
2711 	while (n) {
2712 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2713 
2714 		if (start >= p->end)
2715 			n = n->rb_right;
2716 		else if (end <= p->start)
2717 			n = n->rb_left;
2718 		else
2719 			break;
2720 	}
2721 	if (!n)
2722 		return NULL;
2723 	for (;;) {
2724 		struct sp_node *w = NULL;
2725 		struct rb_node *prev = rb_prev(n);
2726 		if (!prev)
2727 			break;
2728 		w = rb_entry(prev, struct sp_node, nd);
2729 		if (w->end <= start)
2730 			break;
2731 		n = prev;
2732 	}
2733 	return rb_entry(n, struct sp_node, nd);
2734 }
2735 
2736 /*
2737  * Insert a new shared policy into the list.  Caller holds sp->lock for
2738  * writing.
2739  */
2740 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2741 {
2742 	struct rb_node **p = &sp->root.rb_node;
2743 	struct rb_node *parent = NULL;
2744 	struct sp_node *nd;
2745 
2746 	while (*p) {
2747 		parent = *p;
2748 		nd = rb_entry(parent, struct sp_node, nd);
2749 		if (new->start < nd->start)
2750 			p = &(*p)->rb_left;
2751 		else if (new->end > nd->end)
2752 			p = &(*p)->rb_right;
2753 		else
2754 			BUG();
2755 	}
2756 	rb_link_node(&new->nd, parent, p);
2757 	rb_insert_color(&new->nd, &sp->root);
2758 }
2759 
2760 /* Find shared policy intersecting idx */
2761 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2762 						pgoff_t idx)
2763 {
2764 	struct mempolicy *pol = NULL;
2765 	struct sp_node *sn;
2766 
2767 	if (!sp->root.rb_node)
2768 		return NULL;
2769 	read_lock(&sp->lock);
2770 	sn = sp_lookup(sp, idx, idx+1);
2771 	if (sn) {
2772 		mpol_get(sn->policy);
2773 		pol = sn->policy;
2774 	}
2775 	read_unlock(&sp->lock);
2776 	return pol;
2777 }
2778 
2779 static void sp_free(struct sp_node *n)
2780 {
2781 	mpol_put(n->policy);
2782 	kmem_cache_free(sn_cache, n);
2783 }
2784 
2785 /**
2786  * mpol_misplaced - check whether current folio node is valid in policy
2787  *
2788  * @folio: folio to be checked
2789  * @vmf: structure describing the fault
2790  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2791  *
2792  * Lookup current policy node id for vma,addr and "compare to" folio's
2793  * node id.  Policy determination "mimics" alloc_page_vma().
2794  * Called from fault path where we know the vma and faulting address.
2795  *
2796  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2797  * policy, or a suitable node ID to allocate a replacement folio from.
2798  */
2799 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2800 		   unsigned long addr)
2801 {
2802 	struct mempolicy *pol;
2803 	pgoff_t ilx;
2804 	struct zoneref *z;
2805 	int curnid = folio_nid(folio);
2806 	struct vm_area_struct *vma = vmf->vma;
2807 	int thiscpu = raw_smp_processor_id();
2808 	int thisnid = numa_node_id();
2809 	int polnid = NUMA_NO_NODE;
2810 	int ret = NUMA_NO_NODE;
2811 
2812 	/*
2813 	 * Make sure ptl is held so that we don't preempt and we
2814 	 * have a stable smp processor id
2815 	 */
2816 	lockdep_assert_held(vmf->ptl);
2817 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2818 	if (!(pol->flags & MPOL_F_MOF))
2819 		goto out;
2820 
2821 	switch (pol->mode) {
2822 	case MPOL_INTERLEAVE:
2823 		polnid = interleave_nid(pol, ilx);
2824 		break;
2825 
2826 	case MPOL_WEIGHTED_INTERLEAVE:
2827 		polnid = weighted_interleave_nid(pol, ilx);
2828 		break;
2829 
2830 	case MPOL_PREFERRED:
2831 		if (node_isset(curnid, pol->nodes))
2832 			goto out;
2833 		polnid = first_node(pol->nodes);
2834 		break;
2835 
2836 	case MPOL_LOCAL:
2837 		polnid = numa_node_id();
2838 		break;
2839 
2840 	case MPOL_BIND:
2841 	case MPOL_PREFERRED_MANY:
2842 		/*
2843 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
2844 		 * policy nodemask we don't allow numa migration to nodes
2845 		 * outside policy nodemask for now. This is done so that if we
2846 		 * want demotion to slow memory to happen, before allocating
2847 		 * from some DRAM node say 'x', we will end up using a
2848 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2849 		 * we should not promote to node 'x' from slow memory node.
2850 		 */
2851 		if (pol->flags & MPOL_F_MORON) {
2852 			/*
2853 			 * Optimize placement among multiple nodes
2854 			 * via NUMA balancing
2855 			 */
2856 			if (node_isset(thisnid, pol->nodes))
2857 				break;
2858 			goto out;
2859 		}
2860 
2861 		/*
2862 		 * use current page if in policy nodemask,
2863 		 * else select nearest allowed node, if any.
2864 		 * If no allowed nodes, use current [!misplaced].
2865 		 */
2866 		if (node_isset(curnid, pol->nodes))
2867 			goto out;
2868 		z = first_zones_zonelist(
2869 				node_zonelist(thisnid, GFP_HIGHUSER),
2870 				gfp_zone(GFP_HIGHUSER),
2871 				&pol->nodes);
2872 		polnid = zonelist_node_idx(z);
2873 		break;
2874 
2875 	default:
2876 		BUG();
2877 	}
2878 
2879 	/* Migrate the folio towards the node whose CPU is referencing it */
2880 	if (pol->flags & MPOL_F_MORON) {
2881 		polnid = thisnid;
2882 
2883 		if (!should_numa_migrate_memory(current, folio, curnid,
2884 						thiscpu))
2885 			goto out;
2886 	}
2887 
2888 	if (curnid != polnid)
2889 		ret = polnid;
2890 out:
2891 	mpol_cond_put(pol);
2892 
2893 	return ret;
2894 }
2895 
2896 /*
2897  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2898  * dropped after task->mempolicy is set to NULL so that any allocation done as
2899  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2900  * policy.
2901  */
2902 void mpol_put_task_policy(struct task_struct *task)
2903 {
2904 	struct mempolicy *pol;
2905 
2906 	task_lock(task);
2907 	pol = task->mempolicy;
2908 	task->mempolicy = NULL;
2909 	task_unlock(task);
2910 	mpol_put(pol);
2911 }
2912 
2913 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2914 {
2915 	rb_erase(&n->nd, &sp->root);
2916 	sp_free(n);
2917 }
2918 
2919 static void sp_node_init(struct sp_node *node, unsigned long start,
2920 			unsigned long end, struct mempolicy *pol)
2921 {
2922 	node->start = start;
2923 	node->end = end;
2924 	node->policy = pol;
2925 }
2926 
2927 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2928 				struct mempolicy *pol)
2929 {
2930 	struct sp_node *n;
2931 	struct mempolicy *newpol;
2932 
2933 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2934 	if (!n)
2935 		return NULL;
2936 
2937 	newpol = mpol_dup(pol);
2938 	if (IS_ERR(newpol)) {
2939 		kmem_cache_free(sn_cache, n);
2940 		return NULL;
2941 	}
2942 	newpol->flags |= MPOL_F_SHARED;
2943 	sp_node_init(n, start, end, newpol);
2944 
2945 	return n;
2946 }
2947 
2948 /* Replace a policy range. */
2949 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2950 				 pgoff_t end, struct sp_node *new)
2951 {
2952 	struct sp_node *n;
2953 	struct sp_node *n_new = NULL;
2954 	struct mempolicy *mpol_new = NULL;
2955 	int ret = 0;
2956 
2957 restart:
2958 	write_lock(&sp->lock);
2959 	n = sp_lookup(sp, start, end);
2960 	/* Take care of old policies in the same range. */
2961 	while (n && n->start < end) {
2962 		struct rb_node *next = rb_next(&n->nd);
2963 		if (n->start >= start) {
2964 			if (n->end <= end)
2965 				sp_delete(sp, n);
2966 			else
2967 				n->start = end;
2968 		} else {
2969 			/* Old policy spanning whole new range. */
2970 			if (n->end > end) {
2971 				if (!n_new)
2972 					goto alloc_new;
2973 
2974 				*mpol_new = *n->policy;
2975 				atomic_set(&mpol_new->refcnt, 1);
2976 				sp_node_init(n_new, end, n->end, mpol_new);
2977 				n->end = start;
2978 				sp_insert(sp, n_new);
2979 				n_new = NULL;
2980 				mpol_new = NULL;
2981 				break;
2982 			} else
2983 				n->end = start;
2984 		}
2985 		if (!next)
2986 			break;
2987 		n = rb_entry(next, struct sp_node, nd);
2988 	}
2989 	if (new)
2990 		sp_insert(sp, new);
2991 	write_unlock(&sp->lock);
2992 	ret = 0;
2993 
2994 err_out:
2995 	if (mpol_new)
2996 		mpol_put(mpol_new);
2997 	if (n_new)
2998 		kmem_cache_free(sn_cache, n_new);
2999 
3000 	return ret;
3001 
3002 alloc_new:
3003 	write_unlock(&sp->lock);
3004 	ret = -ENOMEM;
3005 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3006 	if (!n_new)
3007 		goto err_out;
3008 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3009 	if (!mpol_new)
3010 		goto err_out;
3011 	atomic_set(&mpol_new->refcnt, 1);
3012 	goto restart;
3013 }
3014 
3015 /**
3016  * mpol_shared_policy_init - initialize shared policy for inode
3017  * @sp: pointer to inode shared policy
3018  * @mpol:  struct mempolicy to install
3019  *
3020  * Install non-NULL @mpol in inode's shared policy rb-tree.
3021  * On entry, the current task has a reference on a non-NULL @mpol.
3022  * This must be released on exit.
3023  * This is called at get_inode() calls and we can use GFP_KERNEL.
3024  */
3025 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3026 {
3027 	int ret;
3028 
3029 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3030 	rwlock_init(&sp->lock);
3031 
3032 	if (mpol) {
3033 		struct sp_node *sn;
3034 		struct mempolicy *npol;
3035 		NODEMASK_SCRATCH(scratch);
3036 
3037 		if (!scratch)
3038 			goto put_mpol;
3039 
3040 		/* contextualize the tmpfs mount point mempolicy to this file */
3041 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3042 		if (IS_ERR(npol))
3043 			goto free_scratch; /* no valid nodemask intersection */
3044 
3045 		task_lock(current);
3046 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3047 		task_unlock(current);
3048 		if (ret)
3049 			goto put_npol;
3050 
3051 		/* alloc node covering entire file; adds ref to file's npol */
3052 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3053 		if (sn)
3054 			sp_insert(sp, sn);
3055 put_npol:
3056 		mpol_put(npol);	/* drop initial ref on file's npol */
3057 free_scratch:
3058 		NODEMASK_SCRATCH_FREE(scratch);
3059 put_mpol:
3060 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3061 	}
3062 }
3063 
3064 int mpol_set_shared_policy(struct shared_policy *sp,
3065 			struct vm_area_struct *vma, struct mempolicy *pol)
3066 {
3067 	int err;
3068 	struct sp_node *new = NULL;
3069 	unsigned long sz = vma_pages(vma);
3070 
3071 	if (pol) {
3072 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3073 		if (!new)
3074 			return -ENOMEM;
3075 	}
3076 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3077 	if (err && new)
3078 		sp_free(new);
3079 	return err;
3080 }
3081 
3082 /* Free a backing policy store on inode delete. */
3083 void mpol_free_shared_policy(struct shared_policy *sp)
3084 {
3085 	struct sp_node *n;
3086 	struct rb_node *next;
3087 
3088 	if (!sp->root.rb_node)
3089 		return;
3090 	write_lock(&sp->lock);
3091 	next = rb_first(&sp->root);
3092 	while (next) {
3093 		n = rb_entry(next, struct sp_node, nd);
3094 		next = rb_next(&n->nd);
3095 		sp_delete(sp, n);
3096 	}
3097 	write_unlock(&sp->lock);
3098 }
3099 
3100 #ifdef CONFIG_NUMA_BALANCING
3101 static int __initdata numabalancing_override;
3102 
3103 static void __init check_numabalancing_enable(void)
3104 {
3105 	bool numabalancing_default = false;
3106 
3107 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3108 		numabalancing_default = true;
3109 
3110 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3111 	if (numabalancing_override)
3112 		set_numabalancing_state(numabalancing_override == 1);
3113 
3114 	if (num_online_nodes() > 1 && !numabalancing_override) {
3115 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3116 			numabalancing_default ? "Enabling" : "Disabling");
3117 		set_numabalancing_state(numabalancing_default);
3118 	}
3119 }
3120 
3121 static int __init setup_numabalancing(char *str)
3122 {
3123 	int ret = 0;
3124 	if (!str)
3125 		goto out;
3126 
3127 	if (!strcmp(str, "enable")) {
3128 		numabalancing_override = 1;
3129 		ret = 1;
3130 	} else if (!strcmp(str, "disable")) {
3131 		numabalancing_override = -1;
3132 		ret = 1;
3133 	}
3134 out:
3135 	if (!ret)
3136 		pr_warn("Unable to parse numa_balancing=\n");
3137 
3138 	return ret;
3139 }
3140 __setup("numa_balancing=", setup_numabalancing);
3141 #else
3142 static inline void __init check_numabalancing_enable(void)
3143 {
3144 }
3145 #endif /* CONFIG_NUMA_BALANCING */
3146 
3147 void __init numa_policy_init(void)
3148 {
3149 	nodemask_t interleave_nodes;
3150 	unsigned long largest = 0;
3151 	int nid, prefer = 0;
3152 
3153 	policy_cache = kmem_cache_create("numa_policy",
3154 					 sizeof(struct mempolicy),
3155 					 0, SLAB_PANIC, NULL);
3156 
3157 	sn_cache = kmem_cache_create("shared_policy_node",
3158 				     sizeof(struct sp_node),
3159 				     0, SLAB_PANIC, NULL);
3160 
3161 	for_each_node(nid) {
3162 		preferred_node_policy[nid] = (struct mempolicy) {
3163 			.refcnt = ATOMIC_INIT(1),
3164 			.mode = MPOL_PREFERRED,
3165 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3166 			.nodes = nodemask_of_node(nid),
3167 		};
3168 	}
3169 
3170 	/*
3171 	 * Set interleaving policy for system init. Interleaving is only
3172 	 * enabled across suitably sized nodes (default is >= 16MB), or
3173 	 * fall back to the largest node if they're all smaller.
3174 	 */
3175 	nodes_clear(interleave_nodes);
3176 	for_each_node_state(nid, N_MEMORY) {
3177 		unsigned long total_pages = node_present_pages(nid);
3178 
3179 		/* Preserve the largest node */
3180 		if (largest < total_pages) {
3181 			largest = total_pages;
3182 			prefer = nid;
3183 		}
3184 
3185 		/* Interleave this node? */
3186 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3187 			node_set(nid, interleave_nodes);
3188 	}
3189 
3190 	/* All too small, use the largest */
3191 	if (unlikely(nodes_empty(interleave_nodes)))
3192 		node_set(prefer, interleave_nodes);
3193 
3194 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3195 		pr_err("%s: interleaving failed\n", __func__);
3196 
3197 	check_numabalancing_enable();
3198 }
3199 
3200 /* Reset policy of current process to default */
3201 void numa_default_policy(void)
3202 {
3203 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3204 }
3205 
3206 /*
3207  * Parse and format mempolicy from/to strings
3208  */
3209 static const char * const policy_modes[] =
3210 {
3211 	[MPOL_DEFAULT]    = "default",
3212 	[MPOL_PREFERRED]  = "prefer",
3213 	[MPOL_BIND]       = "bind",
3214 	[MPOL_INTERLEAVE] = "interleave",
3215 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3216 	[MPOL_LOCAL]      = "local",
3217 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3218 };
3219 
3220 #ifdef CONFIG_TMPFS
3221 /**
3222  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3223  * @str:  string containing mempolicy to parse
3224  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3225  *
3226  * Format of input:
3227  *	<mode>[=<flags>][:<nodelist>]
3228  *
3229  * Return: %0 on success, else %1
3230  */
3231 int mpol_parse_str(char *str, struct mempolicy **mpol)
3232 {
3233 	struct mempolicy *new = NULL;
3234 	unsigned short mode_flags;
3235 	nodemask_t nodes;
3236 	char *nodelist = strchr(str, ':');
3237 	char *flags = strchr(str, '=');
3238 	int err = 1, mode;
3239 
3240 	if (flags)
3241 		*flags++ = '\0';	/* terminate mode string */
3242 
3243 	if (nodelist) {
3244 		/* NUL-terminate mode or flags string */
3245 		*nodelist++ = '\0';
3246 		if (nodelist_parse(nodelist, nodes))
3247 			goto out;
3248 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3249 			goto out;
3250 	} else
3251 		nodes_clear(nodes);
3252 
3253 	mode = match_string(policy_modes, MPOL_MAX, str);
3254 	if (mode < 0)
3255 		goto out;
3256 
3257 	switch (mode) {
3258 	case MPOL_PREFERRED:
3259 		/*
3260 		 * Insist on a nodelist of one node only, although later
3261 		 * we use first_node(nodes) to grab a single node, so here
3262 		 * nodelist (or nodes) cannot be empty.
3263 		 */
3264 		if (nodelist) {
3265 			char *rest = nodelist;
3266 			while (isdigit(*rest))
3267 				rest++;
3268 			if (*rest)
3269 				goto out;
3270 			if (nodes_empty(nodes))
3271 				goto out;
3272 		}
3273 		break;
3274 	case MPOL_INTERLEAVE:
3275 	case MPOL_WEIGHTED_INTERLEAVE:
3276 		/*
3277 		 * Default to online nodes with memory if no nodelist
3278 		 */
3279 		if (!nodelist)
3280 			nodes = node_states[N_MEMORY];
3281 		break;
3282 	case MPOL_LOCAL:
3283 		/*
3284 		 * Don't allow a nodelist;  mpol_new() checks flags
3285 		 */
3286 		if (nodelist)
3287 			goto out;
3288 		break;
3289 	case MPOL_DEFAULT:
3290 		/*
3291 		 * Insist on a empty nodelist
3292 		 */
3293 		if (!nodelist)
3294 			err = 0;
3295 		goto out;
3296 	case MPOL_PREFERRED_MANY:
3297 	case MPOL_BIND:
3298 		/*
3299 		 * Insist on a nodelist
3300 		 */
3301 		if (!nodelist)
3302 			goto out;
3303 	}
3304 
3305 	mode_flags = 0;
3306 	if (flags) {
3307 		/*
3308 		 * Currently, we only support two mutually exclusive
3309 		 * mode flags.
3310 		 */
3311 		if (!strcmp(flags, "static"))
3312 			mode_flags |= MPOL_F_STATIC_NODES;
3313 		else if (!strcmp(flags, "relative"))
3314 			mode_flags |= MPOL_F_RELATIVE_NODES;
3315 		else
3316 			goto out;
3317 	}
3318 
3319 	new = mpol_new(mode, mode_flags, &nodes);
3320 	if (IS_ERR(new))
3321 		goto out;
3322 
3323 	/*
3324 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3325 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3326 	 */
3327 	if (mode != MPOL_PREFERRED) {
3328 		new->nodes = nodes;
3329 	} else if (nodelist) {
3330 		nodes_clear(new->nodes);
3331 		node_set(first_node(nodes), new->nodes);
3332 	} else {
3333 		new->mode = MPOL_LOCAL;
3334 	}
3335 
3336 	/*
3337 	 * Save nodes for contextualization: this will be used to "clone"
3338 	 * the mempolicy in a specific context [cpuset] at a later time.
3339 	 */
3340 	new->w.user_nodemask = nodes;
3341 
3342 	err = 0;
3343 
3344 out:
3345 	/* Restore string for error message */
3346 	if (nodelist)
3347 		*--nodelist = ':';
3348 	if (flags)
3349 		*--flags = '=';
3350 	if (!err)
3351 		*mpol = new;
3352 	return err;
3353 }
3354 #endif /* CONFIG_TMPFS */
3355 
3356 /**
3357  * mpol_to_str - format a mempolicy structure for printing
3358  * @buffer:  to contain formatted mempolicy string
3359  * @maxlen:  length of @buffer
3360  * @pol:  pointer to mempolicy to be formatted
3361  *
3362  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3363  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3364  * interleave", plus the longest flag flags, "relative|balancing", and to
3365  * display at least a few node ids.
3366  */
3367 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3368 {
3369 	char *p = buffer;
3370 	nodemask_t nodes = NODE_MASK_NONE;
3371 	unsigned short mode = MPOL_DEFAULT;
3372 	unsigned short flags = 0;
3373 
3374 	if (pol &&
3375 	    pol != &default_policy &&
3376 	    !(pol >= &preferred_node_policy[0] &&
3377 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3378 		mode = pol->mode;
3379 		flags = pol->flags;
3380 	}
3381 
3382 	switch (mode) {
3383 	case MPOL_DEFAULT:
3384 	case MPOL_LOCAL:
3385 		break;
3386 	case MPOL_PREFERRED:
3387 	case MPOL_PREFERRED_MANY:
3388 	case MPOL_BIND:
3389 	case MPOL_INTERLEAVE:
3390 	case MPOL_WEIGHTED_INTERLEAVE:
3391 		nodes = pol->nodes;
3392 		break;
3393 	default:
3394 		WARN_ON_ONCE(1);
3395 		snprintf(p, maxlen, "unknown");
3396 		return;
3397 	}
3398 
3399 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3400 
3401 	if (flags & MPOL_MODE_FLAGS) {
3402 		p += snprintf(p, buffer + maxlen - p, "=");
3403 
3404 		/*
3405 		 * Static and relative are mutually exclusive.
3406 		 */
3407 		if (flags & MPOL_F_STATIC_NODES)
3408 			p += snprintf(p, buffer + maxlen - p, "static");
3409 		else if (flags & MPOL_F_RELATIVE_NODES)
3410 			p += snprintf(p, buffer + maxlen - p, "relative");
3411 
3412 		if (flags & MPOL_F_NUMA_BALANCING) {
3413 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3414 				p += snprintf(p, buffer + maxlen - p, "|");
3415 			p += snprintf(p, buffer + maxlen - p, "balancing");
3416 		}
3417 	}
3418 
3419 	if (!nodes_empty(nodes))
3420 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3421 			       nodemask_pr_args(&nodes));
3422 }
3423 
3424 #ifdef CONFIG_SYSFS
3425 struct iw_node_attr {
3426 	struct kobj_attribute kobj_attr;
3427 	int nid;
3428 };
3429 
3430 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3431 			 char *buf)
3432 {
3433 	struct iw_node_attr *node_attr;
3434 	u8 weight;
3435 
3436 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3437 	weight = get_il_weight(node_attr->nid);
3438 	return sysfs_emit(buf, "%d\n", weight);
3439 }
3440 
3441 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3442 			  const char *buf, size_t count)
3443 {
3444 	struct iw_node_attr *node_attr;
3445 	u8 *new;
3446 	u8 *old;
3447 	u8 weight = 0;
3448 
3449 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3450 	if (count == 0 || sysfs_streq(buf, ""))
3451 		weight = 0;
3452 	else if (kstrtou8(buf, 0, &weight))
3453 		return -EINVAL;
3454 
3455 	new = kzalloc(nr_node_ids, GFP_KERNEL);
3456 	if (!new)
3457 		return -ENOMEM;
3458 
3459 	mutex_lock(&iw_table_lock);
3460 	old = rcu_dereference_protected(iw_table,
3461 					lockdep_is_held(&iw_table_lock));
3462 	if (old)
3463 		memcpy(new, old, nr_node_ids);
3464 	new[node_attr->nid] = weight;
3465 	rcu_assign_pointer(iw_table, new);
3466 	mutex_unlock(&iw_table_lock);
3467 	synchronize_rcu();
3468 	kfree(old);
3469 	return count;
3470 }
3471 
3472 static struct iw_node_attr **node_attrs;
3473 
3474 static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
3475 				  struct kobject *parent)
3476 {
3477 	if (!node_attr)
3478 		return;
3479 	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
3480 	kfree(node_attr->kobj_attr.attr.name);
3481 	kfree(node_attr);
3482 }
3483 
3484 static void sysfs_wi_release(struct kobject *wi_kobj)
3485 {
3486 	int i;
3487 
3488 	for (i = 0; i < nr_node_ids; i++)
3489 		sysfs_wi_node_release(node_attrs[i], wi_kobj);
3490 	kobject_put(wi_kobj);
3491 }
3492 
3493 static const struct kobj_type wi_ktype = {
3494 	.sysfs_ops = &kobj_sysfs_ops,
3495 	.release = sysfs_wi_release,
3496 };
3497 
3498 static int add_weight_node(int nid, struct kobject *wi_kobj)
3499 {
3500 	struct iw_node_attr *node_attr;
3501 	char *name;
3502 
3503 	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
3504 	if (!node_attr)
3505 		return -ENOMEM;
3506 
3507 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3508 	if (!name) {
3509 		kfree(node_attr);
3510 		return -ENOMEM;
3511 	}
3512 
3513 	sysfs_attr_init(&node_attr->kobj_attr.attr);
3514 	node_attr->kobj_attr.attr.name = name;
3515 	node_attr->kobj_attr.attr.mode = 0644;
3516 	node_attr->kobj_attr.show = node_show;
3517 	node_attr->kobj_attr.store = node_store;
3518 	node_attr->nid = nid;
3519 
3520 	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
3521 		kfree(node_attr->kobj_attr.attr.name);
3522 		kfree(node_attr);
3523 		pr_err("failed to add attribute to weighted_interleave\n");
3524 		return -ENOMEM;
3525 	}
3526 
3527 	node_attrs[nid] = node_attr;
3528 	return 0;
3529 }
3530 
3531 static int add_weighted_interleave_group(struct kobject *root_kobj)
3532 {
3533 	struct kobject *wi_kobj;
3534 	int nid, err;
3535 
3536 	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
3537 	if (!wi_kobj)
3538 		return -ENOMEM;
3539 
3540 	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
3541 				   "weighted_interleave");
3542 	if (err) {
3543 		kfree(wi_kobj);
3544 		return err;
3545 	}
3546 
3547 	for_each_node_state(nid, N_POSSIBLE) {
3548 		err = add_weight_node(nid, wi_kobj);
3549 		if (err) {
3550 			pr_err("failed to add sysfs [node%d]\n", nid);
3551 			break;
3552 		}
3553 	}
3554 	if (err)
3555 		kobject_put(wi_kobj);
3556 	return 0;
3557 }
3558 
3559 static void mempolicy_kobj_release(struct kobject *kobj)
3560 {
3561 	u8 *old;
3562 
3563 	mutex_lock(&iw_table_lock);
3564 	old = rcu_dereference_protected(iw_table,
3565 					lockdep_is_held(&iw_table_lock));
3566 	rcu_assign_pointer(iw_table, NULL);
3567 	mutex_unlock(&iw_table_lock);
3568 	synchronize_rcu();
3569 	kfree(old);
3570 	kfree(node_attrs);
3571 	kfree(kobj);
3572 }
3573 
3574 static const struct kobj_type mempolicy_ktype = {
3575 	.release = mempolicy_kobj_release
3576 };
3577 
3578 static int __init mempolicy_sysfs_init(void)
3579 {
3580 	int err;
3581 	static struct kobject *mempolicy_kobj;
3582 
3583 	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
3584 	if (!mempolicy_kobj) {
3585 		err = -ENOMEM;
3586 		goto err_out;
3587 	}
3588 
3589 	node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
3590 			     GFP_KERNEL);
3591 	if (!node_attrs) {
3592 		err = -ENOMEM;
3593 		goto mempol_out;
3594 	}
3595 
3596 	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
3597 				   "mempolicy");
3598 	if (err)
3599 		goto node_out;
3600 
3601 	err = add_weighted_interleave_group(mempolicy_kobj);
3602 	if (err) {
3603 		pr_err("mempolicy sysfs structure failed to initialize\n");
3604 		kobject_put(mempolicy_kobj);
3605 		return err;
3606 	}
3607 
3608 	return err;
3609 node_out:
3610 	kfree(node_attrs);
3611 mempol_out:
3612 	kfree(mempolicy_kobj);
3613 err_out:
3614 	pr_err("failed to add mempolicy kobject to the system\n");
3615 	return err;
3616 }
3617 
3618 late_initcall(mempolicy_sysfs_init);
3619 #endif /* CONFIG_SYSFS */
3620