xref: /linux/mm/mempolicy.c (revision 82a08bde3cf7bbbe90de57baa181bebf676582c7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/task.h>
89 #include <linux/nodemask.h>
90 #include <linux/cpuset.h>
91 #include <linux/slab.h>
92 #include <linux/string.h>
93 #include <linux/export.h>
94 #include <linux/nsproxy.h>
95 #include <linux/interrupt.h>
96 #include <linux/init.h>
97 #include <linux/compat.h>
98 #include <linux/ptrace.h>
99 #include <linux/swap.h>
100 #include <linux/seq_file.h>
101 #include <linux/proc_fs.h>
102 #include <linux/migrate.h>
103 #include <linux/ksm.h>
104 #include <linux/rmap.h>
105 #include <linux/security.h>
106 #include <linux/syscalls.h>
107 #include <linux/ctype.h>
108 #include <linux/mm_inline.h>
109 #include <linux/mmu_notifier.h>
110 #include <linux/printk.h>
111 #include <linux/swapops.h>
112 
113 #include <asm/tlbflush.h>
114 #include <asm/tlb.h>
115 #include <linux/uaccess.h>
116 #include <linux/memory.h>
117 
118 #include "internal.h"
119 
120 /* Internal flags */
121 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
122 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
123 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
124 
125 static struct kmem_cache *policy_cache;
126 static struct kmem_cache *sn_cache;
127 
128 /* Highest zone. An specific allocation for a zone below that is not
129    policied. */
130 enum zone_type policy_zone = 0;
131 
132 /*
133  * run-time system-wide default policy => local allocation
134  */
135 static struct mempolicy default_policy = {
136 	.refcnt = ATOMIC_INIT(1), /* never free it */
137 	.mode = MPOL_LOCAL,
138 };
139 
140 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
141 
142 /*
143  * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
144  * system-default value should be used. A NULL iw_table also denotes that
145  * system-default values should be used. Until the system-default table
146  * is implemented, the system-default is always 1.
147  *
148  * iw_table is RCU protected
149  */
150 static u8 __rcu *iw_table;
151 static DEFINE_MUTEX(iw_table_lock);
152 
153 static u8 get_il_weight(int node)
154 {
155 	u8 *table;
156 	u8 weight;
157 
158 	rcu_read_lock();
159 	table = rcu_dereference(iw_table);
160 	/* if no iw_table, use system default */
161 	weight = table ? table[node] : 1;
162 	/* if value in iw_table is 0, use system default */
163 	weight = weight ? weight : 1;
164 	rcu_read_unlock();
165 	return weight;
166 }
167 
168 /**
169  * numa_nearest_node - Find nearest node by state
170  * @node: Node id to start the search
171  * @state: State to filter the search
172  *
173  * Lookup the closest node by distance if @nid is not in state.
174  *
175  * Return: this @node if it is in state, otherwise the closest node by distance
176  */
177 int numa_nearest_node(int node, unsigned int state)
178 {
179 	int min_dist = INT_MAX, dist, n, min_node;
180 
181 	if (state >= NR_NODE_STATES)
182 		return -EINVAL;
183 
184 	if (node == NUMA_NO_NODE || node_state(node, state))
185 		return node;
186 
187 	min_node = node;
188 	for_each_node_state(n, state) {
189 		dist = node_distance(node, n);
190 		if (dist < min_dist) {
191 			min_dist = dist;
192 			min_node = n;
193 		}
194 	}
195 
196 	return min_node;
197 }
198 EXPORT_SYMBOL_GPL(numa_nearest_node);
199 
200 /**
201  * nearest_node_nodemask - Find the node in @mask at the nearest distance
202  *			   from @node.
203  *
204  * @node: a valid node ID to start the search from.
205  * @mask: a pointer to a nodemask representing the allowed nodes.
206  *
207  * This function iterates over all nodes in @mask and calculates the
208  * distance from the starting @node, then it returns the node ID that is
209  * the closest to @node, or MAX_NUMNODES if no node is found.
210  *
211  * Note that @node must be a valid node ID usable with node_distance(),
212  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
213  * or unexpected behavior.
214  */
215 int nearest_node_nodemask(int node, nodemask_t *mask)
216 {
217 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
218 
219 	for_each_node_mask(n, *mask) {
220 		dist = node_distance(node, n);
221 		if (dist < min_dist) {
222 			min_dist = dist;
223 			min_node = n;
224 		}
225 	}
226 
227 	return min_node;
228 }
229 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
230 
231 struct mempolicy *get_task_policy(struct task_struct *p)
232 {
233 	struct mempolicy *pol = p->mempolicy;
234 	int node;
235 
236 	if (pol)
237 		return pol;
238 
239 	node = numa_node_id();
240 	if (node != NUMA_NO_NODE) {
241 		pol = &preferred_node_policy[node];
242 		/* preferred_node_policy is not initialised early in boot */
243 		if (pol->mode)
244 			return pol;
245 	}
246 
247 	return &default_policy;
248 }
249 
250 static const struct mempolicy_operations {
251 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
252 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
253 } mpol_ops[MPOL_MAX];
254 
255 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
256 {
257 	return pol->flags & MPOL_MODE_FLAGS;
258 }
259 
260 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
261 				   const nodemask_t *rel)
262 {
263 	nodemask_t tmp;
264 	nodes_fold(tmp, *orig, nodes_weight(*rel));
265 	nodes_onto(*ret, tmp, *rel);
266 }
267 
268 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
269 {
270 	if (nodes_empty(*nodes))
271 		return -EINVAL;
272 	pol->nodes = *nodes;
273 	return 0;
274 }
275 
276 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
277 {
278 	if (nodes_empty(*nodes))
279 		return -EINVAL;
280 
281 	nodes_clear(pol->nodes);
282 	node_set(first_node(*nodes), pol->nodes);
283 	return 0;
284 }
285 
286 /*
287  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
288  * any, for the new policy.  mpol_new() has already validated the nodes
289  * parameter with respect to the policy mode and flags.
290  *
291  * Must be called holding task's alloc_lock to protect task's mems_allowed
292  * and mempolicy.  May also be called holding the mmap_lock for write.
293  */
294 static int mpol_set_nodemask(struct mempolicy *pol,
295 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
296 {
297 	int ret;
298 
299 	/*
300 	 * Default (pol==NULL) resp. local memory policies are not a
301 	 * subject of any remapping. They also do not need any special
302 	 * constructor.
303 	 */
304 	if (!pol || pol->mode == MPOL_LOCAL)
305 		return 0;
306 
307 	/* Check N_MEMORY */
308 	nodes_and(nsc->mask1,
309 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
310 
311 	VM_BUG_ON(!nodes);
312 
313 	if (pol->flags & MPOL_F_RELATIVE_NODES)
314 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
315 	else
316 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
317 
318 	if (mpol_store_user_nodemask(pol))
319 		pol->w.user_nodemask = *nodes;
320 	else
321 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
322 
323 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
324 	return ret;
325 }
326 
327 /*
328  * This function just creates a new policy, does some check and simple
329  * initialization. You must invoke mpol_set_nodemask() to set nodes.
330  */
331 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
332 				  nodemask_t *nodes)
333 {
334 	struct mempolicy *policy;
335 
336 	if (mode == MPOL_DEFAULT) {
337 		if (nodes && !nodes_empty(*nodes))
338 			return ERR_PTR(-EINVAL);
339 		return NULL;
340 	}
341 	VM_BUG_ON(!nodes);
342 
343 	/*
344 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
345 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
346 	 * All other modes require a valid pointer to a non-empty nodemask.
347 	 */
348 	if (mode == MPOL_PREFERRED) {
349 		if (nodes_empty(*nodes)) {
350 			if (((flags & MPOL_F_STATIC_NODES) ||
351 			     (flags & MPOL_F_RELATIVE_NODES)))
352 				return ERR_PTR(-EINVAL);
353 
354 			mode = MPOL_LOCAL;
355 		}
356 	} else if (mode == MPOL_LOCAL) {
357 		if (!nodes_empty(*nodes) ||
358 		    (flags & MPOL_F_STATIC_NODES) ||
359 		    (flags & MPOL_F_RELATIVE_NODES))
360 			return ERR_PTR(-EINVAL);
361 	} else if (nodes_empty(*nodes))
362 		return ERR_PTR(-EINVAL);
363 
364 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
365 	if (!policy)
366 		return ERR_PTR(-ENOMEM);
367 	atomic_set(&policy->refcnt, 1);
368 	policy->mode = mode;
369 	policy->flags = flags;
370 	policy->home_node = NUMA_NO_NODE;
371 
372 	return policy;
373 }
374 
375 /* Slow path of a mpol destructor. */
376 void __mpol_put(struct mempolicy *pol)
377 {
378 	if (!atomic_dec_and_test(&pol->refcnt))
379 		return;
380 	kmem_cache_free(policy_cache, pol);
381 }
382 
383 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
384 {
385 }
386 
387 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
388 {
389 	nodemask_t tmp;
390 
391 	if (pol->flags & MPOL_F_STATIC_NODES)
392 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
393 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
394 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
395 	else {
396 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
397 								*nodes);
398 		pol->w.cpuset_mems_allowed = *nodes;
399 	}
400 
401 	if (nodes_empty(tmp))
402 		tmp = *nodes;
403 
404 	pol->nodes = tmp;
405 }
406 
407 static void mpol_rebind_preferred(struct mempolicy *pol,
408 						const nodemask_t *nodes)
409 {
410 	pol->w.cpuset_mems_allowed = *nodes;
411 }
412 
413 /*
414  * mpol_rebind_policy - Migrate a policy to a different set of nodes
415  *
416  * Per-vma policies are protected by mmap_lock. Allocations using per-task
417  * policies are protected by task->mems_allowed_seq to prevent a premature
418  * OOM/allocation failure due to parallel nodemask modification.
419  */
420 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
421 {
422 	if (!pol || pol->mode == MPOL_LOCAL)
423 		return;
424 	if (!mpol_store_user_nodemask(pol) &&
425 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
426 		return;
427 
428 	mpol_ops[pol->mode].rebind(pol, newmask);
429 }
430 
431 /*
432  * Wrapper for mpol_rebind_policy() that just requires task
433  * pointer, and updates task mempolicy.
434  *
435  * Called with task's alloc_lock held.
436  */
437 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
438 {
439 	mpol_rebind_policy(tsk->mempolicy, new);
440 }
441 
442 /*
443  * Rebind each vma in mm to new nodemask.
444  *
445  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
446  */
447 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
448 {
449 	struct vm_area_struct *vma;
450 	VMA_ITERATOR(vmi, mm, 0);
451 
452 	mmap_write_lock(mm);
453 	for_each_vma(vmi, vma) {
454 		vma_start_write(vma);
455 		mpol_rebind_policy(vma->vm_policy, new);
456 	}
457 	mmap_write_unlock(mm);
458 }
459 
460 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
461 	[MPOL_DEFAULT] = {
462 		.rebind = mpol_rebind_default,
463 	},
464 	[MPOL_INTERLEAVE] = {
465 		.create = mpol_new_nodemask,
466 		.rebind = mpol_rebind_nodemask,
467 	},
468 	[MPOL_PREFERRED] = {
469 		.create = mpol_new_preferred,
470 		.rebind = mpol_rebind_preferred,
471 	},
472 	[MPOL_BIND] = {
473 		.create = mpol_new_nodemask,
474 		.rebind = mpol_rebind_nodemask,
475 	},
476 	[MPOL_LOCAL] = {
477 		.rebind = mpol_rebind_default,
478 	},
479 	[MPOL_PREFERRED_MANY] = {
480 		.create = mpol_new_nodemask,
481 		.rebind = mpol_rebind_preferred,
482 	},
483 	[MPOL_WEIGHTED_INTERLEAVE] = {
484 		.create = mpol_new_nodemask,
485 		.rebind = mpol_rebind_nodemask,
486 	},
487 };
488 
489 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
490 				unsigned long flags);
491 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
492 				pgoff_t ilx, int *nid);
493 
494 static bool strictly_unmovable(unsigned long flags)
495 {
496 	/*
497 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
498 	 * if any misplaced page is found.
499 	 */
500 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
501 			 MPOL_MF_STRICT;
502 }
503 
504 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
505 	struct mempolicy *pol;
506 	pgoff_t ilx;
507 };
508 
509 struct queue_pages {
510 	struct list_head *pagelist;
511 	unsigned long flags;
512 	nodemask_t *nmask;
513 	unsigned long start;
514 	unsigned long end;
515 	struct vm_area_struct *first;
516 	struct folio *large;		/* note last large folio encountered */
517 	long nr_failed;			/* could not be isolated at this time */
518 };
519 
520 /*
521  * Check if the folio's nid is in qp->nmask.
522  *
523  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
524  * in the invert of qp->nmask.
525  */
526 static inline bool queue_folio_required(struct folio *folio,
527 					struct queue_pages *qp)
528 {
529 	int nid = folio_nid(folio);
530 	unsigned long flags = qp->flags;
531 
532 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
533 }
534 
535 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
536 {
537 	struct folio *folio;
538 	struct queue_pages *qp = walk->private;
539 
540 	if (unlikely(is_pmd_migration_entry(*pmd))) {
541 		qp->nr_failed++;
542 		return;
543 	}
544 	folio = pmd_folio(*pmd);
545 	if (is_huge_zero_folio(folio)) {
546 		walk->action = ACTION_CONTINUE;
547 		return;
548 	}
549 	if (!queue_folio_required(folio, qp))
550 		return;
551 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
552 	    !vma_migratable(walk->vma) ||
553 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
554 		qp->nr_failed++;
555 }
556 
557 /*
558  * Scan through folios, checking if they satisfy the required conditions,
559  * moving them from LRU to local pagelist for migration if they do (or not).
560  *
561  * queue_folios_pte_range() has two possible return values:
562  * 0 - continue walking to scan for more, even if an existing folio on the
563  *     wrong node could not be isolated and queued for migration.
564  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
565  *        and an existing folio was on a node that does not follow the policy.
566  */
567 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
568 			unsigned long end, struct mm_walk *walk)
569 {
570 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
571 	struct vm_area_struct *vma = walk->vma;
572 	struct folio *folio;
573 	struct queue_pages *qp = walk->private;
574 	unsigned long flags = qp->flags;
575 	pte_t *pte, *mapped_pte;
576 	pte_t ptent;
577 	spinlock_t *ptl;
578 	int max_nr, nr;
579 
580 	ptl = pmd_trans_huge_lock(pmd, vma);
581 	if (ptl) {
582 		queue_folios_pmd(pmd, walk);
583 		spin_unlock(ptl);
584 		goto out;
585 	}
586 
587 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
588 	if (!pte) {
589 		walk->action = ACTION_AGAIN;
590 		return 0;
591 	}
592 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
593 		max_nr = (end - addr) >> PAGE_SHIFT;
594 		nr = 1;
595 		ptent = ptep_get(pte);
596 		if (pte_none(ptent))
597 			continue;
598 		if (!pte_present(ptent)) {
599 			if (is_migration_entry(pte_to_swp_entry(ptent)))
600 				qp->nr_failed++;
601 			continue;
602 		}
603 		folio = vm_normal_folio(vma, addr, ptent);
604 		if (!folio || folio_is_zone_device(folio))
605 			continue;
606 		if (folio_test_large(folio) && max_nr != 1)
607 			nr = folio_pte_batch(folio, addr, pte, ptent,
608 					     max_nr, fpb_flags,
609 					     NULL, NULL, NULL);
610 		/*
611 		 * vm_normal_folio() filters out zero pages, but there might
612 		 * still be reserved folios to skip, perhaps in a VDSO.
613 		 */
614 		if (folio_test_reserved(folio))
615 			continue;
616 		if (!queue_folio_required(folio, qp))
617 			continue;
618 		if (folio_test_large(folio)) {
619 			/*
620 			 * A large folio can only be isolated from LRU once,
621 			 * but may be mapped by many PTEs (and Copy-On-Write may
622 			 * intersperse PTEs of other, order 0, folios).  This is
623 			 * a common case, so don't mistake it for failure (but
624 			 * there can be other cases of multi-mapped pages which
625 			 * this quick check does not help to filter out - and a
626 			 * search of the pagelist might grow to be prohibitive).
627 			 *
628 			 * migrate_pages(&pagelist) returns nr_failed folios, so
629 			 * check "large" now so that queue_pages_range() returns
630 			 * a comparable nr_failed folios.  This does imply that
631 			 * if folio could not be isolated for some racy reason
632 			 * at its first PTE, later PTEs will not give it another
633 			 * chance of isolation; but keeps the accounting simple.
634 			 */
635 			if (folio == qp->large)
636 				continue;
637 			qp->large = folio;
638 		}
639 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
640 		    !vma_migratable(vma) ||
641 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
642 			qp->nr_failed += nr;
643 			if (strictly_unmovable(flags))
644 				break;
645 		}
646 	}
647 	pte_unmap_unlock(mapped_pte, ptl);
648 	cond_resched();
649 out:
650 	if (qp->nr_failed && strictly_unmovable(flags))
651 		return -EIO;
652 	return 0;
653 }
654 
655 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
656 			       unsigned long addr, unsigned long end,
657 			       struct mm_walk *walk)
658 {
659 #ifdef CONFIG_HUGETLB_PAGE
660 	struct queue_pages *qp = walk->private;
661 	unsigned long flags = qp->flags;
662 	struct folio *folio;
663 	spinlock_t *ptl;
664 	pte_t entry;
665 
666 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
667 	entry = huge_ptep_get(walk->mm, addr, pte);
668 	if (!pte_present(entry)) {
669 		if (unlikely(is_hugetlb_entry_migration(entry)))
670 			qp->nr_failed++;
671 		goto unlock;
672 	}
673 	folio = pfn_folio(pte_pfn(entry));
674 	if (!queue_folio_required(folio, qp))
675 		goto unlock;
676 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
677 	    !vma_migratable(walk->vma)) {
678 		qp->nr_failed++;
679 		goto unlock;
680 	}
681 	/*
682 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
683 	 * Choosing not to migrate a shared folio is not counted as a failure.
684 	 *
685 	 * See folio_maybe_mapped_shared() on possible imprecision when we
686 	 * cannot easily detect if a folio is shared.
687 	 */
688 	if ((flags & MPOL_MF_MOVE_ALL) ||
689 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
690 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
691 			qp->nr_failed++;
692 unlock:
693 	spin_unlock(ptl);
694 	if (qp->nr_failed && strictly_unmovable(flags))
695 		return -EIO;
696 #endif
697 	return 0;
698 }
699 
700 #ifdef CONFIG_NUMA_BALANCING
701 /*
702  * This is used to mark a range of virtual addresses to be inaccessible.
703  * These are later cleared by a NUMA hinting fault. Depending on these
704  * faults, pages may be migrated for better NUMA placement.
705  *
706  * This is assuming that NUMA faults are handled using PROT_NONE. If
707  * an architecture makes a different choice, it will need further
708  * changes to the core.
709  */
710 unsigned long change_prot_numa(struct vm_area_struct *vma,
711 			unsigned long addr, unsigned long end)
712 {
713 	struct mmu_gather tlb;
714 	long nr_updated;
715 
716 	tlb_gather_mmu(&tlb, vma->vm_mm);
717 
718 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
719 	if (nr_updated > 0) {
720 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
721 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
722 	}
723 
724 	tlb_finish_mmu(&tlb);
725 
726 	return nr_updated;
727 }
728 #endif /* CONFIG_NUMA_BALANCING */
729 
730 static int queue_pages_test_walk(unsigned long start, unsigned long end,
731 				struct mm_walk *walk)
732 {
733 	struct vm_area_struct *next, *vma = walk->vma;
734 	struct queue_pages *qp = walk->private;
735 	unsigned long flags = qp->flags;
736 
737 	/* range check first */
738 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
739 
740 	if (!qp->first) {
741 		qp->first = vma;
742 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
743 			(qp->start < vma->vm_start))
744 			/* hole at head side of range */
745 			return -EFAULT;
746 	}
747 	next = find_vma(vma->vm_mm, vma->vm_end);
748 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
749 		((vma->vm_end < qp->end) &&
750 		(!next || vma->vm_end < next->vm_start)))
751 		/* hole at middle or tail of range */
752 		return -EFAULT;
753 
754 	/*
755 	 * Need check MPOL_MF_STRICT to return -EIO if possible
756 	 * regardless of vma_migratable
757 	 */
758 	if (!vma_migratable(vma) &&
759 	    !(flags & MPOL_MF_STRICT))
760 		return 1;
761 
762 	/*
763 	 * Check page nodes, and queue pages to move, in the current vma.
764 	 * But if no moving, and no strict checking, the scan can be skipped.
765 	 */
766 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
767 		return 0;
768 	return 1;
769 }
770 
771 static const struct mm_walk_ops queue_pages_walk_ops = {
772 	.hugetlb_entry		= queue_folios_hugetlb,
773 	.pmd_entry		= queue_folios_pte_range,
774 	.test_walk		= queue_pages_test_walk,
775 	.walk_lock		= PGWALK_RDLOCK,
776 };
777 
778 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
779 	.hugetlb_entry		= queue_folios_hugetlb,
780 	.pmd_entry		= queue_folios_pte_range,
781 	.test_walk		= queue_pages_test_walk,
782 	.walk_lock		= PGWALK_WRLOCK,
783 };
784 
785 /*
786  * Walk through page tables and collect pages to be migrated.
787  *
788  * If pages found in a given range are not on the required set of @nodes,
789  * and migration is allowed, they are isolated and queued to @pagelist.
790  *
791  * queue_pages_range() may return:
792  * 0 - all pages already on the right node, or successfully queued for moving
793  *     (or neither strict checking nor moving requested: only range checking).
794  * >0 - this number of misplaced folios could not be queued for moving
795  *      (a hugetlbfs page or a transparent huge page being counted as 1).
796  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
797  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
798  */
799 static long
800 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
801 		nodemask_t *nodes, unsigned long flags,
802 		struct list_head *pagelist)
803 {
804 	int err;
805 	struct queue_pages qp = {
806 		.pagelist = pagelist,
807 		.flags = flags,
808 		.nmask = nodes,
809 		.start = start,
810 		.end = end,
811 		.first = NULL,
812 	};
813 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
814 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
815 
816 	err = walk_page_range(mm, start, end, ops, &qp);
817 
818 	if (!qp.first)
819 		/* whole range in hole */
820 		err = -EFAULT;
821 
822 	return err ? : qp.nr_failed;
823 }
824 
825 /*
826  * Apply policy to a single VMA
827  * This must be called with the mmap_lock held for writing.
828  */
829 static int vma_replace_policy(struct vm_area_struct *vma,
830 				struct mempolicy *pol)
831 {
832 	int err;
833 	struct mempolicy *old;
834 	struct mempolicy *new;
835 
836 	vma_assert_write_locked(vma);
837 
838 	new = mpol_dup(pol);
839 	if (IS_ERR(new))
840 		return PTR_ERR(new);
841 
842 	if (vma->vm_ops && vma->vm_ops->set_policy) {
843 		err = vma->vm_ops->set_policy(vma, new);
844 		if (err)
845 			goto err_out;
846 	}
847 
848 	old = vma->vm_policy;
849 	vma->vm_policy = new; /* protected by mmap_lock */
850 	mpol_put(old);
851 
852 	return 0;
853  err_out:
854 	mpol_put(new);
855 	return err;
856 }
857 
858 /* Split or merge the VMA (if required) and apply the new policy */
859 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
860 		struct vm_area_struct **prev, unsigned long start,
861 		unsigned long end, struct mempolicy *new_pol)
862 {
863 	unsigned long vmstart, vmend;
864 
865 	vmend = min(end, vma->vm_end);
866 	if (start > vma->vm_start) {
867 		*prev = vma;
868 		vmstart = start;
869 	} else {
870 		vmstart = vma->vm_start;
871 	}
872 
873 	if (mpol_equal(vma->vm_policy, new_pol)) {
874 		*prev = vma;
875 		return 0;
876 	}
877 
878 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
879 	if (IS_ERR(vma))
880 		return PTR_ERR(vma);
881 
882 	*prev = vma;
883 	return vma_replace_policy(vma, new_pol);
884 }
885 
886 /* Set the process memory policy */
887 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
888 			     nodemask_t *nodes)
889 {
890 	struct mempolicy *new, *old;
891 	NODEMASK_SCRATCH(scratch);
892 	int ret;
893 
894 	if (!scratch)
895 		return -ENOMEM;
896 
897 	new = mpol_new(mode, flags, nodes);
898 	if (IS_ERR(new)) {
899 		ret = PTR_ERR(new);
900 		goto out;
901 	}
902 
903 	task_lock(current);
904 	ret = mpol_set_nodemask(new, nodes, scratch);
905 	if (ret) {
906 		task_unlock(current);
907 		mpol_put(new);
908 		goto out;
909 	}
910 
911 	old = current->mempolicy;
912 	current->mempolicy = new;
913 	if (new && (new->mode == MPOL_INTERLEAVE ||
914 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
915 		current->il_prev = MAX_NUMNODES-1;
916 		current->il_weight = 0;
917 	}
918 	task_unlock(current);
919 	mpol_put(old);
920 	ret = 0;
921 out:
922 	NODEMASK_SCRATCH_FREE(scratch);
923 	return ret;
924 }
925 
926 /*
927  * Return nodemask for policy for get_mempolicy() query
928  *
929  * Called with task's alloc_lock held
930  */
931 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
932 {
933 	nodes_clear(*nodes);
934 	if (pol == &default_policy)
935 		return;
936 
937 	switch (pol->mode) {
938 	case MPOL_BIND:
939 	case MPOL_INTERLEAVE:
940 	case MPOL_PREFERRED:
941 	case MPOL_PREFERRED_MANY:
942 	case MPOL_WEIGHTED_INTERLEAVE:
943 		*nodes = pol->nodes;
944 		break;
945 	case MPOL_LOCAL:
946 		/* return empty node mask for local allocation */
947 		break;
948 	default:
949 		BUG();
950 	}
951 }
952 
953 static int lookup_node(struct mm_struct *mm, unsigned long addr)
954 {
955 	struct page *p = NULL;
956 	int ret;
957 
958 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
959 	if (ret > 0) {
960 		ret = page_to_nid(p);
961 		put_page(p);
962 	}
963 	return ret;
964 }
965 
966 /* Retrieve NUMA policy */
967 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
968 			     unsigned long addr, unsigned long flags)
969 {
970 	int err;
971 	struct mm_struct *mm = current->mm;
972 	struct vm_area_struct *vma = NULL;
973 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
974 
975 	if (flags &
976 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
977 		return -EINVAL;
978 
979 	if (flags & MPOL_F_MEMS_ALLOWED) {
980 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
981 			return -EINVAL;
982 		*policy = 0;	/* just so it's initialized */
983 		task_lock(current);
984 		*nmask  = cpuset_current_mems_allowed;
985 		task_unlock(current);
986 		return 0;
987 	}
988 
989 	if (flags & MPOL_F_ADDR) {
990 		pgoff_t ilx;		/* ignored here */
991 		/*
992 		 * Do NOT fall back to task policy if the
993 		 * vma/shared policy at addr is NULL.  We
994 		 * want to return MPOL_DEFAULT in this case.
995 		 */
996 		mmap_read_lock(mm);
997 		vma = vma_lookup(mm, addr);
998 		if (!vma) {
999 			mmap_read_unlock(mm);
1000 			return -EFAULT;
1001 		}
1002 		pol = __get_vma_policy(vma, addr, &ilx);
1003 	} else if (addr)
1004 		return -EINVAL;
1005 
1006 	if (!pol)
1007 		pol = &default_policy;	/* indicates default behavior */
1008 
1009 	if (flags & MPOL_F_NODE) {
1010 		if (flags & MPOL_F_ADDR) {
1011 			/*
1012 			 * Take a refcount on the mpol, because we are about to
1013 			 * drop the mmap_lock, after which only "pol" remains
1014 			 * valid, "vma" is stale.
1015 			 */
1016 			pol_refcount = pol;
1017 			vma = NULL;
1018 			mpol_get(pol);
1019 			mmap_read_unlock(mm);
1020 			err = lookup_node(mm, addr);
1021 			if (err < 0)
1022 				goto out;
1023 			*policy = err;
1024 		} else if (pol == current->mempolicy &&
1025 				pol->mode == MPOL_INTERLEAVE) {
1026 			*policy = next_node_in(current->il_prev, pol->nodes);
1027 		} else if (pol == current->mempolicy &&
1028 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1029 			if (current->il_weight)
1030 				*policy = current->il_prev;
1031 			else
1032 				*policy = next_node_in(current->il_prev,
1033 						       pol->nodes);
1034 		} else {
1035 			err = -EINVAL;
1036 			goto out;
1037 		}
1038 	} else {
1039 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1040 						pol->mode;
1041 		/*
1042 		 * Internal mempolicy flags must be masked off before exposing
1043 		 * the policy to userspace.
1044 		 */
1045 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1046 	}
1047 
1048 	err = 0;
1049 	if (nmask) {
1050 		if (mpol_store_user_nodemask(pol)) {
1051 			*nmask = pol->w.user_nodemask;
1052 		} else {
1053 			task_lock(current);
1054 			get_policy_nodemask(pol, nmask);
1055 			task_unlock(current);
1056 		}
1057 	}
1058 
1059  out:
1060 	mpol_cond_put(pol);
1061 	if (vma)
1062 		mmap_read_unlock(mm);
1063 	if (pol_refcount)
1064 		mpol_put(pol_refcount);
1065 	return err;
1066 }
1067 
1068 #ifdef CONFIG_MIGRATION
1069 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1070 				unsigned long flags)
1071 {
1072 	/*
1073 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1074 	 * Choosing not to migrate a shared folio is not counted as a failure.
1075 	 *
1076 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1077 	 * cannot easily detect if a folio is shared.
1078 	 */
1079 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1080 		if (folio_isolate_lru(folio)) {
1081 			list_add_tail(&folio->lru, foliolist);
1082 			node_stat_mod_folio(folio,
1083 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1084 				folio_nr_pages(folio));
1085 		} else {
1086 			/*
1087 			 * Non-movable folio may reach here.  And, there may be
1088 			 * temporary off LRU folios or non-LRU movable folios.
1089 			 * Treat them as unmovable folios since they can't be
1090 			 * isolated, so they can't be moved at the moment.
1091 			 */
1092 			return false;
1093 		}
1094 	}
1095 	return true;
1096 }
1097 
1098 /*
1099  * Migrate pages from one node to a target node.
1100  * Returns error or the number of pages not migrated.
1101  */
1102 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1103 			    int flags)
1104 {
1105 	nodemask_t nmask;
1106 	struct vm_area_struct *vma;
1107 	LIST_HEAD(pagelist);
1108 	long nr_failed;
1109 	long err = 0;
1110 	struct migration_target_control mtc = {
1111 		.nid = dest,
1112 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1113 		.reason = MR_SYSCALL,
1114 	};
1115 
1116 	nodes_clear(nmask);
1117 	node_set(source, nmask);
1118 
1119 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1120 
1121 	mmap_read_lock(mm);
1122 	vma = find_vma(mm, 0);
1123 	if (unlikely(!vma)) {
1124 		mmap_read_unlock(mm);
1125 		return 0;
1126 	}
1127 
1128 	/*
1129 	 * This does not migrate the range, but isolates all pages that
1130 	 * need migration.  Between passing in the full user address
1131 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1132 	 * but passes back the count of pages which could not be isolated.
1133 	 */
1134 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1135 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1136 	mmap_read_unlock(mm);
1137 
1138 	if (!list_empty(&pagelist)) {
1139 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1140 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1141 		if (err)
1142 			putback_movable_pages(&pagelist);
1143 	}
1144 
1145 	if (err >= 0)
1146 		err += nr_failed;
1147 	return err;
1148 }
1149 
1150 /*
1151  * Move pages between the two nodesets so as to preserve the physical
1152  * layout as much as possible.
1153  *
1154  * Returns the number of page that could not be moved.
1155  */
1156 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1157 		     const nodemask_t *to, int flags)
1158 {
1159 	long nr_failed = 0;
1160 	long err = 0;
1161 	nodemask_t tmp;
1162 
1163 	lru_cache_disable();
1164 
1165 	/*
1166 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1167 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1168 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1169 	 * The pair of nodemasks 'to' and 'from' define the map.
1170 	 *
1171 	 * If no pair of bits is found that way, fallback to picking some
1172 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1173 	 * 'source' and 'dest' bits are the same, this represents a node
1174 	 * that will be migrating to itself, so no pages need move.
1175 	 *
1176 	 * If no bits are left in 'tmp', or if all remaining bits left
1177 	 * in 'tmp' correspond to the same bit in 'to', return false
1178 	 * (nothing left to migrate).
1179 	 *
1180 	 * This lets us pick a pair of nodes to migrate between, such that
1181 	 * if possible the dest node is not already occupied by some other
1182 	 * source node, minimizing the risk of overloading the memory on a
1183 	 * node that would happen if we migrated incoming memory to a node
1184 	 * before migrating outgoing memory source that same node.
1185 	 *
1186 	 * A single scan of tmp is sufficient.  As we go, we remember the
1187 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1188 	 * that not only moved, but what's better, moved to an empty slot
1189 	 * (d is not set in tmp), then we break out then, with that pair.
1190 	 * Otherwise when we finish scanning from_tmp, we at least have the
1191 	 * most recent <s, d> pair that moved.  If we get all the way through
1192 	 * the scan of tmp without finding any node that moved, much less
1193 	 * moved to an empty node, then there is nothing left worth migrating.
1194 	 */
1195 
1196 	tmp = *from;
1197 	while (!nodes_empty(tmp)) {
1198 		int s, d;
1199 		int source = NUMA_NO_NODE;
1200 		int dest = 0;
1201 
1202 		for_each_node_mask(s, tmp) {
1203 
1204 			/*
1205 			 * do_migrate_pages() tries to maintain the relative
1206 			 * node relationship of the pages established between
1207 			 * threads and memory areas.
1208                          *
1209 			 * However if the number of source nodes is not equal to
1210 			 * the number of destination nodes we can not preserve
1211 			 * this node relative relationship.  In that case, skip
1212 			 * copying memory from a node that is in the destination
1213 			 * mask.
1214 			 *
1215 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1216 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1217 			 */
1218 
1219 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1220 						(node_isset(s, *to)))
1221 				continue;
1222 
1223 			d = node_remap(s, *from, *to);
1224 			if (s == d)
1225 				continue;
1226 
1227 			source = s;	/* Node moved. Memorize */
1228 			dest = d;
1229 
1230 			/* dest not in remaining from nodes? */
1231 			if (!node_isset(dest, tmp))
1232 				break;
1233 		}
1234 		if (source == NUMA_NO_NODE)
1235 			break;
1236 
1237 		node_clear(source, tmp);
1238 		err = migrate_to_node(mm, source, dest, flags);
1239 		if (err > 0)
1240 			nr_failed += err;
1241 		if (err < 0)
1242 			break;
1243 	}
1244 
1245 	lru_cache_enable();
1246 	if (err < 0)
1247 		return err;
1248 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1249 }
1250 
1251 /*
1252  * Allocate a new folio for page migration, according to NUMA mempolicy.
1253  */
1254 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1255 						    unsigned long private)
1256 {
1257 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1258 	struct mempolicy *pol = mmpol->pol;
1259 	pgoff_t ilx = mmpol->ilx;
1260 	unsigned int order;
1261 	int nid = numa_node_id();
1262 	gfp_t gfp;
1263 
1264 	order = folio_order(src);
1265 	ilx += src->index >> order;
1266 
1267 	if (folio_test_hugetlb(src)) {
1268 		nodemask_t *nodemask;
1269 		struct hstate *h;
1270 
1271 		h = folio_hstate(src);
1272 		gfp = htlb_alloc_mask(h);
1273 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1274 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1275 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1276 	}
1277 
1278 	if (folio_test_large(src))
1279 		gfp = GFP_TRANSHUGE;
1280 	else
1281 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1282 
1283 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1284 }
1285 #else
1286 
1287 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1288 				unsigned long flags)
1289 {
1290 	return false;
1291 }
1292 
1293 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1294 		     const nodemask_t *to, int flags)
1295 {
1296 	return -ENOSYS;
1297 }
1298 
1299 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1300 						    unsigned long private)
1301 {
1302 	return NULL;
1303 }
1304 #endif
1305 
1306 static long do_mbind(unsigned long start, unsigned long len,
1307 		     unsigned short mode, unsigned short mode_flags,
1308 		     nodemask_t *nmask, unsigned long flags)
1309 {
1310 	struct mm_struct *mm = current->mm;
1311 	struct vm_area_struct *vma, *prev;
1312 	struct vma_iterator vmi;
1313 	struct migration_mpol mmpol;
1314 	struct mempolicy *new;
1315 	unsigned long end;
1316 	long err;
1317 	long nr_failed;
1318 	LIST_HEAD(pagelist);
1319 
1320 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1321 		return -EINVAL;
1322 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1323 		return -EPERM;
1324 
1325 	if (start & ~PAGE_MASK)
1326 		return -EINVAL;
1327 
1328 	if (mode == MPOL_DEFAULT)
1329 		flags &= ~MPOL_MF_STRICT;
1330 
1331 	len = PAGE_ALIGN(len);
1332 	end = start + len;
1333 
1334 	if (end < start)
1335 		return -EINVAL;
1336 	if (end == start)
1337 		return 0;
1338 
1339 	new = mpol_new(mode, mode_flags, nmask);
1340 	if (IS_ERR(new))
1341 		return PTR_ERR(new);
1342 
1343 	/*
1344 	 * If we are using the default policy then operation
1345 	 * on discontinuous address spaces is okay after all
1346 	 */
1347 	if (!new)
1348 		flags |= MPOL_MF_DISCONTIG_OK;
1349 
1350 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1351 		lru_cache_disable();
1352 	{
1353 		NODEMASK_SCRATCH(scratch);
1354 		if (scratch) {
1355 			mmap_write_lock(mm);
1356 			err = mpol_set_nodemask(new, nmask, scratch);
1357 			if (err)
1358 				mmap_write_unlock(mm);
1359 		} else
1360 			err = -ENOMEM;
1361 		NODEMASK_SCRATCH_FREE(scratch);
1362 	}
1363 	if (err)
1364 		goto mpol_out;
1365 
1366 	/*
1367 	 * Lock the VMAs before scanning for pages to migrate,
1368 	 * to ensure we don't miss a concurrently inserted page.
1369 	 */
1370 	nr_failed = queue_pages_range(mm, start, end, nmask,
1371 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1372 
1373 	if (nr_failed < 0) {
1374 		err = nr_failed;
1375 		nr_failed = 0;
1376 	} else {
1377 		vma_iter_init(&vmi, mm, start);
1378 		prev = vma_prev(&vmi);
1379 		for_each_vma_range(vmi, vma, end) {
1380 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1381 			if (err)
1382 				break;
1383 		}
1384 	}
1385 
1386 	if (!err && !list_empty(&pagelist)) {
1387 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1388 		if (!new) {
1389 			new = get_task_policy(current);
1390 			mpol_get(new);
1391 		}
1392 		mmpol.pol = new;
1393 		mmpol.ilx = 0;
1394 
1395 		/*
1396 		 * In the interleaved case, attempt to allocate on exactly the
1397 		 * targeted nodes, for the first VMA to be migrated; for later
1398 		 * VMAs, the nodes will still be interleaved from the targeted
1399 		 * nodemask, but one by one may be selected differently.
1400 		 */
1401 		if (new->mode == MPOL_INTERLEAVE ||
1402 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1403 			struct folio *folio;
1404 			unsigned int order;
1405 			unsigned long addr = -EFAULT;
1406 
1407 			list_for_each_entry(folio, &pagelist, lru) {
1408 				if (!folio_test_ksm(folio))
1409 					break;
1410 			}
1411 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1412 				vma_iter_init(&vmi, mm, start);
1413 				for_each_vma_range(vmi, vma, end) {
1414 					addr = page_address_in_vma(folio,
1415 						folio_page(folio, 0), vma);
1416 					if (addr != -EFAULT)
1417 						break;
1418 				}
1419 			}
1420 			if (addr != -EFAULT) {
1421 				order = folio_order(folio);
1422 				/* We already know the pol, but not the ilx */
1423 				mpol_cond_put(get_vma_policy(vma, addr, order,
1424 							     &mmpol.ilx));
1425 				/* Set base from which to increment by index */
1426 				mmpol.ilx -= folio->index >> order;
1427 			}
1428 		}
1429 	}
1430 
1431 	mmap_write_unlock(mm);
1432 
1433 	if (!err && !list_empty(&pagelist)) {
1434 		nr_failed |= migrate_pages(&pagelist,
1435 				alloc_migration_target_by_mpol, NULL,
1436 				(unsigned long)&mmpol, MIGRATE_SYNC,
1437 				MR_MEMPOLICY_MBIND, NULL);
1438 	}
1439 
1440 	if (nr_failed && (flags & MPOL_MF_STRICT))
1441 		err = -EIO;
1442 	if (!list_empty(&pagelist))
1443 		putback_movable_pages(&pagelist);
1444 mpol_out:
1445 	mpol_put(new);
1446 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1447 		lru_cache_enable();
1448 	return err;
1449 }
1450 
1451 /*
1452  * User space interface with variable sized bitmaps for nodelists.
1453  */
1454 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1455 		      unsigned long maxnode)
1456 {
1457 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1458 	int ret;
1459 
1460 	if (in_compat_syscall())
1461 		ret = compat_get_bitmap(mask,
1462 					(const compat_ulong_t __user *)nmask,
1463 					maxnode);
1464 	else
1465 		ret = copy_from_user(mask, nmask,
1466 				     nlongs * sizeof(unsigned long));
1467 
1468 	if (ret)
1469 		return -EFAULT;
1470 
1471 	if (maxnode % BITS_PER_LONG)
1472 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1473 
1474 	return 0;
1475 }
1476 
1477 /* Copy a node mask from user space. */
1478 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1479 		     unsigned long maxnode)
1480 {
1481 	--maxnode;
1482 	nodes_clear(*nodes);
1483 	if (maxnode == 0 || !nmask)
1484 		return 0;
1485 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1486 		return -EINVAL;
1487 
1488 	/*
1489 	 * When the user specified more nodes than supported just check
1490 	 * if the non supported part is all zero, one word at a time,
1491 	 * starting at the end.
1492 	 */
1493 	while (maxnode > MAX_NUMNODES) {
1494 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1495 		unsigned long t;
1496 
1497 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1498 			return -EFAULT;
1499 
1500 		if (maxnode - bits >= MAX_NUMNODES) {
1501 			maxnode -= bits;
1502 		} else {
1503 			maxnode = MAX_NUMNODES;
1504 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1505 		}
1506 		if (t)
1507 			return -EINVAL;
1508 	}
1509 
1510 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1511 }
1512 
1513 /* Copy a kernel node mask to user space */
1514 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1515 			      nodemask_t *nodes)
1516 {
1517 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1518 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1519 	bool compat = in_compat_syscall();
1520 
1521 	if (compat)
1522 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1523 
1524 	if (copy > nbytes) {
1525 		if (copy > PAGE_SIZE)
1526 			return -EINVAL;
1527 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1528 			return -EFAULT;
1529 		copy = nbytes;
1530 		maxnode = nr_node_ids;
1531 	}
1532 
1533 	if (compat)
1534 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1535 					 nodes_addr(*nodes), maxnode);
1536 
1537 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1538 }
1539 
1540 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1541 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1542 {
1543 	*flags = *mode & MPOL_MODE_FLAGS;
1544 	*mode &= ~MPOL_MODE_FLAGS;
1545 
1546 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1547 		return -EINVAL;
1548 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1549 		return -EINVAL;
1550 	if (*flags & MPOL_F_NUMA_BALANCING) {
1551 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1552 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1553 		else
1554 			return -EINVAL;
1555 	}
1556 	return 0;
1557 }
1558 
1559 static long kernel_mbind(unsigned long start, unsigned long len,
1560 			 unsigned long mode, const unsigned long __user *nmask,
1561 			 unsigned long maxnode, unsigned int flags)
1562 {
1563 	unsigned short mode_flags;
1564 	nodemask_t nodes;
1565 	int lmode = mode;
1566 	int err;
1567 
1568 	start = untagged_addr(start);
1569 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1570 	if (err)
1571 		return err;
1572 
1573 	err = get_nodes(&nodes, nmask, maxnode);
1574 	if (err)
1575 		return err;
1576 
1577 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1578 }
1579 
1580 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1581 		unsigned long, home_node, unsigned long, flags)
1582 {
1583 	struct mm_struct *mm = current->mm;
1584 	struct vm_area_struct *vma, *prev;
1585 	struct mempolicy *new, *old;
1586 	unsigned long end;
1587 	int err = -ENOENT;
1588 	VMA_ITERATOR(vmi, mm, start);
1589 
1590 	start = untagged_addr(start);
1591 	if (start & ~PAGE_MASK)
1592 		return -EINVAL;
1593 	/*
1594 	 * flags is used for future extension if any.
1595 	 */
1596 	if (flags != 0)
1597 		return -EINVAL;
1598 
1599 	/*
1600 	 * Check home_node is online to avoid accessing uninitialized
1601 	 * NODE_DATA.
1602 	 */
1603 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1604 		return -EINVAL;
1605 
1606 	len = PAGE_ALIGN(len);
1607 	end = start + len;
1608 
1609 	if (end < start)
1610 		return -EINVAL;
1611 	if (end == start)
1612 		return 0;
1613 	mmap_write_lock(mm);
1614 	prev = vma_prev(&vmi);
1615 	for_each_vma_range(vmi, vma, end) {
1616 		/*
1617 		 * If any vma in the range got policy other than MPOL_BIND
1618 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1619 		 * the home node for vmas we already updated before.
1620 		 */
1621 		old = vma_policy(vma);
1622 		if (!old) {
1623 			prev = vma;
1624 			continue;
1625 		}
1626 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1627 			err = -EOPNOTSUPP;
1628 			break;
1629 		}
1630 		new = mpol_dup(old);
1631 		if (IS_ERR(new)) {
1632 			err = PTR_ERR(new);
1633 			break;
1634 		}
1635 
1636 		vma_start_write(vma);
1637 		new->home_node = home_node;
1638 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1639 		mpol_put(new);
1640 		if (err)
1641 			break;
1642 	}
1643 	mmap_write_unlock(mm);
1644 	return err;
1645 }
1646 
1647 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1648 		unsigned long, mode, const unsigned long __user *, nmask,
1649 		unsigned long, maxnode, unsigned int, flags)
1650 {
1651 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1652 }
1653 
1654 /* Set the process memory policy */
1655 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1656 				 unsigned long maxnode)
1657 {
1658 	unsigned short mode_flags;
1659 	nodemask_t nodes;
1660 	int lmode = mode;
1661 	int err;
1662 
1663 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1664 	if (err)
1665 		return err;
1666 
1667 	err = get_nodes(&nodes, nmask, maxnode);
1668 	if (err)
1669 		return err;
1670 
1671 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1672 }
1673 
1674 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1675 		unsigned long, maxnode)
1676 {
1677 	return kernel_set_mempolicy(mode, nmask, maxnode);
1678 }
1679 
1680 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1681 				const unsigned long __user *old_nodes,
1682 				const unsigned long __user *new_nodes)
1683 {
1684 	struct mm_struct *mm = NULL;
1685 	struct task_struct *task;
1686 	nodemask_t task_nodes;
1687 	int err;
1688 	nodemask_t *old;
1689 	nodemask_t *new;
1690 	NODEMASK_SCRATCH(scratch);
1691 
1692 	if (!scratch)
1693 		return -ENOMEM;
1694 
1695 	old = &scratch->mask1;
1696 	new = &scratch->mask2;
1697 
1698 	err = get_nodes(old, old_nodes, maxnode);
1699 	if (err)
1700 		goto out;
1701 
1702 	err = get_nodes(new, new_nodes, maxnode);
1703 	if (err)
1704 		goto out;
1705 
1706 	/* Find the mm_struct */
1707 	rcu_read_lock();
1708 	task = pid ? find_task_by_vpid(pid) : current;
1709 	if (!task) {
1710 		rcu_read_unlock();
1711 		err = -ESRCH;
1712 		goto out;
1713 	}
1714 	get_task_struct(task);
1715 
1716 	err = -EINVAL;
1717 
1718 	/*
1719 	 * Check if this process has the right to modify the specified process.
1720 	 * Use the regular "ptrace_may_access()" checks.
1721 	 */
1722 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1723 		rcu_read_unlock();
1724 		err = -EPERM;
1725 		goto out_put;
1726 	}
1727 	rcu_read_unlock();
1728 
1729 	task_nodes = cpuset_mems_allowed(task);
1730 	/* Is the user allowed to access the target nodes? */
1731 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1732 		err = -EPERM;
1733 		goto out_put;
1734 	}
1735 
1736 	task_nodes = cpuset_mems_allowed(current);
1737 	nodes_and(*new, *new, task_nodes);
1738 	if (nodes_empty(*new))
1739 		goto out_put;
1740 
1741 	err = security_task_movememory(task);
1742 	if (err)
1743 		goto out_put;
1744 
1745 	mm = get_task_mm(task);
1746 	put_task_struct(task);
1747 
1748 	if (!mm) {
1749 		err = -EINVAL;
1750 		goto out;
1751 	}
1752 
1753 	err = do_migrate_pages(mm, old, new,
1754 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1755 
1756 	mmput(mm);
1757 out:
1758 	NODEMASK_SCRATCH_FREE(scratch);
1759 
1760 	return err;
1761 
1762 out_put:
1763 	put_task_struct(task);
1764 	goto out;
1765 }
1766 
1767 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1768 		const unsigned long __user *, old_nodes,
1769 		const unsigned long __user *, new_nodes)
1770 {
1771 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1772 }
1773 
1774 /* Retrieve NUMA policy */
1775 static int kernel_get_mempolicy(int __user *policy,
1776 				unsigned long __user *nmask,
1777 				unsigned long maxnode,
1778 				unsigned long addr,
1779 				unsigned long flags)
1780 {
1781 	int err;
1782 	int pval;
1783 	nodemask_t nodes;
1784 
1785 	if (nmask != NULL && maxnode < nr_node_ids)
1786 		return -EINVAL;
1787 
1788 	addr = untagged_addr(addr);
1789 
1790 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1791 
1792 	if (err)
1793 		return err;
1794 
1795 	if (policy && put_user(pval, policy))
1796 		return -EFAULT;
1797 
1798 	if (nmask)
1799 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1800 
1801 	return err;
1802 }
1803 
1804 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1805 		unsigned long __user *, nmask, unsigned long, maxnode,
1806 		unsigned long, addr, unsigned long, flags)
1807 {
1808 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1809 }
1810 
1811 bool vma_migratable(struct vm_area_struct *vma)
1812 {
1813 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1814 		return false;
1815 
1816 	/*
1817 	 * DAX device mappings require predictable access latency, so avoid
1818 	 * incurring periodic faults.
1819 	 */
1820 	if (vma_is_dax(vma))
1821 		return false;
1822 
1823 	if (is_vm_hugetlb_page(vma) &&
1824 		!hugepage_migration_supported(hstate_vma(vma)))
1825 		return false;
1826 
1827 	/*
1828 	 * Migration allocates pages in the highest zone. If we cannot
1829 	 * do so then migration (at least from node to node) is not
1830 	 * possible.
1831 	 */
1832 	if (vma->vm_file &&
1833 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1834 			< policy_zone)
1835 		return false;
1836 	return true;
1837 }
1838 
1839 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1840 				   unsigned long addr, pgoff_t *ilx)
1841 {
1842 	*ilx = 0;
1843 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
1844 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1845 }
1846 
1847 /*
1848  * get_vma_policy(@vma, @addr, @order, @ilx)
1849  * @vma: virtual memory area whose policy is sought
1850  * @addr: address in @vma for shared policy lookup
1851  * @order: 0, or appropriate huge_page_order for interleaving
1852  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1853  *       MPOL_WEIGHTED_INTERLEAVE
1854  *
1855  * Returns effective policy for a VMA at specified address.
1856  * Falls back to current->mempolicy or system default policy, as necessary.
1857  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1858  * count--added by the get_policy() vm_op, as appropriate--to protect against
1859  * freeing by another task.  It is the caller's responsibility to free the
1860  * extra reference for shared policies.
1861  */
1862 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1863 				 unsigned long addr, int order, pgoff_t *ilx)
1864 {
1865 	struct mempolicy *pol;
1866 
1867 	pol = __get_vma_policy(vma, addr, ilx);
1868 	if (!pol)
1869 		pol = get_task_policy(current);
1870 	if (pol->mode == MPOL_INTERLEAVE ||
1871 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1872 		*ilx += vma->vm_pgoff >> order;
1873 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1874 	}
1875 	return pol;
1876 }
1877 
1878 bool vma_policy_mof(struct vm_area_struct *vma)
1879 {
1880 	struct mempolicy *pol;
1881 
1882 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1883 		bool ret = false;
1884 		pgoff_t ilx;		/* ignored here */
1885 
1886 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1887 		if (pol && (pol->flags & MPOL_F_MOF))
1888 			ret = true;
1889 		mpol_cond_put(pol);
1890 
1891 		return ret;
1892 	}
1893 
1894 	pol = vma->vm_policy;
1895 	if (!pol)
1896 		pol = get_task_policy(current);
1897 
1898 	return pol->flags & MPOL_F_MOF;
1899 }
1900 
1901 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1902 {
1903 	enum zone_type dynamic_policy_zone = policy_zone;
1904 
1905 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1906 
1907 	/*
1908 	 * if policy->nodes has movable memory only,
1909 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1910 	 *
1911 	 * policy->nodes is intersect with node_states[N_MEMORY].
1912 	 * so if the following test fails, it implies
1913 	 * policy->nodes has movable memory only.
1914 	 */
1915 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1916 		dynamic_policy_zone = ZONE_MOVABLE;
1917 
1918 	return zone >= dynamic_policy_zone;
1919 }
1920 
1921 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
1922 {
1923 	unsigned int node;
1924 	unsigned int cpuset_mems_cookie;
1925 
1926 retry:
1927 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
1928 	cpuset_mems_cookie = read_mems_allowed_begin();
1929 	node = current->il_prev;
1930 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
1931 		node = next_node_in(node, policy->nodes);
1932 		if (read_mems_allowed_retry(cpuset_mems_cookie))
1933 			goto retry;
1934 		if (node == MAX_NUMNODES)
1935 			return node;
1936 		current->il_prev = node;
1937 		current->il_weight = get_il_weight(node);
1938 	}
1939 	current->il_weight--;
1940 	return node;
1941 }
1942 
1943 /* Do dynamic interleaving for a process */
1944 static unsigned int interleave_nodes(struct mempolicy *policy)
1945 {
1946 	unsigned int nid;
1947 	unsigned int cpuset_mems_cookie;
1948 
1949 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
1950 	do {
1951 		cpuset_mems_cookie = read_mems_allowed_begin();
1952 		nid = next_node_in(current->il_prev, policy->nodes);
1953 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
1954 
1955 	if (nid < MAX_NUMNODES)
1956 		current->il_prev = nid;
1957 	return nid;
1958 }
1959 
1960 /*
1961  * Depending on the memory policy provide a node from which to allocate the
1962  * next slab entry.
1963  */
1964 unsigned int mempolicy_slab_node(void)
1965 {
1966 	struct mempolicy *policy;
1967 	int node = numa_mem_id();
1968 
1969 	if (!in_task())
1970 		return node;
1971 
1972 	policy = current->mempolicy;
1973 	if (!policy)
1974 		return node;
1975 
1976 	switch (policy->mode) {
1977 	case MPOL_PREFERRED:
1978 		return first_node(policy->nodes);
1979 
1980 	case MPOL_INTERLEAVE:
1981 		return interleave_nodes(policy);
1982 
1983 	case MPOL_WEIGHTED_INTERLEAVE:
1984 		return weighted_interleave_nodes(policy);
1985 
1986 	case MPOL_BIND:
1987 	case MPOL_PREFERRED_MANY:
1988 	{
1989 		struct zoneref *z;
1990 
1991 		/*
1992 		 * Follow bind policy behavior and start allocation at the
1993 		 * first node.
1994 		 */
1995 		struct zonelist *zonelist;
1996 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1997 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1998 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1999 							&policy->nodes);
2000 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2001 	}
2002 	case MPOL_LOCAL:
2003 		return node;
2004 
2005 	default:
2006 		BUG();
2007 	}
2008 }
2009 
2010 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2011 					      nodemask_t *mask)
2012 {
2013 	/*
2014 	 * barrier stabilizes the nodemask locally so that it can be iterated
2015 	 * over safely without concern for changes. Allocators validate node
2016 	 * selection does not violate mems_allowed, so this is safe.
2017 	 */
2018 	barrier();
2019 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2020 	barrier();
2021 	return nodes_weight(*mask);
2022 }
2023 
2024 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2025 {
2026 	nodemask_t nodemask;
2027 	unsigned int target, nr_nodes;
2028 	u8 *table;
2029 	unsigned int weight_total = 0;
2030 	u8 weight;
2031 	int nid;
2032 
2033 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2034 	if (!nr_nodes)
2035 		return numa_node_id();
2036 
2037 	rcu_read_lock();
2038 	table = rcu_dereference(iw_table);
2039 	/* calculate the total weight */
2040 	for_each_node_mask(nid, nodemask) {
2041 		/* detect system default usage */
2042 		weight = table ? table[nid] : 1;
2043 		weight = weight ? weight : 1;
2044 		weight_total += weight;
2045 	}
2046 
2047 	/* Calculate the node offset based on totals */
2048 	target = ilx % weight_total;
2049 	nid = first_node(nodemask);
2050 	while (target) {
2051 		/* detect system default usage */
2052 		weight = table ? table[nid] : 1;
2053 		weight = weight ? weight : 1;
2054 		if (target < weight)
2055 			break;
2056 		target -= weight;
2057 		nid = next_node_in(nid, nodemask);
2058 	}
2059 	rcu_read_unlock();
2060 	return nid;
2061 }
2062 
2063 /*
2064  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2065  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2066  * exceeds the number of present nodes.
2067  */
2068 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2069 {
2070 	nodemask_t nodemask;
2071 	unsigned int target, nnodes;
2072 	int i;
2073 	int nid;
2074 
2075 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2076 	if (!nnodes)
2077 		return numa_node_id();
2078 	target = ilx % nnodes;
2079 	nid = first_node(nodemask);
2080 	for (i = 0; i < target; i++)
2081 		nid = next_node(nid, nodemask);
2082 	return nid;
2083 }
2084 
2085 /*
2086  * Return a nodemask representing a mempolicy for filtering nodes for
2087  * page allocation, together with preferred node id (or the input node id).
2088  */
2089 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2090 				   pgoff_t ilx, int *nid)
2091 {
2092 	nodemask_t *nodemask = NULL;
2093 
2094 	switch (pol->mode) {
2095 	case MPOL_PREFERRED:
2096 		/* Override input node id */
2097 		*nid = first_node(pol->nodes);
2098 		break;
2099 	case MPOL_PREFERRED_MANY:
2100 		nodemask = &pol->nodes;
2101 		if (pol->home_node != NUMA_NO_NODE)
2102 			*nid = pol->home_node;
2103 		break;
2104 	case MPOL_BIND:
2105 		/* Restrict to nodemask (but not on lower zones) */
2106 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2107 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2108 			nodemask = &pol->nodes;
2109 		if (pol->home_node != NUMA_NO_NODE)
2110 			*nid = pol->home_node;
2111 		/*
2112 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2113 		 * because we might easily break the expectation to stay on the
2114 		 * requested node and not break the policy.
2115 		 */
2116 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2117 		break;
2118 	case MPOL_INTERLEAVE:
2119 		/* Override input node id */
2120 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2121 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2122 		break;
2123 	case MPOL_WEIGHTED_INTERLEAVE:
2124 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2125 			weighted_interleave_nodes(pol) :
2126 			weighted_interleave_nid(pol, ilx);
2127 		break;
2128 	}
2129 
2130 	return nodemask;
2131 }
2132 
2133 #ifdef CONFIG_HUGETLBFS
2134 /*
2135  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2136  * @vma: virtual memory area whose policy is sought
2137  * @addr: address in @vma for shared policy lookup and interleave policy
2138  * @gfp_flags: for requested zone
2139  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2140  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2141  *
2142  * Returns a nid suitable for a huge page allocation and a pointer
2143  * to the struct mempolicy for conditional unref after allocation.
2144  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2145  * to the mempolicy's @nodemask for filtering the zonelist.
2146  */
2147 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2148 		struct mempolicy **mpol, nodemask_t **nodemask)
2149 {
2150 	pgoff_t ilx;
2151 	int nid;
2152 
2153 	nid = numa_node_id();
2154 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2155 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2156 	return nid;
2157 }
2158 
2159 /*
2160  * init_nodemask_of_mempolicy
2161  *
2162  * If the current task's mempolicy is "default" [NULL], return 'false'
2163  * to indicate default policy.  Otherwise, extract the policy nodemask
2164  * for 'bind' or 'interleave' policy into the argument nodemask, or
2165  * initialize the argument nodemask to contain the single node for
2166  * 'preferred' or 'local' policy and return 'true' to indicate presence
2167  * of non-default mempolicy.
2168  *
2169  * We don't bother with reference counting the mempolicy [mpol_get/put]
2170  * because the current task is examining it's own mempolicy and a task's
2171  * mempolicy is only ever changed by the task itself.
2172  *
2173  * N.B., it is the caller's responsibility to free a returned nodemask.
2174  */
2175 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2176 {
2177 	struct mempolicy *mempolicy;
2178 
2179 	if (!(mask && current->mempolicy))
2180 		return false;
2181 
2182 	task_lock(current);
2183 	mempolicy = current->mempolicy;
2184 	switch (mempolicy->mode) {
2185 	case MPOL_PREFERRED:
2186 	case MPOL_PREFERRED_MANY:
2187 	case MPOL_BIND:
2188 	case MPOL_INTERLEAVE:
2189 	case MPOL_WEIGHTED_INTERLEAVE:
2190 		*mask = mempolicy->nodes;
2191 		break;
2192 
2193 	case MPOL_LOCAL:
2194 		init_nodemask_of_node(mask, numa_node_id());
2195 		break;
2196 
2197 	default:
2198 		BUG();
2199 	}
2200 	task_unlock(current);
2201 
2202 	return true;
2203 }
2204 #endif
2205 
2206 /*
2207  * mempolicy_in_oom_domain
2208  *
2209  * If tsk's mempolicy is "bind", check for intersection between mask and
2210  * the policy nodemask. Otherwise, return true for all other policies
2211  * including "interleave", as a tsk with "interleave" policy may have
2212  * memory allocated from all nodes in system.
2213  *
2214  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2215  */
2216 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2217 					const nodemask_t *mask)
2218 {
2219 	struct mempolicy *mempolicy;
2220 	bool ret = true;
2221 
2222 	if (!mask)
2223 		return ret;
2224 
2225 	task_lock(tsk);
2226 	mempolicy = tsk->mempolicy;
2227 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2228 		ret = nodes_intersects(mempolicy->nodes, *mask);
2229 	task_unlock(tsk);
2230 
2231 	return ret;
2232 }
2233 
2234 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2235 						int nid, nodemask_t *nodemask)
2236 {
2237 	struct page *page;
2238 	gfp_t preferred_gfp;
2239 
2240 	/*
2241 	 * This is a two pass approach. The first pass will only try the
2242 	 * preferred nodes but skip the direct reclaim and allow the
2243 	 * allocation to fail, while the second pass will try all the
2244 	 * nodes in system.
2245 	 */
2246 	preferred_gfp = gfp | __GFP_NOWARN;
2247 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2248 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2249 	if (!page)
2250 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2251 
2252 	return page;
2253 }
2254 
2255 /**
2256  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2257  * @gfp: GFP flags.
2258  * @order: Order of the page allocation.
2259  * @pol: Pointer to the NUMA mempolicy.
2260  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2261  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2262  *
2263  * Return: The page on success or NULL if allocation fails.
2264  */
2265 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2266 		struct mempolicy *pol, pgoff_t ilx, int nid)
2267 {
2268 	nodemask_t *nodemask;
2269 	struct page *page;
2270 
2271 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2272 
2273 	if (pol->mode == MPOL_PREFERRED_MANY)
2274 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2275 
2276 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2277 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2278 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2279 		/*
2280 		 * For hugepage allocation and non-interleave policy which
2281 		 * allows the current node (or other explicitly preferred
2282 		 * node) we only try to allocate from the current/preferred
2283 		 * node and don't fall back to other nodes, as the cost of
2284 		 * remote accesses would likely offset THP benefits.
2285 		 *
2286 		 * If the policy is interleave or does not allow the current
2287 		 * node in its nodemask, we allocate the standard way.
2288 		 */
2289 		if (pol->mode != MPOL_INTERLEAVE &&
2290 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2291 		    (!nodemask || node_isset(nid, *nodemask))) {
2292 			/*
2293 			 * First, try to allocate THP only on local node, but
2294 			 * don't reclaim unnecessarily, just compact.
2295 			 */
2296 			page = __alloc_frozen_pages_noprof(
2297 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2298 				nid, NULL);
2299 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2300 				return page;
2301 			/*
2302 			 * If hugepage allocations are configured to always
2303 			 * synchronous compact or the vma has been madvised
2304 			 * to prefer hugepage backing, retry allowing remote
2305 			 * memory with both reclaim and compact as well.
2306 			 */
2307 		}
2308 	}
2309 
2310 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2311 
2312 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2313 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2314 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2315 		if (static_branch_likely(&vm_numa_stat_key) &&
2316 		    page_to_nid(page) == nid) {
2317 			preempt_disable();
2318 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2319 			preempt_enable();
2320 		}
2321 	}
2322 
2323 	return page;
2324 }
2325 
2326 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2327 		struct mempolicy *pol, pgoff_t ilx, int nid)
2328 {
2329 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2330 			ilx, nid);
2331 	if (!page)
2332 		return NULL;
2333 
2334 	set_page_refcounted(page);
2335 	return page_rmappable_folio(page);
2336 }
2337 
2338 /**
2339  * vma_alloc_folio - Allocate a folio for a VMA.
2340  * @gfp: GFP flags.
2341  * @order: Order of the folio.
2342  * @vma: Pointer to VMA.
2343  * @addr: Virtual address of the allocation.  Must be inside @vma.
2344  *
2345  * Allocate a folio for a specific address in @vma, using the appropriate
2346  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2347  * VMA to prevent it from going away.  Should be used for all allocations
2348  * for folios that will be mapped into user space, excepting hugetlbfs, and
2349  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2350  *
2351  * Return: The folio on success or NULL if allocation fails.
2352  */
2353 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2354 		unsigned long addr)
2355 {
2356 	struct mempolicy *pol;
2357 	pgoff_t ilx;
2358 	struct folio *folio;
2359 
2360 	if (vma->vm_flags & VM_DROPPABLE)
2361 		gfp |= __GFP_NOWARN;
2362 
2363 	pol = get_vma_policy(vma, addr, order, &ilx);
2364 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2365 	mpol_cond_put(pol);
2366 	return folio;
2367 }
2368 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2369 
2370 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2371 {
2372 	struct mempolicy *pol = &default_policy;
2373 
2374 	/*
2375 	 * No reference counting needed for current->mempolicy
2376 	 * nor system default_policy
2377 	 */
2378 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2379 		pol = get_task_policy(current);
2380 
2381 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2382 				       numa_node_id());
2383 }
2384 
2385 /**
2386  * alloc_pages - Allocate pages.
2387  * @gfp: GFP flags.
2388  * @order: Power of two of number of pages to allocate.
2389  *
2390  * Allocate 1 << @order contiguous pages.  The physical address of the
2391  * first page is naturally aligned (eg an order-3 allocation will be aligned
2392  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2393  * process is honoured when in process context.
2394  *
2395  * Context: Can be called from any context, providing the appropriate GFP
2396  * flags are used.
2397  * Return: The page on success or NULL if allocation fails.
2398  */
2399 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2400 {
2401 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2402 
2403 	if (page)
2404 		set_page_refcounted(page);
2405 	return page;
2406 }
2407 EXPORT_SYMBOL(alloc_pages_noprof);
2408 
2409 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2410 {
2411 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2412 }
2413 EXPORT_SYMBOL(folio_alloc_noprof);
2414 
2415 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2416 		struct mempolicy *pol, unsigned long nr_pages,
2417 		struct page **page_array)
2418 {
2419 	int nodes;
2420 	unsigned long nr_pages_per_node;
2421 	int delta;
2422 	int i;
2423 	unsigned long nr_allocated;
2424 	unsigned long total_allocated = 0;
2425 
2426 	nodes = nodes_weight(pol->nodes);
2427 	nr_pages_per_node = nr_pages / nodes;
2428 	delta = nr_pages - nodes * nr_pages_per_node;
2429 
2430 	for (i = 0; i < nodes; i++) {
2431 		if (delta) {
2432 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2433 					interleave_nodes(pol), NULL,
2434 					nr_pages_per_node + 1,
2435 					page_array);
2436 			delta--;
2437 		} else {
2438 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2439 					interleave_nodes(pol), NULL,
2440 					nr_pages_per_node, page_array);
2441 		}
2442 
2443 		page_array += nr_allocated;
2444 		total_allocated += nr_allocated;
2445 	}
2446 
2447 	return total_allocated;
2448 }
2449 
2450 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2451 		struct mempolicy *pol, unsigned long nr_pages,
2452 		struct page **page_array)
2453 {
2454 	struct task_struct *me = current;
2455 	unsigned int cpuset_mems_cookie;
2456 	unsigned long total_allocated = 0;
2457 	unsigned long nr_allocated = 0;
2458 	unsigned long rounds;
2459 	unsigned long node_pages, delta;
2460 	u8 *table, *weights, weight;
2461 	unsigned int weight_total = 0;
2462 	unsigned long rem_pages = nr_pages;
2463 	nodemask_t nodes;
2464 	int nnodes, node;
2465 	int resume_node = MAX_NUMNODES - 1;
2466 	u8 resume_weight = 0;
2467 	int prev_node;
2468 	int i;
2469 
2470 	if (!nr_pages)
2471 		return 0;
2472 
2473 	/* read the nodes onto the stack, retry if done during rebind */
2474 	do {
2475 		cpuset_mems_cookie = read_mems_allowed_begin();
2476 		nnodes = read_once_policy_nodemask(pol, &nodes);
2477 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2478 
2479 	/* if the nodemask has become invalid, we cannot do anything */
2480 	if (!nnodes)
2481 		return 0;
2482 
2483 	/* Continue allocating from most recent node and adjust the nr_pages */
2484 	node = me->il_prev;
2485 	weight = me->il_weight;
2486 	if (weight && node_isset(node, nodes)) {
2487 		node_pages = min(rem_pages, weight);
2488 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2489 						  page_array);
2490 		page_array += nr_allocated;
2491 		total_allocated += nr_allocated;
2492 		/* if that's all the pages, no need to interleave */
2493 		if (rem_pages <= weight) {
2494 			me->il_weight -= rem_pages;
2495 			return total_allocated;
2496 		}
2497 		/* Otherwise we adjust remaining pages, continue from there */
2498 		rem_pages -= weight;
2499 	}
2500 	/* clear active weight in case of an allocation failure */
2501 	me->il_weight = 0;
2502 	prev_node = node;
2503 
2504 	/* create a local copy of node weights to operate on outside rcu */
2505 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2506 	if (!weights)
2507 		return total_allocated;
2508 
2509 	rcu_read_lock();
2510 	table = rcu_dereference(iw_table);
2511 	if (table)
2512 		memcpy(weights, table, nr_node_ids);
2513 	rcu_read_unlock();
2514 
2515 	/* calculate total, detect system default usage */
2516 	for_each_node_mask(node, nodes) {
2517 		if (!weights[node])
2518 			weights[node] = 1;
2519 		weight_total += weights[node];
2520 	}
2521 
2522 	/*
2523 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2524 	 * Track which node weighted interleave should resume from.
2525 	 *
2526 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2527 	 * the node following prev_node and its weight.
2528 	 */
2529 	rounds = rem_pages / weight_total;
2530 	delta = rem_pages % weight_total;
2531 	resume_node = next_node_in(prev_node, nodes);
2532 	resume_weight = weights[resume_node];
2533 	for (i = 0; i < nnodes; i++) {
2534 		node = next_node_in(prev_node, nodes);
2535 		weight = weights[node];
2536 		node_pages = weight * rounds;
2537 		/* If a delta exists, add this node's portion of the delta */
2538 		if (delta > weight) {
2539 			node_pages += weight;
2540 			delta -= weight;
2541 		} else if (delta) {
2542 			/* when delta is depleted, resume from that node */
2543 			node_pages += delta;
2544 			resume_node = node;
2545 			resume_weight = weight - delta;
2546 			delta = 0;
2547 		}
2548 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2549 		if (!node_pages)
2550 			break;
2551 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2552 						  page_array);
2553 		page_array += nr_allocated;
2554 		total_allocated += nr_allocated;
2555 		if (total_allocated == nr_pages)
2556 			break;
2557 		prev_node = node;
2558 	}
2559 	me->il_prev = resume_node;
2560 	me->il_weight = resume_weight;
2561 	kfree(weights);
2562 	return total_allocated;
2563 }
2564 
2565 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2566 		struct mempolicy *pol, unsigned long nr_pages,
2567 		struct page **page_array)
2568 {
2569 	gfp_t preferred_gfp;
2570 	unsigned long nr_allocated = 0;
2571 
2572 	preferred_gfp = gfp | __GFP_NOWARN;
2573 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2574 
2575 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2576 					   nr_pages, page_array);
2577 
2578 	if (nr_allocated < nr_pages)
2579 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2580 				nr_pages - nr_allocated,
2581 				page_array + nr_allocated);
2582 	return nr_allocated;
2583 }
2584 
2585 /* alloc pages bulk and mempolicy should be considered at the
2586  * same time in some situation such as vmalloc.
2587  *
2588  * It can accelerate memory allocation especially interleaving
2589  * allocate memory.
2590  */
2591 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2592 		unsigned long nr_pages, struct page **page_array)
2593 {
2594 	struct mempolicy *pol = &default_policy;
2595 	nodemask_t *nodemask;
2596 	int nid;
2597 
2598 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2599 		pol = get_task_policy(current);
2600 
2601 	if (pol->mode == MPOL_INTERLEAVE)
2602 		return alloc_pages_bulk_interleave(gfp, pol,
2603 							 nr_pages, page_array);
2604 
2605 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2606 		return alloc_pages_bulk_weighted_interleave(
2607 				  gfp, pol, nr_pages, page_array);
2608 
2609 	if (pol->mode == MPOL_PREFERRED_MANY)
2610 		return alloc_pages_bulk_preferred_many(gfp,
2611 				numa_node_id(), pol, nr_pages, page_array);
2612 
2613 	nid = numa_node_id();
2614 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2615 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2616 				       nr_pages, page_array);
2617 }
2618 
2619 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2620 {
2621 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2622 
2623 	if (IS_ERR(pol))
2624 		return PTR_ERR(pol);
2625 	dst->vm_policy = pol;
2626 	return 0;
2627 }
2628 
2629 /*
2630  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2631  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2632  * with the mems_allowed returned by cpuset_mems_allowed().  This
2633  * keeps mempolicies cpuset relative after its cpuset moves.  See
2634  * further kernel/cpuset.c update_nodemask().
2635  *
2636  * current's mempolicy may be rebinded by the other task(the task that changes
2637  * cpuset's mems), so we needn't do rebind work for current task.
2638  */
2639 
2640 /* Slow path of a mempolicy duplicate */
2641 struct mempolicy *__mpol_dup(struct mempolicy *old)
2642 {
2643 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2644 
2645 	if (!new)
2646 		return ERR_PTR(-ENOMEM);
2647 
2648 	/* task's mempolicy is protected by alloc_lock */
2649 	if (old == current->mempolicy) {
2650 		task_lock(current);
2651 		*new = *old;
2652 		task_unlock(current);
2653 	} else
2654 		*new = *old;
2655 
2656 	if (current_cpuset_is_being_rebound()) {
2657 		nodemask_t mems = cpuset_mems_allowed(current);
2658 		mpol_rebind_policy(new, &mems);
2659 	}
2660 	atomic_set(&new->refcnt, 1);
2661 	return new;
2662 }
2663 
2664 /* Slow path of a mempolicy comparison */
2665 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2666 {
2667 	if (!a || !b)
2668 		return false;
2669 	if (a->mode != b->mode)
2670 		return false;
2671 	if (a->flags != b->flags)
2672 		return false;
2673 	if (a->home_node != b->home_node)
2674 		return false;
2675 	if (mpol_store_user_nodemask(a))
2676 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2677 			return false;
2678 
2679 	switch (a->mode) {
2680 	case MPOL_BIND:
2681 	case MPOL_INTERLEAVE:
2682 	case MPOL_PREFERRED:
2683 	case MPOL_PREFERRED_MANY:
2684 	case MPOL_WEIGHTED_INTERLEAVE:
2685 		return !!nodes_equal(a->nodes, b->nodes);
2686 	case MPOL_LOCAL:
2687 		return true;
2688 	default:
2689 		BUG();
2690 		return false;
2691 	}
2692 }
2693 
2694 /*
2695  * Shared memory backing store policy support.
2696  *
2697  * Remember policies even when nobody has shared memory mapped.
2698  * The policies are kept in Red-Black tree linked from the inode.
2699  * They are protected by the sp->lock rwlock, which should be held
2700  * for any accesses to the tree.
2701  */
2702 
2703 /*
2704  * lookup first element intersecting start-end.  Caller holds sp->lock for
2705  * reading or for writing
2706  */
2707 static struct sp_node *sp_lookup(struct shared_policy *sp,
2708 					pgoff_t start, pgoff_t end)
2709 {
2710 	struct rb_node *n = sp->root.rb_node;
2711 
2712 	while (n) {
2713 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2714 
2715 		if (start >= p->end)
2716 			n = n->rb_right;
2717 		else if (end <= p->start)
2718 			n = n->rb_left;
2719 		else
2720 			break;
2721 	}
2722 	if (!n)
2723 		return NULL;
2724 	for (;;) {
2725 		struct sp_node *w = NULL;
2726 		struct rb_node *prev = rb_prev(n);
2727 		if (!prev)
2728 			break;
2729 		w = rb_entry(prev, struct sp_node, nd);
2730 		if (w->end <= start)
2731 			break;
2732 		n = prev;
2733 	}
2734 	return rb_entry(n, struct sp_node, nd);
2735 }
2736 
2737 /*
2738  * Insert a new shared policy into the list.  Caller holds sp->lock for
2739  * writing.
2740  */
2741 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2742 {
2743 	struct rb_node **p = &sp->root.rb_node;
2744 	struct rb_node *parent = NULL;
2745 	struct sp_node *nd;
2746 
2747 	while (*p) {
2748 		parent = *p;
2749 		nd = rb_entry(parent, struct sp_node, nd);
2750 		if (new->start < nd->start)
2751 			p = &(*p)->rb_left;
2752 		else if (new->end > nd->end)
2753 			p = &(*p)->rb_right;
2754 		else
2755 			BUG();
2756 	}
2757 	rb_link_node(&new->nd, parent, p);
2758 	rb_insert_color(&new->nd, &sp->root);
2759 }
2760 
2761 /* Find shared policy intersecting idx */
2762 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2763 						pgoff_t idx)
2764 {
2765 	struct mempolicy *pol = NULL;
2766 	struct sp_node *sn;
2767 
2768 	if (!sp->root.rb_node)
2769 		return NULL;
2770 	read_lock(&sp->lock);
2771 	sn = sp_lookup(sp, idx, idx+1);
2772 	if (sn) {
2773 		mpol_get(sn->policy);
2774 		pol = sn->policy;
2775 	}
2776 	read_unlock(&sp->lock);
2777 	return pol;
2778 }
2779 
2780 static void sp_free(struct sp_node *n)
2781 {
2782 	mpol_put(n->policy);
2783 	kmem_cache_free(sn_cache, n);
2784 }
2785 
2786 /**
2787  * mpol_misplaced - check whether current folio node is valid in policy
2788  *
2789  * @folio: folio to be checked
2790  * @vmf: structure describing the fault
2791  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2792  *
2793  * Lookup current policy node id for vma,addr and "compare to" folio's
2794  * node id.  Policy determination "mimics" alloc_page_vma().
2795  * Called from fault path where we know the vma and faulting address.
2796  *
2797  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2798  * policy, or a suitable node ID to allocate a replacement folio from.
2799  */
2800 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2801 		   unsigned long addr)
2802 {
2803 	struct mempolicy *pol;
2804 	pgoff_t ilx;
2805 	struct zoneref *z;
2806 	int curnid = folio_nid(folio);
2807 	struct vm_area_struct *vma = vmf->vma;
2808 	int thiscpu = raw_smp_processor_id();
2809 	int thisnid = numa_node_id();
2810 	int polnid = NUMA_NO_NODE;
2811 	int ret = NUMA_NO_NODE;
2812 
2813 	/*
2814 	 * Make sure ptl is held so that we don't preempt and we
2815 	 * have a stable smp processor id
2816 	 */
2817 	lockdep_assert_held(vmf->ptl);
2818 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2819 	if (!(pol->flags & MPOL_F_MOF))
2820 		goto out;
2821 
2822 	switch (pol->mode) {
2823 	case MPOL_INTERLEAVE:
2824 		polnid = interleave_nid(pol, ilx);
2825 		break;
2826 
2827 	case MPOL_WEIGHTED_INTERLEAVE:
2828 		polnid = weighted_interleave_nid(pol, ilx);
2829 		break;
2830 
2831 	case MPOL_PREFERRED:
2832 		if (node_isset(curnid, pol->nodes))
2833 			goto out;
2834 		polnid = first_node(pol->nodes);
2835 		break;
2836 
2837 	case MPOL_LOCAL:
2838 		polnid = numa_node_id();
2839 		break;
2840 
2841 	case MPOL_BIND:
2842 	case MPOL_PREFERRED_MANY:
2843 		/*
2844 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
2845 		 * policy nodemask we don't allow numa migration to nodes
2846 		 * outside policy nodemask for now. This is done so that if we
2847 		 * want demotion to slow memory to happen, before allocating
2848 		 * from some DRAM node say 'x', we will end up using a
2849 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2850 		 * we should not promote to node 'x' from slow memory node.
2851 		 */
2852 		if (pol->flags & MPOL_F_MORON) {
2853 			/*
2854 			 * Optimize placement among multiple nodes
2855 			 * via NUMA balancing
2856 			 */
2857 			if (node_isset(thisnid, pol->nodes))
2858 				break;
2859 			goto out;
2860 		}
2861 
2862 		/*
2863 		 * use current page if in policy nodemask,
2864 		 * else select nearest allowed node, if any.
2865 		 * If no allowed nodes, use current [!misplaced].
2866 		 */
2867 		if (node_isset(curnid, pol->nodes))
2868 			goto out;
2869 		z = first_zones_zonelist(
2870 				node_zonelist(thisnid, GFP_HIGHUSER),
2871 				gfp_zone(GFP_HIGHUSER),
2872 				&pol->nodes);
2873 		polnid = zonelist_node_idx(z);
2874 		break;
2875 
2876 	default:
2877 		BUG();
2878 	}
2879 
2880 	/* Migrate the folio towards the node whose CPU is referencing it */
2881 	if (pol->flags & MPOL_F_MORON) {
2882 		polnid = thisnid;
2883 
2884 		if (!should_numa_migrate_memory(current, folio, curnid,
2885 						thiscpu))
2886 			goto out;
2887 	}
2888 
2889 	if (curnid != polnid)
2890 		ret = polnid;
2891 out:
2892 	mpol_cond_put(pol);
2893 
2894 	return ret;
2895 }
2896 
2897 /*
2898  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2899  * dropped after task->mempolicy is set to NULL so that any allocation done as
2900  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2901  * policy.
2902  */
2903 void mpol_put_task_policy(struct task_struct *task)
2904 {
2905 	struct mempolicy *pol;
2906 
2907 	task_lock(task);
2908 	pol = task->mempolicy;
2909 	task->mempolicy = NULL;
2910 	task_unlock(task);
2911 	mpol_put(pol);
2912 }
2913 
2914 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2915 {
2916 	rb_erase(&n->nd, &sp->root);
2917 	sp_free(n);
2918 }
2919 
2920 static void sp_node_init(struct sp_node *node, unsigned long start,
2921 			unsigned long end, struct mempolicy *pol)
2922 {
2923 	node->start = start;
2924 	node->end = end;
2925 	node->policy = pol;
2926 }
2927 
2928 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2929 				struct mempolicy *pol)
2930 {
2931 	struct sp_node *n;
2932 	struct mempolicy *newpol;
2933 
2934 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2935 	if (!n)
2936 		return NULL;
2937 
2938 	newpol = mpol_dup(pol);
2939 	if (IS_ERR(newpol)) {
2940 		kmem_cache_free(sn_cache, n);
2941 		return NULL;
2942 	}
2943 	newpol->flags |= MPOL_F_SHARED;
2944 	sp_node_init(n, start, end, newpol);
2945 
2946 	return n;
2947 }
2948 
2949 /* Replace a policy range. */
2950 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2951 				 pgoff_t end, struct sp_node *new)
2952 {
2953 	struct sp_node *n;
2954 	struct sp_node *n_new = NULL;
2955 	struct mempolicy *mpol_new = NULL;
2956 	int ret = 0;
2957 
2958 restart:
2959 	write_lock(&sp->lock);
2960 	n = sp_lookup(sp, start, end);
2961 	/* Take care of old policies in the same range. */
2962 	while (n && n->start < end) {
2963 		struct rb_node *next = rb_next(&n->nd);
2964 		if (n->start >= start) {
2965 			if (n->end <= end)
2966 				sp_delete(sp, n);
2967 			else
2968 				n->start = end;
2969 		} else {
2970 			/* Old policy spanning whole new range. */
2971 			if (n->end > end) {
2972 				if (!n_new)
2973 					goto alloc_new;
2974 
2975 				*mpol_new = *n->policy;
2976 				atomic_set(&mpol_new->refcnt, 1);
2977 				sp_node_init(n_new, end, n->end, mpol_new);
2978 				n->end = start;
2979 				sp_insert(sp, n_new);
2980 				n_new = NULL;
2981 				mpol_new = NULL;
2982 				break;
2983 			} else
2984 				n->end = start;
2985 		}
2986 		if (!next)
2987 			break;
2988 		n = rb_entry(next, struct sp_node, nd);
2989 	}
2990 	if (new)
2991 		sp_insert(sp, new);
2992 	write_unlock(&sp->lock);
2993 	ret = 0;
2994 
2995 err_out:
2996 	if (mpol_new)
2997 		mpol_put(mpol_new);
2998 	if (n_new)
2999 		kmem_cache_free(sn_cache, n_new);
3000 
3001 	return ret;
3002 
3003 alloc_new:
3004 	write_unlock(&sp->lock);
3005 	ret = -ENOMEM;
3006 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3007 	if (!n_new)
3008 		goto err_out;
3009 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3010 	if (!mpol_new)
3011 		goto err_out;
3012 	atomic_set(&mpol_new->refcnt, 1);
3013 	goto restart;
3014 }
3015 
3016 /**
3017  * mpol_shared_policy_init - initialize shared policy for inode
3018  * @sp: pointer to inode shared policy
3019  * @mpol:  struct mempolicy to install
3020  *
3021  * Install non-NULL @mpol in inode's shared policy rb-tree.
3022  * On entry, the current task has a reference on a non-NULL @mpol.
3023  * This must be released on exit.
3024  * This is called at get_inode() calls and we can use GFP_KERNEL.
3025  */
3026 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3027 {
3028 	int ret;
3029 
3030 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3031 	rwlock_init(&sp->lock);
3032 
3033 	if (mpol) {
3034 		struct sp_node *sn;
3035 		struct mempolicy *npol;
3036 		NODEMASK_SCRATCH(scratch);
3037 
3038 		if (!scratch)
3039 			goto put_mpol;
3040 
3041 		/* contextualize the tmpfs mount point mempolicy to this file */
3042 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3043 		if (IS_ERR(npol))
3044 			goto free_scratch; /* no valid nodemask intersection */
3045 
3046 		task_lock(current);
3047 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3048 		task_unlock(current);
3049 		if (ret)
3050 			goto put_npol;
3051 
3052 		/* alloc node covering entire file; adds ref to file's npol */
3053 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3054 		if (sn)
3055 			sp_insert(sp, sn);
3056 put_npol:
3057 		mpol_put(npol);	/* drop initial ref on file's npol */
3058 free_scratch:
3059 		NODEMASK_SCRATCH_FREE(scratch);
3060 put_mpol:
3061 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3062 	}
3063 }
3064 
3065 int mpol_set_shared_policy(struct shared_policy *sp,
3066 			struct vm_area_struct *vma, struct mempolicy *pol)
3067 {
3068 	int err;
3069 	struct sp_node *new = NULL;
3070 	unsigned long sz = vma_pages(vma);
3071 
3072 	if (pol) {
3073 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3074 		if (!new)
3075 			return -ENOMEM;
3076 	}
3077 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3078 	if (err && new)
3079 		sp_free(new);
3080 	return err;
3081 }
3082 
3083 /* Free a backing policy store on inode delete. */
3084 void mpol_free_shared_policy(struct shared_policy *sp)
3085 {
3086 	struct sp_node *n;
3087 	struct rb_node *next;
3088 
3089 	if (!sp->root.rb_node)
3090 		return;
3091 	write_lock(&sp->lock);
3092 	next = rb_first(&sp->root);
3093 	while (next) {
3094 		n = rb_entry(next, struct sp_node, nd);
3095 		next = rb_next(&n->nd);
3096 		sp_delete(sp, n);
3097 	}
3098 	write_unlock(&sp->lock);
3099 }
3100 
3101 #ifdef CONFIG_NUMA_BALANCING
3102 static int __initdata numabalancing_override;
3103 
3104 static void __init check_numabalancing_enable(void)
3105 {
3106 	bool numabalancing_default = false;
3107 
3108 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3109 		numabalancing_default = true;
3110 
3111 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3112 	if (numabalancing_override)
3113 		set_numabalancing_state(numabalancing_override == 1);
3114 
3115 	if (num_online_nodes() > 1 && !numabalancing_override) {
3116 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3117 			numabalancing_default ? "Enabling" : "Disabling");
3118 		set_numabalancing_state(numabalancing_default);
3119 	}
3120 }
3121 
3122 static int __init setup_numabalancing(char *str)
3123 {
3124 	int ret = 0;
3125 	if (!str)
3126 		goto out;
3127 
3128 	if (!strcmp(str, "enable")) {
3129 		numabalancing_override = 1;
3130 		ret = 1;
3131 	} else if (!strcmp(str, "disable")) {
3132 		numabalancing_override = -1;
3133 		ret = 1;
3134 	}
3135 out:
3136 	if (!ret)
3137 		pr_warn("Unable to parse numa_balancing=\n");
3138 
3139 	return ret;
3140 }
3141 __setup("numa_balancing=", setup_numabalancing);
3142 #else
3143 static inline void __init check_numabalancing_enable(void)
3144 {
3145 }
3146 #endif /* CONFIG_NUMA_BALANCING */
3147 
3148 void __init numa_policy_init(void)
3149 {
3150 	nodemask_t interleave_nodes;
3151 	unsigned long largest = 0;
3152 	int nid, prefer = 0;
3153 
3154 	policy_cache = kmem_cache_create("numa_policy",
3155 					 sizeof(struct mempolicy),
3156 					 0, SLAB_PANIC, NULL);
3157 
3158 	sn_cache = kmem_cache_create("shared_policy_node",
3159 				     sizeof(struct sp_node),
3160 				     0, SLAB_PANIC, NULL);
3161 
3162 	for_each_node(nid) {
3163 		preferred_node_policy[nid] = (struct mempolicy) {
3164 			.refcnt = ATOMIC_INIT(1),
3165 			.mode = MPOL_PREFERRED,
3166 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3167 			.nodes = nodemask_of_node(nid),
3168 		};
3169 	}
3170 
3171 	/*
3172 	 * Set interleaving policy for system init. Interleaving is only
3173 	 * enabled across suitably sized nodes (default is >= 16MB), or
3174 	 * fall back to the largest node if they're all smaller.
3175 	 */
3176 	nodes_clear(interleave_nodes);
3177 	for_each_node_state(nid, N_MEMORY) {
3178 		unsigned long total_pages = node_present_pages(nid);
3179 
3180 		/* Preserve the largest node */
3181 		if (largest < total_pages) {
3182 			largest = total_pages;
3183 			prefer = nid;
3184 		}
3185 
3186 		/* Interleave this node? */
3187 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3188 			node_set(nid, interleave_nodes);
3189 	}
3190 
3191 	/* All too small, use the largest */
3192 	if (unlikely(nodes_empty(interleave_nodes)))
3193 		node_set(prefer, interleave_nodes);
3194 
3195 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3196 		pr_err("%s: interleaving failed\n", __func__);
3197 
3198 	check_numabalancing_enable();
3199 }
3200 
3201 /* Reset policy of current process to default */
3202 void numa_default_policy(void)
3203 {
3204 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3205 }
3206 
3207 /*
3208  * Parse and format mempolicy from/to strings
3209  */
3210 static const char * const policy_modes[] =
3211 {
3212 	[MPOL_DEFAULT]    = "default",
3213 	[MPOL_PREFERRED]  = "prefer",
3214 	[MPOL_BIND]       = "bind",
3215 	[MPOL_INTERLEAVE] = "interleave",
3216 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3217 	[MPOL_LOCAL]      = "local",
3218 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3219 };
3220 
3221 #ifdef CONFIG_TMPFS
3222 /**
3223  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3224  * @str:  string containing mempolicy to parse
3225  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3226  *
3227  * Format of input:
3228  *	<mode>[=<flags>][:<nodelist>]
3229  *
3230  * Return: %0 on success, else %1
3231  */
3232 int mpol_parse_str(char *str, struct mempolicy **mpol)
3233 {
3234 	struct mempolicy *new = NULL;
3235 	unsigned short mode_flags;
3236 	nodemask_t nodes;
3237 	char *nodelist = strchr(str, ':');
3238 	char *flags = strchr(str, '=');
3239 	int err = 1, mode;
3240 
3241 	if (flags)
3242 		*flags++ = '\0';	/* terminate mode string */
3243 
3244 	if (nodelist) {
3245 		/* NUL-terminate mode or flags string */
3246 		*nodelist++ = '\0';
3247 		if (nodelist_parse(nodelist, nodes))
3248 			goto out;
3249 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3250 			goto out;
3251 	} else
3252 		nodes_clear(nodes);
3253 
3254 	mode = match_string(policy_modes, MPOL_MAX, str);
3255 	if (mode < 0)
3256 		goto out;
3257 
3258 	switch (mode) {
3259 	case MPOL_PREFERRED:
3260 		/*
3261 		 * Insist on a nodelist of one node only, although later
3262 		 * we use first_node(nodes) to grab a single node, so here
3263 		 * nodelist (or nodes) cannot be empty.
3264 		 */
3265 		if (nodelist) {
3266 			char *rest = nodelist;
3267 			while (isdigit(*rest))
3268 				rest++;
3269 			if (*rest)
3270 				goto out;
3271 			if (nodes_empty(nodes))
3272 				goto out;
3273 		}
3274 		break;
3275 	case MPOL_INTERLEAVE:
3276 	case MPOL_WEIGHTED_INTERLEAVE:
3277 		/*
3278 		 * Default to online nodes with memory if no nodelist
3279 		 */
3280 		if (!nodelist)
3281 			nodes = node_states[N_MEMORY];
3282 		break;
3283 	case MPOL_LOCAL:
3284 		/*
3285 		 * Don't allow a nodelist;  mpol_new() checks flags
3286 		 */
3287 		if (nodelist)
3288 			goto out;
3289 		break;
3290 	case MPOL_DEFAULT:
3291 		/*
3292 		 * Insist on a empty nodelist
3293 		 */
3294 		if (!nodelist)
3295 			err = 0;
3296 		goto out;
3297 	case MPOL_PREFERRED_MANY:
3298 	case MPOL_BIND:
3299 		/*
3300 		 * Insist on a nodelist
3301 		 */
3302 		if (!nodelist)
3303 			goto out;
3304 	}
3305 
3306 	mode_flags = 0;
3307 	if (flags) {
3308 		/*
3309 		 * Currently, we only support two mutually exclusive
3310 		 * mode flags.
3311 		 */
3312 		if (!strcmp(flags, "static"))
3313 			mode_flags |= MPOL_F_STATIC_NODES;
3314 		else if (!strcmp(flags, "relative"))
3315 			mode_flags |= MPOL_F_RELATIVE_NODES;
3316 		else
3317 			goto out;
3318 	}
3319 
3320 	new = mpol_new(mode, mode_flags, &nodes);
3321 	if (IS_ERR(new))
3322 		goto out;
3323 
3324 	/*
3325 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3326 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3327 	 */
3328 	if (mode != MPOL_PREFERRED) {
3329 		new->nodes = nodes;
3330 	} else if (nodelist) {
3331 		nodes_clear(new->nodes);
3332 		node_set(first_node(nodes), new->nodes);
3333 	} else {
3334 		new->mode = MPOL_LOCAL;
3335 	}
3336 
3337 	/*
3338 	 * Save nodes for contextualization: this will be used to "clone"
3339 	 * the mempolicy in a specific context [cpuset] at a later time.
3340 	 */
3341 	new->w.user_nodemask = nodes;
3342 
3343 	err = 0;
3344 
3345 out:
3346 	/* Restore string for error message */
3347 	if (nodelist)
3348 		*--nodelist = ':';
3349 	if (flags)
3350 		*--flags = '=';
3351 	if (!err)
3352 		*mpol = new;
3353 	return err;
3354 }
3355 #endif /* CONFIG_TMPFS */
3356 
3357 /**
3358  * mpol_to_str - format a mempolicy structure for printing
3359  * @buffer:  to contain formatted mempolicy string
3360  * @maxlen:  length of @buffer
3361  * @pol:  pointer to mempolicy to be formatted
3362  *
3363  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3364  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3365  * interleave", plus the longest flag flags, "relative|balancing", and to
3366  * display at least a few node ids.
3367  */
3368 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3369 {
3370 	char *p = buffer;
3371 	nodemask_t nodes = NODE_MASK_NONE;
3372 	unsigned short mode = MPOL_DEFAULT;
3373 	unsigned short flags = 0;
3374 
3375 	if (pol &&
3376 	    pol != &default_policy &&
3377 	    !(pol >= &preferred_node_policy[0] &&
3378 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3379 		mode = pol->mode;
3380 		flags = pol->flags;
3381 	}
3382 
3383 	switch (mode) {
3384 	case MPOL_DEFAULT:
3385 	case MPOL_LOCAL:
3386 		break;
3387 	case MPOL_PREFERRED:
3388 	case MPOL_PREFERRED_MANY:
3389 	case MPOL_BIND:
3390 	case MPOL_INTERLEAVE:
3391 	case MPOL_WEIGHTED_INTERLEAVE:
3392 		nodes = pol->nodes;
3393 		break;
3394 	default:
3395 		WARN_ON_ONCE(1);
3396 		snprintf(p, maxlen, "unknown");
3397 		return;
3398 	}
3399 
3400 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3401 
3402 	if (flags & MPOL_MODE_FLAGS) {
3403 		p += snprintf(p, buffer + maxlen - p, "=");
3404 
3405 		/*
3406 		 * Static and relative are mutually exclusive.
3407 		 */
3408 		if (flags & MPOL_F_STATIC_NODES)
3409 			p += snprintf(p, buffer + maxlen - p, "static");
3410 		else if (flags & MPOL_F_RELATIVE_NODES)
3411 			p += snprintf(p, buffer + maxlen - p, "relative");
3412 
3413 		if (flags & MPOL_F_NUMA_BALANCING) {
3414 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3415 				p += snprintf(p, buffer + maxlen - p, "|");
3416 			p += snprintf(p, buffer + maxlen - p, "balancing");
3417 		}
3418 	}
3419 
3420 	if (!nodes_empty(nodes))
3421 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3422 			       nodemask_pr_args(&nodes));
3423 }
3424 
3425 #ifdef CONFIG_SYSFS
3426 struct iw_node_attr {
3427 	struct kobj_attribute kobj_attr;
3428 	int nid;
3429 };
3430 
3431 struct sysfs_wi_group {
3432 	struct kobject wi_kobj;
3433 	struct mutex kobj_lock;
3434 	struct iw_node_attr *nattrs[];
3435 };
3436 
3437 static struct sysfs_wi_group *wi_group;
3438 
3439 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3440 			 char *buf)
3441 {
3442 	struct iw_node_attr *node_attr;
3443 	u8 weight;
3444 
3445 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3446 	weight = get_il_weight(node_attr->nid);
3447 	return sysfs_emit(buf, "%d\n", weight);
3448 }
3449 
3450 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3451 			  const char *buf, size_t count)
3452 {
3453 	struct iw_node_attr *node_attr;
3454 	u8 *new;
3455 	u8 *old;
3456 	u8 weight = 0;
3457 
3458 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3459 	if (count == 0 || sysfs_streq(buf, ""))
3460 		weight = 0;
3461 	else if (kstrtou8(buf, 0, &weight))
3462 		return -EINVAL;
3463 
3464 	new = kzalloc(nr_node_ids, GFP_KERNEL);
3465 	if (!new)
3466 		return -ENOMEM;
3467 
3468 	mutex_lock(&iw_table_lock);
3469 	old = rcu_dereference_protected(iw_table,
3470 					lockdep_is_held(&iw_table_lock));
3471 	if (old)
3472 		memcpy(new, old, nr_node_ids);
3473 	new[node_attr->nid] = weight;
3474 	rcu_assign_pointer(iw_table, new);
3475 	mutex_unlock(&iw_table_lock);
3476 	synchronize_rcu();
3477 	kfree(old);
3478 	return count;
3479 }
3480 
3481 static void sysfs_wi_node_delete(int nid)
3482 {
3483 	struct iw_node_attr *attr;
3484 
3485 	if (nid < 0 || nid >= nr_node_ids)
3486 		return;
3487 
3488 	mutex_lock(&wi_group->kobj_lock);
3489 	attr = wi_group->nattrs[nid];
3490 	if (!attr) {
3491 		mutex_unlock(&wi_group->kobj_lock);
3492 		return;
3493 	}
3494 
3495 	wi_group->nattrs[nid] = NULL;
3496 	mutex_unlock(&wi_group->kobj_lock);
3497 
3498 	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3499 	kfree(attr->kobj_attr.attr.name);
3500 	kfree(attr);
3501 }
3502 
3503 static void sysfs_wi_node_delete_all(void)
3504 {
3505 	int nid;
3506 
3507 	for (nid = 0; nid < nr_node_ids; nid++)
3508 		sysfs_wi_node_delete(nid);
3509 }
3510 
3511 static void iw_table_free(void)
3512 {
3513 	u8 *old;
3514 
3515 	mutex_lock(&iw_table_lock);
3516 	old = rcu_dereference_protected(iw_table,
3517 					lockdep_is_held(&iw_table_lock));
3518 	rcu_assign_pointer(iw_table, NULL);
3519 	mutex_unlock(&iw_table_lock);
3520 
3521 	synchronize_rcu();
3522 	kfree(old);
3523 }
3524 
3525 static void wi_cleanup(void) {
3526 	sysfs_wi_node_delete_all();
3527 	iw_table_free();
3528 }
3529 
3530 static void wi_kobj_release(struct kobject *wi_kobj)
3531 {
3532 	kfree(wi_group);
3533 }
3534 
3535 static const struct kobj_type wi_ktype = {
3536 	.sysfs_ops = &kobj_sysfs_ops,
3537 	.release = wi_kobj_release,
3538 };
3539 
3540 static int sysfs_wi_node_add(int nid)
3541 {
3542 	int ret;
3543 	char *name;
3544 	struct iw_node_attr *new_attr;
3545 
3546 	if (nid < 0 || nid >= nr_node_ids) {
3547 		pr_err("invalid node id: %d\n", nid);
3548 		return -EINVAL;
3549 	}
3550 
3551 	new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3552 	if (!new_attr)
3553 		return -ENOMEM;
3554 
3555 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3556 	if (!name) {
3557 		kfree(new_attr);
3558 		return -ENOMEM;
3559 	}
3560 
3561 	sysfs_attr_init(&new_attr->kobj_attr.attr);
3562 	new_attr->kobj_attr.attr.name = name;
3563 	new_attr->kobj_attr.attr.mode = 0644;
3564 	new_attr->kobj_attr.show = node_show;
3565 	new_attr->kobj_attr.store = node_store;
3566 	new_attr->nid = nid;
3567 
3568 	mutex_lock(&wi_group->kobj_lock);
3569 	if (wi_group->nattrs[nid]) {
3570 		mutex_unlock(&wi_group->kobj_lock);
3571 		ret = -EEXIST;
3572 		goto out;
3573 	}
3574 
3575 	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3576 	if (ret) {
3577 		mutex_unlock(&wi_group->kobj_lock);
3578 		goto out;
3579 	}
3580 	wi_group->nattrs[nid] = new_attr;
3581 	mutex_unlock(&wi_group->kobj_lock);
3582 	return 0;
3583 
3584 out:
3585 	kfree(new_attr->kobj_attr.attr.name);
3586 	kfree(new_attr);
3587 	return ret;
3588 }
3589 
3590 static int wi_node_notifier(struct notifier_block *nb,
3591 			       unsigned long action, void *data)
3592 {
3593 	int err;
3594 	struct memory_notify *arg = data;
3595 	int nid = arg->status_change_nid;
3596 
3597 	if (nid < 0)
3598 		return NOTIFY_OK;
3599 
3600 	switch (action) {
3601 	case MEM_ONLINE:
3602 		err = sysfs_wi_node_add(nid);
3603 		if (err)
3604 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3605 			       nid, err);
3606 		break;
3607 	case MEM_OFFLINE:
3608 		sysfs_wi_node_delete(nid);
3609 		break;
3610 	}
3611 
3612 	return NOTIFY_OK;
3613 }
3614 
3615 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3616 {
3617 	int nid, err;
3618 
3619 	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3620 			   GFP_KERNEL);
3621 	if (!wi_group)
3622 		return -ENOMEM;
3623 	mutex_init(&wi_group->kobj_lock);
3624 
3625 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3626 				   "weighted_interleave");
3627 	if (err)
3628 		goto err_put_kobj;
3629 
3630 	for_each_online_node(nid) {
3631 		if (!node_state(nid, N_MEMORY))
3632 			continue;
3633 
3634 		err = sysfs_wi_node_add(nid);
3635 		if (err) {
3636 			pr_err("failed to add sysfs for node%d during init: %d\n",
3637 			       nid, err);
3638 			goto err_cleanup_kobj;
3639 		}
3640 	}
3641 
3642 	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3643 	return 0;
3644 
3645 err_cleanup_kobj:
3646 	wi_cleanup();
3647 	kobject_del(&wi_group->wi_kobj);
3648 err_put_kobj:
3649 	kobject_put(&wi_group->wi_kobj);
3650 	return err;
3651 }
3652 
3653 static int __init mempolicy_sysfs_init(void)
3654 {
3655 	int err;
3656 	static struct kobject *mempolicy_kobj;
3657 
3658 	mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3659 	if (!mempolicy_kobj)
3660 		return -ENOMEM;
3661 
3662 	err = add_weighted_interleave_group(mempolicy_kobj);
3663 	if (err)
3664 		goto err_kobj;
3665 
3666 	return 0;
3667 
3668 err_kobj:
3669 	kobject_del(mempolicy_kobj);
3670 	kobject_put(mempolicy_kobj);
3671 	return err;
3672 }
3673 
3674 late_initcall(mempolicy_sysfs_init);
3675 #endif /* CONFIG_SYSFS */
3676