xref: /linux/mm/mempolicy.c (revision 86941382508850d58c11bdafe0fec646dfd31b09)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/task.h>
89 #include <linux/nodemask.h>
90 #include <linux/cpuset.h>
91 #include <linux/slab.h>
92 #include <linux/string.h>
93 #include <linux/export.h>
94 #include <linux/nsproxy.h>
95 #include <linux/interrupt.h>
96 #include <linux/init.h>
97 #include <linux/compat.h>
98 #include <linux/ptrace.h>
99 #include <linux/swap.h>
100 #include <linux/seq_file.h>
101 #include <linux/proc_fs.h>
102 #include <linux/migrate.h>
103 #include <linux/ksm.h>
104 #include <linux/rmap.h>
105 #include <linux/security.h>
106 #include <linux/syscalls.h>
107 #include <linux/ctype.h>
108 #include <linux/mm_inline.h>
109 #include <linux/mmu_notifier.h>
110 #include <linux/printk.h>
111 #include <linux/swapops.h>
112 #include <linux/gcd.h>
113 
114 #include <asm/tlbflush.h>
115 #include <asm/tlb.h>
116 #include <linux/uaccess.h>
117 #include <linux/memory.h>
118 
119 #include "internal.h"
120 
121 /* Internal flags */
122 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
123 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
124 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
125 
126 static struct kmem_cache *policy_cache;
127 static struct kmem_cache *sn_cache;
128 
129 /* Highest zone. An specific allocation for a zone below that is not
130    policied. */
131 enum zone_type policy_zone = 0;
132 
133 /*
134  * run-time system-wide default policy => local allocation
135  */
136 static struct mempolicy default_policy = {
137 	.refcnt = ATOMIC_INIT(1), /* never free it */
138 	.mode = MPOL_LOCAL,
139 };
140 
141 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
142 
143 /*
144  * weightiness balances the tradeoff between small weights (cycles through nodes
145  * faster, more fair/even distribution) and large weights (smaller errors
146  * between actual bandwidth ratios and weight ratios). 32 is a number that has
147  * been found to perform at a reasonable compromise between the two goals.
148  */
149 static const int weightiness = 32;
150 
151 /*
152  * A null weighted_interleave_state is interpreted as having .mode="auto",
153  * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
154  */
155 struct weighted_interleave_state {
156 	bool mode_auto;
157 	u8 iw_table[];
158 };
159 static struct weighted_interleave_state __rcu *wi_state;
160 static unsigned int *node_bw_table;
161 
162 /*
163  * wi_state_lock protects both wi_state and node_bw_table.
164  * node_bw_table is only used by writers to update wi_state.
165  */
166 static DEFINE_MUTEX(wi_state_lock);
167 
168 static u8 get_il_weight(int node)
169 {
170 	struct weighted_interleave_state *state;
171 	u8 weight = 1;
172 
173 	rcu_read_lock();
174 	state = rcu_dereference(wi_state);
175 	if (state)
176 		weight = state->iw_table[node];
177 	rcu_read_unlock();
178 	return weight;
179 }
180 
181 /*
182  * Convert bandwidth values into weighted interleave weights.
183  * Call with wi_state_lock.
184  */
185 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
186 {
187 	u64 sum_bw = 0;
188 	unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
189 	int nid;
190 
191 	for_each_node_state(nid, N_MEMORY)
192 		sum_bw += bw[nid];
193 
194 	/* Scale bandwidths to whole numbers in the range [1, weightiness] */
195 	for_each_node_state(nid, N_MEMORY) {
196 		/*
197 		 * Try not to perform 64-bit division.
198 		 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
199 		 * If sum_bw > scaling_factor, then round the weight up to 1.
200 		 */
201 		scaling_factor = weightiness * bw[nid];
202 		if (bw[nid] && sum_bw < scaling_factor) {
203 			cast_sum_bw = (unsigned int)sum_bw;
204 			new_iw[nid] = scaling_factor / cast_sum_bw;
205 		} else {
206 			new_iw[nid] = 1;
207 		}
208 		if (!iw_gcd)
209 			iw_gcd = new_iw[nid];
210 		iw_gcd = gcd(iw_gcd, new_iw[nid]);
211 	}
212 
213 	/* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
214 	for_each_node_state(nid, N_MEMORY)
215 		new_iw[nid] /= iw_gcd;
216 }
217 
218 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
219 {
220 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
221 	unsigned int *old_bw, *new_bw;
222 	unsigned int bw_val;
223 	int i;
224 
225 	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
226 	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
227 	if (!new_bw)
228 		return -ENOMEM;
229 
230 	new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
231 			       GFP_KERNEL);
232 	if (!new_wi_state) {
233 		kfree(new_bw);
234 		return -ENOMEM;
235 	}
236 	new_wi_state->mode_auto = true;
237 	for (i = 0; i < nr_node_ids; i++)
238 		new_wi_state->iw_table[i] = 1;
239 
240 	/*
241 	 * Update bandwidth info, even in manual mode. That way, when switching
242 	 * to auto mode in the future, iw_table can be overwritten using
243 	 * accurate bw data.
244 	 */
245 	mutex_lock(&wi_state_lock);
246 
247 	old_bw = node_bw_table;
248 	if (old_bw)
249 		memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
250 	new_bw[node] = bw_val;
251 	node_bw_table = new_bw;
252 
253 	old_wi_state = rcu_dereference_protected(wi_state,
254 					lockdep_is_held(&wi_state_lock));
255 	if (old_wi_state && !old_wi_state->mode_auto) {
256 		/* Manual mode; skip reducing weights and updating wi_state */
257 		mutex_unlock(&wi_state_lock);
258 		kfree(new_wi_state);
259 		goto out;
260 	}
261 
262 	/* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
263 	reduce_interleave_weights(new_bw, new_wi_state->iw_table);
264 	rcu_assign_pointer(wi_state, new_wi_state);
265 
266 	mutex_unlock(&wi_state_lock);
267 	if (old_wi_state) {
268 		synchronize_rcu();
269 		kfree(old_wi_state);
270 	}
271 out:
272 	kfree(old_bw);
273 	return 0;
274 }
275 
276 /**
277  * numa_nearest_node - Find nearest node by state
278  * @node: Node id to start the search
279  * @state: State to filter the search
280  *
281  * Lookup the closest node by distance if @nid is not in state.
282  *
283  * Return: this @node if it is in state, otherwise the closest node by distance
284  */
285 int numa_nearest_node(int node, unsigned int state)
286 {
287 	int min_dist = INT_MAX, dist, n, min_node;
288 
289 	if (state >= NR_NODE_STATES)
290 		return -EINVAL;
291 
292 	if (node == NUMA_NO_NODE || node_state(node, state))
293 		return node;
294 
295 	min_node = node;
296 	for_each_node_state(n, state) {
297 		dist = node_distance(node, n);
298 		if (dist < min_dist) {
299 			min_dist = dist;
300 			min_node = n;
301 		}
302 	}
303 
304 	return min_node;
305 }
306 EXPORT_SYMBOL_GPL(numa_nearest_node);
307 
308 /**
309  * nearest_node_nodemask - Find the node in @mask at the nearest distance
310  *			   from @node.
311  *
312  * @node: a valid node ID to start the search from.
313  * @mask: a pointer to a nodemask representing the allowed nodes.
314  *
315  * This function iterates over all nodes in @mask and calculates the
316  * distance from the starting @node, then it returns the node ID that is
317  * the closest to @node, or MAX_NUMNODES if no node is found.
318  *
319  * Note that @node must be a valid node ID usable with node_distance(),
320  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
321  * or unexpected behavior.
322  */
323 int nearest_node_nodemask(int node, nodemask_t *mask)
324 {
325 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
326 
327 	for_each_node_mask(n, *mask) {
328 		dist = node_distance(node, n);
329 		if (dist < min_dist) {
330 			min_dist = dist;
331 			min_node = n;
332 		}
333 	}
334 
335 	return min_node;
336 }
337 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
338 
339 struct mempolicy *get_task_policy(struct task_struct *p)
340 {
341 	struct mempolicy *pol = p->mempolicy;
342 	int node;
343 
344 	if (pol)
345 		return pol;
346 
347 	node = numa_node_id();
348 	if (node != NUMA_NO_NODE) {
349 		pol = &preferred_node_policy[node];
350 		/* preferred_node_policy is not initialised early in boot */
351 		if (pol->mode)
352 			return pol;
353 	}
354 
355 	return &default_policy;
356 }
357 
358 static const struct mempolicy_operations {
359 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
360 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
361 } mpol_ops[MPOL_MAX];
362 
363 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
364 {
365 	return pol->flags & MPOL_MODE_FLAGS;
366 }
367 
368 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
369 				   const nodemask_t *rel)
370 {
371 	nodemask_t tmp;
372 	nodes_fold(tmp, *orig, nodes_weight(*rel));
373 	nodes_onto(*ret, tmp, *rel);
374 }
375 
376 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
377 {
378 	if (nodes_empty(*nodes))
379 		return -EINVAL;
380 	pol->nodes = *nodes;
381 	return 0;
382 }
383 
384 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
385 {
386 	if (nodes_empty(*nodes))
387 		return -EINVAL;
388 
389 	nodes_clear(pol->nodes);
390 	node_set(first_node(*nodes), pol->nodes);
391 	return 0;
392 }
393 
394 /*
395  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
396  * any, for the new policy.  mpol_new() has already validated the nodes
397  * parameter with respect to the policy mode and flags.
398  *
399  * Must be called holding task's alloc_lock to protect task's mems_allowed
400  * and mempolicy.  May also be called holding the mmap_lock for write.
401  */
402 static int mpol_set_nodemask(struct mempolicy *pol,
403 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
404 {
405 	int ret;
406 
407 	/*
408 	 * Default (pol==NULL) resp. local memory policies are not a
409 	 * subject of any remapping. They also do not need any special
410 	 * constructor.
411 	 */
412 	if (!pol || pol->mode == MPOL_LOCAL)
413 		return 0;
414 
415 	/* Check N_MEMORY */
416 	nodes_and(nsc->mask1,
417 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
418 
419 	VM_BUG_ON(!nodes);
420 
421 	if (pol->flags & MPOL_F_RELATIVE_NODES)
422 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
423 	else
424 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
425 
426 	if (mpol_store_user_nodemask(pol))
427 		pol->w.user_nodemask = *nodes;
428 	else
429 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
430 
431 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
432 	return ret;
433 }
434 
435 /*
436  * This function just creates a new policy, does some check and simple
437  * initialization. You must invoke mpol_set_nodemask() to set nodes.
438  */
439 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
440 				  nodemask_t *nodes)
441 {
442 	struct mempolicy *policy;
443 
444 	if (mode == MPOL_DEFAULT) {
445 		if (nodes && !nodes_empty(*nodes))
446 			return ERR_PTR(-EINVAL);
447 		return NULL;
448 	}
449 	VM_BUG_ON(!nodes);
450 
451 	/*
452 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
453 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
454 	 * All other modes require a valid pointer to a non-empty nodemask.
455 	 */
456 	if (mode == MPOL_PREFERRED) {
457 		if (nodes_empty(*nodes)) {
458 			if (((flags & MPOL_F_STATIC_NODES) ||
459 			     (flags & MPOL_F_RELATIVE_NODES)))
460 				return ERR_PTR(-EINVAL);
461 
462 			mode = MPOL_LOCAL;
463 		}
464 	} else if (mode == MPOL_LOCAL) {
465 		if (!nodes_empty(*nodes) ||
466 		    (flags & MPOL_F_STATIC_NODES) ||
467 		    (flags & MPOL_F_RELATIVE_NODES))
468 			return ERR_PTR(-EINVAL);
469 	} else if (nodes_empty(*nodes))
470 		return ERR_PTR(-EINVAL);
471 
472 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
473 	if (!policy)
474 		return ERR_PTR(-ENOMEM);
475 	atomic_set(&policy->refcnt, 1);
476 	policy->mode = mode;
477 	policy->flags = flags;
478 	policy->home_node = NUMA_NO_NODE;
479 
480 	return policy;
481 }
482 
483 /* Slow path of a mpol destructor. */
484 void __mpol_put(struct mempolicy *pol)
485 {
486 	if (!atomic_dec_and_test(&pol->refcnt))
487 		return;
488 	kmem_cache_free(policy_cache, pol);
489 }
490 
491 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
492 {
493 }
494 
495 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
496 {
497 	nodemask_t tmp;
498 
499 	if (pol->flags & MPOL_F_STATIC_NODES)
500 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
501 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
502 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
503 	else {
504 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
505 								*nodes);
506 		pol->w.cpuset_mems_allowed = *nodes;
507 	}
508 
509 	if (nodes_empty(tmp))
510 		tmp = *nodes;
511 
512 	pol->nodes = tmp;
513 }
514 
515 static void mpol_rebind_preferred(struct mempolicy *pol,
516 						const nodemask_t *nodes)
517 {
518 	pol->w.cpuset_mems_allowed = *nodes;
519 }
520 
521 /*
522  * mpol_rebind_policy - Migrate a policy to a different set of nodes
523  *
524  * Per-vma policies are protected by mmap_lock. Allocations using per-task
525  * policies are protected by task->mems_allowed_seq to prevent a premature
526  * OOM/allocation failure due to parallel nodemask modification.
527  */
528 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
529 {
530 	if (!pol || pol->mode == MPOL_LOCAL)
531 		return;
532 	if (!mpol_store_user_nodemask(pol) &&
533 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
534 		return;
535 
536 	mpol_ops[pol->mode].rebind(pol, newmask);
537 }
538 
539 /*
540  * Wrapper for mpol_rebind_policy() that just requires task
541  * pointer, and updates task mempolicy.
542  *
543  * Called with task's alloc_lock held.
544  */
545 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
546 {
547 	mpol_rebind_policy(tsk->mempolicy, new);
548 }
549 
550 /*
551  * Rebind each vma in mm to new nodemask.
552  *
553  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
554  */
555 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
556 {
557 	struct vm_area_struct *vma;
558 	VMA_ITERATOR(vmi, mm, 0);
559 
560 	mmap_write_lock(mm);
561 	for_each_vma(vmi, vma) {
562 		vma_start_write(vma);
563 		mpol_rebind_policy(vma->vm_policy, new);
564 	}
565 	mmap_write_unlock(mm);
566 }
567 
568 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
569 	[MPOL_DEFAULT] = {
570 		.rebind = mpol_rebind_default,
571 	},
572 	[MPOL_INTERLEAVE] = {
573 		.create = mpol_new_nodemask,
574 		.rebind = mpol_rebind_nodemask,
575 	},
576 	[MPOL_PREFERRED] = {
577 		.create = mpol_new_preferred,
578 		.rebind = mpol_rebind_preferred,
579 	},
580 	[MPOL_BIND] = {
581 		.create = mpol_new_nodemask,
582 		.rebind = mpol_rebind_nodemask,
583 	},
584 	[MPOL_LOCAL] = {
585 		.rebind = mpol_rebind_default,
586 	},
587 	[MPOL_PREFERRED_MANY] = {
588 		.create = mpol_new_nodemask,
589 		.rebind = mpol_rebind_preferred,
590 	},
591 	[MPOL_WEIGHTED_INTERLEAVE] = {
592 		.create = mpol_new_nodemask,
593 		.rebind = mpol_rebind_nodemask,
594 	},
595 };
596 
597 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
598 				unsigned long flags);
599 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
600 				pgoff_t ilx, int *nid);
601 
602 static bool strictly_unmovable(unsigned long flags)
603 {
604 	/*
605 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
606 	 * if any misplaced page is found.
607 	 */
608 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
609 			 MPOL_MF_STRICT;
610 }
611 
612 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
613 	struct mempolicy *pol;
614 	pgoff_t ilx;
615 };
616 
617 struct queue_pages {
618 	struct list_head *pagelist;
619 	unsigned long flags;
620 	nodemask_t *nmask;
621 	unsigned long start;
622 	unsigned long end;
623 	struct vm_area_struct *first;
624 	struct folio *large;		/* note last large folio encountered */
625 	long nr_failed;			/* could not be isolated at this time */
626 };
627 
628 /*
629  * Check if the folio's nid is in qp->nmask.
630  *
631  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
632  * in the invert of qp->nmask.
633  */
634 static inline bool queue_folio_required(struct folio *folio,
635 					struct queue_pages *qp)
636 {
637 	int nid = folio_nid(folio);
638 	unsigned long flags = qp->flags;
639 
640 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
641 }
642 
643 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
644 {
645 	struct folio *folio;
646 	struct queue_pages *qp = walk->private;
647 
648 	if (unlikely(is_pmd_migration_entry(*pmd))) {
649 		qp->nr_failed++;
650 		return;
651 	}
652 	folio = pmd_folio(*pmd);
653 	if (is_huge_zero_folio(folio)) {
654 		walk->action = ACTION_CONTINUE;
655 		return;
656 	}
657 	if (!queue_folio_required(folio, qp))
658 		return;
659 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
660 	    !vma_migratable(walk->vma) ||
661 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
662 		qp->nr_failed++;
663 }
664 
665 /*
666  * Scan through folios, checking if they satisfy the required conditions,
667  * moving them from LRU to local pagelist for migration if they do (or not).
668  *
669  * queue_folios_pte_range() has two possible return values:
670  * 0 - continue walking to scan for more, even if an existing folio on the
671  *     wrong node could not be isolated and queued for migration.
672  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
673  *        and an existing folio was on a node that does not follow the policy.
674  */
675 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
676 			unsigned long end, struct mm_walk *walk)
677 {
678 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
679 	struct vm_area_struct *vma = walk->vma;
680 	struct folio *folio;
681 	struct queue_pages *qp = walk->private;
682 	unsigned long flags = qp->flags;
683 	pte_t *pte, *mapped_pte;
684 	pte_t ptent;
685 	spinlock_t *ptl;
686 	int max_nr, nr;
687 
688 	ptl = pmd_trans_huge_lock(pmd, vma);
689 	if (ptl) {
690 		queue_folios_pmd(pmd, walk);
691 		spin_unlock(ptl);
692 		goto out;
693 	}
694 
695 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
696 	if (!pte) {
697 		walk->action = ACTION_AGAIN;
698 		return 0;
699 	}
700 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
701 		max_nr = (end - addr) >> PAGE_SHIFT;
702 		nr = 1;
703 		ptent = ptep_get(pte);
704 		if (pte_none(ptent))
705 			continue;
706 		if (!pte_present(ptent)) {
707 			if (is_migration_entry(pte_to_swp_entry(ptent)))
708 				qp->nr_failed++;
709 			continue;
710 		}
711 		folio = vm_normal_folio(vma, addr, ptent);
712 		if (!folio || folio_is_zone_device(folio))
713 			continue;
714 		if (folio_test_large(folio) && max_nr != 1)
715 			nr = folio_pte_batch(folio, addr, pte, ptent,
716 					     max_nr, fpb_flags,
717 					     NULL, NULL, NULL);
718 		/*
719 		 * vm_normal_folio() filters out zero pages, but there might
720 		 * still be reserved folios to skip, perhaps in a VDSO.
721 		 */
722 		if (folio_test_reserved(folio))
723 			continue;
724 		if (!queue_folio_required(folio, qp))
725 			continue;
726 		if (folio_test_large(folio)) {
727 			/*
728 			 * A large folio can only be isolated from LRU once,
729 			 * but may be mapped by many PTEs (and Copy-On-Write may
730 			 * intersperse PTEs of other, order 0, folios).  This is
731 			 * a common case, so don't mistake it for failure (but
732 			 * there can be other cases of multi-mapped pages which
733 			 * this quick check does not help to filter out - and a
734 			 * search of the pagelist might grow to be prohibitive).
735 			 *
736 			 * migrate_pages(&pagelist) returns nr_failed folios, so
737 			 * check "large" now so that queue_pages_range() returns
738 			 * a comparable nr_failed folios.  This does imply that
739 			 * if folio could not be isolated for some racy reason
740 			 * at its first PTE, later PTEs will not give it another
741 			 * chance of isolation; but keeps the accounting simple.
742 			 */
743 			if (folio == qp->large)
744 				continue;
745 			qp->large = folio;
746 		}
747 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
748 		    !vma_migratable(vma) ||
749 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
750 			qp->nr_failed += nr;
751 			if (strictly_unmovable(flags))
752 				break;
753 		}
754 	}
755 	pte_unmap_unlock(mapped_pte, ptl);
756 	cond_resched();
757 out:
758 	if (qp->nr_failed && strictly_unmovable(flags))
759 		return -EIO;
760 	return 0;
761 }
762 
763 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
764 			       unsigned long addr, unsigned long end,
765 			       struct mm_walk *walk)
766 {
767 #ifdef CONFIG_HUGETLB_PAGE
768 	struct queue_pages *qp = walk->private;
769 	unsigned long flags = qp->flags;
770 	struct folio *folio;
771 	spinlock_t *ptl;
772 	pte_t entry;
773 
774 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
775 	entry = huge_ptep_get(walk->mm, addr, pte);
776 	if (!pte_present(entry)) {
777 		if (unlikely(is_hugetlb_entry_migration(entry)))
778 			qp->nr_failed++;
779 		goto unlock;
780 	}
781 	folio = pfn_folio(pte_pfn(entry));
782 	if (!queue_folio_required(folio, qp))
783 		goto unlock;
784 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
785 	    !vma_migratable(walk->vma)) {
786 		qp->nr_failed++;
787 		goto unlock;
788 	}
789 	/*
790 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
791 	 * Choosing not to migrate a shared folio is not counted as a failure.
792 	 *
793 	 * See folio_maybe_mapped_shared() on possible imprecision when we
794 	 * cannot easily detect if a folio is shared.
795 	 */
796 	if ((flags & MPOL_MF_MOVE_ALL) ||
797 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
798 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
799 			qp->nr_failed++;
800 unlock:
801 	spin_unlock(ptl);
802 	if (qp->nr_failed && strictly_unmovable(flags))
803 		return -EIO;
804 #endif
805 	return 0;
806 }
807 
808 #ifdef CONFIG_NUMA_BALANCING
809 /*
810  * This is used to mark a range of virtual addresses to be inaccessible.
811  * These are later cleared by a NUMA hinting fault. Depending on these
812  * faults, pages may be migrated for better NUMA placement.
813  *
814  * This is assuming that NUMA faults are handled using PROT_NONE. If
815  * an architecture makes a different choice, it will need further
816  * changes to the core.
817  */
818 unsigned long change_prot_numa(struct vm_area_struct *vma,
819 			unsigned long addr, unsigned long end)
820 {
821 	struct mmu_gather tlb;
822 	long nr_updated;
823 
824 	tlb_gather_mmu(&tlb, vma->vm_mm);
825 
826 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
827 	if (nr_updated > 0) {
828 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
829 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
830 	}
831 
832 	tlb_finish_mmu(&tlb);
833 
834 	return nr_updated;
835 }
836 #endif /* CONFIG_NUMA_BALANCING */
837 
838 static int queue_pages_test_walk(unsigned long start, unsigned long end,
839 				struct mm_walk *walk)
840 {
841 	struct vm_area_struct *next, *vma = walk->vma;
842 	struct queue_pages *qp = walk->private;
843 	unsigned long flags = qp->flags;
844 
845 	/* range check first */
846 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
847 
848 	if (!qp->first) {
849 		qp->first = vma;
850 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
851 			(qp->start < vma->vm_start))
852 			/* hole at head side of range */
853 			return -EFAULT;
854 	}
855 	next = find_vma(vma->vm_mm, vma->vm_end);
856 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
857 		((vma->vm_end < qp->end) &&
858 		(!next || vma->vm_end < next->vm_start)))
859 		/* hole at middle or tail of range */
860 		return -EFAULT;
861 
862 	/*
863 	 * Need check MPOL_MF_STRICT to return -EIO if possible
864 	 * regardless of vma_migratable
865 	 */
866 	if (!vma_migratable(vma) &&
867 	    !(flags & MPOL_MF_STRICT))
868 		return 1;
869 
870 	/*
871 	 * Check page nodes, and queue pages to move, in the current vma.
872 	 * But if no moving, and no strict checking, the scan can be skipped.
873 	 */
874 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
875 		return 0;
876 	return 1;
877 }
878 
879 static const struct mm_walk_ops queue_pages_walk_ops = {
880 	.hugetlb_entry		= queue_folios_hugetlb,
881 	.pmd_entry		= queue_folios_pte_range,
882 	.test_walk		= queue_pages_test_walk,
883 	.walk_lock		= PGWALK_RDLOCK,
884 };
885 
886 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
887 	.hugetlb_entry		= queue_folios_hugetlb,
888 	.pmd_entry		= queue_folios_pte_range,
889 	.test_walk		= queue_pages_test_walk,
890 	.walk_lock		= PGWALK_WRLOCK,
891 };
892 
893 /*
894  * Walk through page tables and collect pages to be migrated.
895  *
896  * If pages found in a given range are not on the required set of @nodes,
897  * and migration is allowed, they are isolated and queued to @pagelist.
898  *
899  * queue_pages_range() may return:
900  * 0 - all pages already on the right node, or successfully queued for moving
901  *     (or neither strict checking nor moving requested: only range checking).
902  * >0 - this number of misplaced folios could not be queued for moving
903  *      (a hugetlbfs page or a transparent huge page being counted as 1).
904  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
905  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
906  */
907 static long
908 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
909 		nodemask_t *nodes, unsigned long flags,
910 		struct list_head *pagelist)
911 {
912 	int err;
913 	struct queue_pages qp = {
914 		.pagelist = pagelist,
915 		.flags = flags,
916 		.nmask = nodes,
917 		.start = start,
918 		.end = end,
919 		.first = NULL,
920 	};
921 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
922 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
923 
924 	err = walk_page_range(mm, start, end, ops, &qp);
925 
926 	if (!qp.first)
927 		/* whole range in hole */
928 		err = -EFAULT;
929 
930 	return err ? : qp.nr_failed;
931 }
932 
933 /*
934  * Apply policy to a single VMA
935  * This must be called with the mmap_lock held for writing.
936  */
937 static int vma_replace_policy(struct vm_area_struct *vma,
938 				struct mempolicy *pol)
939 {
940 	int err;
941 	struct mempolicy *old;
942 	struct mempolicy *new;
943 
944 	vma_assert_write_locked(vma);
945 
946 	new = mpol_dup(pol);
947 	if (IS_ERR(new))
948 		return PTR_ERR(new);
949 
950 	if (vma->vm_ops && vma->vm_ops->set_policy) {
951 		err = vma->vm_ops->set_policy(vma, new);
952 		if (err)
953 			goto err_out;
954 	}
955 
956 	old = vma->vm_policy;
957 	vma->vm_policy = new; /* protected by mmap_lock */
958 	mpol_put(old);
959 
960 	return 0;
961  err_out:
962 	mpol_put(new);
963 	return err;
964 }
965 
966 /* Split or merge the VMA (if required) and apply the new policy */
967 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
968 		struct vm_area_struct **prev, unsigned long start,
969 		unsigned long end, struct mempolicy *new_pol)
970 {
971 	unsigned long vmstart, vmend;
972 
973 	vmend = min(end, vma->vm_end);
974 	if (start > vma->vm_start) {
975 		*prev = vma;
976 		vmstart = start;
977 	} else {
978 		vmstart = vma->vm_start;
979 	}
980 
981 	if (mpol_equal(vma->vm_policy, new_pol)) {
982 		*prev = vma;
983 		return 0;
984 	}
985 
986 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
987 	if (IS_ERR(vma))
988 		return PTR_ERR(vma);
989 
990 	*prev = vma;
991 	return vma_replace_policy(vma, new_pol);
992 }
993 
994 /* Set the process memory policy */
995 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
996 			     nodemask_t *nodes)
997 {
998 	struct mempolicy *new, *old;
999 	NODEMASK_SCRATCH(scratch);
1000 	int ret;
1001 
1002 	if (!scratch)
1003 		return -ENOMEM;
1004 
1005 	new = mpol_new(mode, flags, nodes);
1006 	if (IS_ERR(new)) {
1007 		ret = PTR_ERR(new);
1008 		goto out;
1009 	}
1010 
1011 	task_lock(current);
1012 	ret = mpol_set_nodemask(new, nodes, scratch);
1013 	if (ret) {
1014 		task_unlock(current);
1015 		mpol_put(new);
1016 		goto out;
1017 	}
1018 
1019 	old = current->mempolicy;
1020 	current->mempolicy = new;
1021 	if (new && (new->mode == MPOL_INTERLEAVE ||
1022 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1023 		current->il_prev = MAX_NUMNODES-1;
1024 		current->il_weight = 0;
1025 	}
1026 	task_unlock(current);
1027 	mpol_put(old);
1028 	ret = 0;
1029 out:
1030 	NODEMASK_SCRATCH_FREE(scratch);
1031 	return ret;
1032 }
1033 
1034 /*
1035  * Return nodemask for policy for get_mempolicy() query
1036  *
1037  * Called with task's alloc_lock held
1038  */
1039 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1040 {
1041 	nodes_clear(*nodes);
1042 	if (pol == &default_policy)
1043 		return;
1044 
1045 	switch (pol->mode) {
1046 	case MPOL_BIND:
1047 	case MPOL_INTERLEAVE:
1048 	case MPOL_PREFERRED:
1049 	case MPOL_PREFERRED_MANY:
1050 	case MPOL_WEIGHTED_INTERLEAVE:
1051 		*nodes = pol->nodes;
1052 		break;
1053 	case MPOL_LOCAL:
1054 		/* return empty node mask for local allocation */
1055 		break;
1056 	default:
1057 		BUG();
1058 	}
1059 }
1060 
1061 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1062 {
1063 	struct page *p = NULL;
1064 	int ret;
1065 
1066 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1067 	if (ret > 0) {
1068 		ret = page_to_nid(p);
1069 		put_page(p);
1070 	}
1071 	return ret;
1072 }
1073 
1074 /* Retrieve NUMA policy */
1075 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1076 			     unsigned long addr, unsigned long flags)
1077 {
1078 	int err;
1079 	struct mm_struct *mm = current->mm;
1080 	struct vm_area_struct *vma = NULL;
1081 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1082 
1083 	if (flags &
1084 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1085 		return -EINVAL;
1086 
1087 	if (flags & MPOL_F_MEMS_ALLOWED) {
1088 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1089 			return -EINVAL;
1090 		*policy = 0;	/* just so it's initialized */
1091 		task_lock(current);
1092 		*nmask  = cpuset_current_mems_allowed;
1093 		task_unlock(current);
1094 		return 0;
1095 	}
1096 
1097 	if (flags & MPOL_F_ADDR) {
1098 		pgoff_t ilx;		/* ignored here */
1099 		/*
1100 		 * Do NOT fall back to task policy if the
1101 		 * vma/shared policy at addr is NULL.  We
1102 		 * want to return MPOL_DEFAULT in this case.
1103 		 */
1104 		mmap_read_lock(mm);
1105 		vma = vma_lookup(mm, addr);
1106 		if (!vma) {
1107 			mmap_read_unlock(mm);
1108 			return -EFAULT;
1109 		}
1110 		pol = __get_vma_policy(vma, addr, &ilx);
1111 	} else if (addr)
1112 		return -EINVAL;
1113 
1114 	if (!pol)
1115 		pol = &default_policy;	/* indicates default behavior */
1116 
1117 	if (flags & MPOL_F_NODE) {
1118 		if (flags & MPOL_F_ADDR) {
1119 			/*
1120 			 * Take a refcount on the mpol, because we are about to
1121 			 * drop the mmap_lock, after which only "pol" remains
1122 			 * valid, "vma" is stale.
1123 			 */
1124 			pol_refcount = pol;
1125 			vma = NULL;
1126 			mpol_get(pol);
1127 			mmap_read_unlock(mm);
1128 			err = lookup_node(mm, addr);
1129 			if (err < 0)
1130 				goto out;
1131 			*policy = err;
1132 		} else if (pol == current->mempolicy &&
1133 				pol->mode == MPOL_INTERLEAVE) {
1134 			*policy = next_node_in(current->il_prev, pol->nodes);
1135 		} else if (pol == current->mempolicy &&
1136 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1137 			if (current->il_weight)
1138 				*policy = current->il_prev;
1139 			else
1140 				*policy = next_node_in(current->il_prev,
1141 						       pol->nodes);
1142 		} else {
1143 			err = -EINVAL;
1144 			goto out;
1145 		}
1146 	} else {
1147 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1148 						pol->mode;
1149 		/*
1150 		 * Internal mempolicy flags must be masked off before exposing
1151 		 * the policy to userspace.
1152 		 */
1153 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1154 	}
1155 
1156 	err = 0;
1157 	if (nmask) {
1158 		if (mpol_store_user_nodemask(pol)) {
1159 			*nmask = pol->w.user_nodemask;
1160 		} else {
1161 			task_lock(current);
1162 			get_policy_nodemask(pol, nmask);
1163 			task_unlock(current);
1164 		}
1165 	}
1166 
1167  out:
1168 	mpol_cond_put(pol);
1169 	if (vma)
1170 		mmap_read_unlock(mm);
1171 	if (pol_refcount)
1172 		mpol_put(pol_refcount);
1173 	return err;
1174 }
1175 
1176 #ifdef CONFIG_MIGRATION
1177 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1178 				unsigned long flags)
1179 {
1180 	/*
1181 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1182 	 * Choosing not to migrate a shared folio is not counted as a failure.
1183 	 *
1184 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1185 	 * cannot easily detect if a folio is shared.
1186 	 */
1187 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1188 		if (folio_isolate_lru(folio)) {
1189 			list_add_tail(&folio->lru, foliolist);
1190 			node_stat_mod_folio(folio,
1191 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1192 				folio_nr_pages(folio));
1193 		} else {
1194 			/*
1195 			 * Non-movable folio may reach here.  And, there may be
1196 			 * temporary off LRU folios or non-LRU movable folios.
1197 			 * Treat them as unmovable folios since they can't be
1198 			 * isolated, so they can't be moved at the moment.
1199 			 */
1200 			return false;
1201 		}
1202 	}
1203 	return true;
1204 }
1205 
1206 /*
1207  * Migrate pages from one node to a target node.
1208  * Returns error or the number of pages not migrated.
1209  */
1210 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1211 			    int flags)
1212 {
1213 	nodemask_t nmask;
1214 	struct vm_area_struct *vma;
1215 	LIST_HEAD(pagelist);
1216 	long nr_failed;
1217 	long err = 0;
1218 	struct migration_target_control mtc = {
1219 		.nid = dest,
1220 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1221 		.reason = MR_SYSCALL,
1222 	};
1223 
1224 	nodes_clear(nmask);
1225 	node_set(source, nmask);
1226 
1227 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1228 
1229 	mmap_read_lock(mm);
1230 	vma = find_vma(mm, 0);
1231 	if (unlikely(!vma)) {
1232 		mmap_read_unlock(mm);
1233 		return 0;
1234 	}
1235 
1236 	/*
1237 	 * This does not migrate the range, but isolates all pages that
1238 	 * need migration.  Between passing in the full user address
1239 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1240 	 * but passes back the count of pages which could not be isolated.
1241 	 */
1242 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1243 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1244 	mmap_read_unlock(mm);
1245 
1246 	if (!list_empty(&pagelist)) {
1247 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1248 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1249 		if (err)
1250 			putback_movable_pages(&pagelist);
1251 	}
1252 
1253 	if (err >= 0)
1254 		err += nr_failed;
1255 	return err;
1256 }
1257 
1258 /*
1259  * Move pages between the two nodesets so as to preserve the physical
1260  * layout as much as possible.
1261  *
1262  * Returns the number of page that could not be moved.
1263  */
1264 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1265 		     const nodemask_t *to, int flags)
1266 {
1267 	long nr_failed = 0;
1268 	long err = 0;
1269 	nodemask_t tmp;
1270 
1271 	lru_cache_disable();
1272 
1273 	/*
1274 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1275 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1276 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1277 	 * The pair of nodemasks 'to' and 'from' define the map.
1278 	 *
1279 	 * If no pair of bits is found that way, fallback to picking some
1280 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1281 	 * 'source' and 'dest' bits are the same, this represents a node
1282 	 * that will be migrating to itself, so no pages need move.
1283 	 *
1284 	 * If no bits are left in 'tmp', or if all remaining bits left
1285 	 * in 'tmp' correspond to the same bit in 'to', return false
1286 	 * (nothing left to migrate).
1287 	 *
1288 	 * This lets us pick a pair of nodes to migrate between, such that
1289 	 * if possible the dest node is not already occupied by some other
1290 	 * source node, minimizing the risk of overloading the memory on a
1291 	 * node that would happen if we migrated incoming memory to a node
1292 	 * before migrating outgoing memory source that same node.
1293 	 *
1294 	 * A single scan of tmp is sufficient.  As we go, we remember the
1295 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1296 	 * that not only moved, but what's better, moved to an empty slot
1297 	 * (d is not set in tmp), then we break out then, with that pair.
1298 	 * Otherwise when we finish scanning from_tmp, we at least have the
1299 	 * most recent <s, d> pair that moved.  If we get all the way through
1300 	 * the scan of tmp without finding any node that moved, much less
1301 	 * moved to an empty node, then there is nothing left worth migrating.
1302 	 */
1303 
1304 	tmp = *from;
1305 	while (!nodes_empty(tmp)) {
1306 		int s, d;
1307 		int source = NUMA_NO_NODE;
1308 		int dest = 0;
1309 
1310 		for_each_node_mask(s, tmp) {
1311 
1312 			/*
1313 			 * do_migrate_pages() tries to maintain the relative
1314 			 * node relationship of the pages established between
1315 			 * threads and memory areas.
1316                          *
1317 			 * However if the number of source nodes is not equal to
1318 			 * the number of destination nodes we can not preserve
1319 			 * this node relative relationship.  In that case, skip
1320 			 * copying memory from a node that is in the destination
1321 			 * mask.
1322 			 *
1323 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1324 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1325 			 */
1326 
1327 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1328 						(node_isset(s, *to)))
1329 				continue;
1330 
1331 			d = node_remap(s, *from, *to);
1332 			if (s == d)
1333 				continue;
1334 
1335 			source = s;	/* Node moved. Memorize */
1336 			dest = d;
1337 
1338 			/* dest not in remaining from nodes? */
1339 			if (!node_isset(dest, tmp))
1340 				break;
1341 		}
1342 		if (source == NUMA_NO_NODE)
1343 			break;
1344 
1345 		node_clear(source, tmp);
1346 		err = migrate_to_node(mm, source, dest, flags);
1347 		if (err > 0)
1348 			nr_failed += err;
1349 		if (err < 0)
1350 			break;
1351 	}
1352 
1353 	lru_cache_enable();
1354 	if (err < 0)
1355 		return err;
1356 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1357 }
1358 
1359 /*
1360  * Allocate a new folio for page migration, according to NUMA mempolicy.
1361  */
1362 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1363 						    unsigned long private)
1364 {
1365 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1366 	struct mempolicy *pol = mmpol->pol;
1367 	pgoff_t ilx = mmpol->ilx;
1368 	unsigned int order;
1369 	int nid = numa_node_id();
1370 	gfp_t gfp;
1371 
1372 	order = folio_order(src);
1373 	ilx += src->index >> order;
1374 
1375 	if (folio_test_hugetlb(src)) {
1376 		nodemask_t *nodemask;
1377 		struct hstate *h;
1378 
1379 		h = folio_hstate(src);
1380 		gfp = htlb_alloc_mask(h);
1381 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1382 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1383 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1384 	}
1385 
1386 	if (folio_test_large(src))
1387 		gfp = GFP_TRANSHUGE;
1388 	else
1389 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1390 
1391 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1392 }
1393 #else
1394 
1395 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1396 				unsigned long flags)
1397 {
1398 	return false;
1399 }
1400 
1401 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1402 		     const nodemask_t *to, int flags)
1403 {
1404 	return -ENOSYS;
1405 }
1406 
1407 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1408 						    unsigned long private)
1409 {
1410 	return NULL;
1411 }
1412 #endif
1413 
1414 static long do_mbind(unsigned long start, unsigned long len,
1415 		     unsigned short mode, unsigned short mode_flags,
1416 		     nodemask_t *nmask, unsigned long flags)
1417 {
1418 	struct mm_struct *mm = current->mm;
1419 	struct vm_area_struct *vma, *prev;
1420 	struct vma_iterator vmi;
1421 	struct migration_mpol mmpol;
1422 	struct mempolicy *new;
1423 	unsigned long end;
1424 	long err;
1425 	long nr_failed;
1426 	LIST_HEAD(pagelist);
1427 
1428 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1429 		return -EINVAL;
1430 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1431 		return -EPERM;
1432 
1433 	if (start & ~PAGE_MASK)
1434 		return -EINVAL;
1435 
1436 	if (mode == MPOL_DEFAULT)
1437 		flags &= ~MPOL_MF_STRICT;
1438 
1439 	len = PAGE_ALIGN(len);
1440 	end = start + len;
1441 
1442 	if (end < start)
1443 		return -EINVAL;
1444 	if (end == start)
1445 		return 0;
1446 
1447 	new = mpol_new(mode, mode_flags, nmask);
1448 	if (IS_ERR(new))
1449 		return PTR_ERR(new);
1450 
1451 	/*
1452 	 * If we are using the default policy then operation
1453 	 * on discontinuous address spaces is okay after all
1454 	 */
1455 	if (!new)
1456 		flags |= MPOL_MF_DISCONTIG_OK;
1457 
1458 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1459 		lru_cache_disable();
1460 	{
1461 		NODEMASK_SCRATCH(scratch);
1462 		if (scratch) {
1463 			mmap_write_lock(mm);
1464 			err = mpol_set_nodemask(new, nmask, scratch);
1465 			if (err)
1466 				mmap_write_unlock(mm);
1467 		} else
1468 			err = -ENOMEM;
1469 		NODEMASK_SCRATCH_FREE(scratch);
1470 	}
1471 	if (err)
1472 		goto mpol_out;
1473 
1474 	/*
1475 	 * Lock the VMAs before scanning for pages to migrate,
1476 	 * to ensure we don't miss a concurrently inserted page.
1477 	 */
1478 	nr_failed = queue_pages_range(mm, start, end, nmask,
1479 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1480 
1481 	if (nr_failed < 0) {
1482 		err = nr_failed;
1483 		nr_failed = 0;
1484 	} else {
1485 		vma_iter_init(&vmi, mm, start);
1486 		prev = vma_prev(&vmi);
1487 		for_each_vma_range(vmi, vma, end) {
1488 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1489 			if (err)
1490 				break;
1491 		}
1492 	}
1493 
1494 	if (!err && !list_empty(&pagelist)) {
1495 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1496 		if (!new) {
1497 			new = get_task_policy(current);
1498 			mpol_get(new);
1499 		}
1500 		mmpol.pol = new;
1501 		mmpol.ilx = 0;
1502 
1503 		/*
1504 		 * In the interleaved case, attempt to allocate on exactly the
1505 		 * targeted nodes, for the first VMA to be migrated; for later
1506 		 * VMAs, the nodes will still be interleaved from the targeted
1507 		 * nodemask, but one by one may be selected differently.
1508 		 */
1509 		if (new->mode == MPOL_INTERLEAVE ||
1510 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1511 			struct folio *folio;
1512 			unsigned int order;
1513 			unsigned long addr = -EFAULT;
1514 
1515 			list_for_each_entry(folio, &pagelist, lru) {
1516 				if (!folio_test_ksm(folio))
1517 					break;
1518 			}
1519 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1520 				vma_iter_init(&vmi, mm, start);
1521 				for_each_vma_range(vmi, vma, end) {
1522 					addr = page_address_in_vma(folio,
1523 						folio_page(folio, 0), vma);
1524 					if (addr != -EFAULT)
1525 						break;
1526 				}
1527 			}
1528 			if (addr != -EFAULT) {
1529 				order = folio_order(folio);
1530 				/* We already know the pol, but not the ilx */
1531 				mpol_cond_put(get_vma_policy(vma, addr, order,
1532 							     &mmpol.ilx));
1533 				/* Set base from which to increment by index */
1534 				mmpol.ilx -= folio->index >> order;
1535 			}
1536 		}
1537 	}
1538 
1539 	mmap_write_unlock(mm);
1540 
1541 	if (!err && !list_empty(&pagelist)) {
1542 		nr_failed |= migrate_pages(&pagelist,
1543 				alloc_migration_target_by_mpol, NULL,
1544 				(unsigned long)&mmpol, MIGRATE_SYNC,
1545 				MR_MEMPOLICY_MBIND, NULL);
1546 	}
1547 
1548 	if (nr_failed && (flags & MPOL_MF_STRICT))
1549 		err = -EIO;
1550 	if (!list_empty(&pagelist))
1551 		putback_movable_pages(&pagelist);
1552 mpol_out:
1553 	mpol_put(new);
1554 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1555 		lru_cache_enable();
1556 	return err;
1557 }
1558 
1559 /*
1560  * User space interface with variable sized bitmaps for nodelists.
1561  */
1562 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1563 		      unsigned long maxnode)
1564 {
1565 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1566 	int ret;
1567 
1568 	if (in_compat_syscall())
1569 		ret = compat_get_bitmap(mask,
1570 					(const compat_ulong_t __user *)nmask,
1571 					maxnode);
1572 	else
1573 		ret = copy_from_user(mask, nmask,
1574 				     nlongs * sizeof(unsigned long));
1575 
1576 	if (ret)
1577 		return -EFAULT;
1578 
1579 	if (maxnode % BITS_PER_LONG)
1580 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1581 
1582 	return 0;
1583 }
1584 
1585 /* Copy a node mask from user space. */
1586 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1587 		     unsigned long maxnode)
1588 {
1589 	--maxnode;
1590 	nodes_clear(*nodes);
1591 	if (maxnode == 0 || !nmask)
1592 		return 0;
1593 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1594 		return -EINVAL;
1595 
1596 	/*
1597 	 * When the user specified more nodes than supported just check
1598 	 * if the non supported part is all zero, one word at a time,
1599 	 * starting at the end.
1600 	 */
1601 	while (maxnode > MAX_NUMNODES) {
1602 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1603 		unsigned long t;
1604 
1605 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1606 			return -EFAULT;
1607 
1608 		if (maxnode - bits >= MAX_NUMNODES) {
1609 			maxnode -= bits;
1610 		} else {
1611 			maxnode = MAX_NUMNODES;
1612 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1613 		}
1614 		if (t)
1615 			return -EINVAL;
1616 	}
1617 
1618 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1619 }
1620 
1621 /* Copy a kernel node mask to user space */
1622 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1623 			      nodemask_t *nodes)
1624 {
1625 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1626 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1627 	bool compat = in_compat_syscall();
1628 
1629 	if (compat)
1630 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1631 
1632 	if (copy > nbytes) {
1633 		if (copy > PAGE_SIZE)
1634 			return -EINVAL;
1635 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1636 			return -EFAULT;
1637 		copy = nbytes;
1638 		maxnode = nr_node_ids;
1639 	}
1640 
1641 	if (compat)
1642 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1643 					 nodes_addr(*nodes), maxnode);
1644 
1645 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1646 }
1647 
1648 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1649 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1650 {
1651 	*flags = *mode & MPOL_MODE_FLAGS;
1652 	*mode &= ~MPOL_MODE_FLAGS;
1653 
1654 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1655 		return -EINVAL;
1656 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1657 		return -EINVAL;
1658 	if (*flags & MPOL_F_NUMA_BALANCING) {
1659 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1660 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1661 		else
1662 			return -EINVAL;
1663 	}
1664 	return 0;
1665 }
1666 
1667 static long kernel_mbind(unsigned long start, unsigned long len,
1668 			 unsigned long mode, const unsigned long __user *nmask,
1669 			 unsigned long maxnode, unsigned int flags)
1670 {
1671 	unsigned short mode_flags;
1672 	nodemask_t nodes;
1673 	int lmode = mode;
1674 	int err;
1675 
1676 	start = untagged_addr(start);
1677 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1678 	if (err)
1679 		return err;
1680 
1681 	err = get_nodes(&nodes, nmask, maxnode);
1682 	if (err)
1683 		return err;
1684 
1685 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1686 }
1687 
1688 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1689 		unsigned long, home_node, unsigned long, flags)
1690 {
1691 	struct mm_struct *mm = current->mm;
1692 	struct vm_area_struct *vma, *prev;
1693 	struct mempolicy *new, *old;
1694 	unsigned long end;
1695 	int err = -ENOENT;
1696 	VMA_ITERATOR(vmi, mm, start);
1697 
1698 	start = untagged_addr(start);
1699 	if (start & ~PAGE_MASK)
1700 		return -EINVAL;
1701 	/*
1702 	 * flags is used for future extension if any.
1703 	 */
1704 	if (flags != 0)
1705 		return -EINVAL;
1706 
1707 	/*
1708 	 * Check home_node is online to avoid accessing uninitialized
1709 	 * NODE_DATA.
1710 	 */
1711 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1712 		return -EINVAL;
1713 
1714 	len = PAGE_ALIGN(len);
1715 	end = start + len;
1716 
1717 	if (end < start)
1718 		return -EINVAL;
1719 	if (end == start)
1720 		return 0;
1721 	mmap_write_lock(mm);
1722 	prev = vma_prev(&vmi);
1723 	for_each_vma_range(vmi, vma, end) {
1724 		/*
1725 		 * If any vma in the range got policy other than MPOL_BIND
1726 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1727 		 * the home node for vmas we already updated before.
1728 		 */
1729 		old = vma_policy(vma);
1730 		if (!old) {
1731 			prev = vma;
1732 			continue;
1733 		}
1734 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1735 			err = -EOPNOTSUPP;
1736 			break;
1737 		}
1738 		new = mpol_dup(old);
1739 		if (IS_ERR(new)) {
1740 			err = PTR_ERR(new);
1741 			break;
1742 		}
1743 
1744 		vma_start_write(vma);
1745 		new->home_node = home_node;
1746 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1747 		mpol_put(new);
1748 		if (err)
1749 			break;
1750 	}
1751 	mmap_write_unlock(mm);
1752 	return err;
1753 }
1754 
1755 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1756 		unsigned long, mode, const unsigned long __user *, nmask,
1757 		unsigned long, maxnode, unsigned int, flags)
1758 {
1759 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1760 }
1761 
1762 /* Set the process memory policy */
1763 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1764 				 unsigned long maxnode)
1765 {
1766 	unsigned short mode_flags;
1767 	nodemask_t nodes;
1768 	int lmode = mode;
1769 	int err;
1770 
1771 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1772 	if (err)
1773 		return err;
1774 
1775 	err = get_nodes(&nodes, nmask, maxnode);
1776 	if (err)
1777 		return err;
1778 
1779 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1780 }
1781 
1782 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1783 		unsigned long, maxnode)
1784 {
1785 	return kernel_set_mempolicy(mode, nmask, maxnode);
1786 }
1787 
1788 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1789 				const unsigned long __user *old_nodes,
1790 				const unsigned long __user *new_nodes)
1791 {
1792 	struct mm_struct *mm = NULL;
1793 	struct task_struct *task;
1794 	nodemask_t task_nodes;
1795 	int err;
1796 	nodemask_t *old;
1797 	nodemask_t *new;
1798 	NODEMASK_SCRATCH(scratch);
1799 
1800 	if (!scratch)
1801 		return -ENOMEM;
1802 
1803 	old = &scratch->mask1;
1804 	new = &scratch->mask2;
1805 
1806 	err = get_nodes(old, old_nodes, maxnode);
1807 	if (err)
1808 		goto out;
1809 
1810 	err = get_nodes(new, new_nodes, maxnode);
1811 	if (err)
1812 		goto out;
1813 
1814 	/* Find the mm_struct */
1815 	rcu_read_lock();
1816 	task = pid ? find_task_by_vpid(pid) : current;
1817 	if (!task) {
1818 		rcu_read_unlock();
1819 		err = -ESRCH;
1820 		goto out;
1821 	}
1822 	get_task_struct(task);
1823 
1824 	err = -EINVAL;
1825 
1826 	/*
1827 	 * Check if this process has the right to modify the specified process.
1828 	 * Use the regular "ptrace_may_access()" checks.
1829 	 */
1830 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1831 		rcu_read_unlock();
1832 		err = -EPERM;
1833 		goto out_put;
1834 	}
1835 	rcu_read_unlock();
1836 
1837 	task_nodes = cpuset_mems_allowed(task);
1838 	/* Is the user allowed to access the target nodes? */
1839 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1840 		err = -EPERM;
1841 		goto out_put;
1842 	}
1843 
1844 	task_nodes = cpuset_mems_allowed(current);
1845 	nodes_and(*new, *new, task_nodes);
1846 	if (nodes_empty(*new))
1847 		goto out_put;
1848 
1849 	err = security_task_movememory(task);
1850 	if (err)
1851 		goto out_put;
1852 
1853 	mm = get_task_mm(task);
1854 	put_task_struct(task);
1855 
1856 	if (!mm) {
1857 		err = -EINVAL;
1858 		goto out;
1859 	}
1860 
1861 	err = do_migrate_pages(mm, old, new,
1862 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1863 
1864 	mmput(mm);
1865 out:
1866 	NODEMASK_SCRATCH_FREE(scratch);
1867 
1868 	return err;
1869 
1870 out_put:
1871 	put_task_struct(task);
1872 	goto out;
1873 }
1874 
1875 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1876 		const unsigned long __user *, old_nodes,
1877 		const unsigned long __user *, new_nodes)
1878 {
1879 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1880 }
1881 
1882 /* Retrieve NUMA policy */
1883 static int kernel_get_mempolicy(int __user *policy,
1884 				unsigned long __user *nmask,
1885 				unsigned long maxnode,
1886 				unsigned long addr,
1887 				unsigned long flags)
1888 {
1889 	int err;
1890 	int pval;
1891 	nodemask_t nodes;
1892 
1893 	if (nmask != NULL && maxnode < nr_node_ids)
1894 		return -EINVAL;
1895 
1896 	addr = untagged_addr(addr);
1897 
1898 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1899 
1900 	if (err)
1901 		return err;
1902 
1903 	if (policy && put_user(pval, policy))
1904 		return -EFAULT;
1905 
1906 	if (nmask)
1907 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1908 
1909 	return err;
1910 }
1911 
1912 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1913 		unsigned long __user *, nmask, unsigned long, maxnode,
1914 		unsigned long, addr, unsigned long, flags)
1915 {
1916 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1917 }
1918 
1919 bool vma_migratable(struct vm_area_struct *vma)
1920 {
1921 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1922 		return false;
1923 
1924 	/*
1925 	 * DAX device mappings require predictable access latency, so avoid
1926 	 * incurring periodic faults.
1927 	 */
1928 	if (vma_is_dax(vma))
1929 		return false;
1930 
1931 	if (is_vm_hugetlb_page(vma) &&
1932 		!hugepage_migration_supported(hstate_vma(vma)))
1933 		return false;
1934 
1935 	/*
1936 	 * Migration allocates pages in the highest zone. If we cannot
1937 	 * do so then migration (at least from node to node) is not
1938 	 * possible.
1939 	 */
1940 	if (vma->vm_file &&
1941 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1942 			< policy_zone)
1943 		return false;
1944 	return true;
1945 }
1946 
1947 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1948 				   unsigned long addr, pgoff_t *ilx)
1949 {
1950 	*ilx = 0;
1951 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
1952 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1953 }
1954 
1955 /*
1956  * get_vma_policy(@vma, @addr, @order, @ilx)
1957  * @vma: virtual memory area whose policy is sought
1958  * @addr: address in @vma for shared policy lookup
1959  * @order: 0, or appropriate huge_page_order for interleaving
1960  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1961  *       MPOL_WEIGHTED_INTERLEAVE
1962  *
1963  * Returns effective policy for a VMA at specified address.
1964  * Falls back to current->mempolicy or system default policy, as necessary.
1965  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1966  * count--added by the get_policy() vm_op, as appropriate--to protect against
1967  * freeing by another task.  It is the caller's responsibility to free the
1968  * extra reference for shared policies.
1969  */
1970 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1971 				 unsigned long addr, int order, pgoff_t *ilx)
1972 {
1973 	struct mempolicy *pol;
1974 
1975 	pol = __get_vma_policy(vma, addr, ilx);
1976 	if (!pol)
1977 		pol = get_task_policy(current);
1978 	if (pol->mode == MPOL_INTERLEAVE ||
1979 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1980 		*ilx += vma->vm_pgoff >> order;
1981 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1982 	}
1983 	return pol;
1984 }
1985 
1986 bool vma_policy_mof(struct vm_area_struct *vma)
1987 {
1988 	struct mempolicy *pol;
1989 
1990 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1991 		bool ret = false;
1992 		pgoff_t ilx;		/* ignored here */
1993 
1994 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1995 		if (pol && (pol->flags & MPOL_F_MOF))
1996 			ret = true;
1997 		mpol_cond_put(pol);
1998 
1999 		return ret;
2000 	}
2001 
2002 	pol = vma->vm_policy;
2003 	if (!pol)
2004 		pol = get_task_policy(current);
2005 
2006 	return pol->flags & MPOL_F_MOF;
2007 }
2008 
2009 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2010 {
2011 	enum zone_type dynamic_policy_zone = policy_zone;
2012 
2013 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2014 
2015 	/*
2016 	 * if policy->nodes has movable memory only,
2017 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2018 	 *
2019 	 * policy->nodes is intersect with node_states[N_MEMORY].
2020 	 * so if the following test fails, it implies
2021 	 * policy->nodes has movable memory only.
2022 	 */
2023 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2024 		dynamic_policy_zone = ZONE_MOVABLE;
2025 
2026 	return zone >= dynamic_policy_zone;
2027 }
2028 
2029 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2030 {
2031 	unsigned int node;
2032 	unsigned int cpuset_mems_cookie;
2033 
2034 retry:
2035 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2036 	cpuset_mems_cookie = read_mems_allowed_begin();
2037 	node = current->il_prev;
2038 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
2039 		node = next_node_in(node, policy->nodes);
2040 		if (read_mems_allowed_retry(cpuset_mems_cookie))
2041 			goto retry;
2042 		if (node == MAX_NUMNODES)
2043 			return node;
2044 		current->il_prev = node;
2045 		current->il_weight = get_il_weight(node);
2046 	}
2047 	current->il_weight--;
2048 	return node;
2049 }
2050 
2051 /* Do dynamic interleaving for a process */
2052 static unsigned int interleave_nodes(struct mempolicy *policy)
2053 {
2054 	unsigned int nid;
2055 	unsigned int cpuset_mems_cookie;
2056 
2057 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2058 	do {
2059 		cpuset_mems_cookie = read_mems_allowed_begin();
2060 		nid = next_node_in(current->il_prev, policy->nodes);
2061 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2062 
2063 	if (nid < MAX_NUMNODES)
2064 		current->il_prev = nid;
2065 	return nid;
2066 }
2067 
2068 /*
2069  * Depending on the memory policy provide a node from which to allocate the
2070  * next slab entry.
2071  */
2072 unsigned int mempolicy_slab_node(void)
2073 {
2074 	struct mempolicy *policy;
2075 	int node = numa_mem_id();
2076 
2077 	if (!in_task())
2078 		return node;
2079 
2080 	policy = current->mempolicy;
2081 	if (!policy)
2082 		return node;
2083 
2084 	switch (policy->mode) {
2085 	case MPOL_PREFERRED:
2086 		return first_node(policy->nodes);
2087 
2088 	case MPOL_INTERLEAVE:
2089 		return interleave_nodes(policy);
2090 
2091 	case MPOL_WEIGHTED_INTERLEAVE:
2092 		return weighted_interleave_nodes(policy);
2093 
2094 	case MPOL_BIND:
2095 	case MPOL_PREFERRED_MANY:
2096 	{
2097 		struct zoneref *z;
2098 
2099 		/*
2100 		 * Follow bind policy behavior and start allocation at the
2101 		 * first node.
2102 		 */
2103 		struct zonelist *zonelist;
2104 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2105 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2106 		z = first_zones_zonelist(zonelist, highest_zoneidx,
2107 							&policy->nodes);
2108 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2109 	}
2110 	case MPOL_LOCAL:
2111 		return node;
2112 
2113 	default:
2114 		BUG();
2115 	}
2116 }
2117 
2118 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2119 					      nodemask_t *mask)
2120 {
2121 	/*
2122 	 * barrier stabilizes the nodemask locally so that it can be iterated
2123 	 * over safely without concern for changes. Allocators validate node
2124 	 * selection does not violate mems_allowed, so this is safe.
2125 	 */
2126 	barrier();
2127 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2128 	barrier();
2129 	return nodes_weight(*mask);
2130 }
2131 
2132 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2133 {
2134 	struct weighted_interleave_state *state;
2135 	nodemask_t nodemask;
2136 	unsigned int target, nr_nodes;
2137 	u8 *table = NULL;
2138 	unsigned int weight_total = 0;
2139 	u8 weight;
2140 	int nid = 0;
2141 
2142 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2143 	if (!nr_nodes)
2144 		return numa_node_id();
2145 
2146 	rcu_read_lock();
2147 
2148 	state = rcu_dereference(wi_state);
2149 	/* Uninitialized wi_state means we should assume all weights are 1 */
2150 	if (state)
2151 		table = state->iw_table;
2152 
2153 	/* calculate the total weight */
2154 	for_each_node_mask(nid, nodemask)
2155 		weight_total += table ? table[nid] : 1;
2156 
2157 	/* Calculate the node offset based on totals */
2158 	target = ilx % weight_total;
2159 	nid = first_node(nodemask);
2160 	while (target) {
2161 		/* detect system default usage */
2162 		weight = table ? table[nid] : 1;
2163 		if (target < weight)
2164 			break;
2165 		target -= weight;
2166 		nid = next_node_in(nid, nodemask);
2167 	}
2168 	rcu_read_unlock();
2169 	return nid;
2170 }
2171 
2172 /*
2173  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2174  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2175  * exceeds the number of present nodes.
2176  */
2177 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2178 {
2179 	nodemask_t nodemask;
2180 	unsigned int target, nnodes;
2181 	int i;
2182 	int nid;
2183 
2184 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2185 	if (!nnodes)
2186 		return numa_node_id();
2187 	target = ilx % nnodes;
2188 	nid = first_node(nodemask);
2189 	for (i = 0; i < target; i++)
2190 		nid = next_node(nid, nodemask);
2191 	return nid;
2192 }
2193 
2194 /*
2195  * Return a nodemask representing a mempolicy for filtering nodes for
2196  * page allocation, together with preferred node id (or the input node id).
2197  */
2198 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2199 				   pgoff_t ilx, int *nid)
2200 {
2201 	nodemask_t *nodemask = NULL;
2202 
2203 	switch (pol->mode) {
2204 	case MPOL_PREFERRED:
2205 		/* Override input node id */
2206 		*nid = first_node(pol->nodes);
2207 		break;
2208 	case MPOL_PREFERRED_MANY:
2209 		nodemask = &pol->nodes;
2210 		if (pol->home_node != NUMA_NO_NODE)
2211 			*nid = pol->home_node;
2212 		break;
2213 	case MPOL_BIND:
2214 		/* Restrict to nodemask (but not on lower zones) */
2215 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2216 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2217 			nodemask = &pol->nodes;
2218 		if (pol->home_node != NUMA_NO_NODE)
2219 			*nid = pol->home_node;
2220 		/*
2221 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2222 		 * because we might easily break the expectation to stay on the
2223 		 * requested node and not break the policy.
2224 		 */
2225 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2226 		break;
2227 	case MPOL_INTERLEAVE:
2228 		/* Override input node id */
2229 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2230 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2231 		break;
2232 	case MPOL_WEIGHTED_INTERLEAVE:
2233 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2234 			weighted_interleave_nodes(pol) :
2235 			weighted_interleave_nid(pol, ilx);
2236 		break;
2237 	}
2238 
2239 	return nodemask;
2240 }
2241 
2242 #ifdef CONFIG_HUGETLBFS
2243 /*
2244  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2245  * @vma: virtual memory area whose policy is sought
2246  * @addr: address in @vma for shared policy lookup and interleave policy
2247  * @gfp_flags: for requested zone
2248  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2249  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2250  *
2251  * Returns a nid suitable for a huge page allocation and a pointer
2252  * to the struct mempolicy for conditional unref after allocation.
2253  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2254  * to the mempolicy's @nodemask for filtering the zonelist.
2255  */
2256 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2257 		struct mempolicy **mpol, nodemask_t **nodemask)
2258 {
2259 	pgoff_t ilx;
2260 	int nid;
2261 
2262 	nid = numa_node_id();
2263 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2264 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2265 	return nid;
2266 }
2267 
2268 /*
2269  * init_nodemask_of_mempolicy
2270  *
2271  * If the current task's mempolicy is "default" [NULL], return 'false'
2272  * to indicate default policy.  Otherwise, extract the policy nodemask
2273  * for 'bind' or 'interleave' policy into the argument nodemask, or
2274  * initialize the argument nodemask to contain the single node for
2275  * 'preferred' or 'local' policy and return 'true' to indicate presence
2276  * of non-default mempolicy.
2277  *
2278  * We don't bother with reference counting the mempolicy [mpol_get/put]
2279  * because the current task is examining it's own mempolicy and a task's
2280  * mempolicy is only ever changed by the task itself.
2281  *
2282  * N.B., it is the caller's responsibility to free a returned nodemask.
2283  */
2284 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2285 {
2286 	struct mempolicy *mempolicy;
2287 
2288 	if (!(mask && current->mempolicy))
2289 		return false;
2290 
2291 	task_lock(current);
2292 	mempolicy = current->mempolicy;
2293 	switch (mempolicy->mode) {
2294 	case MPOL_PREFERRED:
2295 	case MPOL_PREFERRED_MANY:
2296 	case MPOL_BIND:
2297 	case MPOL_INTERLEAVE:
2298 	case MPOL_WEIGHTED_INTERLEAVE:
2299 		*mask = mempolicy->nodes;
2300 		break;
2301 
2302 	case MPOL_LOCAL:
2303 		init_nodemask_of_node(mask, numa_node_id());
2304 		break;
2305 
2306 	default:
2307 		BUG();
2308 	}
2309 	task_unlock(current);
2310 
2311 	return true;
2312 }
2313 #endif
2314 
2315 /*
2316  * mempolicy_in_oom_domain
2317  *
2318  * If tsk's mempolicy is "bind", check for intersection between mask and
2319  * the policy nodemask. Otherwise, return true for all other policies
2320  * including "interleave", as a tsk with "interleave" policy may have
2321  * memory allocated from all nodes in system.
2322  *
2323  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2324  */
2325 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2326 					const nodemask_t *mask)
2327 {
2328 	struct mempolicy *mempolicy;
2329 	bool ret = true;
2330 
2331 	if (!mask)
2332 		return ret;
2333 
2334 	task_lock(tsk);
2335 	mempolicy = tsk->mempolicy;
2336 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2337 		ret = nodes_intersects(mempolicy->nodes, *mask);
2338 	task_unlock(tsk);
2339 
2340 	return ret;
2341 }
2342 
2343 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2344 						int nid, nodemask_t *nodemask)
2345 {
2346 	struct page *page;
2347 	gfp_t preferred_gfp;
2348 
2349 	/*
2350 	 * This is a two pass approach. The first pass will only try the
2351 	 * preferred nodes but skip the direct reclaim and allow the
2352 	 * allocation to fail, while the second pass will try all the
2353 	 * nodes in system.
2354 	 */
2355 	preferred_gfp = gfp | __GFP_NOWARN;
2356 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2357 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2358 	if (!page)
2359 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2360 
2361 	return page;
2362 }
2363 
2364 /**
2365  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2366  * @gfp: GFP flags.
2367  * @order: Order of the page allocation.
2368  * @pol: Pointer to the NUMA mempolicy.
2369  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2370  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2371  *
2372  * Return: The page on success or NULL if allocation fails.
2373  */
2374 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2375 		struct mempolicy *pol, pgoff_t ilx, int nid)
2376 {
2377 	nodemask_t *nodemask;
2378 	struct page *page;
2379 
2380 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2381 
2382 	if (pol->mode == MPOL_PREFERRED_MANY)
2383 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2384 
2385 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2386 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2387 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2388 		/*
2389 		 * For hugepage allocation and non-interleave policy which
2390 		 * allows the current node (or other explicitly preferred
2391 		 * node) we only try to allocate from the current/preferred
2392 		 * node and don't fall back to other nodes, as the cost of
2393 		 * remote accesses would likely offset THP benefits.
2394 		 *
2395 		 * If the policy is interleave or does not allow the current
2396 		 * node in its nodemask, we allocate the standard way.
2397 		 */
2398 		if (pol->mode != MPOL_INTERLEAVE &&
2399 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2400 		    (!nodemask || node_isset(nid, *nodemask))) {
2401 			/*
2402 			 * First, try to allocate THP only on local node, but
2403 			 * don't reclaim unnecessarily, just compact.
2404 			 */
2405 			page = __alloc_frozen_pages_noprof(
2406 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2407 				nid, NULL);
2408 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2409 				return page;
2410 			/*
2411 			 * If hugepage allocations are configured to always
2412 			 * synchronous compact or the vma has been madvised
2413 			 * to prefer hugepage backing, retry allowing remote
2414 			 * memory with both reclaim and compact as well.
2415 			 */
2416 		}
2417 	}
2418 
2419 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2420 
2421 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2422 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2423 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2424 		if (static_branch_likely(&vm_numa_stat_key) &&
2425 		    page_to_nid(page) == nid) {
2426 			preempt_disable();
2427 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2428 			preempt_enable();
2429 		}
2430 	}
2431 
2432 	return page;
2433 }
2434 
2435 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2436 		struct mempolicy *pol, pgoff_t ilx, int nid)
2437 {
2438 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2439 			ilx, nid);
2440 	if (!page)
2441 		return NULL;
2442 
2443 	set_page_refcounted(page);
2444 	return page_rmappable_folio(page);
2445 }
2446 
2447 /**
2448  * vma_alloc_folio - Allocate a folio for a VMA.
2449  * @gfp: GFP flags.
2450  * @order: Order of the folio.
2451  * @vma: Pointer to VMA.
2452  * @addr: Virtual address of the allocation.  Must be inside @vma.
2453  *
2454  * Allocate a folio for a specific address in @vma, using the appropriate
2455  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2456  * VMA to prevent it from going away.  Should be used for all allocations
2457  * for folios that will be mapped into user space, excepting hugetlbfs, and
2458  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2459  *
2460  * Return: The folio on success or NULL if allocation fails.
2461  */
2462 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2463 		unsigned long addr)
2464 {
2465 	struct mempolicy *pol;
2466 	pgoff_t ilx;
2467 	struct folio *folio;
2468 
2469 	if (vma->vm_flags & VM_DROPPABLE)
2470 		gfp |= __GFP_NOWARN;
2471 
2472 	pol = get_vma_policy(vma, addr, order, &ilx);
2473 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2474 	mpol_cond_put(pol);
2475 	return folio;
2476 }
2477 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2478 
2479 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2480 {
2481 	struct mempolicy *pol = &default_policy;
2482 
2483 	/*
2484 	 * No reference counting needed for current->mempolicy
2485 	 * nor system default_policy
2486 	 */
2487 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2488 		pol = get_task_policy(current);
2489 
2490 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2491 				       numa_node_id());
2492 }
2493 
2494 /**
2495  * alloc_pages - Allocate pages.
2496  * @gfp: GFP flags.
2497  * @order: Power of two of number of pages to allocate.
2498  *
2499  * Allocate 1 << @order contiguous pages.  The physical address of the
2500  * first page is naturally aligned (eg an order-3 allocation will be aligned
2501  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2502  * process is honoured when in process context.
2503  *
2504  * Context: Can be called from any context, providing the appropriate GFP
2505  * flags are used.
2506  * Return: The page on success or NULL if allocation fails.
2507  */
2508 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2509 {
2510 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2511 
2512 	if (page)
2513 		set_page_refcounted(page);
2514 	return page;
2515 }
2516 EXPORT_SYMBOL(alloc_pages_noprof);
2517 
2518 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2519 {
2520 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2521 }
2522 EXPORT_SYMBOL(folio_alloc_noprof);
2523 
2524 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2525 		struct mempolicy *pol, unsigned long nr_pages,
2526 		struct page **page_array)
2527 {
2528 	int nodes;
2529 	unsigned long nr_pages_per_node;
2530 	int delta;
2531 	int i;
2532 	unsigned long nr_allocated;
2533 	unsigned long total_allocated = 0;
2534 
2535 	nodes = nodes_weight(pol->nodes);
2536 	nr_pages_per_node = nr_pages / nodes;
2537 	delta = nr_pages - nodes * nr_pages_per_node;
2538 
2539 	for (i = 0; i < nodes; i++) {
2540 		if (delta) {
2541 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2542 					interleave_nodes(pol), NULL,
2543 					nr_pages_per_node + 1,
2544 					page_array);
2545 			delta--;
2546 		} else {
2547 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2548 					interleave_nodes(pol), NULL,
2549 					nr_pages_per_node, page_array);
2550 		}
2551 
2552 		page_array += nr_allocated;
2553 		total_allocated += nr_allocated;
2554 	}
2555 
2556 	return total_allocated;
2557 }
2558 
2559 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2560 		struct mempolicy *pol, unsigned long nr_pages,
2561 		struct page **page_array)
2562 {
2563 	struct weighted_interleave_state *state;
2564 	struct task_struct *me = current;
2565 	unsigned int cpuset_mems_cookie;
2566 	unsigned long total_allocated = 0;
2567 	unsigned long nr_allocated = 0;
2568 	unsigned long rounds;
2569 	unsigned long node_pages, delta;
2570 	u8 *weights, weight;
2571 	unsigned int weight_total = 0;
2572 	unsigned long rem_pages = nr_pages;
2573 	nodemask_t nodes;
2574 	int nnodes, node;
2575 	int resume_node = MAX_NUMNODES - 1;
2576 	u8 resume_weight = 0;
2577 	int prev_node;
2578 	int i;
2579 
2580 	if (!nr_pages)
2581 		return 0;
2582 
2583 	/* read the nodes onto the stack, retry if done during rebind */
2584 	do {
2585 		cpuset_mems_cookie = read_mems_allowed_begin();
2586 		nnodes = read_once_policy_nodemask(pol, &nodes);
2587 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2588 
2589 	/* if the nodemask has become invalid, we cannot do anything */
2590 	if (!nnodes)
2591 		return 0;
2592 
2593 	/* Continue allocating from most recent node and adjust the nr_pages */
2594 	node = me->il_prev;
2595 	weight = me->il_weight;
2596 	if (weight && node_isset(node, nodes)) {
2597 		node_pages = min(rem_pages, weight);
2598 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2599 						  page_array);
2600 		page_array += nr_allocated;
2601 		total_allocated += nr_allocated;
2602 		/* if that's all the pages, no need to interleave */
2603 		if (rem_pages <= weight) {
2604 			me->il_weight -= rem_pages;
2605 			return total_allocated;
2606 		}
2607 		/* Otherwise we adjust remaining pages, continue from there */
2608 		rem_pages -= weight;
2609 	}
2610 	/* clear active weight in case of an allocation failure */
2611 	me->il_weight = 0;
2612 	prev_node = node;
2613 
2614 	/* create a local copy of node weights to operate on outside rcu */
2615 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2616 	if (!weights)
2617 		return total_allocated;
2618 
2619 	rcu_read_lock();
2620 	state = rcu_dereference(wi_state);
2621 	if (state) {
2622 		memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2623 		rcu_read_unlock();
2624 	} else {
2625 		rcu_read_unlock();
2626 		for (i = 0; i < nr_node_ids; i++)
2627 			weights[i] = 1;
2628 	}
2629 
2630 	/* calculate total, detect system default usage */
2631 	for_each_node_mask(node, nodes)
2632 		weight_total += weights[node];
2633 
2634 	/*
2635 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2636 	 * Track which node weighted interleave should resume from.
2637 	 *
2638 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2639 	 * the node following prev_node and its weight.
2640 	 */
2641 	rounds = rem_pages / weight_total;
2642 	delta = rem_pages % weight_total;
2643 	resume_node = next_node_in(prev_node, nodes);
2644 	resume_weight = weights[resume_node];
2645 	for (i = 0; i < nnodes; i++) {
2646 		node = next_node_in(prev_node, nodes);
2647 		weight = weights[node];
2648 		node_pages = weight * rounds;
2649 		/* If a delta exists, add this node's portion of the delta */
2650 		if (delta > weight) {
2651 			node_pages += weight;
2652 			delta -= weight;
2653 		} else if (delta) {
2654 			/* when delta is depleted, resume from that node */
2655 			node_pages += delta;
2656 			resume_node = node;
2657 			resume_weight = weight - delta;
2658 			delta = 0;
2659 		}
2660 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2661 		if (!node_pages)
2662 			break;
2663 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2664 						  page_array);
2665 		page_array += nr_allocated;
2666 		total_allocated += nr_allocated;
2667 		if (total_allocated == nr_pages)
2668 			break;
2669 		prev_node = node;
2670 	}
2671 	me->il_prev = resume_node;
2672 	me->il_weight = resume_weight;
2673 	kfree(weights);
2674 	return total_allocated;
2675 }
2676 
2677 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2678 		struct mempolicy *pol, unsigned long nr_pages,
2679 		struct page **page_array)
2680 {
2681 	gfp_t preferred_gfp;
2682 	unsigned long nr_allocated = 0;
2683 
2684 	preferred_gfp = gfp | __GFP_NOWARN;
2685 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2686 
2687 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2688 					   nr_pages, page_array);
2689 
2690 	if (nr_allocated < nr_pages)
2691 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2692 				nr_pages - nr_allocated,
2693 				page_array + nr_allocated);
2694 	return nr_allocated;
2695 }
2696 
2697 /* alloc pages bulk and mempolicy should be considered at the
2698  * same time in some situation such as vmalloc.
2699  *
2700  * It can accelerate memory allocation especially interleaving
2701  * allocate memory.
2702  */
2703 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2704 		unsigned long nr_pages, struct page **page_array)
2705 {
2706 	struct mempolicy *pol = &default_policy;
2707 	nodemask_t *nodemask;
2708 	int nid;
2709 
2710 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2711 		pol = get_task_policy(current);
2712 
2713 	if (pol->mode == MPOL_INTERLEAVE)
2714 		return alloc_pages_bulk_interleave(gfp, pol,
2715 							 nr_pages, page_array);
2716 
2717 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2718 		return alloc_pages_bulk_weighted_interleave(
2719 				  gfp, pol, nr_pages, page_array);
2720 
2721 	if (pol->mode == MPOL_PREFERRED_MANY)
2722 		return alloc_pages_bulk_preferred_many(gfp,
2723 				numa_node_id(), pol, nr_pages, page_array);
2724 
2725 	nid = numa_node_id();
2726 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2727 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2728 				       nr_pages, page_array);
2729 }
2730 
2731 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2732 {
2733 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2734 
2735 	if (IS_ERR(pol))
2736 		return PTR_ERR(pol);
2737 	dst->vm_policy = pol;
2738 	return 0;
2739 }
2740 
2741 /*
2742  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2743  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2744  * with the mems_allowed returned by cpuset_mems_allowed().  This
2745  * keeps mempolicies cpuset relative after its cpuset moves.  See
2746  * further kernel/cpuset.c update_nodemask().
2747  *
2748  * current's mempolicy may be rebinded by the other task(the task that changes
2749  * cpuset's mems), so we needn't do rebind work for current task.
2750  */
2751 
2752 /* Slow path of a mempolicy duplicate */
2753 struct mempolicy *__mpol_dup(struct mempolicy *old)
2754 {
2755 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2756 
2757 	if (!new)
2758 		return ERR_PTR(-ENOMEM);
2759 
2760 	/* task's mempolicy is protected by alloc_lock */
2761 	if (old == current->mempolicy) {
2762 		task_lock(current);
2763 		*new = *old;
2764 		task_unlock(current);
2765 	} else
2766 		*new = *old;
2767 
2768 	if (current_cpuset_is_being_rebound()) {
2769 		nodemask_t mems = cpuset_mems_allowed(current);
2770 		mpol_rebind_policy(new, &mems);
2771 	}
2772 	atomic_set(&new->refcnt, 1);
2773 	return new;
2774 }
2775 
2776 /* Slow path of a mempolicy comparison */
2777 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2778 {
2779 	if (!a || !b)
2780 		return false;
2781 	if (a->mode != b->mode)
2782 		return false;
2783 	if (a->flags != b->flags)
2784 		return false;
2785 	if (a->home_node != b->home_node)
2786 		return false;
2787 	if (mpol_store_user_nodemask(a))
2788 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2789 			return false;
2790 
2791 	switch (a->mode) {
2792 	case MPOL_BIND:
2793 	case MPOL_INTERLEAVE:
2794 	case MPOL_PREFERRED:
2795 	case MPOL_PREFERRED_MANY:
2796 	case MPOL_WEIGHTED_INTERLEAVE:
2797 		return !!nodes_equal(a->nodes, b->nodes);
2798 	case MPOL_LOCAL:
2799 		return true;
2800 	default:
2801 		BUG();
2802 		return false;
2803 	}
2804 }
2805 
2806 /*
2807  * Shared memory backing store policy support.
2808  *
2809  * Remember policies even when nobody has shared memory mapped.
2810  * The policies are kept in Red-Black tree linked from the inode.
2811  * They are protected by the sp->lock rwlock, which should be held
2812  * for any accesses to the tree.
2813  */
2814 
2815 /*
2816  * lookup first element intersecting start-end.  Caller holds sp->lock for
2817  * reading or for writing
2818  */
2819 static struct sp_node *sp_lookup(struct shared_policy *sp,
2820 					pgoff_t start, pgoff_t end)
2821 {
2822 	struct rb_node *n = sp->root.rb_node;
2823 
2824 	while (n) {
2825 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2826 
2827 		if (start >= p->end)
2828 			n = n->rb_right;
2829 		else if (end <= p->start)
2830 			n = n->rb_left;
2831 		else
2832 			break;
2833 	}
2834 	if (!n)
2835 		return NULL;
2836 	for (;;) {
2837 		struct sp_node *w = NULL;
2838 		struct rb_node *prev = rb_prev(n);
2839 		if (!prev)
2840 			break;
2841 		w = rb_entry(prev, struct sp_node, nd);
2842 		if (w->end <= start)
2843 			break;
2844 		n = prev;
2845 	}
2846 	return rb_entry(n, struct sp_node, nd);
2847 }
2848 
2849 /*
2850  * Insert a new shared policy into the list.  Caller holds sp->lock for
2851  * writing.
2852  */
2853 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2854 {
2855 	struct rb_node **p = &sp->root.rb_node;
2856 	struct rb_node *parent = NULL;
2857 	struct sp_node *nd;
2858 
2859 	while (*p) {
2860 		parent = *p;
2861 		nd = rb_entry(parent, struct sp_node, nd);
2862 		if (new->start < nd->start)
2863 			p = &(*p)->rb_left;
2864 		else if (new->end > nd->end)
2865 			p = &(*p)->rb_right;
2866 		else
2867 			BUG();
2868 	}
2869 	rb_link_node(&new->nd, parent, p);
2870 	rb_insert_color(&new->nd, &sp->root);
2871 }
2872 
2873 /* Find shared policy intersecting idx */
2874 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2875 						pgoff_t idx)
2876 {
2877 	struct mempolicy *pol = NULL;
2878 	struct sp_node *sn;
2879 
2880 	if (!sp->root.rb_node)
2881 		return NULL;
2882 	read_lock(&sp->lock);
2883 	sn = sp_lookup(sp, idx, idx+1);
2884 	if (sn) {
2885 		mpol_get(sn->policy);
2886 		pol = sn->policy;
2887 	}
2888 	read_unlock(&sp->lock);
2889 	return pol;
2890 }
2891 
2892 static void sp_free(struct sp_node *n)
2893 {
2894 	mpol_put(n->policy);
2895 	kmem_cache_free(sn_cache, n);
2896 }
2897 
2898 /**
2899  * mpol_misplaced - check whether current folio node is valid in policy
2900  *
2901  * @folio: folio to be checked
2902  * @vmf: structure describing the fault
2903  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2904  *
2905  * Lookup current policy node id for vma,addr and "compare to" folio's
2906  * node id.  Policy determination "mimics" alloc_page_vma().
2907  * Called from fault path where we know the vma and faulting address.
2908  *
2909  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2910  * policy, or a suitable node ID to allocate a replacement folio from.
2911  */
2912 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2913 		   unsigned long addr)
2914 {
2915 	struct mempolicy *pol;
2916 	pgoff_t ilx;
2917 	struct zoneref *z;
2918 	int curnid = folio_nid(folio);
2919 	struct vm_area_struct *vma = vmf->vma;
2920 	int thiscpu = raw_smp_processor_id();
2921 	int thisnid = numa_node_id();
2922 	int polnid = NUMA_NO_NODE;
2923 	int ret = NUMA_NO_NODE;
2924 
2925 	/*
2926 	 * Make sure ptl is held so that we don't preempt and we
2927 	 * have a stable smp processor id
2928 	 */
2929 	lockdep_assert_held(vmf->ptl);
2930 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2931 	if (!(pol->flags & MPOL_F_MOF))
2932 		goto out;
2933 
2934 	switch (pol->mode) {
2935 	case MPOL_INTERLEAVE:
2936 		polnid = interleave_nid(pol, ilx);
2937 		break;
2938 
2939 	case MPOL_WEIGHTED_INTERLEAVE:
2940 		polnid = weighted_interleave_nid(pol, ilx);
2941 		break;
2942 
2943 	case MPOL_PREFERRED:
2944 		if (node_isset(curnid, pol->nodes))
2945 			goto out;
2946 		polnid = first_node(pol->nodes);
2947 		break;
2948 
2949 	case MPOL_LOCAL:
2950 		polnid = numa_node_id();
2951 		break;
2952 
2953 	case MPOL_BIND:
2954 	case MPOL_PREFERRED_MANY:
2955 		/*
2956 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
2957 		 * policy nodemask we don't allow numa migration to nodes
2958 		 * outside policy nodemask for now. This is done so that if we
2959 		 * want demotion to slow memory to happen, before allocating
2960 		 * from some DRAM node say 'x', we will end up using a
2961 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2962 		 * we should not promote to node 'x' from slow memory node.
2963 		 */
2964 		if (pol->flags & MPOL_F_MORON) {
2965 			/*
2966 			 * Optimize placement among multiple nodes
2967 			 * via NUMA balancing
2968 			 */
2969 			if (node_isset(thisnid, pol->nodes))
2970 				break;
2971 			goto out;
2972 		}
2973 
2974 		/*
2975 		 * use current page if in policy nodemask,
2976 		 * else select nearest allowed node, if any.
2977 		 * If no allowed nodes, use current [!misplaced].
2978 		 */
2979 		if (node_isset(curnid, pol->nodes))
2980 			goto out;
2981 		z = first_zones_zonelist(
2982 				node_zonelist(thisnid, GFP_HIGHUSER),
2983 				gfp_zone(GFP_HIGHUSER),
2984 				&pol->nodes);
2985 		polnid = zonelist_node_idx(z);
2986 		break;
2987 
2988 	default:
2989 		BUG();
2990 	}
2991 
2992 	/* Migrate the folio towards the node whose CPU is referencing it */
2993 	if (pol->flags & MPOL_F_MORON) {
2994 		polnid = thisnid;
2995 
2996 		if (!should_numa_migrate_memory(current, folio, curnid,
2997 						thiscpu))
2998 			goto out;
2999 	}
3000 
3001 	if (curnid != polnid)
3002 		ret = polnid;
3003 out:
3004 	mpol_cond_put(pol);
3005 
3006 	return ret;
3007 }
3008 
3009 /*
3010  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
3011  * dropped after task->mempolicy is set to NULL so that any allocation done as
3012  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3013  * policy.
3014  */
3015 void mpol_put_task_policy(struct task_struct *task)
3016 {
3017 	struct mempolicy *pol;
3018 
3019 	task_lock(task);
3020 	pol = task->mempolicy;
3021 	task->mempolicy = NULL;
3022 	task_unlock(task);
3023 	mpol_put(pol);
3024 }
3025 
3026 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3027 {
3028 	rb_erase(&n->nd, &sp->root);
3029 	sp_free(n);
3030 }
3031 
3032 static void sp_node_init(struct sp_node *node, unsigned long start,
3033 			unsigned long end, struct mempolicy *pol)
3034 {
3035 	node->start = start;
3036 	node->end = end;
3037 	node->policy = pol;
3038 }
3039 
3040 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3041 				struct mempolicy *pol)
3042 {
3043 	struct sp_node *n;
3044 	struct mempolicy *newpol;
3045 
3046 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3047 	if (!n)
3048 		return NULL;
3049 
3050 	newpol = mpol_dup(pol);
3051 	if (IS_ERR(newpol)) {
3052 		kmem_cache_free(sn_cache, n);
3053 		return NULL;
3054 	}
3055 	newpol->flags |= MPOL_F_SHARED;
3056 	sp_node_init(n, start, end, newpol);
3057 
3058 	return n;
3059 }
3060 
3061 /* Replace a policy range. */
3062 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3063 				 pgoff_t end, struct sp_node *new)
3064 {
3065 	struct sp_node *n;
3066 	struct sp_node *n_new = NULL;
3067 	struct mempolicy *mpol_new = NULL;
3068 	int ret = 0;
3069 
3070 restart:
3071 	write_lock(&sp->lock);
3072 	n = sp_lookup(sp, start, end);
3073 	/* Take care of old policies in the same range. */
3074 	while (n && n->start < end) {
3075 		struct rb_node *next = rb_next(&n->nd);
3076 		if (n->start >= start) {
3077 			if (n->end <= end)
3078 				sp_delete(sp, n);
3079 			else
3080 				n->start = end;
3081 		} else {
3082 			/* Old policy spanning whole new range. */
3083 			if (n->end > end) {
3084 				if (!n_new)
3085 					goto alloc_new;
3086 
3087 				*mpol_new = *n->policy;
3088 				atomic_set(&mpol_new->refcnt, 1);
3089 				sp_node_init(n_new, end, n->end, mpol_new);
3090 				n->end = start;
3091 				sp_insert(sp, n_new);
3092 				n_new = NULL;
3093 				mpol_new = NULL;
3094 				break;
3095 			} else
3096 				n->end = start;
3097 		}
3098 		if (!next)
3099 			break;
3100 		n = rb_entry(next, struct sp_node, nd);
3101 	}
3102 	if (new)
3103 		sp_insert(sp, new);
3104 	write_unlock(&sp->lock);
3105 	ret = 0;
3106 
3107 err_out:
3108 	if (mpol_new)
3109 		mpol_put(mpol_new);
3110 	if (n_new)
3111 		kmem_cache_free(sn_cache, n_new);
3112 
3113 	return ret;
3114 
3115 alloc_new:
3116 	write_unlock(&sp->lock);
3117 	ret = -ENOMEM;
3118 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3119 	if (!n_new)
3120 		goto err_out;
3121 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3122 	if (!mpol_new)
3123 		goto err_out;
3124 	atomic_set(&mpol_new->refcnt, 1);
3125 	goto restart;
3126 }
3127 
3128 /**
3129  * mpol_shared_policy_init - initialize shared policy for inode
3130  * @sp: pointer to inode shared policy
3131  * @mpol:  struct mempolicy to install
3132  *
3133  * Install non-NULL @mpol in inode's shared policy rb-tree.
3134  * On entry, the current task has a reference on a non-NULL @mpol.
3135  * This must be released on exit.
3136  * This is called at get_inode() calls and we can use GFP_KERNEL.
3137  */
3138 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3139 {
3140 	int ret;
3141 
3142 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3143 	rwlock_init(&sp->lock);
3144 
3145 	if (mpol) {
3146 		struct sp_node *sn;
3147 		struct mempolicy *npol;
3148 		NODEMASK_SCRATCH(scratch);
3149 
3150 		if (!scratch)
3151 			goto put_mpol;
3152 
3153 		/* contextualize the tmpfs mount point mempolicy to this file */
3154 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3155 		if (IS_ERR(npol))
3156 			goto free_scratch; /* no valid nodemask intersection */
3157 
3158 		task_lock(current);
3159 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3160 		task_unlock(current);
3161 		if (ret)
3162 			goto put_npol;
3163 
3164 		/* alloc node covering entire file; adds ref to file's npol */
3165 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3166 		if (sn)
3167 			sp_insert(sp, sn);
3168 put_npol:
3169 		mpol_put(npol);	/* drop initial ref on file's npol */
3170 free_scratch:
3171 		NODEMASK_SCRATCH_FREE(scratch);
3172 put_mpol:
3173 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3174 	}
3175 }
3176 
3177 int mpol_set_shared_policy(struct shared_policy *sp,
3178 			struct vm_area_struct *vma, struct mempolicy *pol)
3179 {
3180 	int err;
3181 	struct sp_node *new = NULL;
3182 	unsigned long sz = vma_pages(vma);
3183 
3184 	if (pol) {
3185 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3186 		if (!new)
3187 			return -ENOMEM;
3188 	}
3189 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3190 	if (err && new)
3191 		sp_free(new);
3192 	return err;
3193 }
3194 
3195 /* Free a backing policy store on inode delete. */
3196 void mpol_free_shared_policy(struct shared_policy *sp)
3197 {
3198 	struct sp_node *n;
3199 	struct rb_node *next;
3200 
3201 	if (!sp->root.rb_node)
3202 		return;
3203 	write_lock(&sp->lock);
3204 	next = rb_first(&sp->root);
3205 	while (next) {
3206 		n = rb_entry(next, struct sp_node, nd);
3207 		next = rb_next(&n->nd);
3208 		sp_delete(sp, n);
3209 	}
3210 	write_unlock(&sp->lock);
3211 }
3212 
3213 #ifdef CONFIG_NUMA_BALANCING
3214 static int __initdata numabalancing_override;
3215 
3216 static void __init check_numabalancing_enable(void)
3217 {
3218 	bool numabalancing_default = false;
3219 
3220 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3221 		numabalancing_default = true;
3222 
3223 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3224 	if (numabalancing_override)
3225 		set_numabalancing_state(numabalancing_override == 1);
3226 
3227 	if (num_online_nodes() > 1 && !numabalancing_override) {
3228 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3229 			numabalancing_default ? "Enabling" : "Disabling");
3230 		set_numabalancing_state(numabalancing_default);
3231 	}
3232 }
3233 
3234 static int __init setup_numabalancing(char *str)
3235 {
3236 	int ret = 0;
3237 	if (!str)
3238 		goto out;
3239 
3240 	if (!strcmp(str, "enable")) {
3241 		numabalancing_override = 1;
3242 		ret = 1;
3243 	} else if (!strcmp(str, "disable")) {
3244 		numabalancing_override = -1;
3245 		ret = 1;
3246 	}
3247 out:
3248 	if (!ret)
3249 		pr_warn("Unable to parse numa_balancing=\n");
3250 
3251 	return ret;
3252 }
3253 __setup("numa_balancing=", setup_numabalancing);
3254 #else
3255 static inline void __init check_numabalancing_enable(void)
3256 {
3257 }
3258 #endif /* CONFIG_NUMA_BALANCING */
3259 
3260 void __init numa_policy_init(void)
3261 {
3262 	nodemask_t interleave_nodes;
3263 	unsigned long largest = 0;
3264 	int nid, prefer = 0;
3265 
3266 	policy_cache = kmem_cache_create("numa_policy",
3267 					 sizeof(struct mempolicy),
3268 					 0, SLAB_PANIC, NULL);
3269 
3270 	sn_cache = kmem_cache_create("shared_policy_node",
3271 				     sizeof(struct sp_node),
3272 				     0, SLAB_PANIC, NULL);
3273 
3274 	for_each_node(nid) {
3275 		preferred_node_policy[nid] = (struct mempolicy) {
3276 			.refcnt = ATOMIC_INIT(1),
3277 			.mode = MPOL_PREFERRED,
3278 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3279 			.nodes = nodemask_of_node(nid),
3280 		};
3281 	}
3282 
3283 	/*
3284 	 * Set interleaving policy for system init. Interleaving is only
3285 	 * enabled across suitably sized nodes (default is >= 16MB), or
3286 	 * fall back to the largest node if they're all smaller.
3287 	 */
3288 	nodes_clear(interleave_nodes);
3289 	for_each_node_state(nid, N_MEMORY) {
3290 		unsigned long total_pages = node_present_pages(nid);
3291 
3292 		/* Preserve the largest node */
3293 		if (largest < total_pages) {
3294 			largest = total_pages;
3295 			prefer = nid;
3296 		}
3297 
3298 		/* Interleave this node? */
3299 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3300 			node_set(nid, interleave_nodes);
3301 	}
3302 
3303 	/* All too small, use the largest */
3304 	if (unlikely(nodes_empty(interleave_nodes)))
3305 		node_set(prefer, interleave_nodes);
3306 
3307 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3308 		pr_err("%s: interleaving failed\n", __func__);
3309 
3310 	check_numabalancing_enable();
3311 }
3312 
3313 /* Reset policy of current process to default */
3314 void numa_default_policy(void)
3315 {
3316 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3317 }
3318 
3319 /*
3320  * Parse and format mempolicy from/to strings
3321  */
3322 static const char * const policy_modes[] =
3323 {
3324 	[MPOL_DEFAULT]    = "default",
3325 	[MPOL_PREFERRED]  = "prefer",
3326 	[MPOL_BIND]       = "bind",
3327 	[MPOL_INTERLEAVE] = "interleave",
3328 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3329 	[MPOL_LOCAL]      = "local",
3330 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3331 };
3332 
3333 #ifdef CONFIG_TMPFS
3334 /**
3335  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3336  * @str:  string containing mempolicy to parse
3337  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3338  *
3339  * Format of input:
3340  *	<mode>[=<flags>][:<nodelist>]
3341  *
3342  * Return: %0 on success, else %1
3343  */
3344 int mpol_parse_str(char *str, struct mempolicy **mpol)
3345 {
3346 	struct mempolicy *new = NULL;
3347 	unsigned short mode_flags;
3348 	nodemask_t nodes;
3349 	char *nodelist = strchr(str, ':');
3350 	char *flags = strchr(str, '=');
3351 	int err = 1, mode;
3352 
3353 	if (flags)
3354 		*flags++ = '\0';	/* terminate mode string */
3355 
3356 	if (nodelist) {
3357 		/* NUL-terminate mode or flags string */
3358 		*nodelist++ = '\0';
3359 		if (nodelist_parse(nodelist, nodes))
3360 			goto out;
3361 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3362 			goto out;
3363 	} else
3364 		nodes_clear(nodes);
3365 
3366 	mode = match_string(policy_modes, MPOL_MAX, str);
3367 	if (mode < 0)
3368 		goto out;
3369 
3370 	switch (mode) {
3371 	case MPOL_PREFERRED:
3372 		/*
3373 		 * Insist on a nodelist of one node only, although later
3374 		 * we use first_node(nodes) to grab a single node, so here
3375 		 * nodelist (or nodes) cannot be empty.
3376 		 */
3377 		if (nodelist) {
3378 			char *rest = nodelist;
3379 			while (isdigit(*rest))
3380 				rest++;
3381 			if (*rest)
3382 				goto out;
3383 			if (nodes_empty(nodes))
3384 				goto out;
3385 		}
3386 		break;
3387 	case MPOL_INTERLEAVE:
3388 	case MPOL_WEIGHTED_INTERLEAVE:
3389 		/*
3390 		 * Default to online nodes with memory if no nodelist
3391 		 */
3392 		if (!nodelist)
3393 			nodes = node_states[N_MEMORY];
3394 		break;
3395 	case MPOL_LOCAL:
3396 		/*
3397 		 * Don't allow a nodelist;  mpol_new() checks flags
3398 		 */
3399 		if (nodelist)
3400 			goto out;
3401 		break;
3402 	case MPOL_DEFAULT:
3403 		/*
3404 		 * Insist on a empty nodelist
3405 		 */
3406 		if (!nodelist)
3407 			err = 0;
3408 		goto out;
3409 	case MPOL_PREFERRED_MANY:
3410 	case MPOL_BIND:
3411 		/*
3412 		 * Insist on a nodelist
3413 		 */
3414 		if (!nodelist)
3415 			goto out;
3416 	}
3417 
3418 	mode_flags = 0;
3419 	if (flags) {
3420 		/*
3421 		 * Currently, we only support two mutually exclusive
3422 		 * mode flags.
3423 		 */
3424 		if (!strcmp(flags, "static"))
3425 			mode_flags |= MPOL_F_STATIC_NODES;
3426 		else if (!strcmp(flags, "relative"))
3427 			mode_flags |= MPOL_F_RELATIVE_NODES;
3428 		else
3429 			goto out;
3430 	}
3431 
3432 	new = mpol_new(mode, mode_flags, &nodes);
3433 	if (IS_ERR(new))
3434 		goto out;
3435 
3436 	/*
3437 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3438 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3439 	 */
3440 	if (mode != MPOL_PREFERRED) {
3441 		new->nodes = nodes;
3442 	} else if (nodelist) {
3443 		nodes_clear(new->nodes);
3444 		node_set(first_node(nodes), new->nodes);
3445 	} else {
3446 		new->mode = MPOL_LOCAL;
3447 	}
3448 
3449 	/*
3450 	 * Save nodes for contextualization: this will be used to "clone"
3451 	 * the mempolicy in a specific context [cpuset] at a later time.
3452 	 */
3453 	new->w.user_nodemask = nodes;
3454 
3455 	err = 0;
3456 
3457 out:
3458 	/* Restore string for error message */
3459 	if (nodelist)
3460 		*--nodelist = ':';
3461 	if (flags)
3462 		*--flags = '=';
3463 	if (!err)
3464 		*mpol = new;
3465 	return err;
3466 }
3467 #endif /* CONFIG_TMPFS */
3468 
3469 /**
3470  * mpol_to_str - format a mempolicy structure for printing
3471  * @buffer:  to contain formatted mempolicy string
3472  * @maxlen:  length of @buffer
3473  * @pol:  pointer to mempolicy to be formatted
3474  *
3475  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3476  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3477  * interleave", plus the longest flag flags, "relative|balancing", and to
3478  * display at least a few node ids.
3479  */
3480 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3481 {
3482 	char *p = buffer;
3483 	nodemask_t nodes = NODE_MASK_NONE;
3484 	unsigned short mode = MPOL_DEFAULT;
3485 	unsigned short flags = 0;
3486 
3487 	if (pol &&
3488 	    pol != &default_policy &&
3489 	    !(pol >= &preferred_node_policy[0] &&
3490 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3491 		mode = pol->mode;
3492 		flags = pol->flags;
3493 	}
3494 
3495 	switch (mode) {
3496 	case MPOL_DEFAULT:
3497 	case MPOL_LOCAL:
3498 		break;
3499 	case MPOL_PREFERRED:
3500 	case MPOL_PREFERRED_MANY:
3501 	case MPOL_BIND:
3502 	case MPOL_INTERLEAVE:
3503 	case MPOL_WEIGHTED_INTERLEAVE:
3504 		nodes = pol->nodes;
3505 		break;
3506 	default:
3507 		WARN_ON_ONCE(1);
3508 		snprintf(p, maxlen, "unknown");
3509 		return;
3510 	}
3511 
3512 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3513 
3514 	if (flags & MPOL_MODE_FLAGS) {
3515 		p += snprintf(p, buffer + maxlen - p, "=");
3516 
3517 		/*
3518 		 * Static and relative are mutually exclusive.
3519 		 */
3520 		if (flags & MPOL_F_STATIC_NODES)
3521 			p += snprintf(p, buffer + maxlen - p, "static");
3522 		else if (flags & MPOL_F_RELATIVE_NODES)
3523 			p += snprintf(p, buffer + maxlen - p, "relative");
3524 
3525 		if (flags & MPOL_F_NUMA_BALANCING) {
3526 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3527 				p += snprintf(p, buffer + maxlen - p, "|");
3528 			p += snprintf(p, buffer + maxlen - p, "balancing");
3529 		}
3530 	}
3531 
3532 	if (!nodes_empty(nodes))
3533 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3534 			       nodemask_pr_args(&nodes));
3535 }
3536 
3537 #ifdef CONFIG_SYSFS
3538 struct iw_node_attr {
3539 	struct kobj_attribute kobj_attr;
3540 	int nid;
3541 };
3542 
3543 struct sysfs_wi_group {
3544 	struct kobject wi_kobj;
3545 	struct mutex kobj_lock;
3546 	struct iw_node_attr *nattrs[];
3547 };
3548 
3549 static struct sysfs_wi_group *wi_group;
3550 
3551 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3552 			 char *buf)
3553 {
3554 	struct iw_node_attr *node_attr;
3555 	u8 weight;
3556 
3557 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3558 	weight = get_il_weight(node_attr->nid);
3559 	return sysfs_emit(buf, "%d\n", weight);
3560 }
3561 
3562 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3563 			  const char *buf, size_t count)
3564 {
3565 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3566 	struct iw_node_attr *node_attr;
3567 	u8 weight = 0;
3568 	int i;
3569 
3570 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3571 	if (count == 0 || sysfs_streq(buf, "") ||
3572 	    kstrtou8(buf, 0, &weight) || weight == 0)
3573 		return -EINVAL;
3574 
3575 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3576 			       GFP_KERNEL);
3577 	if (!new_wi_state)
3578 		return -ENOMEM;
3579 
3580 	mutex_lock(&wi_state_lock);
3581 	old_wi_state = rcu_dereference_protected(wi_state,
3582 					lockdep_is_held(&wi_state_lock));
3583 	if (old_wi_state) {
3584 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3585 					nr_node_ids * sizeof(u8));
3586 	} else {
3587 		for (i = 0; i < nr_node_ids; i++)
3588 			new_wi_state->iw_table[i] = 1;
3589 	}
3590 	new_wi_state->iw_table[node_attr->nid] = weight;
3591 	new_wi_state->mode_auto = false;
3592 
3593 	rcu_assign_pointer(wi_state, new_wi_state);
3594 	mutex_unlock(&wi_state_lock);
3595 	if (old_wi_state) {
3596 		synchronize_rcu();
3597 		kfree(old_wi_state);
3598 	}
3599 	return count;
3600 }
3601 
3602 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3603 		struct kobj_attribute *attr, char *buf)
3604 {
3605 	struct weighted_interleave_state *state;
3606 	bool wi_auto = true;
3607 
3608 	rcu_read_lock();
3609 	state = rcu_dereference(wi_state);
3610 	if (state)
3611 		wi_auto = state->mode_auto;
3612 	rcu_read_unlock();
3613 
3614 	return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3615 }
3616 
3617 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3618 		struct kobj_attribute *attr, const char *buf, size_t count)
3619 {
3620 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3621 	unsigned int *bw;
3622 	bool input;
3623 	int i;
3624 
3625 	if (kstrtobool(buf, &input))
3626 		return -EINVAL;
3627 
3628 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3629 			       GFP_KERNEL);
3630 	if (!new_wi_state)
3631 		return -ENOMEM;
3632 	for (i = 0; i < nr_node_ids; i++)
3633 		new_wi_state->iw_table[i] = 1;
3634 
3635 	mutex_lock(&wi_state_lock);
3636 	if (!input) {
3637 		old_wi_state = rcu_dereference_protected(wi_state,
3638 					lockdep_is_held(&wi_state_lock));
3639 		if (!old_wi_state)
3640 			goto update_wi_state;
3641 		if (input == old_wi_state->mode_auto) {
3642 			mutex_unlock(&wi_state_lock);
3643 			return count;
3644 		}
3645 
3646 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3647 					       nr_node_ids * sizeof(u8));
3648 		goto update_wi_state;
3649 	}
3650 
3651 	bw = node_bw_table;
3652 	if (!bw) {
3653 		mutex_unlock(&wi_state_lock);
3654 		kfree(new_wi_state);
3655 		return -ENODEV;
3656 	}
3657 
3658 	new_wi_state->mode_auto = true;
3659 	reduce_interleave_weights(bw, new_wi_state->iw_table);
3660 
3661 update_wi_state:
3662 	rcu_assign_pointer(wi_state, new_wi_state);
3663 	mutex_unlock(&wi_state_lock);
3664 	if (old_wi_state) {
3665 		synchronize_rcu();
3666 		kfree(old_wi_state);
3667 	}
3668 	return count;
3669 }
3670 
3671 static void sysfs_wi_node_delete(int nid)
3672 {
3673 	struct iw_node_attr *attr;
3674 
3675 	if (nid < 0 || nid >= nr_node_ids)
3676 		return;
3677 
3678 	mutex_lock(&wi_group->kobj_lock);
3679 	attr = wi_group->nattrs[nid];
3680 	if (!attr) {
3681 		mutex_unlock(&wi_group->kobj_lock);
3682 		return;
3683 	}
3684 
3685 	wi_group->nattrs[nid] = NULL;
3686 	mutex_unlock(&wi_group->kobj_lock);
3687 
3688 	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3689 	kfree(attr->kobj_attr.attr.name);
3690 	kfree(attr);
3691 }
3692 
3693 static void sysfs_wi_node_delete_all(void)
3694 {
3695 	int nid;
3696 
3697 	for (nid = 0; nid < nr_node_ids; nid++)
3698 		sysfs_wi_node_delete(nid);
3699 }
3700 
3701 static void wi_state_free(void)
3702 {
3703 	struct weighted_interleave_state *old_wi_state;
3704 
3705 	mutex_lock(&wi_state_lock);
3706 
3707 	old_wi_state = rcu_dereference_protected(wi_state,
3708 			lockdep_is_held(&wi_state_lock));
3709 	if (!old_wi_state) {
3710 		mutex_unlock(&wi_state_lock);
3711 		return;
3712 	}
3713 
3714 	rcu_assign_pointer(wi_state, NULL);
3715 	mutex_unlock(&wi_state_lock);
3716 	synchronize_rcu();
3717 	kfree(old_wi_state);
3718 }
3719 
3720 static struct kobj_attribute wi_auto_attr =
3721 	__ATTR(auto, 0664, weighted_interleave_auto_show,
3722 			   weighted_interleave_auto_store);
3723 
3724 static void wi_cleanup(void) {
3725 	sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3726 	sysfs_wi_node_delete_all();
3727 	wi_state_free();
3728 }
3729 
3730 static void wi_kobj_release(struct kobject *wi_kobj)
3731 {
3732 	kfree(wi_group);
3733 }
3734 
3735 static const struct kobj_type wi_ktype = {
3736 	.sysfs_ops = &kobj_sysfs_ops,
3737 	.release = wi_kobj_release,
3738 };
3739 
3740 static int sysfs_wi_node_add(int nid)
3741 {
3742 	int ret;
3743 	char *name;
3744 	struct iw_node_attr *new_attr;
3745 
3746 	if (nid < 0 || nid >= nr_node_ids) {
3747 		pr_err("invalid node id: %d\n", nid);
3748 		return -EINVAL;
3749 	}
3750 
3751 	new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3752 	if (!new_attr)
3753 		return -ENOMEM;
3754 
3755 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3756 	if (!name) {
3757 		kfree(new_attr);
3758 		return -ENOMEM;
3759 	}
3760 
3761 	sysfs_attr_init(&new_attr->kobj_attr.attr);
3762 	new_attr->kobj_attr.attr.name = name;
3763 	new_attr->kobj_attr.attr.mode = 0644;
3764 	new_attr->kobj_attr.show = node_show;
3765 	new_attr->kobj_attr.store = node_store;
3766 	new_attr->nid = nid;
3767 
3768 	mutex_lock(&wi_group->kobj_lock);
3769 	if (wi_group->nattrs[nid]) {
3770 		mutex_unlock(&wi_group->kobj_lock);
3771 		ret = -EEXIST;
3772 		goto out;
3773 	}
3774 
3775 	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3776 	if (ret) {
3777 		mutex_unlock(&wi_group->kobj_lock);
3778 		goto out;
3779 	}
3780 	wi_group->nattrs[nid] = new_attr;
3781 	mutex_unlock(&wi_group->kobj_lock);
3782 	return 0;
3783 
3784 out:
3785 	kfree(new_attr->kobj_attr.attr.name);
3786 	kfree(new_attr);
3787 	return ret;
3788 }
3789 
3790 static int wi_node_notifier(struct notifier_block *nb,
3791 			       unsigned long action, void *data)
3792 {
3793 	int err;
3794 	struct memory_notify *arg = data;
3795 	int nid = arg->status_change_nid;
3796 
3797 	if (nid < 0)
3798 		return NOTIFY_OK;
3799 
3800 	switch (action) {
3801 	case MEM_ONLINE:
3802 		err = sysfs_wi_node_add(nid);
3803 		if (err)
3804 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3805 			       nid, err);
3806 		break;
3807 	case MEM_OFFLINE:
3808 		sysfs_wi_node_delete(nid);
3809 		break;
3810 	}
3811 
3812 	return NOTIFY_OK;
3813 }
3814 
3815 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3816 {
3817 	int nid, err;
3818 
3819 	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3820 			   GFP_KERNEL);
3821 	if (!wi_group)
3822 		return -ENOMEM;
3823 	mutex_init(&wi_group->kobj_lock);
3824 
3825 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3826 				   "weighted_interleave");
3827 	if (err)
3828 		goto err_put_kobj;
3829 
3830 	err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3831 	if (err)
3832 		goto err_put_kobj;
3833 
3834 	for_each_online_node(nid) {
3835 		if (!node_state(nid, N_MEMORY))
3836 			continue;
3837 
3838 		err = sysfs_wi_node_add(nid);
3839 		if (err) {
3840 			pr_err("failed to add sysfs for node%d during init: %d\n",
3841 			       nid, err);
3842 			goto err_cleanup_kobj;
3843 		}
3844 	}
3845 
3846 	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3847 	return 0;
3848 
3849 err_cleanup_kobj:
3850 	wi_cleanup();
3851 	kobject_del(&wi_group->wi_kobj);
3852 err_put_kobj:
3853 	kobject_put(&wi_group->wi_kobj);
3854 	return err;
3855 }
3856 
3857 static int __init mempolicy_sysfs_init(void)
3858 {
3859 	int err;
3860 	static struct kobject *mempolicy_kobj;
3861 
3862 	mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3863 	if (!mempolicy_kobj)
3864 		return -ENOMEM;
3865 
3866 	err = add_weighted_interleave_group(mempolicy_kobj);
3867 	if (err)
3868 		goto err_kobj;
3869 
3870 	return 0;
3871 
3872 err_kobj:
3873 	kobject_del(mempolicy_kobj);
3874 	kobject_put(mempolicy_kobj);
3875 	return err;
3876 }
3877 
3878 late_initcall(mempolicy_sysfs_init);
3879 #endif /* CONFIG_SYSFS */
3880