xref: /linux/mm/mempolicy.c (revision 7203ca412fc8e8a0588e9adc0f777d3163f8dff3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/sysctl.h>
89 #include <linux/sched/task.h>
90 #include <linux/nodemask.h>
91 #include <linux/cpuset.h>
92 #include <linux/slab.h>
93 #include <linux/string.h>
94 #include <linux/export.h>
95 #include <linux/nsproxy.h>
96 #include <linux/interrupt.h>
97 #include <linux/init.h>
98 #include <linux/compat.h>
99 #include <linux/ptrace.h>
100 #include <linux/swap.h>
101 #include <linux/seq_file.h>
102 #include <linux/proc_fs.h>
103 #include <linux/memory-tiers.h>
104 #include <linux/migrate.h>
105 #include <linux/ksm.h>
106 #include <linux/rmap.h>
107 #include <linux/security.h>
108 #include <linux/syscalls.h>
109 #include <linux/ctype.h>
110 #include <linux/mm_inline.h>
111 #include <linux/mmu_notifier.h>
112 #include <linux/printk.h>
113 #include <linux/leafops.h>
114 #include <linux/gcd.h>
115 
116 #include <asm/tlbflush.h>
117 #include <asm/tlb.h>
118 #include <linux/uaccess.h>
119 #include <linux/memory.h>
120 
121 #include "internal.h"
122 
123 /* Internal flags */
124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
125 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
126 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
127 
128 static struct kmem_cache *policy_cache;
129 static struct kmem_cache *sn_cache;
130 
131 /* Highest zone. An specific allocation for a zone below that is not
132    policied. */
133 enum zone_type policy_zone = 0;
134 
135 /*
136  * run-time system-wide default policy => local allocation
137  */
138 static struct mempolicy default_policy = {
139 	.refcnt = ATOMIC_INIT(1), /* never free it */
140 	.mode = MPOL_LOCAL,
141 };
142 
143 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144 
145 /*
146  * weightiness balances the tradeoff between small weights (cycles through nodes
147  * faster, more fair/even distribution) and large weights (smaller errors
148  * between actual bandwidth ratios and weight ratios). 32 is a number that has
149  * been found to perform at a reasonable compromise between the two goals.
150  */
151 static const int weightiness = 32;
152 
153 /*
154  * A null weighted_interleave_state is interpreted as having .mode="auto",
155  * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156  */
157 struct weighted_interleave_state {
158 	bool mode_auto;
159 	u8 iw_table[];
160 };
161 static struct weighted_interleave_state __rcu *wi_state;
162 static unsigned int *node_bw_table;
163 
164 /*
165  * wi_state_lock protects both wi_state and node_bw_table.
166  * node_bw_table is only used by writers to update wi_state.
167  */
168 static DEFINE_MUTEX(wi_state_lock);
169 
170 static u8 get_il_weight(int node)
171 {
172 	struct weighted_interleave_state *state;
173 	u8 weight = 1;
174 
175 	rcu_read_lock();
176 	state = rcu_dereference(wi_state);
177 	if (state)
178 		weight = state->iw_table[node];
179 	rcu_read_unlock();
180 	return weight;
181 }
182 
183 /*
184  * Convert bandwidth values into weighted interleave weights.
185  * Call with wi_state_lock.
186  */
187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188 {
189 	u64 sum_bw = 0;
190 	unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191 	int nid;
192 
193 	for_each_node_state(nid, N_MEMORY)
194 		sum_bw += bw[nid];
195 
196 	/* Scale bandwidths to whole numbers in the range [1, weightiness] */
197 	for_each_node_state(nid, N_MEMORY) {
198 		/*
199 		 * Try not to perform 64-bit division.
200 		 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201 		 * If sum_bw > scaling_factor, then round the weight up to 1.
202 		 */
203 		scaling_factor = weightiness * bw[nid];
204 		if (bw[nid] && sum_bw < scaling_factor) {
205 			cast_sum_bw = (unsigned int)sum_bw;
206 			new_iw[nid] = scaling_factor / cast_sum_bw;
207 		} else {
208 			new_iw[nid] = 1;
209 		}
210 		if (!iw_gcd)
211 			iw_gcd = new_iw[nid];
212 		iw_gcd = gcd(iw_gcd, new_iw[nid]);
213 	}
214 
215 	/* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216 	for_each_node_state(nid, N_MEMORY)
217 		new_iw[nid] /= iw_gcd;
218 }
219 
220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221 {
222 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223 	unsigned int *old_bw, *new_bw;
224 	unsigned int bw_val;
225 	int i;
226 
227 	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228 	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229 	if (!new_bw)
230 		return -ENOMEM;
231 
232 	new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
233 			       GFP_KERNEL);
234 	if (!new_wi_state) {
235 		kfree(new_bw);
236 		return -ENOMEM;
237 	}
238 	new_wi_state->mode_auto = true;
239 	for (i = 0; i < nr_node_ids; i++)
240 		new_wi_state->iw_table[i] = 1;
241 
242 	/*
243 	 * Update bandwidth info, even in manual mode. That way, when switching
244 	 * to auto mode in the future, iw_table can be overwritten using
245 	 * accurate bw data.
246 	 */
247 	mutex_lock(&wi_state_lock);
248 
249 	old_bw = node_bw_table;
250 	if (old_bw)
251 		memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
252 	new_bw[node] = bw_val;
253 	node_bw_table = new_bw;
254 
255 	old_wi_state = rcu_dereference_protected(wi_state,
256 					lockdep_is_held(&wi_state_lock));
257 	if (old_wi_state && !old_wi_state->mode_auto) {
258 		/* Manual mode; skip reducing weights and updating wi_state */
259 		mutex_unlock(&wi_state_lock);
260 		kfree(new_wi_state);
261 		goto out;
262 	}
263 
264 	/* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
265 	reduce_interleave_weights(new_bw, new_wi_state->iw_table);
266 	rcu_assign_pointer(wi_state, new_wi_state);
267 
268 	mutex_unlock(&wi_state_lock);
269 	if (old_wi_state) {
270 		synchronize_rcu();
271 		kfree(old_wi_state);
272 	}
273 out:
274 	kfree(old_bw);
275 	return 0;
276 }
277 
278 /**
279  * numa_nearest_node - Find nearest node by state
280  * @node: Node id to start the search
281  * @state: State to filter the search
282  *
283  * Lookup the closest node by distance if @nid is not in state.
284  *
285  * Return: this @node if it is in state, otherwise the closest node by distance
286  */
287 int numa_nearest_node(int node, unsigned int state)
288 {
289 	int min_dist = INT_MAX, dist, n, min_node;
290 
291 	if (state >= NR_NODE_STATES)
292 		return -EINVAL;
293 
294 	if (node == NUMA_NO_NODE || node_state(node, state))
295 		return node;
296 
297 	min_node = node;
298 	for_each_node_state(n, state) {
299 		dist = node_distance(node, n);
300 		if (dist < min_dist) {
301 			min_dist = dist;
302 			min_node = n;
303 		}
304 	}
305 
306 	return min_node;
307 }
308 EXPORT_SYMBOL_GPL(numa_nearest_node);
309 
310 /**
311  * nearest_node_nodemask - Find the node in @mask at the nearest distance
312  *			   from @node.
313  *
314  * @node: a valid node ID to start the search from.
315  * @mask: a pointer to a nodemask representing the allowed nodes.
316  *
317  * This function iterates over all nodes in @mask and calculates the
318  * distance from the starting @node, then it returns the node ID that is
319  * the closest to @node, or MAX_NUMNODES if no node is found.
320  *
321  * Note that @node must be a valid node ID usable with node_distance(),
322  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
323  * or unexpected behavior.
324  */
325 int nearest_node_nodemask(int node, nodemask_t *mask)
326 {
327 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
328 
329 	for_each_node_mask(n, *mask) {
330 		dist = node_distance(node, n);
331 		if (dist < min_dist) {
332 			min_dist = dist;
333 			min_node = n;
334 		}
335 	}
336 
337 	return min_node;
338 }
339 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
340 
341 struct mempolicy *get_task_policy(struct task_struct *p)
342 {
343 	struct mempolicy *pol = p->mempolicy;
344 	int node;
345 
346 	if (pol)
347 		return pol;
348 
349 	node = numa_node_id();
350 	if (node != NUMA_NO_NODE) {
351 		pol = &preferred_node_policy[node];
352 		/* preferred_node_policy is not initialised early in boot */
353 		if (pol->mode)
354 			return pol;
355 	}
356 
357 	return &default_policy;
358 }
359 
360 static const struct mempolicy_operations {
361 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
362 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
363 } mpol_ops[MPOL_MAX];
364 
365 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
366 {
367 	return pol->flags & MPOL_MODE_FLAGS;
368 }
369 
370 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
371 				   const nodemask_t *rel)
372 {
373 	nodemask_t tmp;
374 	nodes_fold(tmp, *orig, nodes_weight(*rel));
375 	nodes_onto(*ret, tmp, *rel);
376 }
377 
378 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
379 {
380 	if (nodes_empty(*nodes))
381 		return -EINVAL;
382 	pol->nodes = *nodes;
383 	return 0;
384 }
385 
386 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 	if (nodes_empty(*nodes))
389 		return -EINVAL;
390 
391 	nodes_clear(pol->nodes);
392 	node_set(first_node(*nodes), pol->nodes);
393 	return 0;
394 }
395 
396 /*
397  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
398  * any, for the new policy.  mpol_new() has already validated the nodes
399  * parameter with respect to the policy mode and flags.
400  *
401  * Must be called holding task's alloc_lock to protect task's mems_allowed
402  * and mempolicy.  May also be called holding the mmap_lock for write.
403  */
404 static int mpol_set_nodemask(struct mempolicy *pol,
405 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
406 {
407 	int ret;
408 
409 	/*
410 	 * Default (pol==NULL) resp. local memory policies are not a
411 	 * subject of any remapping. They also do not need any special
412 	 * constructor.
413 	 */
414 	if (!pol || pol->mode == MPOL_LOCAL)
415 		return 0;
416 
417 	/* Check N_MEMORY */
418 	nodes_and(nsc->mask1,
419 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
420 
421 	VM_BUG_ON(!nodes);
422 
423 	if (pol->flags & MPOL_F_RELATIVE_NODES)
424 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
425 	else
426 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
427 
428 	if (mpol_store_user_nodemask(pol))
429 		pol->w.user_nodemask = *nodes;
430 	else
431 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
432 
433 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
434 	return ret;
435 }
436 
437 /*
438  * This function just creates a new policy, does some check and simple
439  * initialization. You must invoke mpol_set_nodemask() to set nodes.
440  */
441 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
442 				  nodemask_t *nodes)
443 {
444 	struct mempolicy *policy;
445 
446 	if (mode == MPOL_DEFAULT) {
447 		if (nodes && !nodes_empty(*nodes))
448 			return ERR_PTR(-EINVAL);
449 		return NULL;
450 	}
451 	VM_BUG_ON(!nodes);
452 
453 	/*
454 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
455 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
456 	 * All other modes require a valid pointer to a non-empty nodemask.
457 	 */
458 	if (mode == MPOL_PREFERRED) {
459 		if (nodes_empty(*nodes)) {
460 			if (((flags & MPOL_F_STATIC_NODES) ||
461 			     (flags & MPOL_F_RELATIVE_NODES)))
462 				return ERR_PTR(-EINVAL);
463 
464 			mode = MPOL_LOCAL;
465 		}
466 	} else if (mode == MPOL_LOCAL) {
467 		if (!nodes_empty(*nodes) ||
468 		    (flags & MPOL_F_STATIC_NODES) ||
469 		    (flags & MPOL_F_RELATIVE_NODES))
470 			return ERR_PTR(-EINVAL);
471 	} else if (nodes_empty(*nodes))
472 		return ERR_PTR(-EINVAL);
473 
474 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
475 	if (!policy)
476 		return ERR_PTR(-ENOMEM);
477 	atomic_set(&policy->refcnt, 1);
478 	policy->mode = mode;
479 	policy->flags = flags;
480 	policy->home_node = NUMA_NO_NODE;
481 
482 	return policy;
483 }
484 
485 /* Slow path of a mpol destructor. */
486 void __mpol_put(struct mempolicy *pol)
487 {
488 	if (!atomic_dec_and_test(&pol->refcnt))
489 		return;
490 	kmem_cache_free(policy_cache, pol);
491 }
492 
493 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
494 {
495 }
496 
497 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
498 {
499 	nodemask_t tmp;
500 
501 	if (pol->flags & MPOL_F_STATIC_NODES)
502 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
503 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
504 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
505 	else {
506 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
507 								*nodes);
508 		pol->w.cpuset_mems_allowed = *nodes;
509 	}
510 
511 	if (nodes_empty(tmp))
512 		tmp = *nodes;
513 
514 	pol->nodes = tmp;
515 }
516 
517 static void mpol_rebind_preferred(struct mempolicy *pol,
518 						const nodemask_t *nodes)
519 {
520 	pol->w.cpuset_mems_allowed = *nodes;
521 }
522 
523 /*
524  * mpol_rebind_policy - Migrate a policy to a different set of nodes
525  *
526  * Per-vma policies are protected by mmap_lock. Allocations using per-task
527  * policies are protected by task->mems_allowed_seq to prevent a premature
528  * OOM/allocation failure due to parallel nodemask modification.
529  */
530 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
531 {
532 	if (!pol || pol->mode == MPOL_LOCAL)
533 		return;
534 	if (!mpol_store_user_nodemask(pol) &&
535 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
536 		return;
537 
538 	mpol_ops[pol->mode].rebind(pol, newmask);
539 }
540 
541 /*
542  * Wrapper for mpol_rebind_policy() that just requires task
543  * pointer, and updates task mempolicy.
544  *
545  * Called with task's alloc_lock held.
546  */
547 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
548 {
549 	mpol_rebind_policy(tsk->mempolicy, new);
550 }
551 
552 /*
553  * Rebind each vma in mm to new nodemask.
554  *
555  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
556  */
557 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
558 {
559 	struct vm_area_struct *vma;
560 	VMA_ITERATOR(vmi, mm, 0);
561 
562 	mmap_write_lock(mm);
563 	for_each_vma(vmi, vma) {
564 		vma_start_write(vma);
565 		mpol_rebind_policy(vma->vm_policy, new);
566 	}
567 	mmap_write_unlock(mm);
568 }
569 
570 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
571 	[MPOL_DEFAULT] = {
572 		.rebind = mpol_rebind_default,
573 	},
574 	[MPOL_INTERLEAVE] = {
575 		.create = mpol_new_nodemask,
576 		.rebind = mpol_rebind_nodemask,
577 	},
578 	[MPOL_PREFERRED] = {
579 		.create = mpol_new_preferred,
580 		.rebind = mpol_rebind_preferred,
581 	},
582 	[MPOL_BIND] = {
583 		.create = mpol_new_nodemask,
584 		.rebind = mpol_rebind_nodemask,
585 	},
586 	[MPOL_LOCAL] = {
587 		.rebind = mpol_rebind_default,
588 	},
589 	[MPOL_PREFERRED_MANY] = {
590 		.create = mpol_new_nodemask,
591 		.rebind = mpol_rebind_preferred,
592 	},
593 	[MPOL_WEIGHTED_INTERLEAVE] = {
594 		.create = mpol_new_nodemask,
595 		.rebind = mpol_rebind_nodemask,
596 	},
597 };
598 
599 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
600 				unsigned long flags);
601 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
602 				pgoff_t ilx, int *nid);
603 
604 static bool strictly_unmovable(unsigned long flags)
605 {
606 	/*
607 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
608 	 * if any misplaced page is found.
609 	 */
610 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
611 			 MPOL_MF_STRICT;
612 }
613 
614 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
615 	struct mempolicy *pol;
616 	pgoff_t ilx;
617 };
618 
619 struct queue_pages {
620 	struct list_head *pagelist;
621 	unsigned long flags;
622 	nodemask_t *nmask;
623 	unsigned long start;
624 	unsigned long end;
625 	struct vm_area_struct *first;
626 	struct folio *large;		/* note last large folio encountered */
627 	long nr_failed;			/* could not be isolated at this time */
628 };
629 
630 /*
631  * Check if the folio's nid is in qp->nmask.
632  *
633  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
634  * in the invert of qp->nmask.
635  */
636 static inline bool queue_folio_required(struct folio *folio,
637 					struct queue_pages *qp)
638 {
639 	int nid = folio_nid(folio);
640 	unsigned long flags = qp->flags;
641 
642 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
643 }
644 
645 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
646 {
647 	struct folio *folio;
648 	struct queue_pages *qp = walk->private;
649 
650 	if (unlikely(pmd_is_migration_entry(*pmd))) {
651 		qp->nr_failed++;
652 		return;
653 	}
654 	folio = pmd_folio(*pmd);
655 	if (is_huge_zero_folio(folio)) {
656 		walk->action = ACTION_CONTINUE;
657 		return;
658 	}
659 	if (!queue_folio_required(folio, qp))
660 		return;
661 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
662 	    !vma_migratable(walk->vma) ||
663 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
664 		qp->nr_failed++;
665 }
666 
667 /*
668  * Scan through folios, checking if they satisfy the required conditions,
669  * moving them from LRU to local pagelist for migration if they do (or not).
670  *
671  * queue_folios_pte_range() has two possible return values:
672  * 0 - continue walking to scan for more, even if an existing folio on the
673  *     wrong node could not be isolated and queued for migration.
674  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
675  *        and an existing folio was on a node that does not follow the policy.
676  */
677 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
678 			unsigned long end, struct mm_walk *walk)
679 {
680 	struct vm_area_struct *vma = walk->vma;
681 	struct folio *folio;
682 	struct queue_pages *qp = walk->private;
683 	unsigned long flags = qp->flags;
684 	pte_t *pte, *mapped_pte;
685 	pte_t ptent;
686 	spinlock_t *ptl;
687 	int max_nr, nr;
688 
689 	ptl = pmd_trans_huge_lock(pmd, vma);
690 	if (ptl) {
691 		queue_folios_pmd(pmd, walk);
692 		spin_unlock(ptl);
693 		goto out;
694 	}
695 
696 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
697 	if (!pte) {
698 		walk->action = ACTION_AGAIN;
699 		return 0;
700 	}
701 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
702 		max_nr = (end - addr) >> PAGE_SHIFT;
703 		nr = 1;
704 		ptent = ptep_get(pte);
705 		if (pte_none(ptent))
706 			continue;
707 		if (!pte_present(ptent)) {
708 			const softleaf_t entry = softleaf_from_pte(ptent);
709 
710 			if (softleaf_is_migration(entry))
711 				qp->nr_failed++;
712 			continue;
713 		}
714 		folio = vm_normal_folio(vma, addr, ptent);
715 		if (!folio || folio_is_zone_device(folio))
716 			continue;
717 		if (folio_test_large(folio) && max_nr != 1)
718 			nr = folio_pte_batch(folio, pte, ptent, max_nr);
719 		/*
720 		 * vm_normal_folio() filters out zero pages, but there might
721 		 * still be reserved folios to skip, perhaps in a VDSO.
722 		 */
723 		if (folio_test_reserved(folio))
724 			continue;
725 		if (!queue_folio_required(folio, qp))
726 			continue;
727 		if (folio_test_large(folio)) {
728 			/*
729 			 * A large folio can only be isolated from LRU once,
730 			 * but may be mapped by many PTEs (and Copy-On-Write may
731 			 * intersperse PTEs of other, order 0, folios).  This is
732 			 * a common case, so don't mistake it for failure (but
733 			 * there can be other cases of multi-mapped pages which
734 			 * this quick check does not help to filter out - and a
735 			 * search of the pagelist might grow to be prohibitive).
736 			 *
737 			 * migrate_pages(&pagelist) returns nr_failed folios, so
738 			 * check "large" now so that queue_pages_range() returns
739 			 * a comparable nr_failed folios.  This does imply that
740 			 * if folio could not be isolated for some racy reason
741 			 * at its first PTE, later PTEs will not give it another
742 			 * chance of isolation; but keeps the accounting simple.
743 			 */
744 			if (folio == qp->large)
745 				continue;
746 			qp->large = folio;
747 		}
748 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
749 		    !vma_migratable(vma) ||
750 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
751 			qp->nr_failed += nr;
752 			if (strictly_unmovable(flags))
753 				break;
754 		}
755 	}
756 	pte_unmap_unlock(mapped_pte, ptl);
757 	cond_resched();
758 out:
759 	if (qp->nr_failed && strictly_unmovable(flags))
760 		return -EIO;
761 	return 0;
762 }
763 
764 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
765 			       unsigned long addr, unsigned long end,
766 			       struct mm_walk *walk)
767 {
768 #ifdef CONFIG_HUGETLB_PAGE
769 	struct queue_pages *qp = walk->private;
770 	unsigned long flags = qp->flags;
771 	struct folio *folio;
772 	spinlock_t *ptl;
773 	pte_t ptep;
774 
775 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
776 	ptep = huge_ptep_get(walk->mm, addr, pte);
777 	if (!pte_present(ptep)) {
778 		if (!huge_pte_none(ptep)) {
779 			const softleaf_t entry = softleaf_from_pte(ptep);
780 
781 			if (unlikely(softleaf_is_migration(entry)))
782 				qp->nr_failed++;
783 		}
784 
785 		goto unlock;
786 	}
787 	folio = pfn_folio(pte_pfn(ptep));
788 	if (!queue_folio_required(folio, qp))
789 		goto unlock;
790 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
791 	    !vma_migratable(walk->vma)) {
792 		qp->nr_failed++;
793 		goto unlock;
794 	}
795 	/*
796 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
797 	 * Choosing not to migrate a shared folio is not counted as a failure.
798 	 *
799 	 * See folio_maybe_mapped_shared() on possible imprecision when we
800 	 * cannot easily detect if a folio is shared.
801 	 */
802 	if ((flags & MPOL_MF_MOVE_ALL) ||
803 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
804 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
805 			qp->nr_failed++;
806 unlock:
807 	spin_unlock(ptl);
808 	if (qp->nr_failed && strictly_unmovable(flags))
809 		return -EIO;
810 #endif
811 	return 0;
812 }
813 
814 #ifdef CONFIG_NUMA_BALANCING
815 /**
816  * folio_can_map_prot_numa() - check whether the folio can map prot numa
817  * @folio: The folio whose mapping considered for being made NUMA hintable
818  * @vma: The VMA that the folio belongs to.
819  * @is_private_single_threaded: Is this a single-threaded private VMA or not
820  *
821  * This function checks to see if the folio actually indicates that
822  * we need to make the mapping one which causes a NUMA hinting fault,
823  * as there are cases where it's simply unnecessary, and the folio's
824  * access time is adjusted for memory tiering if prot numa needed.
825  *
826  * Return: True if the mapping of the folio needs to be changed, false otherwise.
827  */
828 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
829 		bool is_private_single_threaded)
830 {
831 	int nid;
832 
833 	if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
834 		return false;
835 
836 	/* Also skip shared copy-on-write folios */
837 	if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
838 		return false;
839 
840 	/* Folios are pinned and can't be migrated */
841 	if (folio_maybe_dma_pinned(folio))
842 		return false;
843 
844 	/*
845 	 * While migration can move some dirty folios,
846 	 * it cannot move them all from MIGRATE_ASYNC
847 	 * context.
848 	 */
849 	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
850 		return false;
851 
852 	/*
853 	 * Don't mess with PTEs if folio is already on the node
854 	 * a single-threaded process is running on.
855 	 */
856 	nid = folio_nid(folio);
857 	if (is_private_single_threaded && (nid == numa_node_id()))
858 		return false;
859 
860 	/*
861 	 * Skip scanning top tier node if normal numa
862 	 * balancing is disabled
863 	 */
864 	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
865 	    node_is_toptier(nid))
866 		return false;
867 
868 	if (folio_use_access_time(folio))
869 		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
870 
871 	return true;
872 }
873 
874 /*
875  * This is used to mark a range of virtual addresses to be inaccessible.
876  * These are later cleared by a NUMA hinting fault. Depending on these
877  * faults, pages may be migrated for better NUMA placement.
878  *
879  * This is assuming that NUMA faults are handled using PROT_NONE. If
880  * an architecture makes a different choice, it will need further
881  * changes to the core.
882  */
883 unsigned long change_prot_numa(struct vm_area_struct *vma,
884 			unsigned long addr, unsigned long end)
885 {
886 	struct mmu_gather tlb;
887 	long nr_updated;
888 
889 	tlb_gather_mmu(&tlb, vma->vm_mm);
890 
891 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
892 	if (nr_updated > 0) {
893 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
894 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
895 	}
896 
897 	tlb_finish_mmu(&tlb);
898 
899 	return nr_updated;
900 }
901 #endif /* CONFIG_NUMA_BALANCING */
902 
903 static int queue_pages_test_walk(unsigned long start, unsigned long end,
904 				struct mm_walk *walk)
905 {
906 	struct vm_area_struct *next, *vma = walk->vma;
907 	struct queue_pages *qp = walk->private;
908 	unsigned long flags = qp->flags;
909 
910 	/* range check first */
911 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
912 
913 	if (!qp->first) {
914 		qp->first = vma;
915 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
916 			(qp->start < vma->vm_start))
917 			/* hole at head side of range */
918 			return -EFAULT;
919 	}
920 	next = find_vma(vma->vm_mm, vma->vm_end);
921 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
922 		((vma->vm_end < qp->end) &&
923 		(!next || vma->vm_end < next->vm_start)))
924 		/* hole at middle or tail of range */
925 		return -EFAULT;
926 
927 	/*
928 	 * Need check MPOL_MF_STRICT to return -EIO if possible
929 	 * regardless of vma_migratable
930 	 */
931 	if (!vma_migratable(vma) &&
932 	    !(flags & MPOL_MF_STRICT))
933 		return 1;
934 
935 	/*
936 	 * Check page nodes, and queue pages to move, in the current vma.
937 	 * But if no moving, and no strict checking, the scan can be skipped.
938 	 */
939 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
940 		return 0;
941 	return 1;
942 }
943 
944 static const struct mm_walk_ops queue_pages_walk_ops = {
945 	.hugetlb_entry		= queue_folios_hugetlb,
946 	.pmd_entry		= queue_folios_pte_range,
947 	.test_walk		= queue_pages_test_walk,
948 	.walk_lock		= PGWALK_RDLOCK,
949 };
950 
951 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
952 	.hugetlb_entry		= queue_folios_hugetlb,
953 	.pmd_entry		= queue_folios_pte_range,
954 	.test_walk		= queue_pages_test_walk,
955 	.walk_lock		= PGWALK_WRLOCK,
956 };
957 
958 /*
959  * Walk through page tables and collect pages to be migrated.
960  *
961  * If pages found in a given range are not on the required set of @nodes,
962  * and migration is allowed, they are isolated and queued to @pagelist.
963  *
964  * queue_pages_range() may return:
965  * 0 - all pages already on the right node, or successfully queued for moving
966  *     (or neither strict checking nor moving requested: only range checking).
967  * >0 - this number of misplaced folios could not be queued for moving
968  *      (a hugetlbfs page or a transparent huge page being counted as 1).
969  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
970  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
971  */
972 static long
973 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
974 		nodemask_t *nodes, unsigned long flags,
975 		struct list_head *pagelist)
976 {
977 	int err;
978 	struct queue_pages qp = {
979 		.pagelist = pagelist,
980 		.flags = flags,
981 		.nmask = nodes,
982 		.start = start,
983 		.end = end,
984 		.first = NULL,
985 	};
986 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
987 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
988 
989 	err = walk_page_range(mm, start, end, ops, &qp);
990 
991 	if (!qp.first)
992 		/* whole range in hole */
993 		err = -EFAULT;
994 
995 	return err ? : qp.nr_failed;
996 }
997 
998 /*
999  * Apply policy to a single VMA
1000  * This must be called with the mmap_lock held for writing.
1001  */
1002 static int vma_replace_policy(struct vm_area_struct *vma,
1003 				struct mempolicy *pol)
1004 {
1005 	int err;
1006 	struct mempolicy *old;
1007 	struct mempolicy *new;
1008 
1009 	vma_assert_write_locked(vma);
1010 
1011 	new = mpol_dup(pol);
1012 	if (IS_ERR(new))
1013 		return PTR_ERR(new);
1014 
1015 	if (vma->vm_ops && vma->vm_ops->set_policy) {
1016 		err = vma->vm_ops->set_policy(vma, new);
1017 		if (err)
1018 			goto err_out;
1019 	}
1020 
1021 	old = vma->vm_policy;
1022 	vma->vm_policy = new; /* protected by mmap_lock */
1023 	mpol_put(old);
1024 
1025 	return 0;
1026  err_out:
1027 	mpol_put(new);
1028 	return err;
1029 }
1030 
1031 /* Split or merge the VMA (if required) and apply the new policy */
1032 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1033 		struct vm_area_struct **prev, unsigned long start,
1034 		unsigned long end, struct mempolicy *new_pol)
1035 {
1036 	unsigned long vmstart, vmend;
1037 
1038 	vmend = min(end, vma->vm_end);
1039 	if (start > vma->vm_start) {
1040 		*prev = vma;
1041 		vmstart = start;
1042 	} else {
1043 		vmstart = vma->vm_start;
1044 	}
1045 
1046 	if (mpol_equal(vma->vm_policy, new_pol)) {
1047 		*prev = vma;
1048 		return 0;
1049 	}
1050 
1051 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
1052 	if (IS_ERR(vma))
1053 		return PTR_ERR(vma);
1054 
1055 	*prev = vma;
1056 	return vma_replace_policy(vma, new_pol);
1057 }
1058 
1059 /* Set the process memory policy */
1060 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1061 			     nodemask_t *nodes)
1062 {
1063 	struct mempolicy *new, *old;
1064 	NODEMASK_SCRATCH(scratch);
1065 	int ret;
1066 
1067 	if (!scratch)
1068 		return -ENOMEM;
1069 
1070 	new = mpol_new(mode, flags, nodes);
1071 	if (IS_ERR(new)) {
1072 		ret = PTR_ERR(new);
1073 		goto out;
1074 	}
1075 
1076 	task_lock(current);
1077 	ret = mpol_set_nodemask(new, nodes, scratch);
1078 	if (ret) {
1079 		task_unlock(current);
1080 		mpol_put(new);
1081 		goto out;
1082 	}
1083 
1084 	old = current->mempolicy;
1085 	current->mempolicy = new;
1086 	if (new && (new->mode == MPOL_INTERLEAVE ||
1087 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1088 		current->il_prev = MAX_NUMNODES-1;
1089 		current->il_weight = 0;
1090 	}
1091 	task_unlock(current);
1092 	mpol_put(old);
1093 	ret = 0;
1094 out:
1095 	NODEMASK_SCRATCH_FREE(scratch);
1096 	return ret;
1097 }
1098 
1099 /*
1100  * Return nodemask for policy for get_mempolicy() query
1101  *
1102  * Called with task's alloc_lock held
1103  */
1104 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1105 {
1106 	nodes_clear(*nodes);
1107 	if (pol == &default_policy)
1108 		return;
1109 
1110 	switch (pol->mode) {
1111 	case MPOL_BIND:
1112 	case MPOL_INTERLEAVE:
1113 	case MPOL_PREFERRED:
1114 	case MPOL_PREFERRED_MANY:
1115 	case MPOL_WEIGHTED_INTERLEAVE:
1116 		*nodes = pol->nodes;
1117 		break;
1118 	case MPOL_LOCAL:
1119 		/* return empty node mask for local allocation */
1120 		break;
1121 	default:
1122 		BUG();
1123 	}
1124 }
1125 
1126 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1127 {
1128 	struct page *p = NULL;
1129 	int ret;
1130 
1131 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1132 	if (ret > 0) {
1133 		ret = page_to_nid(p);
1134 		put_page(p);
1135 	}
1136 	return ret;
1137 }
1138 
1139 /* Retrieve NUMA policy */
1140 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1141 			     unsigned long addr, unsigned long flags)
1142 {
1143 	int err;
1144 	struct mm_struct *mm = current->mm;
1145 	struct vm_area_struct *vma = NULL;
1146 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1147 
1148 	if (flags &
1149 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1150 		return -EINVAL;
1151 
1152 	if (flags & MPOL_F_MEMS_ALLOWED) {
1153 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1154 			return -EINVAL;
1155 		*policy = 0;	/* just so it's initialized */
1156 		task_lock(current);
1157 		*nmask  = cpuset_current_mems_allowed;
1158 		task_unlock(current);
1159 		return 0;
1160 	}
1161 
1162 	if (flags & MPOL_F_ADDR) {
1163 		pgoff_t ilx;		/* ignored here */
1164 		/*
1165 		 * Do NOT fall back to task policy if the
1166 		 * vma/shared policy at addr is NULL.  We
1167 		 * want to return MPOL_DEFAULT in this case.
1168 		 */
1169 		mmap_read_lock(mm);
1170 		vma = vma_lookup(mm, addr);
1171 		if (!vma) {
1172 			mmap_read_unlock(mm);
1173 			return -EFAULT;
1174 		}
1175 		pol = __get_vma_policy(vma, addr, &ilx);
1176 	} else if (addr)
1177 		return -EINVAL;
1178 
1179 	if (!pol)
1180 		pol = &default_policy;	/* indicates default behavior */
1181 
1182 	if (flags & MPOL_F_NODE) {
1183 		if (flags & MPOL_F_ADDR) {
1184 			/*
1185 			 * Take a refcount on the mpol, because we are about to
1186 			 * drop the mmap_lock, after which only "pol" remains
1187 			 * valid, "vma" is stale.
1188 			 */
1189 			pol_refcount = pol;
1190 			vma = NULL;
1191 			mpol_get(pol);
1192 			mmap_read_unlock(mm);
1193 			err = lookup_node(mm, addr);
1194 			if (err < 0)
1195 				goto out;
1196 			*policy = err;
1197 		} else if (pol == current->mempolicy &&
1198 				pol->mode == MPOL_INTERLEAVE) {
1199 			*policy = next_node_in(current->il_prev, pol->nodes);
1200 		} else if (pol == current->mempolicy &&
1201 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1202 			if (current->il_weight)
1203 				*policy = current->il_prev;
1204 			else
1205 				*policy = next_node_in(current->il_prev,
1206 						       pol->nodes);
1207 		} else {
1208 			err = -EINVAL;
1209 			goto out;
1210 		}
1211 	} else {
1212 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1213 						pol->mode;
1214 		/*
1215 		 * Internal mempolicy flags must be masked off before exposing
1216 		 * the policy to userspace.
1217 		 */
1218 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1219 	}
1220 
1221 	err = 0;
1222 	if (nmask) {
1223 		if (mpol_store_user_nodemask(pol)) {
1224 			*nmask = pol->w.user_nodemask;
1225 		} else {
1226 			task_lock(current);
1227 			get_policy_nodemask(pol, nmask);
1228 			task_unlock(current);
1229 		}
1230 	}
1231 
1232  out:
1233 	mpol_cond_put(pol);
1234 	if (vma)
1235 		mmap_read_unlock(mm);
1236 	if (pol_refcount)
1237 		mpol_put(pol_refcount);
1238 	return err;
1239 }
1240 
1241 #ifdef CONFIG_MIGRATION
1242 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1243 				unsigned long flags)
1244 {
1245 	/*
1246 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1247 	 * Choosing not to migrate a shared folio is not counted as a failure.
1248 	 *
1249 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1250 	 * cannot easily detect if a folio is shared.
1251 	 */
1252 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1253 		if (folio_isolate_lru(folio)) {
1254 			list_add_tail(&folio->lru, foliolist);
1255 			node_stat_mod_folio(folio,
1256 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1257 				folio_nr_pages(folio));
1258 		} else {
1259 			/*
1260 			 * Non-movable folio may reach here.  And, there may be
1261 			 * temporary off LRU folios or non-LRU movable folios.
1262 			 * Treat them as unmovable folios since they can't be
1263 			 * isolated, so they can't be moved at the moment.
1264 			 */
1265 			return false;
1266 		}
1267 	}
1268 	return true;
1269 }
1270 
1271 /*
1272  * Migrate pages from one node to a target node.
1273  * Returns error or the number of pages not migrated.
1274  */
1275 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1276 			    int flags)
1277 {
1278 	nodemask_t nmask;
1279 	struct vm_area_struct *vma;
1280 	LIST_HEAD(pagelist);
1281 	long nr_failed;
1282 	long err = 0;
1283 	struct migration_target_control mtc = {
1284 		.nid = dest,
1285 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1286 		.reason = MR_SYSCALL,
1287 	};
1288 
1289 	nodes_clear(nmask);
1290 	node_set(source, nmask);
1291 
1292 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1293 
1294 	mmap_read_lock(mm);
1295 	vma = find_vma(mm, 0);
1296 	if (unlikely(!vma)) {
1297 		mmap_read_unlock(mm);
1298 		return 0;
1299 	}
1300 
1301 	/*
1302 	 * This does not migrate the range, but isolates all pages that
1303 	 * need migration.  Between passing in the full user address
1304 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1305 	 * but passes back the count of pages which could not be isolated.
1306 	 */
1307 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1308 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1309 	mmap_read_unlock(mm);
1310 
1311 	if (!list_empty(&pagelist)) {
1312 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1313 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1314 		if (err)
1315 			putback_movable_pages(&pagelist);
1316 	}
1317 
1318 	if (err >= 0)
1319 		err += nr_failed;
1320 	return err;
1321 }
1322 
1323 /*
1324  * Move pages between the two nodesets so as to preserve the physical
1325  * layout as much as possible.
1326  *
1327  * Returns the number of page that could not be moved.
1328  */
1329 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1330 		     const nodemask_t *to, int flags)
1331 {
1332 	long nr_failed = 0;
1333 	long err = 0;
1334 	nodemask_t tmp;
1335 
1336 	lru_cache_disable();
1337 
1338 	/*
1339 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1340 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1341 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1342 	 * The pair of nodemasks 'to' and 'from' define the map.
1343 	 *
1344 	 * If no pair of bits is found that way, fallback to picking some
1345 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1346 	 * 'source' and 'dest' bits are the same, this represents a node
1347 	 * that will be migrating to itself, so no pages need move.
1348 	 *
1349 	 * If no bits are left in 'tmp', or if all remaining bits left
1350 	 * in 'tmp' correspond to the same bit in 'to', return false
1351 	 * (nothing left to migrate).
1352 	 *
1353 	 * This lets us pick a pair of nodes to migrate between, such that
1354 	 * if possible the dest node is not already occupied by some other
1355 	 * source node, minimizing the risk of overloading the memory on a
1356 	 * node that would happen if we migrated incoming memory to a node
1357 	 * before migrating outgoing memory source that same node.
1358 	 *
1359 	 * A single scan of tmp is sufficient.  As we go, we remember the
1360 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1361 	 * that not only moved, but what's better, moved to an empty slot
1362 	 * (d is not set in tmp), then we break out then, with that pair.
1363 	 * Otherwise when we finish scanning from_tmp, we at least have the
1364 	 * most recent <s, d> pair that moved.  If we get all the way through
1365 	 * the scan of tmp without finding any node that moved, much less
1366 	 * moved to an empty node, then there is nothing left worth migrating.
1367 	 */
1368 
1369 	tmp = *from;
1370 	while (!nodes_empty(tmp)) {
1371 		int s, d;
1372 		int source = NUMA_NO_NODE;
1373 		int dest = 0;
1374 
1375 		for_each_node_mask(s, tmp) {
1376 
1377 			/*
1378 			 * do_migrate_pages() tries to maintain the relative
1379 			 * node relationship of the pages established between
1380 			 * threads and memory areas.
1381                          *
1382 			 * However if the number of source nodes is not equal to
1383 			 * the number of destination nodes we can not preserve
1384 			 * this node relative relationship.  In that case, skip
1385 			 * copying memory from a node that is in the destination
1386 			 * mask.
1387 			 *
1388 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1389 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1390 			 */
1391 
1392 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1393 						(node_isset(s, *to)))
1394 				continue;
1395 
1396 			d = node_remap(s, *from, *to);
1397 			if (s == d)
1398 				continue;
1399 
1400 			source = s;	/* Node moved. Memorize */
1401 			dest = d;
1402 
1403 			/* dest not in remaining from nodes? */
1404 			if (!node_isset(dest, tmp))
1405 				break;
1406 		}
1407 		if (source == NUMA_NO_NODE)
1408 			break;
1409 
1410 		node_clear(source, tmp);
1411 		err = migrate_to_node(mm, source, dest, flags);
1412 		if (err > 0)
1413 			nr_failed += err;
1414 		if (err < 0)
1415 			break;
1416 	}
1417 
1418 	lru_cache_enable();
1419 	if (err < 0)
1420 		return err;
1421 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1422 }
1423 
1424 /*
1425  * Allocate a new folio for page migration, according to NUMA mempolicy.
1426  */
1427 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1428 						    unsigned long private)
1429 {
1430 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1431 	struct mempolicy *pol = mmpol->pol;
1432 	pgoff_t ilx = mmpol->ilx;
1433 	unsigned int order;
1434 	int nid = numa_node_id();
1435 	gfp_t gfp;
1436 
1437 	order = folio_order(src);
1438 	ilx += src->index >> order;
1439 
1440 	if (folio_test_hugetlb(src)) {
1441 		nodemask_t *nodemask;
1442 		struct hstate *h;
1443 
1444 		h = folio_hstate(src);
1445 		gfp = htlb_alloc_mask(h);
1446 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1447 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1448 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1449 	}
1450 
1451 	if (folio_test_large(src))
1452 		gfp = GFP_TRANSHUGE;
1453 	else
1454 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1455 
1456 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1457 }
1458 #else
1459 
1460 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1461 				unsigned long flags)
1462 {
1463 	return false;
1464 }
1465 
1466 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1467 		     const nodemask_t *to, int flags)
1468 {
1469 	return -ENOSYS;
1470 }
1471 
1472 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1473 						    unsigned long private)
1474 {
1475 	return NULL;
1476 }
1477 #endif
1478 
1479 static long do_mbind(unsigned long start, unsigned long len,
1480 		     unsigned short mode, unsigned short mode_flags,
1481 		     nodemask_t *nmask, unsigned long flags)
1482 {
1483 	struct mm_struct *mm = current->mm;
1484 	struct vm_area_struct *vma, *prev;
1485 	struct vma_iterator vmi;
1486 	struct migration_mpol mmpol;
1487 	struct mempolicy *new;
1488 	unsigned long end;
1489 	long err;
1490 	long nr_failed;
1491 	LIST_HEAD(pagelist);
1492 
1493 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1494 		return -EINVAL;
1495 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1496 		return -EPERM;
1497 
1498 	if (start & ~PAGE_MASK)
1499 		return -EINVAL;
1500 
1501 	if (mode == MPOL_DEFAULT)
1502 		flags &= ~MPOL_MF_STRICT;
1503 
1504 	len = PAGE_ALIGN(len);
1505 	end = start + len;
1506 
1507 	if (end < start)
1508 		return -EINVAL;
1509 	if (end == start)
1510 		return 0;
1511 
1512 	new = mpol_new(mode, mode_flags, nmask);
1513 	if (IS_ERR(new))
1514 		return PTR_ERR(new);
1515 
1516 	/*
1517 	 * If we are using the default policy then operation
1518 	 * on discontinuous address spaces is okay after all
1519 	 */
1520 	if (!new)
1521 		flags |= MPOL_MF_DISCONTIG_OK;
1522 
1523 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1524 		lru_cache_disable();
1525 	{
1526 		NODEMASK_SCRATCH(scratch);
1527 		if (scratch) {
1528 			mmap_write_lock(mm);
1529 			err = mpol_set_nodemask(new, nmask, scratch);
1530 			if (err)
1531 				mmap_write_unlock(mm);
1532 		} else
1533 			err = -ENOMEM;
1534 		NODEMASK_SCRATCH_FREE(scratch);
1535 	}
1536 	if (err)
1537 		goto mpol_out;
1538 
1539 	/*
1540 	 * Lock the VMAs before scanning for pages to migrate,
1541 	 * to ensure we don't miss a concurrently inserted page.
1542 	 */
1543 	nr_failed = queue_pages_range(mm, start, end, nmask,
1544 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1545 
1546 	if (nr_failed < 0) {
1547 		err = nr_failed;
1548 		nr_failed = 0;
1549 	} else {
1550 		vma_iter_init(&vmi, mm, start);
1551 		prev = vma_prev(&vmi);
1552 		for_each_vma_range(vmi, vma, end) {
1553 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1554 			if (err)
1555 				break;
1556 		}
1557 	}
1558 
1559 	if (!err && !list_empty(&pagelist)) {
1560 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1561 		if (!new) {
1562 			new = get_task_policy(current);
1563 			mpol_get(new);
1564 		}
1565 		mmpol.pol = new;
1566 		mmpol.ilx = 0;
1567 
1568 		/*
1569 		 * In the interleaved case, attempt to allocate on exactly the
1570 		 * targeted nodes, for the first VMA to be migrated; for later
1571 		 * VMAs, the nodes will still be interleaved from the targeted
1572 		 * nodemask, but one by one may be selected differently.
1573 		 */
1574 		if (new->mode == MPOL_INTERLEAVE ||
1575 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1576 			struct folio *folio;
1577 			unsigned int order;
1578 			unsigned long addr = -EFAULT;
1579 
1580 			list_for_each_entry(folio, &pagelist, lru) {
1581 				if (!folio_test_ksm(folio))
1582 					break;
1583 			}
1584 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1585 				vma_iter_init(&vmi, mm, start);
1586 				for_each_vma_range(vmi, vma, end) {
1587 					addr = page_address_in_vma(folio,
1588 						folio_page(folio, 0), vma);
1589 					if (addr != -EFAULT)
1590 						break;
1591 				}
1592 			}
1593 			if (addr != -EFAULT) {
1594 				order = folio_order(folio);
1595 				/* We already know the pol, but not the ilx */
1596 				mpol_cond_put(get_vma_policy(vma, addr, order,
1597 							     &mmpol.ilx));
1598 				/* Set base from which to increment by index */
1599 				mmpol.ilx -= folio->index >> order;
1600 			}
1601 		}
1602 	}
1603 
1604 	mmap_write_unlock(mm);
1605 
1606 	if (!err && !list_empty(&pagelist)) {
1607 		nr_failed |= migrate_pages(&pagelist,
1608 				alloc_migration_target_by_mpol, NULL,
1609 				(unsigned long)&mmpol, MIGRATE_SYNC,
1610 				MR_MEMPOLICY_MBIND, NULL);
1611 	}
1612 
1613 	if (nr_failed && (flags & MPOL_MF_STRICT))
1614 		err = -EIO;
1615 	if (!list_empty(&pagelist))
1616 		putback_movable_pages(&pagelist);
1617 mpol_out:
1618 	mpol_put(new);
1619 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1620 		lru_cache_enable();
1621 	return err;
1622 }
1623 
1624 /*
1625  * User space interface with variable sized bitmaps for nodelists.
1626  */
1627 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1628 		      unsigned long maxnode)
1629 {
1630 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1631 	int ret;
1632 
1633 	if (in_compat_syscall())
1634 		ret = compat_get_bitmap(mask,
1635 					(const compat_ulong_t __user *)nmask,
1636 					maxnode);
1637 	else
1638 		ret = copy_from_user(mask, nmask,
1639 				     nlongs * sizeof(unsigned long));
1640 
1641 	if (ret)
1642 		return -EFAULT;
1643 
1644 	if (maxnode % BITS_PER_LONG)
1645 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1646 
1647 	return 0;
1648 }
1649 
1650 /* Copy a node mask from user space. */
1651 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1652 		     unsigned long maxnode)
1653 {
1654 	--maxnode;
1655 	nodes_clear(*nodes);
1656 	if (maxnode == 0 || !nmask)
1657 		return 0;
1658 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1659 		return -EINVAL;
1660 
1661 	/*
1662 	 * When the user specified more nodes than supported just check
1663 	 * if the non supported part is all zero, one word at a time,
1664 	 * starting at the end.
1665 	 */
1666 	while (maxnode > MAX_NUMNODES) {
1667 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1668 		unsigned long t;
1669 
1670 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1671 			return -EFAULT;
1672 
1673 		if (maxnode - bits >= MAX_NUMNODES) {
1674 			maxnode -= bits;
1675 		} else {
1676 			maxnode = MAX_NUMNODES;
1677 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1678 		}
1679 		if (t)
1680 			return -EINVAL;
1681 	}
1682 
1683 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1684 }
1685 
1686 /* Copy a kernel node mask to user space */
1687 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1688 			      nodemask_t *nodes)
1689 {
1690 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1691 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1692 	bool compat = in_compat_syscall();
1693 
1694 	if (compat)
1695 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1696 
1697 	if (copy > nbytes) {
1698 		if (copy > PAGE_SIZE)
1699 			return -EINVAL;
1700 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1701 			return -EFAULT;
1702 		copy = nbytes;
1703 		maxnode = nr_node_ids;
1704 	}
1705 
1706 	if (compat)
1707 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1708 					 nodes_addr(*nodes), maxnode);
1709 
1710 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1711 }
1712 
1713 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1714 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1715 {
1716 	*flags = *mode & MPOL_MODE_FLAGS;
1717 	*mode &= ~MPOL_MODE_FLAGS;
1718 
1719 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1720 		return -EINVAL;
1721 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1722 		return -EINVAL;
1723 	if (*flags & MPOL_F_NUMA_BALANCING) {
1724 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1725 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1726 		else
1727 			return -EINVAL;
1728 	}
1729 	return 0;
1730 }
1731 
1732 static long kernel_mbind(unsigned long start, unsigned long len,
1733 			 unsigned long mode, const unsigned long __user *nmask,
1734 			 unsigned long maxnode, unsigned int flags)
1735 {
1736 	unsigned short mode_flags;
1737 	nodemask_t nodes;
1738 	int lmode = mode;
1739 	int err;
1740 
1741 	start = untagged_addr(start);
1742 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1743 	if (err)
1744 		return err;
1745 
1746 	err = get_nodes(&nodes, nmask, maxnode);
1747 	if (err)
1748 		return err;
1749 
1750 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1751 }
1752 
1753 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1754 		unsigned long, home_node, unsigned long, flags)
1755 {
1756 	struct mm_struct *mm = current->mm;
1757 	struct vm_area_struct *vma, *prev;
1758 	struct mempolicy *new, *old;
1759 	unsigned long end;
1760 	int err = -ENOENT;
1761 	VMA_ITERATOR(vmi, mm, start);
1762 
1763 	start = untagged_addr(start);
1764 	if (start & ~PAGE_MASK)
1765 		return -EINVAL;
1766 	/*
1767 	 * flags is used for future extension if any.
1768 	 */
1769 	if (flags != 0)
1770 		return -EINVAL;
1771 
1772 	/*
1773 	 * Check home_node is online to avoid accessing uninitialized
1774 	 * NODE_DATA.
1775 	 */
1776 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1777 		return -EINVAL;
1778 
1779 	len = PAGE_ALIGN(len);
1780 	end = start + len;
1781 
1782 	if (end < start)
1783 		return -EINVAL;
1784 	if (end == start)
1785 		return 0;
1786 	mmap_write_lock(mm);
1787 	prev = vma_prev(&vmi);
1788 	for_each_vma_range(vmi, vma, end) {
1789 		/*
1790 		 * If any vma in the range got policy other than MPOL_BIND
1791 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1792 		 * the home node for vmas we already updated before.
1793 		 */
1794 		old = vma_policy(vma);
1795 		if (!old) {
1796 			prev = vma;
1797 			continue;
1798 		}
1799 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1800 			err = -EOPNOTSUPP;
1801 			break;
1802 		}
1803 		new = mpol_dup(old);
1804 		if (IS_ERR(new)) {
1805 			err = PTR_ERR(new);
1806 			break;
1807 		}
1808 
1809 		vma_start_write(vma);
1810 		new->home_node = home_node;
1811 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1812 		mpol_put(new);
1813 		if (err)
1814 			break;
1815 	}
1816 	mmap_write_unlock(mm);
1817 	return err;
1818 }
1819 
1820 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1821 		unsigned long, mode, const unsigned long __user *, nmask,
1822 		unsigned long, maxnode, unsigned int, flags)
1823 {
1824 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1825 }
1826 
1827 /* Set the process memory policy */
1828 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1829 				 unsigned long maxnode)
1830 {
1831 	unsigned short mode_flags;
1832 	nodemask_t nodes;
1833 	int lmode = mode;
1834 	int err;
1835 
1836 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1837 	if (err)
1838 		return err;
1839 
1840 	err = get_nodes(&nodes, nmask, maxnode);
1841 	if (err)
1842 		return err;
1843 
1844 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1845 }
1846 
1847 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1848 		unsigned long, maxnode)
1849 {
1850 	return kernel_set_mempolicy(mode, nmask, maxnode);
1851 }
1852 
1853 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1854 				const unsigned long __user *old_nodes,
1855 				const unsigned long __user *new_nodes)
1856 {
1857 	struct mm_struct *mm = NULL;
1858 	struct task_struct *task;
1859 	nodemask_t task_nodes;
1860 	int err;
1861 	nodemask_t *old;
1862 	nodemask_t *new;
1863 	NODEMASK_SCRATCH(scratch);
1864 
1865 	if (!scratch)
1866 		return -ENOMEM;
1867 
1868 	old = &scratch->mask1;
1869 	new = &scratch->mask2;
1870 
1871 	err = get_nodes(old, old_nodes, maxnode);
1872 	if (err)
1873 		goto out;
1874 
1875 	err = get_nodes(new, new_nodes, maxnode);
1876 	if (err)
1877 		goto out;
1878 
1879 	/* Find the mm_struct */
1880 	rcu_read_lock();
1881 	task = pid ? find_task_by_vpid(pid) : current;
1882 	if (!task) {
1883 		rcu_read_unlock();
1884 		err = -ESRCH;
1885 		goto out;
1886 	}
1887 	get_task_struct(task);
1888 
1889 	err = -EINVAL;
1890 
1891 	/*
1892 	 * Check if this process has the right to modify the specified process.
1893 	 * Use the regular "ptrace_may_access()" checks.
1894 	 */
1895 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1896 		rcu_read_unlock();
1897 		err = -EPERM;
1898 		goto out_put;
1899 	}
1900 	rcu_read_unlock();
1901 
1902 	task_nodes = cpuset_mems_allowed(task);
1903 	/* Is the user allowed to access the target nodes? */
1904 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1905 		err = -EPERM;
1906 		goto out_put;
1907 	}
1908 
1909 	task_nodes = cpuset_mems_allowed(current);
1910 	nodes_and(*new, *new, task_nodes);
1911 	if (nodes_empty(*new))
1912 		goto out_put;
1913 
1914 	err = security_task_movememory(task);
1915 	if (err)
1916 		goto out_put;
1917 
1918 	mm = get_task_mm(task);
1919 	put_task_struct(task);
1920 
1921 	if (!mm) {
1922 		err = -EINVAL;
1923 		goto out;
1924 	}
1925 
1926 	err = do_migrate_pages(mm, old, new,
1927 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1928 
1929 	mmput(mm);
1930 out:
1931 	NODEMASK_SCRATCH_FREE(scratch);
1932 
1933 	return err;
1934 
1935 out_put:
1936 	put_task_struct(task);
1937 	goto out;
1938 }
1939 
1940 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1941 		const unsigned long __user *, old_nodes,
1942 		const unsigned long __user *, new_nodes)
1943 {
1944 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1945 }
1946 
1947 /* Retrieve NUMA policy */
1948 static int kernel_get_mempolicy(int __user *policy,
1949 				unsigned long __user *nmask,
1950 				unsigned long maxnode,
1951 				unsigned long addr,
1952 				unsigned long flags)
1953 {
1954 	int err;
1955 	int pval;
1956 	nodemask_t nodes;
1957 
1958 	if (nmask != NULL && maxnode < nr_node_ids)
1959 		return -EINVAL;
1960 
1961 	addr = untagged_addr(addr);
1962 
1963 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1964 
1965 	if (err)
1966 		return err;
1967 
1968 	if (policy && put_user(pval, policy))
1969 		return -EFAULT;
1970 
1971 	if (nmask)
1972 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1973 
1974 	return err;
1975 }
1976 
1977 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1978 		unsigned long __user *, nmask, unsigned long, maxnode,
1979 		unsigned long, addr, unsigned long, flags)
1980 {
1981 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1982 }
1983 
1984 bool vma_migratable(struct vm_area_struct *vma)
1985 {
1986 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1987 		return false;
1988 
1989 	/*
1990 	 * DAX device mappings require predictable access latency, so avoid
1991 	 * incurring periodic faults.
1992 	 */
1993 	if (vma_is_dax(vma))
1994 		return false;
1995 
1996 	if (is_vm_hugetlb_page(vma) &&
1997 		!hugepage_migration_supported(hstate_vma(vma)))
1998 		return false;
1999 
2000 	/*
2001 	 * Migration allocates pages in the highest zone. If we cannot
2002 	 * do so then migration (at least from node to node) is not
2003 	 * possible.
2004 	 */
2005 	if (vma->vm_file &&
2006 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
2007 			< policy_zone)
2008 		return false;
2009 	return true;
2010 }
2011 
2012 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2013 				   unsigned long addr, pgoff_t *ilx)
2014 {
2015 	*ilx = 0;
2016 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
2017 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2018 }
2019 
2020 /*
2021  * get_vma_policy(@vma, @addr, @order, @ilx)
2022  * @vma: virtual memory area whose policy is sought
2023  * @addr: address in @vma for shared policy lookup
2024  * @order: 0, or appropriate huge_page_order for interleaving
2025  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2026  *       MPOL_WEIGHTED_INTERLEAVE
2027  *
2028  * Returns effective policy for a VMA at specified address.
2029  * Falls back to current->mempolicy or system default policy, as necessary.
2030  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2031  * count--added by the get_policy() vm_op, as appropriate--to protect against
2032  * freeing by another task.  It is the caller's responsibility to free the
2033  * extra reference for shared policies.
2034  */
2035 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2036 				 unsigned long addr, int order, pgoff_t *ilx)
2037 {
2038 	struct mempolicy *pol;
2039 
2040 	pol = __get_vma_policy(vma, addr, ilx);
2041 	if (!pol)
2042 		pol = get_task_policy(current);
2043 	if (pol->mode == MPOL_INTERLEAVE ||
2044 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2045 		*ilx += vma->vm_pgoff >> order;
2046 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2047 	}
2048 	return pol;
2049 }
2050 
2051 bool vma_policy_mof(struct vm_area_struct *vma)
2052 {
2053 	struct mempolicy *pol;
2054 
2055 	if (vma->vm_ops && vma->vm_ops->get_policy) {
2056 		bool ret = false;
2057 		pgoff_t ilx;		/* ignored here */
2058 
2059 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2060 		if (pol && (pol->flags & MPOL_F_MOF))
2061 			ret = true;
2062 		mpol_cond_put(pol);
2063 
2064 		return ret;
2065 	}
2066 
2067 	pol = vma->vm_policy;
2068 	if (!pol)
2069 		pol = get_task_policy(current);
2070 
2071 	return pol->flags & MPOL_F_MOF;
2072 }
2073 
2074 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2075 {
2076 	enum zone_type dynamic_policy_zone = policy_zone;
2077 
2078 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2079 
2080 	/*
2081 	 * if policy->nodes has movable memory only,
2082 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2083 	 *
2084 	 * policy->nodes is intersect with node_states[N_MEMORY].
2085 	 * so if the following test fails, it implies
2086 	 * policy->nodes has movable memory only.
2087 	 */
2088 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2089 		dynamic_policy_zone = ZONE_MOVABLE;
2090 
2091 	return zone >= dynamic_policy_zone;
2092 }
2093 
2094 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2095 {
2096 	unsigned int node;
2097 	unsigned int cpuset_mems_cookie;
2098 
2099 retry:
2100 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2101 	cpuset_mems_cookie = read_mems_allowed_begin();
2102 	node = current->il_prev;
2103 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
2104 		node = next_node_in(node, policy->nodes);
2105 		if (read_mems_allowed_retry(cpuset_mems_cookie))
2106 			goto retry;
2107 		if (node == MAX_NUMNODES)
2108 			return node;
2109 		current->il_prev = node;
2110 		current->il_weight = get_il_weight(node);
2111 	}
2112 	current->il_weight--;
2113 	return node;
2114 }
2115 
2116 /* Do dynamic interleaving for a process */
2117 static unsigned int interleave_nodes(struct mempolicy *policy)
2118 {
2119 	unsigned int nid;
2120 	unsigned int cpuset_mems_cookie;
2121 
2122 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2123 	do {
2124 		cpuset_mems_cookie = read_mems_allowed_begin();
2125 		nid = next_node_in(current->il_prev, policy->nodes);
2126 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2127 
2128 	if (nid < MAX_NUMNODES)
2129 		current->il_prev = nid;
2130 	return nid;
2131 }
2132 
2133 /*
2134  * Depending on the memory policy provide a node from which to allocate the
2135  * next slab entry.
2136  */
2137 unsigned int mempolicy_slab_node(void)
2138 {
2139 	struct mempolicy *policy;
2140 	int node = numa_mem_id();
2141 
2142 	if (!in_task())
2143 		return node;
2144 
2145 	policy = current->mempolicy;
2146 	if (!policy)
2147 		return node;
2148 
2149 	switch (policy->mode) {
2150 	case MPOL_PREFERRED:
2151 		return first_node(policy->nodes);
2152 
2153 	case MPOL_INTERLEAVE:
2154 		return interleave_nodes(policy);
2155 
2156 	case MPOL_WEIGHTED_INTERLEAVE:
2157 		return weighted_interleave_nodes(policy);
2158 
2159 	case MPOL_BIND:
2160 	case MPOL_PREFERRED_MANY:
2161 	{
2162 		struct zoneref *z;
2163 
2164 		/*
2165 		 * Follow bind policy behavior and start allocation at the
2166 		 * first node.
2167 		 */
2168 		struct zonelist *zonelist;
2169 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2170 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2171 		z = first_zones_zonelist(zonelist, highest_zoneidx,
2172 							&policy->nodes);
2173 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2174 	}
2175 	case MPOL_LOCAL:
2176 		return node;
2177 
2178 	default:
2179 		BUG();
2180 	}
2181 }
2182 
2183 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2184 					      nodemask_t *mask)
2185 {
2186 	/*
2187 	 * barrier stabilizes the nodemask locally so that it can be iterated
2188 	 * over safely without concern for changes. Allocators validate node
2189 	 * selection does not violate mems_allowed, so this is safe.
2190 	 */
2191 	barrier();
2192 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2193 	barrier();
2194 	return nodes_weight(*mask);
2195 }
2196 
2197 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2198 {
2199 	struct weighted_interleave_state *state;
2200 	nodemask_t nodemask;
2201 	unsigned int target, nr_nodes;
2202 	u8 *table = NULL;
2203 	unsigned int weight_total = 0;
2204 	u8 weight;
2205 	int nid = 0;
2206 
2207 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2208 	if (!nr_nodes)
2209 		return numa_node_id();
2210 
2211 	rcu_read_lock();
2212 
2213 	state = rcu_dereference(wi_state);
2214 	/* Uninitialized wi_state means we should assume all weights are 1 */
2215 	if (state)
2216 		table = state->iw_table;
2217 
2218 	/* calculate the total weight */
2219 	for_each_node_mask(nid, nodemask)
2220 		weight_total += table ? table[nid] : 1;
2221 
2222 	/* Calculate the node offset based on totals */
2223 	target = ilx % weight_total;
2224 	nid = first_node(nodemask);
2225 	while (target) {
2226 		/* detect system default usage */
2227 		weight = table ? table[nid] : 1;
2228 		if (target < weight)
2229 			break;
2230 		target -= weight;
2231 		nid = next_node_in(nid, nodemask);
2232 	}
2233 	rcu_read_unlock();
2234 	return nid;
2235 }
2236 
2237 /*
2238  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2239  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2240  * exceeds the number of present nodes.
2241  */
2242 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2243 {
2244 	nodemask_t nodemask;
2245 	unsigned int target, nnodes;
2246 	int i;
2247 	int nid;
2248 
2249 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2250 	if (!nnodes)
2251 		return numa_node_id();
2252 	target = ilx % nnodes;
2253 	nid = first_node(nodemask);
2254 	for (i = 0; i < target; i++)
2255 		nid = next_node(nid, nodemask);
2256 	return nid;
2257 }
2258 
2259 /*
2260  * Return a nodemask representing a mempolicy for filtering nodes for
2261  * page allocation, together with preferred node id (or the input node id).
2262  */
2263 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2264 				   pgoff_t ilx, int *nid)
2265 {
2266 	nodemask_t *nodemask = NULL;
2267 
2268 	switch (pol->mode) {
2269 	case MPOL_PREFERRED:
2270 		/* Override input node id */
2271 		*nid = first_node(pol->nodes);
2272 		break;
2273 	case MPOL_PREFERRED_MANY:
2274 		nodemask = &pol->nodes;
2275 		if (pol->home_node != NUMA_NO_NODE)
2276 			*nid = pol->home_node;
2277 		break;
2278 	case MPOL_BIND:
2279 		/* Restrict to nodemask (but not on lower zones) */
2280 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2281 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2282 			nodemask = &pol->nodes;
2283 		if (pol->home_node != NUMA_NO_NODE)
2284 			*nid = pol->home_node;
2285 		/*
2286 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2287 		 * because we might easily break the expectation to stay on the
2288 		 * requested node and not break the policy.
2289 		 */
2290 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2291 		break;
2292 	case MPOL_INTERLEAVE:
2293 		/* Override input node id */
2294 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2295 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2296 		break;
2297 	case MPOL_WEIGHTED_INTERLEAVE:
2298 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2299 			weighted_interleave_nodes(pol) :
2300 			weighted_interleave_nid(pol, ilx);
2301 		break;
2302 	}
2303 
2304 	return nodemask;
2305 }
2306 
2307 #ifdef CONFIG_HUGETLBFS
2308 /*
2309  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2310  * @vma: virtual memory area whose policy is sought
2311  * @addr: address in @vma for shared policy lookup and interleave policy
2312  * @gfp_flags: for requested zone
2313  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2314  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2315  *
2316  * Returns a nid suitable for a huge page allocation and a pointer
2317  * to the struct mempolicy for conditional unref after allocation.
2318  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2319  * to the mempolicy's @nodemask for filtering the zonelist.
2320  */
2321 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2322 		struct mempolicy **mpol, nodemask_t **nodemask)
2323 {
2324 	pgoff_t ilx;
2325 	int nid;
2326 
2327 	nid = numa_node_id();
2328 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2329 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2330 	return nid;
2331 }
2332 
2333 /*
2334  * init_nodemask_of_mempolicy
2335  *
2336  * If the current task's mempolicy is "default" [NULL], return 'false'
2337  * to indicate default policy.  Otherwise, extract the policy nodemask
2338  * for 'bind' or 'interleave' policy into the argument nodemask, or
2339  * initialize the argument nodemask to contain the single node for
2340  * 'preferred' or 'local' policy and return 'true' to indicate presence
2341  * of non-default mempolicy.
2342  *
2343  * We don't bother with reference counting the mempolicy [mpol_get/put]
2344  * because the current task is examining it's own mempolicy and a task's
2345  * mempolicy is only ever changed by the task itself.
2346  *
2347  * N.B., it is the caller's responsibility to free a returned nodemask.
2348  */
2349 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2350 {
2351 	struct mempolicy *mempolicy;
2352 
2353 	if (!(mask && current->mempolicy))
2354 		return false;
2355 
2356 	task_lock(current);
2357 	mempolicy = current->mempolicy;
2358 	switch (mempolicy->mode) {
2359 	case MPOL_PREFERRED:
2360 	case MPOL_PREFERRED_MANY:
2361 	case MPOL_BIND:
2362 	case MPOL_INTERLEAVE:
2363 	case MPOL_WEIGHTED_INTERLEAVE:
2364 		*mask = mempolicy->nodes;
2365 		break;
2366 
2367 	case MPOL_LOCAL:
2368 		init_nodemask_of_node(mask, numa_node_id());
2369 		break;
2370 
2371 	default:
2372 		BUG();
2373 	}
2374 	task_unlock(current);
2375 
2376 	return true;
2377 }
2378 #endif
2379 
2380 /*
2381  * mempolicy_in_oom_domain
2382  *
2383  * If tsk's mempolicy is "bind", check for intersection between mask and
2384  * the policy nodemask. Otherwise, return true for all other policies
2385  * including "interleave", as a tsk with "interleave" policy may have
2386  * memory allocated from all nodes in system.
2387  *
2388  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2389  */
2390 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2391 					const nodemask_t *mask)
2392 {
2393 	struct mempolicy *mempolicy;
2394 	bool ret = true;
2395 
2396 	if (!mask)
2397 		return ret;
2398 
2399 	task_lock(tsk);
2400 	mempolicy = tsk->mempolicy;
2401 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2402 		ret = nodes_intersects(mempolicy->nodes, *mask);
2403 	task_unlock(tsk);
2404 
2405 	return ret;
2406 }
2407 
2408 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2409 						int nid, nodemask_t *nodemask)
2410 {
2411 	struct page *page;
2412 	gfp_t preferred_gfp;
2413 
2414 	/*
2415 	 * This is a two pass approach. The first pass will only try the
2416 	 * preferred nodes but skip the direct reclaim and allow the
2417 	 * allocation to fail, while the second pass will try all the
2418 	 * nodes in system.
2419 	 */
2420 	preferred_gfp = gfp | __GFP_NOWARN;
2421 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2422 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2423 	if (!page)
2424 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2425 
2426 	return page;
2427 }
2428 
2429 /**
2430  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2431  * @gfp: GFP flags.
2432  * @order: Order of the page allocation.
2433  * @pol: Pointer to the NUMA mempolicy.
2434  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2435  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2436  *
2437  * Return: The page on success or NULL if allocation fails.
2438  */
2439 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2440 		struct mempolicy *pol, pgoff_t ilx, int nid)
2441 {
2442 	nodemask_t *nodemask;
2443 	struct page *page;
2444 
2445 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2446 
2447 	if (pol->mode == MPOL_PREFERRED_MANY)
2448 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2449 
2450 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2451 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2452 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2453 		/*
2454 		 * For hugepage allocation and non-interleave policy which
2455 		 * allows the current node (or other explicitly preferred
2456 		 * node) we only try to allocate from the current/preferred
2457 		 * node and don't fall back to other nodes, as the cost of
2458 		 * remote accesses would likely offset THP benefits.
2459 		 *
2460 		 * If the policy is interleave or does not allow the current
2461 		 * node in its nodemask, we allocate the standard way.
2462 		 */
2463 		if (pol->mode != MPOL_INTERLEAVE &&
2464 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2465 		    (!nodemask || node_isset(nid, *nodemask))) {
2466 			/*
2467 			 * First, try to allocate THP only on local node, but
2468 			 * don't reclaim unnecessarily, just compact.
2469 			 */
2470 			page = __alloc_frozen_pages_noprof(
2471 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2472 				nid, NULL);
2473 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2474 				return page;
2475 			/*
2476 			 * If hugepage allocations are configured to always
2477 			 * synchronous compact or the vma has been madvised
2478 			 * to prefer hugepage backing, retry allowing remote
2479 			 * memory with both reclaim and compact as well.
2480 			 */
2481 		}
2482 	}
2483 
2484 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2485 
2486 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2487 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2488 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2489 		if (static_branch_likely(&vm_numa_stat_key) &&
2490 		    page_to_nid(page) == nid) {
2491 			preempt_disable();
2492 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2493 			preempt_enable();
2494 		}
2495 	}
2496 
2497 	return page;
2498 }
2499 
2500 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2501 		struct mempolicy *pol, pgoff_t ilx, int nid)
2502 {
2503 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2504 			ilx, nid);
2505 	if (!page)
2506 		return NULL;
2507 
2508 	set_page_refcounted(page);
2509 	return page_rmappable_folio(page);
2510 }
2511 
2512 /**
2513  * vma_alloc_folio - Allocate a folio for a VMA.
2514  * @gfp: GFP flags.
2515  * @order: Order of the folio.
2516  * @vma: Pointer to VMA.
2517  * @addr: Virtual address of the allocation.  Must be inside @vma.
2518  *
2519  * Allocate a folio for a specific address in @vma, using the appropriate
2520  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2521  * VMA to prevent it from going away.  Should be used for all allocations
2522  * for folios that will be mapped into user space, excepting hugetlbfs, and
2523  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2524  *
2525  * Return: The folio on success or NULL if allocation fails.
2526  */
2527 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2528 		unsigned long addr)
2529 {
2530 	struct mempolicy *pol;
2531 	pgoff_t ilx;
2532 	struct folio *folio;
2533 
2534 	if (vma->vm_flags & VM_DROPPABLE)
2535 		gfp |= __GFP_NOWARN;
2536 
2537 	pol = get_vma_policy(vma, addr, order, &ilx);
2538 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2539 	mpol_cond_put(pol);
2540 	return folio;
2541 }
2542 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2543 
2544 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2545 {
2546 	struct mempolicy *pol = &default_policy;
2547 
2548 	/*
2549 	 * No reference counting needed for current->mempolicy
2550 	 * nor system default_policy
2551 	 */
2552 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2553 		pol = get_task_policy(current);
2554 
2555 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2556 				       numa_node_id());
2557 }
2558 
2559 /**
2560  * alloc_pages - Allocate pages.
2561  * @gfp: GFP flags.
2562  * @order: Power of two of number of pages to allocate.
2563  *
2564  * Allocate 1 << @order contiguous pages.  The physical address of the
2565  * first page is naturally aligned (eg an order-3 allocation will be aligned
2566  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2567  * process is honoured when in process context.
2568  *
2569  * Context: Can be called from any context, providing the appropriate GFP
2570  * flags are used.
2571  * Return: The page on success or NULL if allocation fails.
2572  */
2573 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2574 {
2575 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2576 
2577 	if (page)
2578 		set_page_refcounted(page);
2579 	return page;
2580 }
2581 EXPORT_SYMBOL(alloc_pages_noprof);
2582 
2583 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2584 {
2585 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2586 }
2587 EXPORT_SYMBOL(folio_alloc_noprof);
2588 
2589 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2590 		struct mempolicy *pol, unsigned long nr_pages,
2591 		struct page **page_array)
2592 {
2593 	int nodes;
2594 	unsigned long nr_pages_per_node;
2595 	int delta;
2596 	int i;
2597 	unsigned long nr_allocated;
2598 	unsigned long total_allocated = 0;
2599 
2600 	nodes = nodes_weight(pol->nodes);
2601 	nr_pages_per_node = nr_pages / nodes;
2602 	delta = nr_pages - nodes * nr_pages_per_node;
2603 
2604 	for (i = 0; i < nodes; i++) {
2605 		if (delta) {
2606 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2607 					interleave_nodes(pol), NULL,
2608 					nr_pages_per_node + 1,
2609 					page_array);
2610 			delta--;
2611 		} else {
2612 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2613 					interleave_nodes(pol), NULL,
2614 					nr_pages_per_node, page_array);
2615 		}
2616 
2617 		page_array += nr_allocated;
2618 		total_allocated += nr_allocated;
2619 	}
2620 
2621 	return total_allocated;
2622 }
2623 
2624 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2625 		struct mempolicy *pol, unsigned long nr_pages,
2626 		struct page **page_array)
2627 {
2628 	struct weighted_interleave_state *state;
2629 	struct task_struct *me = current;
2630 	unsigned int cpuset_mems_cookie;
2631 	unsigned long total_allocated = 0;
2632 	unsigned long nr_allocated = 0;
2633 	unsigned long rounds;
2634 	unsigned long node_pages, delta;
2635 	u8 *weights, weight;
2636 	unsigned int weight_total = 0;
2637 	unsigned long rem_pages = nr_pages;
2638 	nodemask_t nodes;
2639 	int nnodes, node;
2640 	int resume_node = MAX_NUMNODES - 1;
2641 	u8 resume_weight = 0;
2642 	int prev_node;
2643 	int i;
2644 
2645 	if (!nr_pages)
2646 		return 0;
2647 
2648 	/* read the nodes onto the stack, retry if done during rebind */
2649 	do {
2650 		cpuset_mems_cookie = read_mems_allowed_begin();
2651 		nnodes = read_once_policy_nodemask(pol, &nodes);
2652 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2653 
2654 	/* if the nodemask has become invalid, we cannot do anything */
2655 	if (!nnodes)
2656 		return 0;
2657 
2658 	/* Continue allocating from most recent node and adjust the nr_pages */
2659 	node = me->il_prev;
2660 	weight = me->il_weight;
2661 	if (weight && node_isset(node, nodes)) {
2662 		node_pages = min(rem_pages, weight);
2663 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2664 						  page_array);
2665 		page_array += nr_allocated;
2666 		total_allocated += nr_allocated;
2667 		/* if that's all the pages, no need to interleave */
2668 		if (rem_pages <= weight) {
2669 			me->il_weight -= rem_pages;
2670 			return total_allocated;
2671 		}
2672 		/* Otherwise we adjust remaining pages, continue from there */
2673 		rem_pages -= weight;
2674 	}
2675 	/* clear active weight in case of an allocation failure */
2676 	me->il_weight = 0;
2677 	prev_node = node;
2678 
2679 	/* create a local copy of node weights to operate on outside rcu */
2680 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2681 	if (!weights)
2682 		return total_allocated;
2683 
2684 	rcu_read_lock();
2685 	state = rcu_dereference(wi_state);
2686 	if (state) {
2687 		memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2688 		rcu_read_unlock();
2689 	} else {
2690 		rcu_read_unlock();
2691 		for (i = 0; i < nr_node_ids; i++)
2692 			weights[i] = 1;
2693 	}
2694 
2695 	/* calculate total, detect system default usage */
2696 	for_each_node_mask(node, nodes)
2697 		weight_total += weights[node];
2698 
2699 	/*
2700 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2701 	 * Track which node weighted interleave should resume from.
2702 	 *
2703 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2704 	 * the node following prev_node and its weight.
2705 	 */
2706 	rounds = rem_pages / weight_total;
2707 	delta = rem_pages % weight_total;
2708 	resume_node = next_node_in(prev_node, nodes);
2709 	resume_weight = weights[resume_node];
2710 	for (i = 0; i < nnodes; i++) {
2711 		node = next_node_in(prev_node, nodes);
2712 		weight = weights[node];
2713 		node_pages = weight * rounds;
2714 		/* If a delta exists, add this node's portion of the delta */
2715 		if (delta > weight) {
2716 			node_pages += weight;
2717 			delta -= weight;
2718 		} else if (delta) {
2719 			/* when delta is depleted, resume from that node */
2720 			node_pages += delta;
2721 			resume_node = node;
2722 			resume_weight = weight - delta;
2723 			delta = 0;
2724 		}
2725 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2726 		if (!node_pages)
2727 			break;
2728 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2729 						  page_array);
2730 		page_array += nr_allocated;
2731 		total_allocated += nr_allocated;
2732 		if (total_allocated == nr_pages)
2733 			break;
2734 		prev_node = node;
2735 	}
2736 	me->il_prev = resume_node;
2737 	me->il_weight = resume_weight;
2738 	kfree(weights);
2739 	return total_allocated;
2740 }
2741 
2742 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2743 		struct mempolicy *pol, unsigned long nr_pages,
2744 		struct page **page_array)
2745 {
2746 	gfp_t preferred_gfp;
2747 	unsigned long nr_allocated = 0;
2748 
2749 	preferred_gfp = gfp | __GFP_NOWARN;
2750 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2751 
2752 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2753 					   nr_pages, page_array);
2754 
2755 	if (nr_allocated < nr_pages)
2756 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2757 				nr_pages - nr_allocated,
2758 				page_array + nr_allocated);
2759 	return nr_allocated;
2760 }
2761 
2762 /* alloc pages bulk and mempolicy should be considered at the
2763  * same time in some situation such as vmalloc.
2764  *
2765  * It can accelerate memory allocation especially interleaving
2766  * allocate memory.
2767  */
2768 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2769 		unsigned long nr_pages, struct page **page_array)
2770 {
2771 	struct mempolicy *pol = &default_policy;
2772 	nodemask_t *nodemask;
2773 	int nid;
2774 
2775 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2776 		pol = get_task_policy(current);
2777 
2778 	if (pol->mode == MPOL_INTERLEAVE)
2779 		return alloc_pages_bulk_interleave(gfp, pol,
2780 							 nr_pages, page_array);
2781 
2782 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2783 		return alloc_pages_bulk_weighted_interleave(
2784 				  gfp, pol, nr_pages, page_array);
2785 
2786 	if (pol->mode == MPOL_PREFERRED_MANY)
2787 		return alloc_pages_bulk_preferred_many(gfp,
2788 				numa_node_id(), pol, nr_pages, page_array);
2789 
2790 	nid = numa_node_id();
2791 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2792 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2793 				       nr_pages, page_array);
2794 }
2795 
2796 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2797 {
2798 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2799 
2800 	if (IS_ERR(pol))
2801 		return PTR_ERR(pol);
2802 	dst->vm_policy = pol;
2803 	return 0;
2804 }
2805 
2806 /*
2807  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2808  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2809  * with the mems_allowed returned by cpuset_mems_allowed().  This
2810  * keeps mempolicies cpuset relative after its cpuset moves.  See
2811  * further kernel/cpuset.c update_nodemask().
2812  *
2813  * current's mempolicy may be rebinded by the other task(the task that changes
2814  * cpuset's mems), so we needn't do rebind work for current task.
2815  */
2816 
2817 /* Slow path of a mempolicy duplicate */
2818 struct mempolicy *__mpol_dup(struct mempolicy *old)
2819 {
2820 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2821 
2822 	if (!new)
2823 		return ERR_PTR(-ENOMEM);
2824 
2825 	/* task's mempolicy is protected by alloc_lock */
2826 	if (old == current->mempolicy) {
2827 		task_lock(current);
2828 		*new = *old;
2829 		task_unlock(current);
2830 	} else
2831 		*new = *old;
2832 
2833 	if (current_cpuset_is_being_rebound()) {
2834 		nodemask_t mems = cpuset_mems_allowed(current);
2835 		mpol_rebind_policy(new, &mems);
2836 	}
2837 	atomic_set(&new->refcnt, 1);
2838 	return new;
2839 }
2840 
2841 /* Slow path of a mempolicy comparison */
2842 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2843 {
2844 	if (!a || !b)
2845 		return false;
2846 	if (a->mode != b->mode)
2847 		return false;
2848 	if (a->flags != b->flags)
2849 		return false;
2850 	if (a->home_node != b->home_node)
2851 		return false;
2852 	if (mpol_store_user_nodemask(a))
2853 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2854 			return false;
2855 
2856 	switch (a->mode) {
2857 	case MPOL_BIND:
2858 	case MPOL_INTERLEAVE:
2859 	case MPOL_PREFERRED:
2860 	case MPOL_PREFERRED_MANY:
2861 	case MPOL_WEIGHTED_INTERLEAVE:
2862 		return !!nodes_equal(a->nodes, b->nodes);
2863 	case MPOL_LOCAL:
2864 		return true;
2865 	default:
2866 		BUG();
2867 		return false;
2868 	}
2869 }
2870 
2871 /*
2872  * Shared memory backing store policy support.
2873  *
2874  * Remember policies even when nobody has shared memory mapped.
2875  * The policies are kept in Red-Black tree linked from the inode.
2876  * They are protected by the sp->lock rwlock, which should be held
2877  * for any accesses to the tree.
2878  */
2879 
2880 /*
2881  * lookup first element intersecting start-end.  Caller holds sp->lock for
2882  * reading or for writing
2883  */
2884 static struct sp_node *sp_lookup(struct shared_policy *sp,
2885 					pgoff_t start, pgoff_t end)
2886 {
2887 	struct rb_node *n = sp->root.rb_node;
2888 
2889 	while (n) {
2890 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2891 
2892 		if (start >= p->end)
2893 			n = n->rb_right;
2894 		else if (end <= p->start)
2895 			n = n->rb_left;
2896 		else
2897 			break;
2898 	}
2899 	if (!n)
2900 		return NULL;
2901 	for (;;) {
2902 		struct sp_node *w = NULL;
2903 		struct rb_node *prev = rb_prev(n);
2904 		if (!prev)
2905 			break;
2906 		w = rb_entry(prev, struct sp_node, nd);
2907 		if (w->end <= start)
2908 			break;
2909 		n = prev;
2910 	}
2911 	return rb_entry(n, struct sp_node, nd);
2912 }
2913 
2914 /*
2915  * Insert a new shared policy into the list.  Caller holds sp->lock for
2916  * writing.
2917  */
2918 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2919 {
2920 	struct rb_node **p = &sp->root.rb_node;
2921 	struct rb_node *parent = NULL;
2922 	struct sp_node *nd;
2923 
2924 	while (*p) {
2925 		parent = *p;
2926 		nd = rb_entry(parent, struct sp_node, nd);
2927 		if (new->start < nd->start)
2928 			p = &(*p)->rb_left;
2929 		else if (new->end > nd->end)
2930 			p = &(*p)->rb_right;
2931 		else
2932 			BUG();
2933 	}
2934 	rb_link_node(&new->nd, parent, p);
2935 	rb_insert_color(&new->nd, &sp->root);
2936 }
2937 
2938 /* Find shared policy intersecting idx */
2939 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2940 						pgoff_t idx)
2941 {
2942 	struct mempolicy *pol = NULL;
2943 	struct sp_node *sn;
2944 
2945 	if (!sp->root.rb_node)
2946 		return NULL;
2947 	read_lock(&sp->lock);
2948 	sn = sp_lookup(sp, idx, idx+1);
2949 	if (sn) {
2950 		mpol_get(sn->policy);
2951 		pol = sn->policy;
2952 	}
2953 	read_unlock(&sp->lock);
2954 	return pol;
2955 }
2956 
2957 static void sp_free(struct sp_node *n)
2958 {
2959 	mpol_put(n->policy);
2960 	kmem_cache_free(sn_cache, n);
2961 }
2962 
2963 /**
2964  * mpol_misplaced - check whether current folio node is valid in policy
2965  *
2966  * @folio: folio to be checked
2967  * @vmf: structure describing the fault
2968  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2969  *
2970  * Lookup current policy node id for vma,addr and "compare to" folio's
2971  * node id.  Policy determination "mimics" alloc_page_vma().
2972  * Called from fault path where we know the vma and faulting address.
2973  *
2974  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2975  * policy, or a suitable node ID to allocate a replacement folio from.
2976  */
2977 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2978 		   unsigned long addr)
2979 {
2980 	struct mempolicy *pol;
2981 	pgoff_t ilx;
2982 	struct zoneref *z;
2983 	int curnid = folio_nid(folio);
2984 	struct vm_area_struct *vma = vmf->vma;
2985 	int thiscpu = raw_smp_processor_id();
2986 	int thisnid = numa_node_id();
2987 	int polnid = NUMA_NO_NODE;
2988 	int ret = NUMA_NO_NODE;
2989 
2990 	/*
2991 	 * Make sure ptl is held so that we don't preempt and we
2992 	 * have a stable smp processor id
2993 	 */
2994 	lockdep_assert_held(vmf->ptl);
2995 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2996 	if (!(pol->flags & MPOL_F_MOF))
2997 		goto out;
2998 
2999 	switch (pol->mode) {
3000 	case MPOL_INTERLEAVE:
3001 		polnid = interleave_nid(pol, ilx);
3002 		break;
3003 
3004 	case MPOL_WEIGHTED_INTERLEAVE:
3005 		polnid = weighted_interleave_nid(pol, ilx);
3006 		break;
3007 
3008 	case MPOL_PREFERRED:
3009 		if (node_isset(curnid, pol->nodes))
3010 			goto out;
3011 		polnid = first_node(pol->nodes);
3012 		break;
3013 
3014 	case MPOL_LOCAL:
3015 		polnid = numa_node_id();
3016 		break;
3017 
3018 	case MPOL_BIND:
3019 	case MPOL_PREFERRED_MANY:
3020 		/*
3021 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
3022 		 * policy nodemask we don't allow numa migration to nodes
3023 		 * outside policy nodemask for now. This is done so that if we
3024 		 * want demotion to slow memory to happen, before allocating
3025 		 * from some DRAM node say 'x', we will end up using a
3026 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3027 		 * we should not promote to node 'x' from slow memory node.
3028 		 */
3029 		if (pol->flags & MPOL_F_MORON) {
3030 			/*
3031 			 * Optimize placement among multiple nodes
3032 			 * via NUMA balancing
3033 			 */
3034 			if (node_isset(thisnid, pol->nodes))
3035 				break;
3036 			goto out;
3037 		}
3038 
3039 		/*
3040 		 * use current page if in policy nodemask,
3041 		 * else select nearest allowed node, if any.
3042 		 * If no allowed nodes, use current [!misplaced].
3043 		 */
3044 		if (node_isset(curnid, pol->nodes))
3045 			goto out;
3046 		z = first_zones_zonelist(
3047 				node_zonelist(thisnid, GFP_HIGHUSER),
3048 				gfp_zone(GFP_HIGHUSER),
3049 				&pol->nodes);
3050 		polnid = zonelist_node_idx(z);
3051 		break;
3052 
3053 	default:
3054 		BUG();
3055 	}
3056 
3057 	/* Migrate the folio towards the node whose CPU is referencing it */
3058 	if (pol->flags & MPOL_F_MORON) {
3059 		polnid = thisnid;
3060 
3061 		if (!should_numa_migrate_memory(current, folio, curnid,
3062 						thiscpu))
3063 			goto out;
3064 	}
3065 
3066 	if (curnid != polnid)
3067 		ret = polnid;
3068 out:
3069 	mpol_cond_put(pol);
3070 
3071 	return ret;
3072 }
3073 
3074 /*
3075  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
3076  * dropped after task->mempolicy is set to NULL so that any allocation done as
3077  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3078  * policy.
3079  */
3080 void mpol_put_task_policy(struct task_struct *task)
3081 {
3082 	struct mempolicy *pol;
3083 
3084 	task_lock(task);
3085 	pol = task->mempolicy;
3086 	task->mempolicy = NULL;
3087 	task_unlock(task);
3088 	mpol_put(pol);
3089 }
3090 
3091 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3092 {
3093 	rb_erase(&n->nd, &sp->root);
3094 	sp_free(n);
3095 }
3096 
3097 static void sp_node_init(struct sp_node *node, unsigned long start,
3098 			unsigned long end, struct mempolicy *pol)
3099 {
3100 	node->start = start;
3101 	node->end = end;
3102 	node->policy = pol;
3103 }
3104 
3105 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3106 				struct mempolicy *pol)
3107 {
3108 	struct sp_node *n;
3109 	struct mempolicy *newpol;
3110 
3111 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3112 	if (!n)
3113 		return NULL;
3114 
3115 	newpol = mpol_dup(pol);
3116 	if (IS_ERR(newpol)) {
3117 		kmem_cache_free(sn_cache, n);
3118 		return NULL;
3119 	}
3120 	newpol->flags |= MPOL_F_SHARED;
3121 	sp_node_init(n, start, end, newpol);
3122 
3123 	return n;
3124 }
3125 
3126 /* Replace a policy range. */
3127 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3128 				 pgoff_t end, struct sp_node *new)
3129 {
3130 	struct sp_node *n;
3131 	struct sp_node *n_new = NULL;
3132 	struct mempolicy *mpol_new = NULL;
3133 	int ret = 0;
3134 
3135 restart:
3136 	write_lock(&sp->lock);
3137 	n = sp_lookup(sp, start, end);
3138 	/* Take care of old policies in the same range. */
3139 	while (n && n->start < end) {
3140 		struct rb_node *next = rb_next(&n->nd);
3141 		if (n->start >= start) {
3142 			if (n->end <= end)
3143 				sp_delete(sp, n);
3144 			else
3145 				n->start = end;
3146 		} else {
3147 			/* Old policy spanning whole new range. */
3148 			if (n->end > end) {
3149 				if (!n_new)
3150 					goto alloc_new;
3151 
3152 				*mpol_new = *n->policy;
3153 				atomic_set(&mpol_new->refcnt, 1);
3154 				sp_node_init(n_new, end, n->end, mpol_new);
3155 				n->end = start;
3156 				sp_insert(sp, n_new);
3157 				n_new = NULL;
3158 				mpol_new = NULL;
3159 				break;
3160 			} else
3161 				n->end = start;
3162 		}
3163 		if (!next)
3164 			break;
3165 		n = rb_entry(next, struct sp_node, nd);
3166 	}
3167 	if (new)
3168 		sp_insert(sp, new);
3169 	write_unlock(&sp->lock);
3170 	ret = 0;
3171 
3172 err_out:
3173 	if (mpol_new)
3174 		mpol_put(mpol_new);
3175 	if (n_new)
3176 		kmem_cache_free(sn_cache, n_new);
3177 
3178 	return ret;
3179 
3180 alloc_new:
3181 	write_unlock(&sp->lock);
3182 	ret = -ENOMEM;
3183 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3184 	if (!n_new)
3185 		goto err_out;
3186 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3187 	if (!mpol_new)
3188 		goto err_out;
3189 	atomic_set(&mpol_new->refcnt, 1);
3190 	goto restart;
3191 }
3192 
3193 /**
3194  * mpol_shared_policy_init - initialize shared policy for inode
3195  * @sp: pointer to inode shared policy
3196  * @mpol:  struct mempolicy to install
3197  *
3198  * Install non-NULL @mpol in inode's shared policy rb-tree.
3199  * On entry, the current task has a reference on a non-NULL @mpol.
3200  * This must be released on exit.
3201  * This is called at get_inode() calls and we can use GFP_KERNEL.
3202  */
3203 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3204 {
3205 	int ret;
3206 
3207 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3208 	rwlock_init(&sp->lock);
3209 
3210 	if (mpol) {
3211 		struct sp_node *sn;
3212 		struct mempolicy *npol;
3213 		NODEMASK_SCRATCH(scratch);
3214 
3215 		if (!scratch)
3216 			goto put_mpol;
3217 
3218 		/* contextualize the tmpfs mount point mempolicy to this file */
3219 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3220 		if (IS_ERR(npol))
3221 			goto free_scratch; /* no valid nodemask intersection */
3222 
3223 		task_lock(current);
3224 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3225 		task_unlock(current);
3226 		if (ret)
3227 			goto put_npol;
3228 
3229 		/* alloc node covering entire file; adds ref to file's npol */
3230 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3231 		if (sn)
3232 			sp_insert(sp, sn);
3233 put_npol:
3234 		mpol_put(npol);	/* drop initial ref on file's npol */
3235 free_scratch:
3236 		NODEMASK_SCRATCH_FREE(scratch);
3237 put_mpol:
3238 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3239 	}
3240 }
3241 
3242 int mpol_set_shared_policy(struct shared_policy *sp,
3243 			struct vm_area_struct *vma, struct mempolicy *pol)
3244 {
3245 	int err;
3246 	struct sp_node *new = NULL;
3247 	unsigned long sz = vma_pages(vma);
3248 
3249 	if (pol) {
3250 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3251 		if (!new)
3252 			return -ENOMEM;
3253 	}
3254 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3255 	if (err && new)
3256 		sp_free(new);
3257 	return err;
3258 }
3259 
3260 /* Free a backing policy store on inode delete. */
3261 void mpol_free_shared_policy(struct shared_policy *sp)
3262 {
3263 	struct sp_node *n;
3264 	struct rb_node *next;
3265 
3266 	if (!sp->root.rb_node)
3267 		return;
3268 	write_lock(&sp->lock);
3269 	next = rb_first(&sp->root);
3270 	while (next) {
3271 		n = rb_entry(next, struct sp_node, nd);
3272 		next = rb_next(&n->nd);
3273 		sp_delete(sp, n);
3274 	}
3275 	write_unlock(&sp->lock);
3276 }
3277 
3278 #ifdef CONFIG_NUMA_BALANCING
3279 static int __initdata numabalancing_override;
3280 
3281 static void __init check_numabalancing_enable(void)
3282 {
3283 	bool numabalancing_default = false;
3284 
3285 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3286 		numabalancing_default = true;
3287 
3288 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3289 	if (numabalancing_override)
3290 		set_numabalancing_state(numabalancing_override == 1);
3291 
3292 	if (num_online_nodes() > 1 && !numabalancing_override) {
3293 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3294 			numabalancing_default ? "Enabling" : "Disabling");
3295 		set_numabalancing_state(numabalancing_default);
3296 	}
3297 }
3298 
3299 static int __init setup_numabalancing(char *str)
3300 {
3301 	int ret = 0;
3302 	if (!str)
3303 		goto out;
3304 
3305 	if (!strcmp(str, "enable")) {
3306 		numabalancing_override = 1;
3307 		ret = 1;
3308 	} else if (!strcmp(str, "disable")) {
3309 		numabalancing_override = -1;
3310 		ret = 1;
3311 	}
3312 out:
3313 	if (!ret)
3314 		pr_warn("Unable to parse numa_balancing=\n");
3315 
3316 	return ret;
3317 }
3318 __setup("numa_balancing=", setup_numabalancing);
3319 #else
3320 static inline void __init check_numabalancing_enable(void)
3321 {
3322 }
3323 #endif /* CONFIG_NUMA_BALANCING */
3324 
3325 void __init numa_policy_init(void)
3326 {
3327 	nodemask_t interleave_nodes;
3328 	unsigned long largest = 0;
3329 	int nid, prefer = 0;
3330 
3331 	policy_cache = kmem_cache_create("numa_policy",
3332 					 sizeof(struct mempolicy),
3333 					 0, SLAB_PANIC, NULL);
3334 
3335 	sn_cache = kmem_cache_create("shared_policy_node",
3336 				     sizeof(struct sp_node),
3337 				     0, SLAB_PANIC, NULL);
3338 
3339 	for_each_node(nid) {
3340 		preferred_node_policy[nid] = (struct mempolicy) {
3341 			.refcnt = ATOMIC_INIT(1),
3342 			.mode = MPOL_PREFERRED,
3343 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3344 			.nodes = nodemask_of_node(nid),
3345 		};
3346 	}
3347 
3348 	/*
3349 	 * Set interleaving policy for system init. Interleaving is only
3350 	 * enabled across suitably sized nodes (default is >= 16MB), or
3351 	 * fall back to the largest node if they're all smaller.
3352 	 */
3353 	nodes_clear(interleave_nodes);
3354 	for_each_node_state(nid, N_MEMORY) {
3355 		unsigned long total_pages = node_present_pages(nid);
3356 
3357 		/* Preserve the largest node */
3358 		if (largest < total_pages) {
3359 			largest = total_pages;
3360 			prefer = nid;
3361 		}
3362 
3363 		/* Interleave this node? */
3364 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3365 			node_set(nid, interleave_nodes);
3366 	}
3367 
3368 	/* All too small, use the largest */
3369 	if (unlikely(nodes_empty(interleave_nodes)))
3370 		node_set(prefer, interleave_nodes);
3371 
3372 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3373 		pr_err("%s: interleaving failed\n", __func__);
3374 
3375 	check_numabalancing_enable();
3376 }
3377 
3378 /* Reset policy of current process to default */
3379 void numa_default_policy(void)
3380 {
3381 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3382 }
3383 
3384 /*
3385  * Parse and format mempolicy from/to strings
3386  */
3387 static const char * const policy_modes[] =
3388 {
3389 	[MPOL_DEFAULT]    = "default",
3390 	[MPOL_PREFERRED]  = "prefer",
3391 	[MPOL_BIND]       = "bind",
3392 	[MPOL_INTERLEAVE] = "interleave",
3393 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3394 	[MPOL_LOCAL]      = "local",
3395 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3396 };
3397 
3398 #ifdef CONFIG_TMPFS
3399 /**
3400  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3401  * @str:  string containing mempolicy to parse
3402  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3403  *
3404  * Format of input:
3405  *	<mode>[=<flags>][:<nodelist>]
3406  *
3407  * Return: %0 on success, else %1
3408  */
3409 int mpol_parse_str(char *str, struct mempolicy **mpol)
3410 {
3411 	struct mempolicy *new = NULL;
3412 	unsigned short mode_flags;
3413 	nodemask_t nodes;
3414 	char *nodelist = strchr(str, ':');
3415 	char *flags = strchr(str, '=');
3416 	int err = 1, mode;
3417 
3418 	if (flags)
3419 		*flags++ = '\0';	/* terminate mode string */
3420 
3421 	if (nodelist) {
3422 		/* NUL-terminate mode or flags string */
3423 		*nodelist++ = '\0';
3424 		if (nodelist_parse(nodelist, nodes))
3425 			goto out;
3426 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3427 			goto out;
3428 	} else
3429 		nodes_clear(nodes);
3430 
3431 	mode = match_string(policy_modes, MPOL_MAX, str);
3432 	if (mode < 0)
3433 		goto out;
3434 
3435 	switch (mode) {
3436 	case MPOL_PREFERRED:
3437 		/*
3438 		 * Insist on a nodelist of one node only, although later
3439 		 * we use first_node(nodes) to grab a single node, so here
3440 		 * nodelist (or nodes) cannot be empty.
3441 		 */
3442 		if (nodelist) {
3443 			char *rest = nodelist;
3444 			while (isdigit(*rest))
3445 				rest++;
3446 			if (*rest)
3447 				goto out;
3448 			if (nodes_empty(nodes))
3449 				goto out;
3450 		}
3451 		break;
3452 	case MPOL_INTERLEAVE:
3453 	case MPOL_WEIGHTED_INTERLEAVE:
3454 		/*
3455 		 * Default to online nodes with memory if no nodelist
3456 		 */
3457 		if (!nodelist)
3458 			nodes = node_states[N_MEMORY];
3459 		break;
3460 	case MPOL_LOCAL:
3461 		/*
3462 		 * Don't allow a nodelist;  mpol_new() checks flags
3463 		 */
3464 		if (nodelist)
3465 			goto out;
3466 		break;
3467 	case MPOL_DEFAULT:
3468 		/*
3469 		 * Insist on a empty nodelist
3470 		 */
3471 		if (!nodelist)
3472 			err = 0;
3473 		goto out;
3474 	case MPOL_PREFERRED_MANY:
3475 	case MPOL_BIND:
3476 		/*
3477 		 * Insist on a nodelist
3478 		 */
3479 		if (!nodelist)
3480 			goto out;
3481 	}
3482 
3483 	mode_flags = 0;
3484 	if (flags) {
3485 		/*
3486 		 * Currently, we only support two mutually exclusive
3487 		 * mode flags.
3488 		 */
3489 		if (!strcmp(flags, "static"))
3490 			mode_flags |= MPOL_F_STATIC_NODES;
3491 		else if (!strcmp(flags, "relative"))
3492 			mode_flags |= MPOL_F_RELATIVE_NODES;
3493 		else
3494 			goto out;
3495 	}
3496 
3497 	new = mpol_new(mode, mode_flags, &nodes);
3498 	if (IS_ERR(new))
3499 		goto out;
3500 
3501 	/*
3502 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3503 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3504 	 */
3505 	if (mode != MPOL_PREFERRED) {
3506 		new->nodes = nodes;
3507 	} else if (nodelist) {
3508 		nodes_clear(new->nodes);
3509 		node_set(first_node(nodes), new->nodes);
3510 	} else {
3511 		new->mode = MPOL_LOCAL;
3512 	}
3513 
3514 	/*
3515 	 * Save nodes for contextualization: this will be used to "clone"
3516 	 * the mempolicy in a specific context [cpuset] at a later time.
3517 	 */
3518 	new->w.user_nodemask = nodes;
3519 
3520 	err = 0;
3521 
3522 out:
3523 	/* Restore string for error message */
3524 	if (nodelist)
3525 		*--nodelist = ':';
3526 	if (flags)
3527 		*--flags = '=';
3528 	if (!err)
3529 		*mpol = new;
3530 	return err;
3531 }
3532 #endif /* CONFIG_TMPFS */
3533 
3534 /**
3535  * mpol_to_str - format a mempolicy structure for printing
3536  * @buffer:  to contain formatted mempolicy string
3537  * @maxlen:  length of @buffer
3538  * @pol:  pointer to mempolicy to be formatted
3539  *
3540  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3541  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3542  * interleave", plus the longest flag flags, "relative|balancing", and to
3543  * display at least a few node ids.
3544  */
3545 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3546 {
3547 	char *p = buffer;
3548 	nodemask_t nodes = NODE_MASK_NONE;
3549 	unsigned short mode = MPOL_DEFAULT;
3550 	unsigned short flags = 0;
3551 
3552 	if (pol &&
3553 	    pol != &default_policy &&
3554 	    !(pol >= &preferred_node_policy[0] &&
3555 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3556 		mode = pol->mode;
3557 		flags = pol->flags;
3558 	}
3559 
3560 	switch (mode) {
3561 	case MPOL_DEFAULT:
3562 	case MPOL_LOCAL:
3563 		break;
3564 	case MPOL_PREFERRED:
3565 	case MPOL_PREFERRED_MANY:
3566 	case MPOL_BIND:
3567 	case MPOL_INTERLEAVE:
3568 	case MPOL_WEIGHTED_INTERLEAVE:
3569 		nodes = pol->nodes;
3570 		break;
3571 	default:
3572 		WARN_ON_ONCE(1);
3573 		snprintf(p, maxlen, "unknown");
3574 		return;
3575 	}
3576 
3577 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3578 
3579 	if (flags & MPOL_MODE_FLAGS) {
3580 		p += snprintf(p, buffer + maxlen - p, "=");
3581 
3582 		/*
3583 		 * Static and relative are mutually exclusive.
3584 		 */
3585 		if (flags & MPOL_F_STATIC_NODES)
3586 			p += snprintf(p, buffer + maxlen - p, "static");
3587 		else if (flags & MPOL_F_RELATIVE_NODES)
3588 			p += snprintf(p, buffer + maxlen - p, "relative");
3589 
3590 		if (flags & MPOL_F_NUMA_BALANCING) {
3591 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3592 				p += snprintf(p, buffer + maxlen - p, "|");
3593 			p += snprintf(p, buffer + maxlen - p, "balancing");
3594 		}
3595 	}
3596 
3597 	if (!nodes_empty(nodes))
3598 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3599 			       nodemask_pr_args(&nodes));
3600 }
3601 
3602 #ifdef CONFIG_SYSFS
3603 struct iw_node_attr {
3604 	struct kobj_attribute kobj_attr;
3605 	int nid;
3606 };
3607 
3608 struct sysfs_wi_group {
3609 	struct kobject wi_kobj;
3610 	struct mutex kobj_lock;
3611 	struct iw_node_attr *nattrs[];
3612 };
3613 
3614 static struct sysfs_wi_group *wi_group;
3615 
3616 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3617 			 char *buf)
3618 {
3619 	struct iw_node_attr *node_attr;
3620 	u8 weight;
3621 
3622 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3623 	weight = get_il_weight(node_attr->nid);
3624 	return sysfs_emit(buf, "%d\n", weight);
3625 }
3626 
3627 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3628 			  const char *buf, size_t count)
3629 {
3630 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3631 	struct iw_node_attr *node_attr;
3632 	u8 weight = 0;
3633 	int i;
3634 
3635 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3636 	if (count == 0 || sysfs_streq(buf, "") ||
3637 	    kstrtou8(buf, 0, &weight) || weight == 0)
3638 		return -EINVAL;
3639 
3640 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3641 			       GFP_KERNEL);
3642 	if (!new_wi_state)
3643 		return -ENOMEM;
3644 
3645 	mutex_lock(&wi_state_lock);
3646 	old_wi_state = rcu_dereference_protected(wi_state,
3647 					lockdep_is_held(&wi_state_lock));
3648 	if (old_wi_state) {
3649 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3650 					nr_node_ids * sizeof(u8));
3651 	} else {
3652 		for (i = 0; i < nr_node_ids; i++)
3653 			new_wi_state->iw_table[i] = 1;
3654 	}
3655 	new_wi_state->iw_table[node_attr->nid] = weight;
3656 	new_wi_state->mode_auto = false;
3657 
3658 	rcu_assign_pointer(wi_state, new_wi_state);
3659 	mutex_unlock(&wi_state_lock);
3660 	if (old_wi_state) {
3661 		synchronize_rcu();
3662 		kfree(old_wi_state);
3663 	}
3664 	return count;
3665 }
3666 
3667 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3668 		struct kobj_attribute *attr, char *buf)
3669 {
3670 	struct weighted_interleave_state *state;
3671 	bool wi_auto = true;
3672 
3673 	rcu_read_lock();
3674 	state = rcu_dereference(wi_state);
3675 	if (state)
3676 		wi_auto = state->mode_auto;
3677 	rcu_read_unlock();
3678 
3679 	return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3680 }
3681 
3682 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3683 		struct kobj_attribute *attr, const char *buf, size_t count)
3684 {
3685 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3686 	unsigned int *bw;
3687 	bool input;
3688 	int i;
3689 
3690 	if (kstrtobool(buf, &input))
3691 		return -EINVAL;
3692 
3693 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3694 			       GFP_KERNEL);
3695 	if (!new_wi_state)
3696 		return -ENOMEM;
3697 	for (i = 0; i < nr_node_ids; i++)
3698 		new_wi_state->iw_table[i] = 1;
3699 
3700 	mutex_lock(&wi_state_lock);
3701 	if (!input) {
3702 		old_wi_state = rcu_dereference_protected(wi_state,
3703 					lockdep_is_held(&wi_state_lock));
3704 		if (!old_wi_state)
3705 			goto update_wi_state;
3706 		if (input == old_wi_state->mode_auto) {
3707 			mutex_unlock(&wi_state_lock);
3708 			return count;
3709 		}
3710 
3711 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3712 					       nr_node_ids * sizeof(u8));
3713 		goto update_wi_state;
3714 	}
3715 
3716 	bw = node_bw_table;
3717 	if (!bw) {
3718 		mutex_unlock(&wi_state_lock);
3719 		kfree(new_wi_state);
3720 		return -ENODEV;
3721 	}
3722 
3723 	new_wi_state->mode_auto = true;
3724 	reduce_interleave_weights(bw, new_wi_state->iw_table);
3725 
3726 update_wi_state:
3727 	rcu_assign_pointer(wi_state, new_wi_state);
3728 	mutex_unlock(&wi_state_lock);
3729 	if (old_wi_state) {
3730 		synchronize_rcu();
3731 		kfree(old_wi_state);
3732 	}
3733 	return count;
3734 }
3735 
3736 static void sysfs_wi_node_delete(int nid)
3737 {
3738 	struct iw_node_attr *attr;
3739 
3740 	if (nid < 0 || nid >= nr_node_ids)
3741 		return;
3742 
3743 	mutex_lock(&wi_group->kobj_lock);
3744 	attr = wi_group->nattrs[nid];
3745 	if (!attr) {
3746 		mutex_unlock(&wi_group->kobj_lock);
3747 		return;
3748 	}
3749 
3750 	wi_group->nattrs[nid] = NULL;
3751 	mutex_unlock(&wi_group->kobj_lock);
3752 
3753 	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3754 	kfree(attr->kobj_attr.attr.name);
3755 	kfree(attr);
3756 }
3757 
3758 static void sysfs_wi_node_delete_all(void)
3759 {
3760 	int nid;
3761 
3762 	for (nid = 0; nid < nr_node_ids; nid++)
3763 		sysfs_wi_node_delete(nid);
3764 }
3765 
3766 static void wi_state_free(void)
3767 {
3768 	struct weighted_interleave_state *old_wi_state;
3769 
3770 	mutex_lock(&wi_state_lock);
3771 	old_wi_state = rcu_dereference_protected(wi_state,
3772 			lockdep_is_held(&wi_state_lock));
3773 	rcu_assign_pointer(wi_state, NULL);
3774 	mutex_unlock(&wi_state_lock);
3775 
3776 	if (old_wi_state) {
3777 		synchronize_rcu();
3778 		kfree(old_wi_state);
3779 	}
3780 }
3781 
3782 static struct kobj_attribute wi_auto_attr =
3783 	__ATTR(auto, 0664, weighted_interleave_auto_show,
3784 			   weighted_interleave_auto_store);
3785 
3786 static void wi_cleanup(void) {
3787 	sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3788 	sysfs_wi_node_delete_all();
3789 	wi_state_free();
3790 }
3791 
3792 static void wi_kobj_release(struct kobject *wi_kobj)
3793 {
3794 	kfree(wi_group);
3795 }
3796 
3797 static const struct kobj_type wi_ktype = {
3798 	.sysfs_ops = &kobj_sysfs_ops,
3799 	.release = wi_kobj_release,
3800 };
3801 
3802 static int sysfs_wi_node_add(int nid)
3803 {
3804 	int ret;
3805 	char *name;
3806 	struct iw_node_attr *new_attr;
3807 
3808 	if (nid < 0 || nid >= nr_node_ids) {
3809 		pr_err("invalid node id: %d\n", nid);
3810 		return -EINVAL;
3811 	}
3812 
3813 	new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3814 	if (!new_attr)
3815 		return -ENOMEM;
3816 
3817 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3818 	if (!name) {
3819 		kfree(new_attr);
3820 		return -ENOMEM;
3821 	}
3822 
3823 	sysfs_attr_init(&new_attr->kobj_attr.attr);
3824 	new_attr->kobj_attr.attr.name = name;
3825 	new_attr->kobj_attr.attr.mode = 0644;
3826 	new_attr->kobj_attr.show = node_show;
3827 	new_attr->kobj_attr.store = node_store;
3828 	new_attr->nid = nid;
3829 
3830 	mutex_lock(&wi_group->kobj_lock);
3831 	if (wi_group->nattrs[nid]) {
3832 		mutex_unlock(&wi_group->kobj_lock);
3833 		ret = -EEXIST;
3834 		goto out;
3835 	}
3836 
3837 	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3838 	if (ret) {
3839 		mutex_unlock(&wi_group->kobj_lock);
3840 		goto out;
3841 	}
3842 	wi_group->nattrs[nid] = new_attr;
3843 	mutex_unlock(&wi_group->kobj_lock);
3844 	return 0;
3845 
3846 out:
3847 	kfree(new_attr->kobj_attr.attr.name);
3848 	kfree(new_attr);
3849 	return ret;
3850 }
3851 
3852 static int wi_node_notifier(struct notifier_block *nb,
3853 			       unsigned long action, void *data)
3854 {
3855 	int err;
3856 	struct node_notify *nn = data;
3857 	int nid = nn->nid;
3858 
3859 	switch (action) {
3860 	case NODE_ADDED_FIRST_MEMORY:
3861 		err = sysfs_wi_node_add(nid);
3862 		if (err)
3863 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3864 			       nid, err);
3865 		break;
3866 	case NODE_REMOVED_LAST_MEMORY:
3867 		sysfs_wi_node_delete(nid);
3868 		break;
3869 	}
3870 
3871 	return NOTIFY_OK;
3872 }
3873 
3874 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3875 {
3876 	int nid, err;
3877 
3878 	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3879 			   GFP_KERNEL);
3880 	if (!wi_group)
3881 		return -ENOMEM;
3882 	mutex_init(&wi_group->kobj_lock);
3883 
3884 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3885 				   "weighted_interleave");
3886 	if (err)
3887 		goto err_put_kobj;
3888 
3889 	err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3890 	if (err)
3891 		goto err_put_kobj;
3892 
3893 	for_each_online_node(nid) {
3894 		if (!node_state(nid, N_MEMORY))
3895 			continue;
3896 
3897 		err = sysfs_wi_node_add(nid);
3898 		if (err) {
3899 			pr_err("failed to add sysfs for node%d during init: %d\n",
3900 			       nid, err);
3901 			goto err_cleanup_kobj;
3902 		}
3903 	}
3904 
3905 	hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3906 	return 0;
3907 
3908 err_cleanup_kobj:
3909 	wi_cleanup();
3910 	kobject_del(&wi_group->wi_kobj);
3911 err_put_kobj:
3912 	kobject_put(&wi_group->wi_kobj);
3913 	return err;
3914 }
3915 
3916 static int __init mempolicy_sysfs_init(void)
3917 {
3918 	int err;
3919 	static struct kobject *mempolicy_kobj;
3920 
3921 	mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3922 	if (!mempolicy_kobj)
3923 		return -ENOMEM;
3924 
3925 	err = add_weighted_interleave_group(mempolicy_kobj);
3926 	if (err)
3927 		goto err_kobj;
3928 
3929 	return 0;
3930 
3931 err_kobj:
3932 	kobject_del(mempolicy_kobj);
3933 	kobject_put(mempolicy_kobj);
3934 	return err;
3935 }
3936 
3937 late_initcall(mempolicy_sysfs_init);
3938 #endif /* CONFIG_SYSFS */
3939