xref: /linux/mm/mempolicy.c (revision 16e5ac127d8d18adf85fe5ba847d77b58d1ed418)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred      Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * preferred many Try a set of nodes first before normal fallback. This is
35  *                similar to preferred without the special case.
36  *
37  * default        Allocate on the local node first, or when on a VMA
38  *                use the process policy. This is what Linux always did
39  *		  in a NUMA aware kernel and still does by, ahem, default.
40  *
41  * The process policy is applied for most non interrupt memory allocations
42  * in that process' context. Interrupts ignore the policies and always
43  * try to allocate on the local CPU. The VMA policy is only applied for memory
44  * allocations for a VMA in the VM.
45  *
46  * Currently there are a few corner cases in swapping where the policy
47  * is not applied, but the majority should be handled. When process policy
48  * is used it is not remembered over swap outs/swap ins.
49  *
50  * Only the highest zone in the zone hierarchy gets policied. Allocations
51  * requesting a lower zone just use default policy. This implies that
52  * on systems with highmem kernel lowmem allocation don't get policied.
53  * Same with GFP_DMA allocations.
54  *
55  * For shmem/tmpfs shared memory the policy is shared between
56  * all users and remembered even when nobody has memory mapped.
57  */
58 
59 /* Notebook:
60    fix mmap readahead to honour policy and enable policy for any page cache
61    object
62    statistics for bigpages
63    global policy for page cache? currently it uses process policy. Requires
64    first item above.
65    handle mremap for shared memory (currently ignored for the policy)
66    grows down?
67    make bind policy root only? It can trigger oom much faster and the
68    kernel is not always grateful with that.
69 */
70 
71 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72 
73 #include <linux/mempolicy.h>
74 #include <linux/pagewalk.h>
75 #include <linux/highmem.h>
76 #include <linux/hugetlb.h>
77 #include <linux/kernel.h>
78 #include <linux/sched.h>
79 #include <linux/sched/mm.h>
80 #include <linux/sched/numa_balancing.h>
81 #include <linux/sched/task.h>
82 #include <linux/nodemask.h>
83 #include <linux/cpuset.h>
84 #include <linux/slab.h>
85 #include <linux/string.h>
86 #include <linux/export.h>
87 #include <linux/nsproxy.h>
88 #include <linux/interrupt.h>
89 #include <linux/init.h>
90 #include <linux/compat.h>
91 #include <linux/ptrace.h>
92 #include <linux/swap.h>
93 #include <linux/seq_file.h>
94 #include <linux/proc_fs.h>
95 #include <linux/migrate.h>
96 #include <linux/ksm.h>
97 #include <linux/rmap.h>
98 #include <linux/security.h>
99 #include <linux/syscalls.h>
100 #include <linux/ctype.h>
101 #include <linux/mm_inline.h>
102 #include <linux/mmu_notifier.h>
103 #include <linux/printk.h>
104 #include <linux/swapops.h>
105 
106 #include <asm/tlbflush.h>
107 #include <asm/tlb.h>
108 #include <linux/uaccess.h>
109 
110 #include "internal.h"
111 
112 /* Internal flags */
113 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
114 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
115 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
116 
117 static struct kmem_cache *policy_cache;
118 static struct kmem_cache *sn_cache;
119 
120 /* Highest zone. An specific allocation for a zone below that is not
121    policied. */
122 enum zone_type policy_zone = 0;
123 
124 /*
125  * run-time system-wide default policy => local allocation
126  */
127 static struct mempolicy default_policy = {
128 	.refcnt = ATOMIC_INIT(1), /* never free it */
129 	.mode = MPOL_LOCAL,
130 };
131 
132 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
133 
134 /**
135  * numa_nearest_node - Find nearest node by state
136  * @node: Node id to start the search
137  * @state: State to filter the search
138  *
139  * Lookup the closest node by distance if @nid is not in state.
140  *
141  * Return: this @node if it is in state, otherwise the closest node by distance
142  */
143 int numa_nearest_node(int node, unsigned int state)
144 {
145 	int min_dist = INT_MAX, dist, n, min_node;
146 
147 	if (state >= NR_NODE_STATES)
148 		return -EINVAL;
149 
150 	if (node == NUMA_NO_NODE || node_state(node, state))
151 		return node;
152 
153 	min_node = node;
154 	for_each_node_state(n, state) {
155 		dist = node_distance(node, n);
156 		if (dist < min_dist) {
157 			min_dist = dist;
158 			min_node = n;
159 		}
160 	}
161 
162 	return min_node;
163 }
164 EXPORT_SYMBOL_GPL(numa_nearest_node);
165 
166 struct mempolicy *get_task_policy(struct task_struct *p)
167 {
168 	struct mempolicy *pol = p->mempolicy;
169 	int node;
170 
171 	if (pol)
172 		return pol;
173 
174 	node = numa_node_id();
175 	if (node != NUMA_NO_NODE) {
176 		pol = &preferred_node_policy[node];
177 		/* preferred_node_policy is not initialised early in boot */
178 		if (pol->mode)
179 			return pol;
180 	}
181 
182 	return &default_policy;
183 }
184 
185 static const struct mempolicy_operations {
186 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
187 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
188 } mpol_ops[MPOL_MAX];
189 
190 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
191 {
192 	return pol->flags & MPOL_MODE_FLAGS;
193 }
194 
195 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
196 				   const nodemask_t *rel)
197 {
198 	nodemask_t tmp;
199 	nodes_fold(tmp, *orig, nodes_weight(*rel));
200 	nodes_onto(*ret, tmp, *rel);
201 }
202 
203 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
204 {
205 	if (nodes_empty(*nodes))
206 		return -EINVAL;
207 	pol->nodes = *nodes;
208 	return 0;
209 }
210 
211 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
212 {
213 	if (nodes_empty(*nodes))
214 		return -EINVAL;
215 
216 	nodes_clear(pol->nodes);
217 	node_set(first_node(*nodes), pol->nodes);
218 	return 0;
219 }
220 
221 /*
222  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
223  * any, for the new policy.  mpol_new() has already validated the nodes
224  * parameter with respect to the policy mode and flags.
225  *
226  * Must be called holding task's alloc_lock to protect task's mems_allowed
227  * and mempolicy.  May also be called holding the mmap_lock for write.
228  */
229 static int mpol_set_nodemask(struct mempolicy *pol,
230 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
231 {
232 	int ret;
233 
234 	/*
235 	 * Default (pol==NULL) resp. local memory policies are not a
236 	 * subject of any remapping. They also do not need any special
237 	 * constructor.
238 	 */
239 	if (!pol || pol->mode == MPOL_LOCAL)
240 		return 0;
241 
242 	/* Check N_MEMORY */
243 	nodes_and(nsc->mask1,
244 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
245 
246 	VM_BUG_ON(!nodes);
247 
248 	if (pol->flags & MPOL_F_RELATIVE_NODES)
249 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
250 	else
251 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
252 
253 	if (mpol_store_user_nodemask(pol))
254 		pol->w.user_nodemask = *nodes;
255 	else
256 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
257 
258 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
259 	return ret;
260 }
261 
262 /*
263  * This function just creates a new policy, does some check and simple
264  * initialization. You must invoke mpol_set_nodemask() to set nodes.
265  */
266 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
267 				  nodemask_t *nodes)
268 {
269 	struct mempolicy *policy;
270 
271 	if (mode == MPOL_DEFAULT) {
272 		if (nodes && !nodes_empty(*nodes))
273 			return ERR_PTR(-EINVAL);
274 		return NULL;
275 	}
276 	VM_BUG_ON(!nodes);
277 
278 	/*
279 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
280 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
281 	 * All other modes require a valid pointer to a non-empty nodemask.
282 	 */
283 	if (mode == MPOL_PREFERRED) {
284 		if (nodes_empty(*nodes)) {
285 			if (((flags & MPOL_F_STATIC_NODES) ||
286 			     (flags & MPOL_F_RELATIVE_NODES)))
287 				return ERR_PTR(-EINVAL);
288 
289 			mode = MPOL_LOCAL;
290 		}
291 	} else if (mode == MPOL_LOCAL) {
292 		if (!nodes_empty(*nodes) ||
293 		    (flags & MPOL_F_STATIC_NODES) ||
294 		    (flags & MPOL_F_RELATIVE_NODES))
295 			return ERR_PTR(-EINVAL);
296 	} else if (nodes_empty(*nodes))
297 		return ERR_PTR(-EINVAL);
298 
299 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
300 	if (!policy)
301 		return ERR_PTR(-ENOMEM);
302 	atomic_set(&policy->refcnt, 1);
303 	policy->mode = mode;
304 	policy->flags = flags;
305 	policy->home_node = NUMA_NO_NODE;
306 
307 	return policy;
308 }
309 
310 /* Slow path of a mpol destructor. */
311 void __mpol_put(struct mempolicy *pol)
312 {
313 	if (!atomic_dec_and_test(&pol->refcnt))
314 		return;
315 	kmem_cache_free(policy_cache, pol);
316 }
317 
318 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
319 {
320 }
321 
322 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
323 {
324 	nodemask_t tmp;
325 
326 	if (pol->flags & MPOL_F_STATIC_NODES)
327 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
328 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
329 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
330 	else {
331 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
332 								*nodes);
333 		pol->w.cpuset_mems_allowed = *nodes;
334 	}
335 
336 	if (nodes_empty(tmp))
337 		tmp = *nodes;
338 
339 	pol->nodes = tmp;
340 }
341 
342 static void mpol_rebind_preferred(struct mempolicy *pol,
343 						const nodemask_t *nodes)
344 {
345 	pol->w.cpuset_mems_allowed = *nodes;
346 }
347 
348 /*
349  * mpol_rebind_policy - Migrate a policy to a different set of nodes
350  *
351  * Per-vma policies are protected by mmap_lock. Allocations using per-task
352  * policies are protected by task->mems_allowed_seq to prevent a premature
353  * OOM/allocation failure due to parallel nodemask modification.
354  */
355 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
356 {
357 	if (!pol || pol->mode == MPOL_LOCAL)
358 		return;
359 	if (!mpol_store_user_nodemask(pol) &&
360 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
361 		return;
362 
363 	mpol_ops[pol->mode].rebind(pol, newmask);
364 }
365 
366 /*
367  * Wrapper for mpol_rebind_policy() that just requires task
368  * pointer, and updates task mempolicy.
369  *
370  * Called with task's alloc_lock held.
371  */
372 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
373 {
374 	mpol_rebind_policy(tsk->mempolicy, new);
375 }
376 
377 /*
378  * Rebind each vma in mm to new nodemask.
379  *
380  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
381  */
382 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
383 {
384 	struct vm_area_struct *vma;
385 	VMA_ITERATOR(vmi, mm, 0);
386 
387 	mmap_write_lock(mm);
388 	for_each_vma(vmi, vma) {
389 		vma_start_write(vma);
390 		mpol_rebind_policy(vma->vm_policy, new);
391 	}
392 	mmap_write_unlock(mm);
393 }
394 
395 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
396 	[MPOL_DEFAULT] = {
397 		.rebind = mpol_rebind_default,
398 	},
399 	[MPOL_INTERLEAVE] = {
400 		.create = mpol_new_nodemask,
401 		.rebind = mpol_rebind_nodemask,
402 	},
403 	[MPOL_PREFERRED] = {
404 		.create = mpol_new_preferred,
405 		.rebind = mpol_rebind_preferred,
406 	},
407 	[MPOL_BIND] = {
408 		.create = mpol_new_nodemask,
409 		.rebind = mpol_rebind_nodemask,
410 	},
411 	[MPOL_LOCAL] = {
412 		.rebind = mpol_rebind_default,
413 	},
414 	[MPOL_PREFERRED_MANY] = {
415 		.create = mpol_new_nodemask,
416 		.rebind = mpol_rebind_preferred,
417 	},
418 };
419 
420 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
421 				unsigned long flags);
422 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
423 				pgoff_t ilx, int *nid);
424 
425 static bool strictly_unmovable(unsigned long flags)
426 {
427 	/*
428 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
429 	 * if any misplaced page is found.
430 	 */
431 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
432 			 MPOL_MF_STRICT;
433 }
434 
435 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
436 	struct mempolicy *pol;
437 	pgoff_t ilx;
438 };
439 
440 struct queue_pages {
441 	struct list_head *pagelist;
442 	unsigned long flags;
443 	nodemask_t *nmask;
444 	unsigned long start;
445 	unsigned long end;
446 	struct vm_area_struct *first;
447 	struct folio *large;		/* note last large folio encountered */
448 	long nr_failed;			/* could not be isolated at this time */
449 };
450 
451 /*
452  * Check if the folio's nid is in qp->nmask.
453  *
454  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
455  * in the invert of qp->nmask.
456  */
457 static inline bool queue_folio_required(struct folio *folio,
458 					struct queue_pages *qp)
459 {
460 	int nid = folio_nid(folio);
461 	unsigned long flags = qp->flags;
462 
463 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
464 }
465 
466 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
467 {
468 	struct folio *folio;
469 	struct queue_pages *qp = walk->private;
470 
471 	if (unlikely(is_pmd_migration_entry(*pmd))) {
472 		qp->nr_failed++;
473 		return;
474 	}
475 	folio = pfn_folio(pmd_pfn(*pmd));
476 	if (is_huge_zero_page(&folio->page)) {
477 		walk->action = ACTION_CONTINUE;
478 		return;
479 	}
480 	if (!queue_folio_required(folio, qp))
481 		return;
482 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
483 	    !vma_migratable(walk->vma) ||
484 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
485 		qp->nr_failed++;
486 }
487 
488 /*
489  * Scan through folios, checking if they satisfy the required conditions,
490  * moving them from LRU to local pagelist for migration if they do (or not).
491  *
492  * queue_folios_pte_range() has two possible return values:
493  * 0 - continue walking to scan for more, even if an existing folio on the
494  *     wrong node could not be isolated and queued for migration.
495  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
496  *        and an existing folio was on a node that does not follow the policy.
497  */
498 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
499 			unsigned long end, struct mm_walk *walk)
500 {
501 	struct vm_area_struct *vma = walk->vma;
502 	struct folio *folio;
503 	struct queue_pages *qp = walk->private;
504 	unsigned long flags = qp->flags;
505 	pte_t *pte, *mapped_pte;
506 	pte_t ptent;
507 	spinlock_t *ptl;
508 
509 	ptl = pmd_trans_huge_lock(pmd, vma);
510 	if (ptl) {
511 		queue_folios_pmd(pmd, walk);
512 		spin_unlock(ptl);
513 		goto out;
514 	}
515 
516 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
517 	if (!pte) {
518 		walk->action = ACTION_AGAIN;
519 		return 0;
520 	}
521 	for (; addr != end; pte++, addr += PAGE_SIZE) {
522 		ptent = ptep_get(pte);
523 		if (pte_none(ptent))
524 			continue;
525 		if (!pte_present(ptent)) {
526 			if (is_migration_entry(pte_to_swp_entry(ptent)))
527 				qp->nr_failed++;
528 			continue;
529 		}
530 		folio = vm_normal_folio(vma, addr, ptent);
531 		if (!folio || folio_is_zone_device(folio))
532 			continue;
533 		/*
534 		 * vm_normal_folio() filters out zero pages, but there might
535 		 * still be reserved folios to skip, perhaps in a VDSO.
536 		 */
537 		if (folio_test_reserved(folio))
538 			continue;
539 		if (!queue_folio_required(folio, qp))
540 			continue;
541 		if (folio_test_large(folio)) {
542 			/*
543 			 * A large folio can only be isolated from LRU once,
544 			 * but may be mapped by many PTEs (and Copy-On-Write may
545 			 * intersperse PTEs of other, order 0, folios).  This is
546 			 * a common case, so don't mistake it for failure (but
547 			 * there can be other cases of multi-mapped pages which
548 			 * this quick check does not help to filter out - and a
549 			 * search of the pagelist might grow to be prohibitive).
550 			 *
551 			 * migrate_pages(&pagelist) returns nr_failed folios, so
552 			 * check "large" now so that queue_pages_range() returns
553 			 * a comparable nr_failed folios.  This does imply that
554 			 * if folio could not be isolated for some racy reason
555 			 * at its first PTE, later PTEs will not give it another
556 			 * chance of isolation; but keeps the accounting simple.
557 			 */
558 			if (folio == qp->large)
559 				continue;
560 			qp->large = folio;
561 		}
562 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
563 		    !vma_migratable(vma) ||
564 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
565 			qp->nr_failed++;
566 			if (strictly_unmovable(flags))
567 				break;
568 		}
569 	}
570 	pte_unmap_unlock(mapped_pte, ptl);
571 	cond_resched();
572 out:
573 	if (qp->nr_failed && strictly_unmovable(flags))
574 		return -EIO;
575 	return 0;
576 }
577 
578 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
579 			       unsigned long addr, unsigned long end,
580 			       struct mm_walk *walk)
581 {
582 #ifdef CONFIG_HUGETLB_PAGE
583 	struct queue_pages *qp = walk->private;
584 	unsigned long flags = qp->flags;
585 	struct folio *folio;
586 	spinlock_t *ptl;
587 	pte_t entry;
588 
589 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
590 	entry = huge_ptep_get(pte);
591 	if (!pte_present(entry)) {
592 		if (unlikely(is_hugetlb_entry_migration(entry)))
593 			qp->nr_failed++;
594 		goto unlock;
595 	}
596 	folio = pfn_folio(pte_pfn(entry));
597 	if (!queue_folio_required(folio, qp))
598 		goto unlock;
599 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
600 	    !vma_migratable(walk->vma)) {
601 		qp->nr_failed++;
602 		goto unlock;
603 	}
604 	/*
605 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
606 	 * Choosing not to migrate a shared folio is not counted as a failure.
607 	 *
608 	 * To check if the folio is shared, ideally we want to make sure
609 	 * every page is mapped to the same process. Doing that is very
610 	 * expensive, so check the estimated sharers of the folio instead.
611 	 */
612 	if ((flags & MPOL_MF_MOVE_ALL) ||
613 	    (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
614 		if (!isolate_hugetlb(folio, qp->pagelist))
615 			qp->nr_failed++;
616 unlock:
617 	spin_unlock(ptl);
618 	if (qp->nr_failed && strictly_unmovable(flags))
619 		return -EIO;
620 #endif
621 	return 0;
622 }
623 
624 #ifdef CONFIG_NUMA_BALANCING
625 /*
626  * This is used to mark a range of virtual addresses to be inaccessible.
627  * These are later cleared by a NUMA hinting fault. Depending on these
628  * faults, pages may be migrated for better NUMA placement.
629  *
630  * This is assuming that NUMA faults are handled using PROT_NONE. If
631  * an architecture makes a different choice, it will need further
632  * changes to the core.
633  */
634 unsigned long change_prot_numa(struct vm_area_struct *vma,
635 			unsigned long addr, unsigned long end)
636 {
637 	struct mmu_gather tlb;
638 	long nr_updated;
639 
640 	tlb_gather_mmu(&tlb, vma->vm_mm);
641 
642 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
643 	if (nr_updated > 0)
644 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
645 
646 	tlb_finish_mmu(&tlb);
647 
648 	return nr_updated;
649 }
650 #endif /* CONFIG_NUMA_BALANCING */
651 
652 static int queue_pages_test_walk(unsigned long start, unsigned long end,
653 				struct mm_walk *walk)
654 {
655 	struct vm_area_struct *next, *vma = walk->vma;
656 	struct queue_pages *qp = walk->private;
657 	unsigned long endvma = vma->vm_end;
658 	unsigned long flags = qp->flags;
659 
660 	/* range check first */
661 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
662 
663 	if (!qp->first) {
664 		qp->first = vma;
665 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
666 			(qp->start < vma->vm_start))
667 			/* hole at head side of range */
668 			return -EFAULT;
669 	}
670 	next = find_vma(vma->vm_mm, vma->vm_end);
671 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
672 		((vma->vm_end < qp->end) &&
673 		(!next || vma->vm_end < next->vm_start)))
674 		/* hole at middle or tail of range */
675 		return -EFAULT;
676 
677 	/*
678 	 * Need check MPOL_MF_STRICT to return -EIO if possible
679 	 * regardless of vma_migratable
680 	 */
681 	if (!vma_migratable(vma) &&
682 	    !(flags & MPOL_MF_STRICT))
683 		return 1;
684 
685 	if (endvma > end)
686 		endvma = end;
687 
688 	/*
689 	 * Check page nodes, and queue pages to move, in the current vma.
690 	 * But if no moving, and no strict checking, the scan can be skipped.
691 	 */
692 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
693 		return 0;
694 	return 1;
695 }
696 
697 static const struct mm_walk_ops queue_pages_walk_ops = {
698 	.hugetlb_entry		= queue_folios_hugetlb,
699 	.pmd_entry		= queue_folios_pte_range,
700 	.test_walk		= queue_pages_test_walk,
701 	.walk_lock		= PGWALK_RDLOCK,
702 };
703 
704 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
705 	.hugetlb_entry		= queue_folios_hugetlb,
706 	.pmd_entry		= queue_folios_pte_range,
707 	.test_walk		= queue_pages_test_walk,
708 	.walk_lock		= PGWALK_WRLOCK,
709 };
710 
711 /*
712  * Walk through page tables and collect pages to be migrated.
713  *
714  * If pages found in a given range are not on the required set of @nodes,
715  * and migration is allowed, they are isolated and queued to @pagelist.
716  *
717  * queue_pages_range() may return:
718  * 0 - all pages already on the right node, or successfully queued for moving
719  *     (or neither strict checking nor moving requested: only range checking).
720  * >0 - this number of misplaced folios could not be queued for moving
721  *      (a hugetlbfs page or a transparent huge page being counted as 1).
722  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
723  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
724  */
725 static long
726 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
727 		nodemask_t *nodes, unsigned long flags,
728 		struct list_head *pagelist)
729 {
730 	int err;
731 	struct queue_pages qp = {
732 		.pagelist = pagelist,
733 		.flags = flags,
734 		.nmask = nodes,
735 		.start = start,
736 		.end = end,
737 		.first = NULL,
738 	};
739 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
740 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
741 
742 	err = walk_page_range(mm, start, end, ops, &qp);
743 
744 	if (!qp.first)
745 		/* whole range in hole */
746 		err = -EFAULT;
747 
748 	return err ? : qp.nr_failed;
749 }
750 
751 /*
752  * Apply policy to a single VMA
753  * This must be called with the mmap_lock held for writing.
754  */
755 static int vma_replace_policy(struct vm_area_struct *vma,
756 				struct mempolicy *pol)
757 {
758 	int err;
759 	struct mempolicy *old;
760 	struct mempolicy *new;
761 
762 	vma_assert_write_locked(vma);
763 
764 	new = mpol_dup(pol);
765 	if (IS_ERR(new))
766 		return PTR_ERR(new);
767 
768 	if (vma->vm_ops && vma->vm_ops->set_policy) {
769 		err = vma->vm_ops->set_policy(vma, new);
770 		if (err)
771 			goto err_out;
772 	}
773 
774 	old = vma->vm_policy;
775 	vma->vm_policy = new; /* protected by mmap_lock */
776 	mpol_put(old);
777 
778 	return 0;
779  err_out:
780 	mpol_put(new);
781 	return err;
782 }
783 
784 /* Split or merge the VMA (if required) and apply the new policy */
785 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
786 		struct vm_area_struct **prev, unsigned long start,
787 		unsigned long end, struct mempolicy *new_pol)
788 {
789 	unsigned long vmstart, vmend;
790 
791 	vmend = min(end, vma->vm_end);
792 	if (start > vma->vm_start) {
793 		*prev = vma;
794 		vmstart = start;
795 	} else {
796 		vmstart = vma->vm_start;
797 	}
798 
799 	if (mpol_equal(vma->vm_policy, new_pol)) {
800 		*prev = vma;
801 		return 0;
802 	}
803 
804 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
805 	if (IS_ERR(vma))
806 		return PTR_ERR(vma);
807 
808 	*prev = vma;
809 	return vma_replace_policy(vma, new_pol);
810 }
811 
812 /* Set the process memory policy */
813 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
814 			     nodemask_t *nodes)
815 {
816 	struct mempolicy *new, *old;
817 	NODEMASK_SCRATCH(scratch);
818 	int ret;
819 
820 	if (!scratch)
821 		return -ENOMEM;
822 
823 	new = mpol_new(mode, flags, nodes);
824 	if (IS_ERR(new)) {
825 		ret = PTR_ERR(new);
826 		goto out;
827 	}
828 
829 	task_lock(current);
830 	ret = mpol_set_nodemask(new, nodes, scratch);
831 	if (ret) {
832 		task_unlock(current);
833 		mpol_put(new);
834 		goto out;
835 	}
836 
837 	old = current->mempolicy;
838 	current->mempolicy = new;
839 	if (new && new->mode == MPOL_INTERLEAVE)
840 		current->il_prev = MAX_NUMNODES-1;
841 	task_unlock(current);
842 	mpol_put(old);
843 	ret = 0;
844 out:
845 	NODEMASK_SCRATCH_FREE(scratch);
846 	return ret;
847 }
848 
849 /*
850  * Return nodemask for policy for get_mempolicy() query
851  *
852  * Called with task's alloc_lock held
853  */
854 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
855 {
856 	nodes_clear(*nodes);
857 	if (pol == &default_policy)
858 		return;
859 
860 	switch (pol->mode) {
861 	case MPOL_BIND:
862 	case MPOL_INTERLEAVE:
863 	case MPOL_PREFERRED:
864 	case MPOL_PREFERRED_MANY:
865 		*nodes = pol->nodes;
866 		break;
867 	case MPOL_LOCAL:
868 		/* return empty node mask for local allocation */
869 		break;
870 	default:
871 		BUG();
872 	}
873 }
874 
875 static int lookup_node(struct mm_struct *mm, unsigned long addr)
876 {
877 	struct page *p = NULL;
878 	int ret;
879 
880 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
881 	if (ret > 0) {
882 		ret = page_to_nid(p);
883 		put_page(p);
884 	}
885 	return ret;
886 }
887 
888 /* Retrieve NUMA policy */
889 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
890 			     unsigned long addr, unsigned long flags)
891 {
892 	int err;
893 	struct mm_struct *mm = current->mm;
894 	struct vm_area_struct *vma = NULL;
895 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
896 
897 	if (flags &
898 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
899 		return -EINVAL;
900 
901 	if (flags & MPOL_F_MEMS_ALLOWED) {
902 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
903 			return -EINVAL;
904 		*policy = 0;	/* just so it's initialized */
905 		task_lock(current);
906 		*nmask  = cpuset_current_mems_allowed;
907 		task_unlock(current);
908 		return 0;
909 	}
910 
911 	if (flags & MPOL_F_ADDR) {
912 		pgoff_t ilx;		/* ignored here */
913 		/*
914 		 * Do NOT fall back to task policy if the
915 		 * vma/shared policy at addr is NULL.  We
916 		 * want to return MPOL_DEFAULT in this case.
917 		 */
918 		mmap_read_lock(mm);
919 		vma = vma_lookup(mm, addr);
920 		if (!vma) {
921 			mmap_read_unlock(mm);
922 			return -EFAULT;
923 		}
924 		pol = __get_vma_policy(vma, addr, &ilx);
925 	} else if (addr)
926 		return -EINVAL;
927 
928 	if (!pol)
929 		pol = &default_policy;	/* indicates default behavior */
930 
931 	if (flags & MPOL_F_NODE) {
932 		if (flags & MPOL_F_ADDR) {
933 			/*
934 			 * Take a refcount on the mpol, because we are about to
935 			 * drop the mmap_lock, after which only "pol" remains
936 			 * valid, "vma" is stale.
937 			 */
938 			pol_refcount = pol;
939 			vma = NULL;
940 			mpol_get(pol);
941 			mmap_read_unlock(mm);
942 			err = lookup_node(mm, addr);
943 			if (err < 0)
944 				goto out;
945 			*policy = err;
946 		} else if (pol == current->mempolicy &&
947 				pol->mode == MPOL_INTERLEAVE) {
948 			*policy = next_node_in(current->il_prev, pol->nodes);
949 		} else {
950 			err = -EINVAL;
951 			goto out;
952 		}
953 	} else {
954 		*policy = pol == &default_policy ? MPOL_DEFAULT :
955 						pol->mode;
956 		/*
957 		 * Internal mempolicy flags must be masked off before exposing
958 		 * the policy to userspace.
959 		 */
960 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
961 	}
962 
963 	err = 0;
964 	if (nmask) {
965 		if (mpol_store_user_nodemask(pol)) {
966 			*nmask = pol->w.user_nodemask;
967 		} else {
968 			task_lock(current);
969 			get_policy_nodemask(pol, nmask);
970 			task_unlock(current);
971 		}
972 	}
973 
974  out:
975 	mpol_cond_put(pol);
976 	if (vma)
977 		mmap_read_unlock(mm);
978 	if (pol_refcount)
979 		mpol_put(pol_refcount);
980 	return err;
981 }
982 
983 #ifdef CONFIG_MIGRATION
984 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
985 				unsigned long flags)
986 {
987 	/*
988 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
989 	 * Choosing not to migrate a shared folio is not counted as a failure.
990 	 *
991 	 * To check if the folio is shared, ideally we want to make sure
992 	 * every page is mapped to the same process. Doing that is very
993 	 * expensive, so check the estimated sharers of the folio instead.
994 	 */
995 	if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
996 		if (folio_isolate_lru(folio)) {
997 			list_add_tail(&folio->lru, foliolist);
998 			node_stat_mod_folio(folio,
999 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1000 				folio_nr_pages(folio));
1001 		} else {
1002 			/*
1003 			 * Non-movable folio may reach here.  And, there may be
1004 			 * temporary off LRU folios or non-LRU movable folios.
1005 			 * Treat them as unmovable folios since they can't be
1006 			 * isolated, so they can't be moved at the moment.
1007 			 */
1008 			return false;
1009 		}
1010 	}
1011 	return true;
1012 }
1013 
1014 /*
1015  * Migrate pages from one node to a target node.
1016  * Returns error or the number of pages not migrated.
1017  */
1018 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1019 			    int flags)
1020 {
1021 	nodemask_t nmask;
1022 	struct vm_area_struct *vma;
1023 	LIST_HEAD(pagelist);
1024 	long nr_failed;
1025 	long err = 0;
1026 	struct migration_target_control mtc = {
1027 		.nid = dest,
1028 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1029 	};
1030 
1031 	nodes_clear(nmask);
1032 	node_set(source, nmask);
1033 
1034 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1035 
1036 	mmap_read_lock(mm);
1037 	vma = find_vma(mm, 0);
1038 
1039 	/*
1040 	 * This does not migrate the range, but isolates all pages that
1041 	 * need migration.  Between passing in the full user address
1042 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1043 	 * but passes back the count of pages which could not be isolated.
1044 	 */
1045 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1046 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1047 	mmap_read_unlock(mm);
1048 
1049 	if (!list_empty(&pagelist)) {
1050 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1051 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1052 		if (err)
1053 			putback_movable_pages(&pagelist);
1054 	}
1055 
1056 	if (err >= 0)
1057 		err += nr_failed;
1058 	return err;
1059 }
1060 
1061 /*
1062  * Move pages between the two nodesets so as to preserve the physical
1063  * layout as much as possible.
1064  *
1065  * Returns the number of page that could not be moved.
1066  */
1067 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1068 		     const nodemask_t *to, int flags)
1069 {
1070 	long nr_failed = 0;
1071 	long err = 0;
1072 	nodemask_t tmp;
1073 
1074 	lru_cache_disable();
1075 
1076 	/*
1077 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1078 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1079 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1080 	 * The pair of nodemasks 'to' and 'from' define the map.
1081 	 *
1082 	 * If no pair of bits is found that way, fallback to picking some
1083 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1084 	 * 'source' and 'dest' bits are the same, this represents a node
1085 	 * that will be migrating to itself, so no pages need move.
1086 	 *
1087 	 * If no bits are left in 'tmp', or if all remaining bits left
1088 	 * in 'tmp' correspond to the same bit in 'to', return false
1089 	 * (nothing left to migrate).
1090 	 *
1091 	 * This lets us pick a pair of nodes to migrate between, such that
1092 	 * if possible the dest node is not already occupied by some other
1093 	 * source node, minimizing the risk of overloading the memory on a
1094 	 * node that would happen if we migrated incoming memory to a node
1095 	 * before migrating outgoing memory source that same node.
1096 	 *
1097 	 * A single scan of tmp is sufficient.  As we go, we remember the
1098 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1099 	 * that not only moved, but what's better, moved to an empty slot
1100 	 * (d is not set in tmp), then we break out then, with that pair.
1101 	 * Otherwise when we finish scanning from_tmp, we at least have the
1102 	 * most recent <s, d> pair that moved.  If we get all the way through
1103 	 * the scan of tmp without finding any node that moved, much less
1104 	 * moved to an empty node, then there is nothing left worth migrating.
1105 	 */
1106 
1107 	tmp = *from;
1108 	while (!nodes_empty(tmp)) {
1109 		int s, d;
1110 		int source = NUMA_NO_NODE;
1111 		int dest = 0;
1112 
1113 		for_each_node_mask(s, tmp) {
1114 
1115 			/*
1116 			 * do_migrate_pages() tries to maintain the relative
1117 			 * node relationship of the pages established between
1118 			 * threads and memory areas.
1119                          *
1120 			 * However if the number of source nodes is not equal to
1121 			 * the number of destination nodes we can not preserve
1122 			 * this node relative relationship.  In that case, skip
1123 			 * copying memory from a node that is in the destination
1124 			 * mask.
1125 			 *
1126 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1127 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1128 			 */
1129 
1130 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1131 						(node_isset(s, *to)))
1132 				continue;
1133 
1134 			d = node_remap(s, *from, *to);
1135 			if (s == d)
1136 				continue;
1137 
1138 			source = s;	/* Node moved. Memorize */
1139 			dest = d;
1140 
1141 			/* dest not in remaining from nodes? */
1142 			if (!node_isset(dest, tmp))
1143 				break;
1144 		}
1145 		if (source == NUMA_NO_NODE)
1146 			break;
1147 
1148 		node_clear(source, tmp);
1149 		err = migrate_to_node(mm, source, dest, flags);
1150 		if (err > 0)
1151 			nr_failed += err;
1152 		if (err < 0)
1153 			break;
1154 	}
1155 
1156 	lru_cache_enable();
1157 	if (err < 0)
1158 		return err;
1159 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1160 }
1161 
1162 /*
1163  * Allocate a new folio for page migration, according to NUMA mempolicy.
1164  */
1165 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1166 						    unsigned long private)
1167 {
1168 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1169 	struct mempolicy *pol = mmpol->pol;
1170 	pgoff_t ilx = mmpol->ilx;
1171 	struct page *page;
1172 	unsigned int order;
1173 	int nid = numa_node_id();
1174 	gfp_t gfp;
1175 
1176 	order = folio_order(src);
1177 	ilx += src->index >> order;
1178 
1179 	if (folio_test_hugetlb(src)) {
1180 		nodemask_t *nodemask;
1181 		struct hstate *h;
1182 
1183 		h = folio_hstate(src);
1184 		gfp = htlb_alloc_mask(h);
1185 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1186 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
1187 	}
1188 
1189 	if (folio_test_large(src))
1190 		gfp = GFP_TRANSHUGE;
1191 	else
1192 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1193 
1194 	page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
1195 	return page_rmappable_folio(page);
1196 }
1197 #else
1198 
1199 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1200 				unsigned long flags)
1201 {
1202 	return false;
1203 }
1204 
1205 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1206 		     const nodemask_t *to, int flags)
1207 {
1208 	return -ENOSYS;
1209 }
1210 
1211 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1212 						    unsigned long private)
1213 {
1214 	return NULL;
1215 }
1216 #endif
1217 
1218 static long do_mbind(unsigned long start, unsigned long len,
1219 		     unsigned short mode, unsigned short mode_flags,
1220 		     nodemask_t *nmask, unsigned long flags)
1221 {
1222 	struct mm_struct *mm = current->mm;
1223 	struct vm_area_struct *vma, *prev;
1224 	struct vma_iterator vmi;
1225 	struct migration_mpol mmpol;
1226 	struct mempolicy *new;
1227 	unsigned long end;
1228 	long err;
1229 	long nr_failed;
1230 	LIST_HEAD(pagelist);
1231 
1232 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1233 		return -EINVAL;
1234 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1235 		return -EPERM;
1236 
1237 	if (start & ~PAGE_MASK)
1238 		return -EINVAL;
1239 
1240 	if (mode == MPOL_DEFAULT)
1241 		flags &= ~MPOL_MF_STRICT;
1242 
1243 	len = PAGE_ALIGN(len);
1244 	end = start + len;
1245 
1246 	if (end < start)
1247 		return -EINVAL;
1248 	if (end == start)
1249 		return 0;
1250 
1251 	new = mpol_new(mode, mode_flags, nmask);
1252 	if (IS_ERR(new))
1253 		return PTR_ERR(new);
1254 
1255 	/*
1256 	 * If we are using the default policy then operation
1257 	 * on discontinuous address spaces is okay after all
1258 	 */
1259 	if (!new)
1260 		flags |= MPOL_MF_DISCONTIG_OK;
1261 
1262 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1263 		lru_cache_disable();
1264 	{
1265 		NODEMASK_SCRATCH(scratch);
1266 		if (scratch) {
1267 			mmap_write_lock(mm);
1268 			err = mpol_set_nodemask(new, nmask, scratch);
1269 			if (err)
1270 				mmap_write_unlock(mm);
1271 		} else
1272 			err = -ENOMEM;
1273 		NODEMASK_SCRATCH_FREE(scratch);
1274 	}
1275 	if (err)
1276 		goto mpol_out;
1277 
1278 	/*
1279 	 * Lock the VMAs before scanning for pages to migrate,
1280 	 * to ensure we don't miss a concurrently inserted page.
1281 	 */
1282 	nr_failed = queue_pages_range(mm, start, end, nmask,
1283 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1284 
1285 	if (nr_failed < 0) {
1286 		err = nr_failed;
1287 		nr_failed = 0;
1288 	} else {
1289 		vma_iter_init(&vmi, mm, start);
1290 		prev = vma_prev(&vmi);
1291 		for_each_vma_range(vmi, vma, end) {
1292 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1293 			if (err)
1294 				break;
1295 		}
1296 	}
1297 
1298 	if (!err && !list_empty(&pagelist)) {
1299 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1300 		if (!new) {
1301 			new = get_task_policy(current);
1302 			mpol_get(new);
1303 		}
1304 		mmpol.pol = new;
1305 		mmpol.ilx = 0;
1306 
1307 		/*
1308 		 * In the interleaved case, attempt to allocate on exactly the
1309 		 * targeted nodes, for the first VMA to be migrated; for later
1310 		 * VMAs, the nodes will still be interleaved from the targeted
1311 		 * nodemask, but one by one may be selected differently.
1312 		 */
1313 		if (new->mode == MPOL_INTERLEAVE) {
1314 			struct page *page;
1315 			unsigned int order;
1316 			unsigned long addr = -EFAULT;
1317 
1318 			list_for_each_entry(page, &pagelist, lru) {
1319 				if (!PageKsm(page))
1320 					break;
1321 			}
1322 			if (!list_entry_is_head(page, &pagelist, lru)) {
1323 				vma_iter_init(&vmi, mm, start);
1324 				for_each_vma_range(vmi, vma, end) {
1325 					addr = page_address_in_vma(page, vma);
1326 					if (addr != -EFAULT)
1327 						break;
1328 				}
1329 			}
1330 			if (addr != -EFAULT) {
1331 				order = compound_order(page);
1332 				/* We already know the pol, but not the ilx */
1333 				mpol_cond_put(get_vma_policy(vma, addr, order,
1334 							     &mmpol.ilx));
1335 				/* Set base from which to increment by index */
1336 				mmpol.ilx -= page->index >> order;
1337 			}
1338 		}
1339 	}
1340 
1341 	mmap_write_unlock(mm);
1342 
1343 	if (!err && !list_empty(&pagelist)) {
1344 		nr_failed |= migrate_pages(&pagelist,
1345 				alloc_migration_target_by_mpol, NULL,
1346 				(unsigned long)&mmpol, MIGRATE_SYNC,
1347 				MR_MEMPOLICY_MBIND, NULL);
1348 	}
1349 
1350 	if (nr_failed && (flags & MPOL_MF_STRICT))
1351 		err = -EIO;
1352 	if (!list_empty(&pagelist))
1353 		putback_movable_pages(&pagelist);
1354 mpol_out:
1355 	mpol_put(new);
1356 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1357 		lru_cache_enable();
1358 	return err;
1359 }
1360 
1361 /*
1362  * User space interface with variable sized bitmaps for nodelists.
1363  */
1364 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1365 		      unsigned long maxnode)
1366 {
1367 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1368 	int ret;
1369 
1370 	if (in_compat_syscall())
1371 		ret = compat_get_bitmap(mask,
1372 					(const compat_ulong_t __user *)nmask,
1373 					maxnode);
1374 	else
1375 		ret = copy_from_user(mask, nmask,
1376 				     nlongs * sizeof(unsigned long));
1377 
1378 	if (ret)
1379 		return -EFAULT;
1380 
1381 	if (maxnode % BITS_PER_LONG)
1382 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1383 
1384 	return 0;
1385 }
1386 
1387 /* Copy a node mask from user space. */
1388 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1389 		     unsigned long maxnode)
1390 {
1391 	--maxnode;
1392 	nodes_clear(*nodes);
1393 	if (maxnode == 0 || !nmask)
1394 		return 0;
1395 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1396 		return -EINVAL;
1397 
1398 	/*
1399 	 * When the user specified more nodes than supported just check
1400 	 * if the non supported part is all zero, one word at a time,
1401 	 * starting at the end.
1402 	 */
1403 	while (maxnode > MAX_NUMNODES) {
1404 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1405 		unsigned long t;
1406 
1407 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1408 			return -EFAULT;
1409 
1410 		if (maxnode - bits >= MAX_NUMNODES) {
1411 			maxnode -= bits;
1412 		} else {
1413 			maxnode = MAX_NUMNODES;
1414 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1415 		}
1416 		if (t)
1417 			return -EINVAL;
1418 	}
1419 
1420 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1421 }
1422 
1423 /* Copy a kernel node mask to user space */
1424 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1425 			      nodemask_t *nodes)
1426 {
1427 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1428 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1429 	bool compat = in_compat_syscall();
1430 
1431 	if (compat)
1432 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1433 
1434 	if (copy > nbytes) {
1435 		if (copy > PAGE_SIZE)
1436 			return -EINVAL;
1437 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1438 			return -EFAULT;
1439 		copy = nbytes;
1440 		maxnode = nr_node_ids;
1441 	}
1442 
1443 	if (compat)
1444 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1445 					 nodes_addr(*nodes), maxnode);
1446 
1447 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1448 }
1449 
1450 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1451 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1452 {
1453 	*flags = *mode & MPOL_MODE_FLAGS;
1454 	*mode &= ~MPOL_MODE_FLAGS;
1455 
1456 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1457 		return -EINVAL;
1458 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1459 		return -EINVAL;
1460 	if (*flags & MPOL_F_NUMA_BALANCING) {
1461 		if (*mode != MPOL_BIND)
1462 			return -EINVAL;
1463 		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1464 	}
1465 	return 0;
1466 }
1467 
1468 static long kernel_mbind(unsigned long start, unsigned long len,
1469 			 unsigned long mode, const unsigned long __user *nmask,
1470 			 unsigned long maxnode, unsigned int flags)
1471 {
1472 	unsigned short mode_flags;
1473 	nodemask_t nodes;
1474 	int lmode = mode;
1475 	int err;
1476 
1477 	start = untagged_addr(start);
1478 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1479 	if (err)
1480 		return err;
1481 
1482 	err = get_nodes(&nodes, nmask, maxnode);
1483 	if (err)
1484 		return err;
1485 
1486 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1487 }
1488 
1489 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1490 		unsigned long, home_node, unsigned long, flags)
1491 {
1492 	struct mm_struct *mm = current->mm;
1493 	struct vm_area_struct *vma, *prev;
1494 	struct mempolicy *new, *old;
1495 	unsigned long end;
1496 	int err = -ENOENT;
1497 	VMA_ITERATOR(vmi, mm, start);
1498 
1499 	start = untagged_addr(start);
1500 	if (start & ~PAGE_MASK)
1501 		return -EINVAL;
1502 	/*
1503 	 * flags is used for future extension if any.
1504 	 */
1505 	if (flags != 0)
1506 		return -EINVAL;
1507 
1508 	/*
1509 	 * Check home_node is online to avoid accessing uninitialized
1510 	 * NODE_DATA.
1511 	 */
1512 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1513 		return -EINVAL;
1514 
1515 	len = PAGE_ALIGN(len);
1516 	end = start + len;
1517 
1518 	if (end < start)
1519 		return -EINVAL;
1520 	if (end == start)
1521 		return 0;
1522 	mmap_write_lock(mm);
1523 	prev = vma_prev(&vmi);
1524 	for_each_vma_range(vmi, vma, end) {
1525 		/*
1526 		 * If any vma in the range got policy other than MPOL_BIND
1527 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1528 		 * the home node for vmas we already updated before.
1529 		 */
1530 		old = vma_policy(vma);
1531 		if (!old) {
1532 			prev = vma;
1533 			continue;
1534 		}
1535 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1536 			err = -EOPNOTSUPP;
1537 			break;
1538 		}
1539 		new = mpol_dup(old);
1540 		if (IS_ERR(new)) {
1541 			err = PTR_ERR(new);
1542 			break;
1543 		}
1544 
1545 		vma_start_write(vma);
1546 		new->home_node = home_node;
1547 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1548 		mpol_put(new);
1549 		if (err)
1550 			break;
1551 	}
1552 	mmap_write_unlock(mm);
1553 	return err;
1554 }
1555 
1556 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1557 		unsigned long, mode, const unsigned long __user *, nmask,
1558 		unsigned long, maxnode, unsigned int, flags)
1559 {
1560 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1561 }
1562 
1563 /* Set the process memory policy */
1564 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1565 				 unsigned long maxnode)
1566 {
1567 	unsigned short mode_flags;
1568 	nodemask_t nodes;
1569 	int lmode = mode;
1570 	int err;
1571 
1572 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1573 	if (err)
1574 		return err;
1575 
1576 	err = get_nodes(&nodes, nmask, maxnode);
1577 	if (err)
1578 		return err;
1579 
1580 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1581 }
1582 
1583 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1584 		unsigned long, maxnode)
1585 {
1586 	return kernel_set_mempolicy(mode, nmask, maxnode);
1587 }
1588 
1589 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1590 				const unsigned long __user *old_nodes,
1591 				const unsigned long __user *new_nodes)
1592 {
1593 	struct mm_struct *mm = NULL;
1594 	struct task_struct *task;
1595 	nodemask_t task_nodes;
1596 	int err;
1597 	nodemask_t *old;
1598 	nodemask_t *new;
1599 	NODEMASK_SCRATCH(scratch);
1600 
1601 	if (!scratch)
1602 		return -ENOMEM;
1603 
1604 	old = &scratch->mask1;
1605 	new = &scratch->mask2;
1606 
1607 	err = get_nodes(old, old_nodes, maxnode);
1608 	if (err)
1609 		goto out;
1610 
1611 	err = get_nodes(new, new_nodes, maxnode);
1612 	if (err)
1613 		goto out;
1614 
1615 	/* Find the mm_struct */
1616 	rcu_read_lock();
1617 	task = pid ? find_task_by_vpid(pid) : current;
1618 	if (!task) {
1619 		rcu_read_unlock();
1620 		err = -ESRCH;
1621 		goto out;
1622 	}
1623 	get_task_struct(task);
1624 
1625 	err = -EINVAL;
1626 
1627 	/*
1628 	 * Check if this process has the right to modify the specified process.
1629 	 * Use the regular "ptrace_may_access()" checks.
1630 	 */
1631 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1632 		rcu_read_unlock();
1633 		err = -EPERM;
1634 		goto out_put;
1635 	}
1636 	rcu_read_unlock();
1637 
1638 	task_nodes = cpuset_mems_allowed(task);
1639 	/* Is the user allowed to access the target nodes? */
1640 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1641 		err = -EPERM;
1642 		goto out_put;
1643 	}
1644 
1645 	task_nodes = cpuset_mems_allowed(current);
1646 	nodes_and(*new, *new, task_nodes);
1647 	if (nodes_empty(*new))
1648 		goto out_put;
1649 
1650 	err = security_task_movememory(task);
1651 	if (err)
1652 		goto out_put;
1653 
1654 	mm = get_task_mm(task);
1655 	put_task_struct(task);
1656 
1657 	if (!mm) {
1658 		err = -EINVAL;
1659 		goto out;
1660 	}
1661 
1662 	err = do_migrate_pages(mm, old, new,
1663 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1664 
1665 	mmput(mm);
1666 out:
1667 	NODEMASK_SCRATCH_FREE(scratch);
1668 
1669 	return err;
1670 
1671 out_put:
1672 	put_task_struct(task);
1673 	goto out;
1674 }
1675 
1676 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1677 		const unsigned long __user *, old_nodes,
1678 		const unsigned long __user *, new_nodes)
1679 {
1680 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1681 }
1682 
1683 /* Retrieve NUMA policy */
1684 static int kernel_get_mempolicy(int __user *policy,
1685 				unsigned long __user *nmask,
1686 				unsigned long maxnode,
1687 				unsigned long addr,
1688 				unsigned long flags)
1689 {
1690 	int err;
1691 	int pval;
1692 	nodemask_t nodes;
1693 
1694 	if (nmask != NULL && maxnode < nr_node_ids)
1695 		return -EINVAL;
1696 
1697 	addr = untagged_addr(addr);
1698 
1699 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1700 
1701 	if (err)
1702 		return err;
1703 
1704 	if (policy && put_user(pval, policy))
1705 		return -EFAULT;
1706 
1707 	if (nmask)
1708 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1709 
1710 	return err;
1711 }
1712 
1713 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1714 		unsigned long __user *, nmask, unsigned long, maxnode,
1715 		unsigned long, addr, unsigned long, flags)
1716 {
1717 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1718 }
1719 
1720 bool vma_migratable(struct vm_area_struct *vma)
1721 {
1722 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1723 		return false;
1724 
1725 	/*
1726 	 * DAX device mappings require predictable access latency, so avoid
1727 	 * incurring periodic faults.
1728 	 */
1729 	if (vma_is_dax(vma))
1730 		return false;
1731 
1732 	if (is_vm_hugetlb_page(vma) &&
1733 		!hugepage_migration_supported(hstate_vma(vma)))
1734 		return false;
1735 
1736 	/*
1737 	 * Migration allocates pages in the highest zone. If we cannot
1738 	 * do so then migration (at least from node to node) is not
1739 	 * possible.
1740 	 */
1741 	if (vma->vm_file &&
1742 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1743 			< policy_zone)
1744 		return false;
1745 	return true;
1746 }
1747 
1748 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1749 				   unsigned long addr, pgoff_t *ilx)
1750 {
1751 	*ilx = 0;
1752 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
1753 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1754 }
1755 
1756 /*
1757  * get_vma_policy(@vma, @addr, @order, @ilx)
1758  * @vma: virtual memory area whose policy is sought
1759  * @addr: address in @vma for shared policy lookup
1760  * @order: 0, or appropriate huge_page_order for interleaving
1761  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
1762  *
1763  * Returns effective policy for a VMA at specified address.
1764  * Falls back to current->mempolicy or system default policy, as necessary.
1765  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1766  * count--added by the get_policy() vm_op, as appropriate--to protect against
1767  * freeing by another task.  It is the caller's responsibility to free the
1768  * extra reference for shared policies.
1769  */
1770 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1771 				 unsigned long addr, int order, pgoff_t *ilx)
1772 {
1773 	struct mempolicy *pol;
1774 
1775 	pol = __get_vma_policy(vma, addr, ilx);
1776 	if (!pol)
1777 		pol = get_task_policy(current);
1778 	if (pol->mode == MPOL_INTERLEAVE) {
1779 		*ilx += vma->vm_pgoff >> order;
1780 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1781 	}
1782 	return pol;
1783 }
1784 
1785 bool vma_policy_mof(struct vm_area_struct *vma)
1786 {
1787 	struct mempolicy *pol;
1788 
1789 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1790 		bool ret = false;
1791 		pgoff_t ilx;		/* ignored here */
1792 
1793 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1794 		if (pol && (pol->flags & MPOL_F_MOF))
1795 			ret = true;
1796 		mpol_cond_put(pol);
1797 
1798 		return ret;
1799 	}
1800 
1801 	pol = vma->vm_policy;
1802 	if (!pol)
1803 		pol = get_task_policy(current);
1804 
1805 	return pol->flags & MPOL_F_MOF;
1806 }
1807 
1808 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1809 {
1810 	enum zone_type dynamic_policy_zone = policy_zone;
1811 
1812 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1813 
1814 	/*
1815 	 * if policy->nodes has movable memory only,
1816 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1817 	 *
1818 	 * policy->nodes is intersect with node_states[N_MEMORY].
1819 	 * so if the following test fails, it implies
1820 	 * policy->nodes has movable memory only.
1821 	 */
1822 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1823 		dynamic_policy_zone = ZONE_MOVABLE;
1824 
1825 	return zone >= dynamic_policy_zone;
1826 }
1827 
1828 /* Do dynamic interleaving for a process */
1829 static unsigned int interleave_nodes(struct mempolicy *policy)
1830 {
1831 	unsigned int nid;
1832 
1833 	nid = next_node_in(current->il_prev, policy->nodes);
1834 	if (nid < MAX_NUMNODES)
1835 		current->il_prev = nid;
1836 	return nid;
1837 }
1838 
1839 /*
1840  * Depending on the memory policy provide a node from which to allocate the
1841  * next slab entry.
1842  */
1843 unsigned int mempolicy_slab_node(void)
1844 {
1845 	struct mempolicy *policy;
1846 	int node = numa_mem_id();
1847 
1848 	if (!in_task())
1849 		return node;
1850 
1851 	policy = current->mempolicy;
1852 	if (!policy)
1853 		return node;
1854 
1855 	switch (policy->mode) {
1856 	case MPOL_PREFERRED:
1857 		return first_node(policy->nodes);
1858 
1859 	case MPOL_INTERLEAVE:
1860 		return interleave_nodes(policy);
1861 
1862 	case MPOL_BIND:
1863 	case MPOL_PREFERRED_MANY:
1864 	{
1865 		struct zoneref *z;
1866 
1867 		/*
1868 		 * Follow bind policy behavior and start allocation at the
1869 		 * first node.
1870 		 */
1871 		struct zonelist *zonelist;
1872 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1873 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1874 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1875 							&policy->nodes);
1876 		return z->zone ? zone_to_nid(z->zone) : node;
1877 	}
1878 	case MPOL_LOCAL:
1879 		return node;
1880 
1881 	default:
1882 		BUG();
1883 	}
1884 }
1885 
1886 /*
1887  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
1888  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
1889  * exceeds the number of present nodes.
1890  */
1891 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
1892 {
1893 	nodemask_t nodemask = pol->nodes;
1894 	unsigned int target, nnodes;
1895 	int i;
1896 	int nid;
1897 	/*
1898 	 * The barrier will stabilize the nodemask in a register or on
1899 	 * the stack so that it will stop changing under the code.
1900 	 *
1901 	 * Between first_node() and next_node(), pol->nodes could be changed
1902 	 * by other threads. So we put pol->nodes in a local stack.
1903 	 */
1904 	barrier();
1905 
1906 	nnodes = nodes_weight(nodemask);
1907 	if (!nnodes)
1908 		return numa_node_id();
1909 	target = ilx % nnodes;
1910 	nid = first_node(nodemask);
1911 	for (i = 0; i < target; i++)
1912 		nid = next_node(nid, nodemask);
1913 	return nid;
1914 }
1915 
1916 /*
1917  * Return a nodemask representing a mempolicy for filtering nodes for
1918  * page allocation, together with preferred node id (or the input node id).
1919  */
1920 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
1921 				   pgoff_t ilx, int *nid)
1922 {
1923 	nodemask_t *nodemask = NULL;
1924 
1925 	switch (pol->mode) {
1926 	case MPOL_PREFERRED:
1927 		/* Override input node id */
1928 		*nid = first_node(pol->nodes);
1929 		break;
1930 	case MPOL_PREFERRED_MANY:
1931 		nodemask = &pol->nodes;
1932 		if (pol->home_node != NUMA_NO_NODE)
1933 			*nid = pol->home_node;
1934 		break;
1935 	case MPOL_BIND:
1936 		/* Restrict to nodemask (but not on lower zones) */
1937 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
1938 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
1939 			nodemask = &pol->nodes;
1940 		if (pol->home_node != NUMA_NO_NODE)
1941 			*nid = pol->home_node;
1942 		/*
1943 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1944 		 * because we might easily break the expectation to stay on the
1945 		 * requested node and not break the policy.
1946 		 */
1947 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
1948 		break;
1949 	case MPOL_INTERLEAVE:
1950 		/* Override input node id */
1951 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
1952 			interleave_nodes(pol) : interleave_nid(pol, ilx);
1953 		break;
1954 	}
1955 
1956 	return nodemask;
1957 }
1958 
1959 #ifdef CONFIG_HUGETLBFS
1960 /*
1961  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1962  * @vma: virtual memory area whose policy is sought
1963  * @addr: address in @vma for shared policy lookup and interleave policy
1964  * @gfp_flags: for requested zone
1965  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1966  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
1967  *
1968  * Returns a nid suitable for a huge page allocation and a pointer
1969  * to the struct mempolicy for conditional unref after allocation.
1970  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
1971  * to the mempolicy's @nodemask for filtering the zonelist.
1972  */
1973 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1974 		struct mempolicy **mpol, nodemask_t **nodemask)
1975 {
1976 	pgoff_t ilx;
1977 	int nid;
1978 
1979 	nid = numa_node_id();
1980 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
1981 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
1982 	return nid;
1983 }
1984 
1985 /*
1986  * init_nodemask_of_mempolicy
1987  *
1988  * If the current task's mempolicy is "default" [NULL], return 'false'
1989  * to indicate default policy.  Otherwise, extract the policy nodemask
1990  * for 'bind' or 'interleave' policy into the argument nodemask, or
1991  * initialize the argument nodemask to contain the single node for
1992  * 'preferred' or 'local' policy and return 'true' to indicate presence
1993  * of non-default mempolicy.
1994  *
1995  * We don't bother with reference counting the mempolicy [mpol_get/put]
1996  * because the current task is examining it's own mempolicy and a task's
1997  * mempolicy is only ever changed by the task itself.
1998  *
1999  * N.B., it is the caller's responsibility to free a returned nodemask.
2000  */
2001 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2002 {
2003 	struct mempolicy *mempolicy;
2004 
2005 	if (!(mask && current->mempolicy))
2006 		return false;
2007 
2008 	task_lock(current);
2009 	mempolicy = current->mempolicy;
2010 	switch (mempolicy->mode) {
2011 	case MPOL_PREFERRED:
2012 	case MPOL_PREFERRED_MANY:
2013 	case MPOL_BIND:
2014 	case MPOL_INTERLEAVE:
2015 		*mask = mempolicy->nodes;
2016 		break;
2017 
2018 	case MPOL_LOCAL:
2019 		init_nodemask_of_node(mask, numa_node_id());
2020 		break;
2021 
2022 	default:
2023 		BUG();
2024 	}
2025 	task_unlock(current);
2026 
2027 	return true;
2028 }
2029 #endif
2030 
2031 /*
2032  * mempolicy_in_oom_domain
2033  *
2034  * If tsk's mempolicy is "bind", check for intersection between mask and
2035  * the policy nodemask. Otherwise, return true for all other policies
2036  * including "interleave", as a tsk with "interleave" policy may have
2037  * memory allocated from all nodes in system.
2038  *
2039  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2040  */
2041 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2042 					const nodemask_t *mask)
2043 {
2044 	struct mempolicy *mempolicy;
2045 	bool ret = true;
2046 
2047 	if (!mask)
2048 		return ret;
2049 
2050 	task_lock(tsk);
2051 	mempolicy = tsk->mempolicy;
2052 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2053 		ret = nodes_intersects(mempolicy->nodes, *mask);
2054 	task_unlock(tsk);
2055 
2056 	return ret;
2057 }
2058 
2059 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2060 						int nid, nodemask_t *nodemask)
2061 {
2062 	struct page *page;
2063 	gfp_t preferred_gfp;
2064 
2065 	/*
2066 	 * This is a two pass approach. The first pass will only try the
2067 	 * preferred nodes but skip the direct reclaim and allow the
2068 	 * allocation to fail, while the second pass will try all the
2069 	 * nodes in system.
2070 	 */
2071 	preferred_gfp = gfp | __GFP_NOWARN;
2072 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2073 	page = __alloc_pages(preferred_gfp, order, nid, nodemask);
2074 	if (!page)
2075 		page = __alloc_pages(gfp, order, nid, NULL);
2076 
2077 	return page;
2078 }
2079 
2080 /**
2081  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2082  * @gfp: GFP flags.
2083  * @order: Order of the page allocation.
2084  * @pol: Pointer to the NUMA mempolicy.
2085  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2086  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2087  *
2088  * Return: The page on success or NULL if allocation fails.
2089  */
2090 struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2091 		struct mempolicy *pol, pgoff_t ilx, int nid)
2092 {
2093 	nodemask_t *nodemask;
2094 	struct page *page;
2095 
2096 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2097 
2098 	if (pol->mode == MPOL_PREFERRED_MANY)
2099 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2100 
2101 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2102 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2103 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2104 		/*
2105 		 * For hugepage allocation and non-interleave policy which
2106 		 * allows the current node (or other explicitly preferred
2107 		 * node) we only try to allocate from the current/preferred
2108 		 * node and don't fall back to other nodes, as the cost of
2109 		 * remote accesses would likely offset THP benefits.
2110 		 *
2111 		 * If the policy is interleave or does not allow the current
2112 		 * node in its nodemask, we allocate the standard way.
2113 		 */
2114 		if (pol->mode != MPOL_INTERLEAVE &&
2115 		    (!nodemask || node_isset(nid, *nodemask))) {
2116 			/*
2117 			 * First, try to allocate THP only on local node, but
2118 			 * don't reclaim unnecessarily, just compact.
2119 			 */
2120 			page = __alloc_pages_node(nid,
2121 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2122 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2123 				return page;
2124 			/*
2125 			 * If hugepage allocations are configured to always
2126 			 * synchronous compact or the vma has been madvised
2127 			 * to prefer hugepage backing, retry allowing remote
2128 			 * memory with both reclaim and compact as well.
2129 			 */
2130 		}
2131 	}
2132 
2133 	page = __alloc_pages(gfp, order, nid, nodemask);
2134 
2135 	if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
2136 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2137 		if (static_branch_likely(&vm_numa_stat_key) &&
2138 		    page_to_nid(page) == nid) {
2139 			preempt_disable();
2140 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2141 			preempt_enable();
2142 		}
2143 	}
2144 
2145 	return page;
2146 }
2147 
2148 /**
2149  * vma_alloc_folio - Allocate a folio for a VMA.
2150  * @gfp: GFP flags.
2151  * @order: Order of the folio.
2152  * @vma: Pointer to VMA.
2153  * @addr: Virtual address of the allocation.  Must be inside @vma.
2154  * @hugepage: Unused (was: For hugepages try only preferred node if possible).
2155  *
2156  * Allocate a folio for a specific address in @vma, using the appropriate
2157  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2158  * VMA to prevent it from going away.  Should be used for all allocations
2159  * for folios that will be mapped into user space, excepting hugetlbfs, and
2160  * excepting where direct use of alloc_pages_mpol() is more appropriate.
2161  *
2162  * Return: The folio on success or NULL if allocation fails.
2163  */
2164 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2165 		unsigned long addr, bool hugepage)
2166 {
2167 	struct mempolicy *pol;
2168 	pgoff_t ilx;
2169 	struct page *page;
2170 
2171 	pol = get_vma_policy(vma, addr, order, &ilx);
2172 	page = alloc_pages_mpol(gfp | __GFP_COMP, order,
2173 				pol, ilx, numa_node_id());
2174 	mpol_cond_put(pol);
2175 	return page_rmappable_folio(page);
2176 }
2177 EXPORT_SYMBOL(vma_alloc_folio);
2178 
2179 /**
2180  * alloc_pages - Allocate pages.
2181  * @gfp: GFP flags.
2182  * @order: Power of two of number of pages to allocate.
2183  *
2184  * Allocate 1 << @order contiguous pages.  The physical address of the
2185  * first page is naturally aligned (eg an order-3 allocation will be aligned
2186  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2187  * process is honoured when in process context.
2188  *
2189  * Context: Can be called from any context, providing the appropriate GFP
2190  * flags are used.
2191  * Return: The page on success or NULL if allocation fails.
2192  */
2193 struct page *alloc_pages(gfp_t gfp, unsigned int order)
2194 {
2195 	struct mempolicy *pol = &default_policy;
2196 
2197 	/*
2198 	 * No reference counting needed for current->mempolicy
2199 	 * nor system default_policy
2200 	 */
2201 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2202 		pol = get_task_policy(current);
2203 
2204 	return alloc_pages_mpol(gfp, order,
2205 				pol, NO_INTERLEAVE_INDEX, numa_node_id());
2206 }
2207 EXPORT_SYMBOL(alloc_pages);
2208 
2209 struct folio *folio_alloc(gfp_t gfp, unsigned int order)
2210 {
2211 	return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
2212 }
2213 EXPORT_SYMBOL(folio_alloc);
2214 
2215 static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2216 		struct mempolicy *pol, unsigned long nr_pages,
2217 		struct page **page_array)
2218 {
2219 	int nodes;
2220 	unsigned long nr_pages_per_node;
2221 	int delta;
2222 	int i;
2223 	unsigned long nr_allocated;
2224 	unsigned long total_allocated = 0;
2225 
2226 	nodes = nodes_weight(pol->nodes);
2227 	nr_pages_per_node = nr_pages / nodes;
2228 	delta = nr_pages - nodes * nr_pages_per_node;
2229 
2230 	for (i = 0; i < nodes; i++) {
2231 		if (delta) {
2232 			nr_allocated = __alloc_pages_bulk(gfp,
2233 					interleave_nodes(pol), NULL,
2234 					nr_pages_per_node + 1, NULL,
2235 					page_array);
2236 			delta--;
2237 		} else {
2238 			nr_allocated = __alloc_pages_bulk(gfp,
2239 					interleave_nodes(pol), NULL,
2240 					nr_pages_per_node, NULL, page_array);
2241 		}
2242 
2243 		page_array += nr_allocated;
2244 		total_allocated += nr_allocated;
2245 	}
2246 
2247 	return total_allocated;
2248 }
2249 
2250 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2251 		struct mempolicy *pol, unsigned long nr_pages,
2252 		struct page **page_array)
2253 {
2254 	gfp_t preferred_gfp;
2255 	unsigned long nr_allocated = 0;
2256 
2257 	preferred_gfp = gfp | __GFP_NOWARN;
2258 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2259 
2260 	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2261 					   nr_pages, NULL, page_array);
2262 
2263 	if (nr_allocated < nr_pages)
2264 		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2265 				nr_pages - nr_allocated, NULL,
2266 				page_array + nr_allocated);
2267 	return nr_allocated;
2268 }
2269 
2270 /* alloc pages bulk and mempolicy should be considered at the
2271  * same time in some situation such as vmalloc.
2272  *
2273  * It can accelerate memory allocation especially interleaving
2274  * allocate memory.
2275  */
2276 unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2277 		unsigned long nr_pages, struct page **page_array)
2278 {
2279 	struct mempolicy *pol = &default_policy;
2280 	nodemask_t *nodemask;
2281 	int nid;
2282 
2283 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2284 		pol = get_task_policy(current);
2285 
2286 	if (pol->mode == MPOL_INTERLEAVE)
2287 		return alloc_pages_bulk_array_interleave(gfp, pol,
2288 							 nr_pages, page_array);
2289 
2290 	if (pol->mode == MPOL_PREFERRED_MANY)
2291 		return alloc_pages_bulk_array_preferred_many(gfp,
2292 				numa_node_id(), pol, nr_pages, page_array);
2293 
2294 	nid = numa_node_id();
2295 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2296 	return __alloc_pages_bulk(gfp, nid, nodemask,
2297 				  nr_pages, NULL, page_array);
2298 }
2299 
2300 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2301 {
2302 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2303 
2304 	if (IS_ERR(pol))
2305 		return PTR_ERR(pol);
2306 	dst->vm_policy = pol;
2307 	return 0;
2308 }
2309 
2310 /*
2311  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2312  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2313  * with the mems_allowed returned by cpuset_mems_allowed().  This
2314  * keeps mempolicies cpuset relative after its cpuset moves.  See
2315  * further kernel/cpuset.c update_nodemask().
2316  *
2317  * current's mempolicy may be rebinded by the other task(the task that changes
2318  * cpuset's mems), so we needn't do rebind work for current task.
2319  */
2320 
2321 /* Slow path of a mempolicy duplicate */
2322 struct mempolicy *__mpol_dup(struct mempolicy *old)
2323 {
2324 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2325 
2326 	if (!new)
2327 		return ERR_PTR(-ENOMEM);
2328 
2329 	/* task's mempolicy is protected by alloc_lock */
2330 	if (old == current->mempolicy) {
2331 		task_lock(current);
2332 		*new = *old;
2333 		task_unlock(current);
2334 	} else
2335 		*new = *old;
2336 
2337 	if (current_cpuset_is_being_rebound()) {
2338 		nodemask_t mems = cpuset_mems_allowed(current);
2339 		mpol_rebind_policy(new, &mems);
2340 	}
2341 	atomic_set(&new->refcnt, 1);
2342 	return new;
2343 }
2344 
2345 /* Slow path of a mempolicy comparison */
2346 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2347 {
2348 	if (!a || !b)
2349 		return false;
2350 	if (a->mode != b->mode)
2351 		return false;
2352 	if (a->flags != b->flags)
2353 		return false;
2354 	if (a->home_node != b->home_node)
2355 		return false;
2356 	if (mpol_store_user_nodemask(a))
2357 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2358 			return false;
2359 
2360 	switch (a->mode) {
2361 	case MPOL_BIND:
2362 	case MPOL_INTERLEAVE:
2363 	case MPOL_PREFERRED:
2364 	case MPOL_PREFERRED_MANY:
2365 		return !!nodes_equal(a->nodes, b->nodes);
2366 	case MPOL_LOCAL:
2367 		return true;
2368 	default:
2369 		BUG();
2370 		return false;
2371 	}
2372 }
2373 
2374 /*
2375  * Shared memory backing store policy support.
2376  *
2377  * Remember policies even when nobody has shared memory mapped.
2378  * The policies are kept in Red-Black tree linked from the inode.
2379  * They are protected by the sp->lock rwlock, which should be held
2380  * for any accesses to the tree.
2381  */
2382 
2383 /*
2384  * lookup first element intersecting start-end.  Caller holds sp->lock for
2385  * reading or for writing
2386  */
2387 static struct sp_node *sp_lookup(struct shared_policy *sp,
2388 					pgoff_t start, pgoff_t end)
2389 {
2390 	struct rb_node *n = sp->root.rb_node;
2391 
2392 	while (n) {
2393 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2394 
2395 		if (start >= p->end)
2396 			n = n->rb_right;
2397 		else if (end <= p->start)
2398 			n = n->rb_left;
2399 		else
2400 			break;
2401 	}
2402 	if (!n)
2403 		return NULL;
2404 	for (;;) {
2405 		struct sp_node *w = NULL;
2406 		struct rb_node *prev = rb_prev(n);
2407 		if (!prev)
2408 			break;
2409 		w = rb_entry(prev, struct sp_node, nd);
2410 		if (w->end <= start)
2411 			break;
2412 		n = prev;
2413 	}
2414 	return rb_entry(n, struct sp_node, nd);
2415 }
2416 
2417 /*
2418  * Insert a new shared policy into the list.  Caller holds sp->lock for
2419  * writing.
2420  */
2421 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2422 {
2423 	struct rb_node **p = &sp->root.rb_node;
2424 	struct rb_node *parent = NULL;
2425 	struct sp_node *nd;
2426 
2427 	while (*p) {
2428 		parent = *p;
2429 		nd = rb_entry(parent, struct sp_node, nd);
2430 		if (new->start < nd->start)
2431 			p = &(*p)->rb_left;
2432 		else if (new->end > nd->end)
2433 			p = &(*p)->rb_right;
2434 		else
2435 			BUG();
2436 	}
2437 	rb_link_node(&new->nd, parent, p);
2438 	rb_insert_color(&new->nd, &sp->root);
2439 }
2440 
2441 /* Find shared policy intersecting idx */
2442 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2443 						pgoff_t idx)
2444 {
2445 	struct mempolicy *pol = NULL;
2446 	struct sp_node *sn;
2447 
2448 	if (!sp->root.rb_node)
2449 		return NULL;
2450 	read_lock(&sp->lock);
2451 	sn = sp_lookup(sp, idx, idx+1);
2452 	if (sn) {
2453 		mpol_get(sn->policy);
2454 		pol = sn->policy;
2455 	}
2456 	read_unlock(&sp->lock);
2457 	return pol;
2458 }
2459 
2460 static void sp_free(struct sp_node *n)
2461 {
2462 	mpol_put(n->policy);
2463 	kmem_cache_free(sn_cache, n);
2464 }
2465 
2466 /**
2467  * mpol_misplaced - check whether current folio node is valid in policy
2468  *
2469  * @folio: folio to be checked
2470  * @vma: vm area where folio mapped
2471  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2472  *
2473  * Lookup current policy node id for vma,addr and "compare to" folio's
2474  * node id.  Policy determination "mimics" alloc_page_vma().
2475  * Called from fault path where we know the vma and faulting address.
2476  *
2477  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2478  * policy, or a suitable node ID to allocate a replacement folio from.
2479  */
2480 int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
2481 		   unsigned long addr)
2482 {
2483 	struct mempolicy *pol;
2484 	pgoff_t ilx;
2485 	struct zoneref *z;
2486 	int curnid = folio_nid(folio);
2487 	int thiscpu = raw_smp_processor_id();
2488 	int thisnid = cpu_to_node(thiscpu);
2489 	int polnid = NUMA_NO_NODE;
2490 	int ret = NUMA_NO_NODE;
2491 
2492 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2493 	if (!(pol->flags & MPOL_F_MOF))
2494 		goto out;
2495 
2496 	switch (pol->mode) {
2497 	case MPOL_INTERLEAVE:
2498 		polnid = interleave_nid(pol, ilx);
2499 		break;
2500 
2501 	case MPOL_PREFERRED:
2502 		if (node_isset(curnid, pol->nodes))
2503 			goto out;
2504 		polnid = first_node(pol->nodes);
2505 		break;
2506 
2507 	case MPOL_LOCAL:
2508 		polnid = numa_node_id();
2509 		break;
2510 
2511 	case MPOL_BIND:
2512 		/* Optimize placement among multiple nodes via NUMA balancing */
2513 		if (pol->flags & MPOL_F_MORON) {
2514 			if (node_isset(thisnid, pol->nodes))
2515 				break;
2516 			goto out;
2517 		}
2518 		fallthrough;
2519 
2520 	case MPOL_PREFERRED_MANY:
2521 		/*
2522 		 * use current page if in policy nodemask,
2523 		 * else select nearest allowed node, if any.
2524 		 * If no allowed nodes, use current [!misplaced].
2525 		 */
2526 		if (node_isset(curnid, pol->nodes))
2527 			goto out;
2528 		z = first_zones_zonelist(
2529 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2530 				gfp_zone(GFP_HIGHUSER),
2531 				&pol->nodes);
2532 		polnid = zone_to_nid(z->zone);
2533 		break;
2534 
2535 	default:
2536 		BUG();
2537 	}
2538 
2539 	/* Migrate the folio towards the node whose CPU is referencing it */
2540 	if (pol->flags & MPOL_F_MORON) {
2541 		polnid = thisnid;
2542 
2543 		if (!should_numa_migrate_memory(current, folio, curnid,
2544 						thiscpu))
2545 			goto out;
2546 	}
2547 
2548 	if (curnid != polnid)
2549 		ret = polnid;
2550 out:
2551 	mpol_cond_put(pol);
2552 
2553 	return ret;
2554 }
2555 
2556 /*
2557  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2558  * dropped after task->mempolicy is set to NULL so that any allocation done as
2559  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2560  * policy.
2561  */
2562 void mpol_put_task_policy(struct task_struct *task)
2563 {
2564 	struct mempolicy *pol;
2565 
2566 	task_lock(task);
2567 	pol = task->mempolicy;
2568 	task->mempolicy = NULL;
2569 	task_unlock(task);
2570 	mpol_put(pol);
2571 }
2572 
2573 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2574 {
2575 	rb_erase(&n->nd, &sp->root);
2576 	sp_free(n);
2577 }
2578 
2579 static void sp_node_init(struct sp_node *node, unsigned long start,
2580 			unsigned long end, struct mempolicy *pol)
2581 {
2582 	node->start = start;
2583 	node->end = end;
2584 	node->policy = pol;
2585 }
2586 
2587 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2588 				struct mempolicy *pol)
2589 {
2590 	struct sp_node *n;
2591 	struct mempolicy *newpol;
2592 
2593 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2594 	if (!n)
2595 		return NULL;
2596 
2597 	newpol = mpol_dup(pol);
2598 	if (IS_ERR(newpol)) {
2599 		kmem_cache_free(sn_cache, n);
2600 		return NULL;
2601 	}
2602 	newpol->flags |= MPOL_F_SHARED;
2603 	sp_node_init(n, start, end, newpol);
2604 
2605 	return n;
2606 }
2607 
2608 /* Replace a policy range. */
2609 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2610 				 pgoff_t end, struct sp_node *new)
2611 {
2612 	struct sp_node *n;
2613 	struct sp_node *n_new = NULL;
2614 	struct mempolicy *mpol_new = NULL;
2615 	int ret = 0;
2616 
2617 restart:
2618 	write_lock(&sp->lock);
2619 	n = sp_lookup(sp, start, end);
2620 	/* Take care of old policies in the same range. */
2621 	while (n && n->start < end) {
2622 		struct rb_node *next = rb_next(&n->nd);
2623 		if (n->start >= start) {
2624 			if (n->end <= end)
2625 				sp_delete(sp, n);
2626 			else
2627 				n->start = end;
2628 		} else {
2629 			/* Old policy spanning whole new range. */
2630 			if (n->end > end) {
2631 				if (!n_new)
2632 					goto alloc_new;
2633 
2634 				*mpol_new = *n->policy;
2635 				atomic_set(&mpol_new->refcnt, 1);
2636 				sp_node_init(n_new, end, n->end, mpol_new);
2637 				n->end = start;
2638 				sp_insert(sp, n_new);
2639 				n_new = NULL;
2640 				mpol_new = NULL;
2641 				break;
2642 			} else
2643 				n->end = start;
2644 		}
2645 		if (!next)
2646 			break;
2647 		n = rb_entry(next, struct sp_node, nd);
2648 	}
2649 	if (new)
2650 		sp_insert(sp, new);
2651 	write_unlock(&sp->lock);
2652 	ret = 0;
2653 
2654 err_out:
2655 	if (mpol_new)
2656 		mpol_put(mpol_new);
2657 	if (n_new)
2658 		kmem_cache_free(sn_cache, n_new);
2659 
2660 	return ret;
2661 
2662 alloc_new:
2663 	write_unlock(&sp->lock);
2664 	ret = -ENOMEM;
2665 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2666 	if (!n_new)
2667 		goto err_out;
2668 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2669 	if (!mpol_new)
2670 		goto err_out;
2671 	atomic_set(&mpol_new->refcnt, 1);
2672 	goto restart;
2673 }
2674 
2675 /**
2676  * mpol_shared_policy_init - initialize shared policy for inode
2677  * @sp: pointer to inode shared policy
2678  * @mpol:  struct mempolicy to install
2679  *
2680  * Install non-NULL @mpol in inode's shared policy rb-tree.
2681  * On entry, the current task has a reference on a non-NULL @mpol.
2682  * This must be released on exit.
2683  * This is called at get_inode() calls and we can use GFP_KERNEL.
2684  */
2685 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2686 {
2687 	int ret;
2688 
2689 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2690 	rwlock_init(&sp->lock);
2691 
2692 	if (mpol) {
2693 		struct sp_node *sn;
2694 		struct mempolicy *npol;
2695 		NODEMASK_SCRATCH(scratch);
2696 
2697 		if (!scratch)
2698 			goto put_mpol;
2699 
2700 		/* contextualize the tmpfs mount point mempolicy to this file */
2701 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2702 		if (IS_ERR(npol))
2703 			goto free_scratch; /* no valid nodemask intersection */
2704 
2705 		task_lock(current);
2706 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
2707 		task_unlock(current);
2708 		if (ret)
2709 			goto put_npol;
2710 
2711 		/* alloc node covering entire file; adds ref to file's npol */
2712 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
2713 		if (sn)
2714 			sp_insert(sp, sn);
2715 put_npol:
2716 		mpol_put(npol);	/* drop initial ref on file's npol */
2717 free_scratch:
2718 		NODEMASK_SCRATCH_FREE(scratch);
2719 put_mpol:
2720 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2721 	}
2722 }
2723 
2724 int mpol_set_shared_policy(struct shared_policy *sp,
2725 			struct vm_area_struct *vma, struct mempolicy *pol)
2726 {
2727 	int err;
2728 	struct sp_node *new = NULL;
2729 	unsigned long sz = vma_pages(vma);
2730 
2731 	if (pol) {
2732 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
2733 		if (!new)
2734 			return -ENOMEM;
2735 	}
2736 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
2737 	if (err && new)
2738 		sp_free(new);
2739 	return err;
2740 }
2741 
2742 /* Free a backing policy store on inode delete. */
2743 void mpol_free_shared_policy(struct shared_policy *sp)
2744 {
2745 	struct sp_node *n;
2746 	struct rb_node *next;
2747 
2748 	if (!sp->root.rb_node)
2749 		return;
2750 	write_lock(&sp->lock);
2751 	next = rb_first(&sp->root);
2752 	while (next) {
2753 		n = rb_entry(next, struct sp_node, nd);
2754 		next = rb_next(&n->nd);
2755 		sp_delete(sp, n);
2756 	}
2757 	write_unlock(&sp->lock);
2758 }
2759 
2760 #ifdef CONFIG_NUMA_BALANCING
2761 static int __initdata numabalancing_override;
2762 
2763 static void __init check_numabalancing_enable(void)
2764 {
2765 	bool numabalancing_default = false;
2766 
2767 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2768 		numabalancing_default = true;
2769 
2770 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2771 	if (numabalancing_override)
2772 		set_numabalancing_state(numabalancing_override == 1);
2773 
2774 	if (num_online_nodes() > 1 && !numabalancing_override) {
2775 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2776 			numabalancing_default ? "Enabling" : "Disabling");
2777 		set_numabalancing_state(numabalancing_default);
2778 	}
2779 }
2780 
2781 static int __init setup_numabalancing(char *str)
2782 {
2783 	int ret = 0;
2784 	if (!str)
2785 		goto out;
2786 
2787 	if (!strcmp(str, "enable")) {
2788 		numabalancing_override = 1;
2789 		ret = 1;
2790 	} else if (!strcmp(str, "disable")) {
2791 		numabalancing_override = -1;
2792 		ret = 1;
2793 	}
2794 out:
2795 	if (!ret)
2796 		pr_warn("Unable to parse numa_balancing=\n");
2797 
2798 	return ret;
2799 }
2800 __setup("numa_balancing=", setup_numabalancing);
2801 #else
2802 static inline void __init check_numabalancing_enable(void)
2803 {
2804 }
2805 #endif /* CONFIG_NUMA_BALANCING */
2806 
2807 void __init numa_policy_init(void)
2808 {
2809 	nodemask_t interleave_nodes;
2810 	unsigned long largest = 0;
2811 	int nid, prefer = 0;
2812 
2813 	policy_cache = kmem_cache_create("numa_policy",
2814 					 sizeof(struct mempolicy),
2815 					 0, SLAB_PANIC, NULL);
2816 
2817 	sn_cache = kmem_cache_create("shared_policy_node",
2818 				     sizeof(struct sp_node),
2819 				     0, SLAB_PANIC, NULL);
2820 
2821 	for_each_node(nid) {
2822 		preferred_node_policy[nid] = (struct mempolicy) {
2823 			.refcnt = ATOMIC_INIT(1),
2824 			.mode = MPOL_PREFERRED,
2825 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2826 			.nodes = nodemask_of_node(nid),
2827 		};
2828 	}
2829 
2830 	/*
2831 	 * Set interleaving policy for system init. Interleaving is only
2832 	 * enabled across suitably sized nodes (default is >= 16MB), or
2833 	 * fall back to the largest node if they're all smaller.
2834 	 */
2835 	nodes_clear(interleave_nodes);
2836 	for_each_node_state(nid, N_MEMORY) {
2837 		unsigned long total_pages = node_present_pages(nid);
2838 
2839 		/* Preserve the largest node */
2840 		if (largest < total_pages) {
2841 			largest = total_pages;
2842 			prefer = nid;
2843 		}
2844 
2845 		/* Interleave this node? */
2846 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2847 			node_set(nid, interleave_nodes);
2848 	}
2849 
2850 	/* All too small, use the largest */
2851 	if (unlikely(nodes_empty(interleave_nodes)))
2852 		node_set(prefer, interleave_nodes);
2853 
2854 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2855 		pr_err("%s: interleaving failed\n", __func__);
2856 
2857 	check_numabalancing_enable();
2858 }
2859 
2860 /* Reset policy of current process to default */
2861 void numa_default_policy(void)
2862 {
2863 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2864 }
2865 
2866 /*
2867  * Parse and format mempolicy from/to strings
2868  */
2869 static const char * const policy_modes[] =
2870 {
2871 	[MPOL_DEFAULT]    = "default",
2872 	[MPOL_PREFERRED]  = "prefer",
2873 	[MPOL_BIND]       = "bind",
2874 	[MPOL_INTERLEAVE] = "interleave",
2875 	[MPOL_LOCAL]      = "local",
2876 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
2877 };
2878 
2879 #ifdef CONFIG_TMPFS
2880 /**
2881  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2882  * @str:  string containing mempolicy to parse
2883  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2884  *
2885  * Format of input:
2886  *	<mode>[=<flags>][:<nodelist>]
2887  *
2888  * Return: %0 on success, else %1
2889  */
2890 int mpol_parse_str(char *str, struct mempolicy **mpol)
2891 {
2892 	struct mempolicy *new = NULL;
2893 	unsigned short mode_flags;
2894 	nodemask_t nodes;
2895 	char *nodelist = strchr(str, ':');
2896 	char *flags = strchr(str, '=');
2897 	int err = 1, mode;
2898 
2899 	if (flags)
2900 		*flags++ = '\0';	/* terminate mode string */
2901 
2902 	if (nodelist) {
2903 		/* NUL-terminate mode or flags string */
2904 		*nodelist++ = '\0';
2905 		if (nodelist_parse(nodelist, nodes))
2906 			goto out;
2907 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2908 			goto out;
2909 	} else
2910 		nodes_clear(nodes);
2911 
2912 	mode = match_string(policy_modes, MPOL_MAX, str);
2913 	if (mode < 0)
2914 		goto out;
2915 
2916 	switch (mode) {
2917 	case MPOL_PREFERRED:
2918 		/*
2919 		 * Insist on a nodelist of one node only, although later
2920 		 * we use first_node(nodes) to grab a single node, so here
2921 		 * nodelist (or nodes) cannot be empty.
2922 		 */
2923 		if (nodelist) {
2924 			char *rest = nodelist;
2925 			while (isdigit(*rest))
2926 				rest++;
2927 			if (*rest)
2928 				goto out;
2929 			if (nodes_empty(nodes))
2930 				goto out;
2931 		}
2932 		break;
2933 	case MPOL_INTERLEAVE:
2934 		/*
2935 		 * Default to online nodes with memory if no nodelist
2936 		 */
2937 		if (!nodelist)
2938 			nodes = node_states[N_MEMORY];
2939 		break;
2940 	case MPOL_LOCAL:
2941 		/*
2942 		 * Don't allow a nodelist;  mpol_new() checks flags
2943 		 */
2944 		if (nodelist)
2945 			goto out;
2946 		break;
2947 	case MPOL_DEFAULT:
2948 		/*
2949 		 * Insist on a empty nodelist
2950 		 */
2951 		if (!nodelist)
2952 			err = 0;
2953 		goto out;
2954 	case MPOL_PREFERRED_MANY:
2955 	case MPOL_BIND:
2956 		/*
2957 		 * Insist on a nodelist
2958 		 */
2959 		if (!nodelist)
2960 			goto out;
2961 	}
2962 
2963 	mode_flags = 0;
2964 	if (flags) {
2965 		/*
2966 		 * Currently, we only support two mutually exclusive
2967 		 * mode flags.
2968 		 */
2969 		if (!strcmp(flags, "static"))
2970 			mode_flags |= MPOL_F_STATIC_NODES;
2971 		else if (!strcmp(flags, "relative"))
2972 			mode_flags |= MPOL_F_RELATIVE_NODES;
2973 		else
2974 			goto out;
2975 	}
2976 
2977 	new = mpol_new(mode, mode_flags, &nodes);
2978 	if (IS_ERR(new))
2979 		goto out;
2980 
2981 	/*
2982 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2983 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2984 	 */
2985 	if (mode != MPOL_PREFERRED) {
2986 		new->nodes = nodes;
2987 	} else if (nodelist) {
2988 		nodes_clear(new->nodes);
2989 		node_set(first_node(nodes), new->nodes);
2990 	} else {
2991 		new->mode = MPOL_LOCAL;
2992 	}
2993 
2994 	/*
2995 	 * Save nodes for contextualization: this will be used to "clone"
2996 	 * the mempolicy in a specific context [cpuset] at a later time.
2997 	 */
2998 	new->w.user_nodemask = nodes;
2999 
3000 	err = 0;
3001 
3002 out:
3003 	/* Restore string for error message */
3004 	if (nodelist)
3005 		*--nodelist = ':';
3006 	if (flags)
3007 		*--flags = '=';
3008 	if (!err)
3009 		*mpol = new;
3010 	return err;
3011 }
3012 #endif /* CONFIG_TMPFS */
3013 
3014 /**
3015  * mpol_to_str - format a mempolicy structure for printing
3016  * @buffer:  to contain formatted mempolicy string
3017  * @maxlen:  length of @buffer
3018  * @pol:  pointer to mempolicy to be formatted
3019  *
3020  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3021  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3022  * longest flag, "relative", and to display at least a few node ids.
3023  */
3024 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3025 {
3026 	char *p = buffer;
3027 	nodemask_t nodes = NODE_MASK_NONE;
3028 	unsigned short mode = MPOL_DEFAULT;
3029 	unsigned short flags = 0;
3030 
3031 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3032 		mode = pol->mode;
3033 		flags = pol->flags;
3034 	}
3035 
3036 	switch (mode) {
3037 	case MPOL_DEFAULT:
3038 	case MPOL_LOCAL:
3039 		break;
3040 	case MPOL_PREFERRED:
3041 	case MPOL_PREFERRED_MANY:
3042 	case MPOL_BIND:
3043 	case MPOL_INTERLEAVE:
3044 		nodes = pol->nodes;
3045 		break;
3046 	default:
3047 		WARN_ON_ONCE(1);
3048 		snprintf(p, maxlen, "unknown");
3049 		return;
3050 	}
3051 
3052 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3053 
3054 	if (flags & MPOL_MODE_FLAGS) {
3055 		p += snprintf(p, buffer + maxlen - p, "=");
3056 
3057 		/*
3058 		 * Currently, the only defined flags are mutually exclusive
3059 		 */
3060 		if (flags & MPOL_F_STATIC_NODES)
3061 			p += snprintf(p, buffer + maxlen - p, "static");
3062 		else if (flags & MPOL_F_RELATIVE_NODES)
3063 			p += snprintf(p, buffer + maxlen - p, "relative");
3064 	}
3065 
3066 	if (!nodes_empty(nodes))
3067 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3068 			       nodemask_pr_args(&nodes));
3069 }
3070