xref: /linux/mm/mempolicy.c (revision d09560435cb712c9ec1e62b8a43a79b0af69fe77)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/pagewalk.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_LOCAL,
125 };
126 
127 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
128 
129 /**
130  * numa_map_to_online_node - Find closest online node
131  * @node: Node id to start the search
132  *
133  * Lookup the next closest node by distance if @nid is not online.
134  */
135 int numa_map_to_online_node(int node)
136 {
137 	int min_dist = INT_MAX, dist, n, min_node;
138 
139 	if (node == NUMA_NO_NODE || node_online(node))
140 		return node;
141 
142 	min_node = node;
143 	for_each_online_node(n) {
144 		dist = node_distance(node, n);
145 		if (dist < min_dist) {
146 			min_dist = dist;
147 			min_node = n;
148 		}
149 	}
150 
151 	return min_node;
152 }
153 EXPORT_SYMBOL_GPL(numa_map_to_online_node);
154 
155 struct mempolicy *get_task_policy(struct task_struct *p)
156 {
157 	struct mempolicy *pol = p->mempolicy;
158 	int node;
159 
160 	if (pol)
161 		return pol;
162 
163 	node = numa_node_id();
164 	if (node != NUMA_NO_NODE) {
165 		pol = &preferred_node_policy[node];
166 		/* preferred_node_policy is not initialised early in boot */
167 		if (pol->mode)
168 			return pol;
169 	}
170 
171 	return &default_policy;
172 }
173 
174 static const struct mempolicy_operations {
175 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
176 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
177 } mpol_ops[MPOL_MAX];
178 
179 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
180 {
181 	return pol->flags & MPOL_MODE_FLAGS;
182 }
183 
184 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
185 				   const nodemask_t *rel)
186 {
187 	nodemask_t tmp;
188 	nodes_fold(tmp, *orig, nodes_weight(*rel));
189 	nodes_onto(*ret, tmp, *rel);
190 }
191 
192 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
193 {
194 	if (nodes_empty(*nodes))
195 		return -EINVAL;
196 	pol->nodes = *nodes;
197 	return 0;
198 }
199 
200 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
201 {
202 	if (nodes_empty(*nodes))
203 		return -EINVAL;
204 
205 	nodes_clear(pol->nodes);
206 	node_set(first_node(*nodes), pol->nodes);
207 	return 0;
208 }
209 
210 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
211 {
212 	if (nodes_empty(*nodes))
213 		return -EINVAL;
214 	pol->nodes = *nodes;
215 	return 0;
216 }
217 
218 /*
219  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
220  * any, for the new policy.  mpol_new() has already validated the nodes
221  * parameter with respect to the policy mode and flags.
222  *
223  * Must be called holding task's alloc_lock to protect task's mems_allowed
224  * and mempolicy.  May also be called holding the mmap_lock for write.
225  */
226 static int mpol_set_nodemask(struct mempolicy *pol,
227 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
228 {
229 	int ret;
230 
231 	/*
232 	 * Default (pol==NULL) resp. local memory policies are not a
233 	 * subject of any remapping. They also do not need any special
234 	 * constructor.
235 	 */
236 	if (!pol || pol->mode == MPOL_LOCAL)
237 		return 0;
238 
239 	/* Check N_MEMORY */
240 	nodes_and(nsc->mask1,
241 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
242 
243 	VM_BUG_ON(!nodes);
244 
245 	if (pol->flags & MPOL_F_RELATIVE_NODES)
246 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
247 	else
248 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
249 
250 	if (mpol_store_user_nodemask(pol))
251 		pol->w.user_nodemask = *nodes;
252 	else
253 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
254 
255 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
256 	return ret;
257 }
258 
259 /*
260  * This function just creates a new policy, does some check and simple
261  * initialization. You must invoke mpol_set_nodemask() to set nodes.
262  */
263 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
264 				  nodemask_t *nodes)
265 {
266 	struct mempolicy *policy;
267 
268 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
269 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
270 
271 	if (mode == MPOL_DEFAULT) {
272 		if (nodes && !nodes_empty(*nodes))
273 			return ERR_PTR(-EINVAL);
274 		return NULL;
275 	}
276 	VM_BUG_ON(!nodes);
277 
278 	/*
279 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
280 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
281 	 * All other modes require a valid pointer to a non-empty nodemask.
282 	 */
283 	if (mode == MPOL_PREFERRED) {
284 		if (nodes_empty(*nodes)) {
285 			if (((flags & MPOL_F_STATIC_NODES) ||
286 			     (flags & MPOL_F_RELATIVE_NODES)))
287 				return ERR_PTR(-EINVAL);
288 
289 			mode = MPOL_LOCAL;
290 		}
291 	} else if (mode == MPOL_LOCAL) {
292 		if (!nodes_empty(*nodes) ||
293 		    (flags & MPOL_F_STATIC_NODES) ||
294 		    (flags & MPOL_F_RELATIVE_NODES))
295 			return ERR_PTR(-EINVAL);
296 	} else if (nodes_empty(*nodes))
297 		return ERR_PTR(-EINVAL);
298 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
299 	if (!policy)
300 		return ERR_PTR(-ENOMEM);
301 	atomic_set(&policy->refcnt, 1);
302 	policy->mode = mode;
303 	policy->flags = flags;
304 
305 	return policy;
306 }
307 
308 /* Slow path of a mpol destructor. */
309 void __mpol_put(struct mempolicy *p)
310 {
311 	if (!atomic_dec_and_test(&p->refcnt))
312 		return;
313 	kmem_cache_free(policy_cache, p);
314 }
315 
316 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
317 {
318 }
319 
320 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
321 {
322 	nodemask_t tmp;
323 
324 	if (pol->flags & MPOL_F_STATIC_NODES)
325 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
326 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
327 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328 	else {
329 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
330 								*nodes);
331 		pol->w.cpuset_mems_allowed = *nodes;
332 	}
333 
334 	if (nodes_empty(tmp))
335 		tmp = *nodes;
336 
337 	pol->nodes = tmp;
338 }
339 
340 static void mpol_rebind_preferred(struct mempolicy *pol,
341 						const nodemask_t *nodes)
342 {
343 	pol->w.cpuset_mems_allowed = *nodes;
344 }
345 
346 /*
347  * mpol_rebind_policy - Migrate a policy to a different set of nodes
348  *
349  * Per-vma policies are protected by mmap_lock. Allocations using per-task
350  * policies are protected by task->mems_allowed_seq to prevent a premature
351  * OOM/allocation failure due to parallel nodemask modification.
352  */
353 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
354 {
355 	if (!pol)
356 		return;
357 	if (!mpol_store_user_nodemask(pol) &&
358 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
359 		return;
360 
361 	mpol_ops[pol->mode].rebind(pol, newmask);
362 }
363 
364 /*
365  * Wrapper for mpol_rebind_policy() that just requires task
366  * pointer, and updates task mempolicy.
367  *
368  * Called with task's alloc_lock held.
369  */
370 
371 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
372 {
373 	mpol_rebind_policy(tsk->mempolicy, new);
374 }
375 
376 /*
377  * Rebind each vma in mm to new nodemask.
378  *
379  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
380  */
381 
382 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
383 {
384 	struct vm_area_struct *vma;
385 
386 	mmap_write_lock(mm);
387 	for (vma = mm->mmap; vma; vma = vma->vm_next)
388 		mpol_rebind_policy(vma->vm_policy, new);
389 	mmap_write_unlock(mm);
390 }
391 
392 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
393 	[MPOL_DEFAULT] = {
394 		.rebind = mpol_rebind_default,
395 	},
396 	[MPOL_INTERLEAVE] = {
397 		.create = mpol_new_interleave,
398 		.rebind = mpol_rebind_nodemask,
399 	},
400 	[MPOL_PREFERRED] = {
401 		.create = mpol_new_preferred,
402 		.rebind = mpol_rebind_preferred,
403 	},
404 	[MPOL_BIND] = {
405 		.create = mpol_new_bind,
406 		.rebind = mpol_rebind_nodemask,
407 	},
408 	[MPOL_LOCAL] = {
409 		.rebind = mpol_rebind_default,
410 	},
411 };
412 
413 static int migrate_page_add(struct page *page, struct list_head *pagelist,
414 				unsigned long flags);
415 
416 struct queue_pages {
417 	struct list_head *pagelist;
418 	unsigned long flags;
419 	nodemask_t *nmask;
420 	unsigned long start;
421 	unsigned long end;
422 	struct vm_area_struct *first;
423 };
424 
425 /*
426  * Check if the page's nid is in qp->nmask.
427  *
428  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
429  * in the invert of qp->nmask.
430  */
431 static inline bool queue_pages_required(struct page *page,
432 					struct queue_pages *qp)
433 {
434 	int nid = page_to_nid(page);
435 	unsigned long flags = qp->flags;
436 
437 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
438 }
439 
440 /*
441  * queue_pages_pmd() has four possible return values:
442  * 0 - pages are placed on the right node or queued successfully, or
443  *     special page is met, i.e. huge zero page.
444  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
445  *     specified.
446  * 2 - THP was split.
447  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
448  *        existing page was already on a node that does not follow the
449  *        policy.
450  */
451 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
452 				unsigned long end, struct mm_walk *walk)
453 	__releases(ptl)
454 {
455 	int ret = 0;
456 	struct page *page;
457 	struct queue_pages *qp = walk->private;
458 	unsigned long flags;
459 
460 	if (unlikely(is_pmd_migration_entry(*pmd))) {
461 		ret = -EIO;
462 		goto unlock;
463 	}
464 	page = pmd_page(*pmd);
465 	if (is_huge_zero_page(page)) {
466 		spin_unlock(ptl);
467 		walk->action = ACTION_CONTINUE;
468 		goto out;
469 	}
470 	if (!queue_pages_required(page, qp))
471 		goto unlock;
472 
473 	flags = qp->flags;
474 	/* go to thp migration */
475 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
476 		if (!vma_migratable(walk->vma) ||
477 		    migrate_page_add(page, qp->pagelist, flags)) {
478 			ret = 1;
479 			goto unlock;
480 		}
481 	} else
482 		ret = -EIO;
483 unlock:
484 	spin_unlock(ptl);
485 out:
486 	return ret;
487 }
488 
489 /*
490  * Scan through pages checking if pages follow certain conditions,
491  * and move them to the pagelist if they do.
492  *
493  * queue_pages_pte_range() has three possible return values:
494  * 0 - pages are placed on the right node or queued successfully, or
495  *     special page is met, i.e. zero page.
496  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
497  *     specified.
498  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
499  *        on a node that does not follow the policy.
500  */
501 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
502 			unsigned long end, struct mm_walk *walk)
503 {
504 	struct vm_area_struct *vma = walk->vma;
505 	struct page *page;
506 	struct queue_pages *qp = walk->private;
507 	unsigned long flags = qp->flags;
508 	int ret;
509 	bool has_unmovable = false;
510 	pte_t *pte, *mapped_pte;
511 	spinlock_t *ptl;
512 
513 	ptl = pmd_trans_huge_lock(pmd, vma);
514 	if (ptl) {
515 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
516 		if (ret != 2)
517 			return ret;
518 	}
519 	/* THP was split, fall through to pte walk */
520 
521 	if (pmd_trans_unstable(pmd))
522 		return 0;
523 
524 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
525 	for (; addr != end; pte++, addr += PAGE_SIZE) {
526 		if (!pte_present(*pte))
527 			continue;
528 		page = vm_normal_page(vma, addr, *pte);
529 		if (!page)
530 			continue;
531 		/*
532 		 * vm_normal_page() filters out zero pages, but there might
533 		 * still be PageReserved pages to skip, perhaps in a VDSO.
534 		 */
535 		if (PageReserved(page))
536 			continue;
537 		if (!queue_pages_required(page, qp))
538 			continue;
539 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
540 			/* MPOL_MF_STRICT must be specified if we get here */
541 			if (!vma_migratable(vma)) {
542 				has_unmovable = true;
543 				break;
544 			}
545 
546 			/*
547 			 * Do not abort immediately since there may be
548 			 * temporary off LRU pages in the range.  Still
549 			 * need migrate other LRU pages.
550 			 */
551 			if (migrate_page_add(page, qp->pagelist, flags))
552 				has_unmovable = true;
553 		} else
554 			break;
555 	}
556 	pte_unmap_unlock(mapped_pte, ptl);
557 	cond_resched();
558 
559 	if (has_unmovable)
560 		return 1;
561 
562 	return addr != end ? -EIO : 0;
563 }
564 
565 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
566 			       unsigned long addr, unsigned long end,
567 			       struct mm_walk *walk)
568 {
569 	int ret = 0;
570 #ifdef CONFIG_HUGETLB_PAGE
571 	struct queue_pages *qp = walk->private;
572 	unsigned long flags = (qp->flags & MPOL_MF_VALID);
573 	struct page *page;
574 	spinlock_t *ptl;
575 	pte_t entry;
576 
577 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
578 	entry = huge_ptep_get(pte);
579 	if (!pte_present(entry))
580 		goto unlock;
581 	page = pte_page(entry);
582 	if (!queue_pages_required(page, qp))
583 		goto unlock;
584 
585 	if (flags == MPOL_MF_STRICT) {
586 		/*
587 		 * STRICT alone means only detecting misplaced page and no
588 		 * need to further check other vma.
589 		 */
590 		ret = -EIO;
591 		goto unlock;
592 	}
593 
594 	if (!vma_migratable(walk->vma)) {
595 		/*
596 		 * Must be STRICT with MOVE*, otherwise .test_walk() have
597 		 * stopped walking current vma.
598 		 * Detecting misplaced page but allow migrating pages which
599 		 * have been queued.
600 		 */
601 		ret = 1;
602 		goto unlock;
603 	}
604 
605 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
606 	if (flags & (MPOL_MF_MOVE_ALL) ||
607 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
608 		if (!isolate_huge_page(page, qp->pagelist) &&
609 			(flags & MPOL_MF_STRICT))
610 			/*
611 			 * Failed to isolate page but allow migrating pages
612 			 * which have been queued.
613 			 */
614 			ret = 1;
615 	}
616 unlock:
617 	spin_unlock(ptl);
618 #else
619 	BUG();
620 #endif
621 	return ret;
622 }
623 
624 #ifdef CONFIG_NUMA_BALANCING
625 /*
626  * This is used to mark a range of virtual addresses to be inaccessible.
627  * These are later cleared by a NUMA hinting fault. Depending on these
628  * faults, pages may be migrated for better NUMA placement.
629  *
630  * This is assuming that NUMA faults are handled using PROT_NONE. If
631  * an architecture makes a different choice, it will need further
632  * changes to the core.
633  */
634 unsigned long change_prot_numa(struct vm_area_struct *vma,
635 			unsigned long addr, unsigned long end)
636 {
637 	int nr_updated;
638 
639 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
640 	if (nr_updated)
641 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
642 
643 	return nr_updated;
644 }
645 #else
646 static unsigned long change_prot_numa(struct vm_area_struct *vma,
647 			unsigned long addr, unsigned long end)
648 {
649 	return 0;
650 }
651 #endif /* CONFIG_NUMA_BALANCING */
652 
653 static int queue_pages_test_walk(unsigned long start, unsigned long end,
654 				struct mm_walk *walk)
655 {
656 	struct vm_area_struct *vma = walk->vma;
657 	struct queue_pages *qp = walk->private;
658 	unsigned long endvma = vma->vm_end;
659 	unsigned long flags = qp->flags;
660 
661 	/* range check first */
662 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
663 
664 	if (!qp->first) {
665 		qp->first = vma;
666 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
667 			(qp->start < vma->vm_start))
668 			/* hole at head side of range */
669 			return -EFAULT;
670 	}
671 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
672 		((vma->vm_end < qp->end) &&
673 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
674 		/* hole at middle or tail of range */
675 		return -EFAULT;
676 
677 	/*
678 	 * Need check MPOL_MF_STRICT to return -EIO if possible
679 	 * regardless of vma_migratable
680 	 */
681 	if (!vma_migratable(vma) &&
682 	    !(flags & MPOL_MF_STRICT))
683 		return 1;
684 
685 	if (endvma > end)
686 		endvma = end;
687 
688 	if (flags & MPOL_MF_LAZY) {
689 		/* Similar to task_numa_work, skip inaccessible VMAs */
690 		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
691 			!(vma->vm_flags & VM_MIXEDMAP))
692 			change_prot_numa(vma, start, endvma);
693 		return 1;
694 	}
695 
696 	/* queue pages from current vma */
697 	if (flags & MPOL_MF_VALID)
698 		return 0;
699 	return 1;
700 }
701 
702 static const struct mm_walk_ops queue_pages_walk_ops = {
703 	.hugetlb_entry		= queue_pages_hugetlb,
704 	.pmd_entry		= queue_pages_pte_range,
705 	.test_walk		= queue_pages_test_walk,
706 };
707 
708 /*
709  * Walk through page tables and collect pages to be migrated.
710  *
711  * If pages found in a given range are on a set of nodes (determined by
712  * @nodes and @flags,) it's isolated and queued to the pagelist which is
713  * passed via @private.
714  *
715  * queue_pages_range() has three possible return values:
716  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
717  *     specified.
718  * 0 - queue pages successfully or no misplaced page.
719  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
720  *         memory range specified by nodemask and maxnode points outside
721  *         your accessible address space (-EFAULT)
722  */
723 static int
724 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
725 		nodemask_t *nodes, unsigned long flags,
726 		struct list_head *pagelist)
727 {
728 	int err;
729 	struct queue_pages qp = {
730 		.pagelist = pagelist,
731 		.flags = flags,
732 		.nmask = nodes,
733 		.start = start,
734 		.end = end,
735 		.first = NULL,
736 	};
737 
738 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
739 
740 	if (!qp.first)
741 		/* whole range in hole */
742 		err = -EFAULT;
743 
744 	return err;
745 }
746 
747 /*
748  * Apply policy to a single VMA
749  * This must be called with the mmap_lock held for writing.
750  */
751 static int vma_replace_policy(struct vm_area_struct *vma,
752 						struct mempolicy *pol)
753 {
754 	int err;
755 	struct mempolicy *old;
756 	struct mempolicy *new;
757 
758 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
759 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
760 		 vma->vm_ops, vma->vm_file,
761 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
762 
763 	new = mpol_dup(pol);
764 	if (IS_ERR(new))
765 		return PTR_ERR(new);
766 
767 	if (vma->vm_ops && vma->vm_ops->set_policy) {
768 		err = vma->vm_ops->set_policy(vma, new);
769 		if (err)
770 			goto err_out;
771 	}
772 
773 	old = vma->vm_policy;
774 	vma->vm_policy = new; /* protected by mmap_lock */
775 	mpol_put(old);
776 
777 	return 0;
778  err_out:
779 	mpol_put(new);
780 	return err;
781 }
782 
783 /* Step 2: apply policy to a range and do splits. */
784 static int mbind_range(struct mm_struct *mm, unsigned long start,
785 		       unsigned long end, struct mempolicy *new_pol)
786 {
787 	struct vm_area_struct *next;
788 	struct vm_area_struct *prev;
789 	struct vm_area_struct *vma;
790 	int err = 0;
791 	pgoff_t pgoff;
792 	unsigned long vmstart;
793 	unsigned long vmend;
794 
795 	vma = find_vma(mm, start);
796 	VM_BUG_ON(!vma);
797 
798 	prev = vma->vm_prev;
799 	if (start > vma->vm_start)
800 		prev = vma;
801 
802 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
803 		next = vma->vm_next;
804 		vmstart = max(start, vma->vm_start);
805 		vmend   = min(end, vma->vm_end);
806 
807 		if (mpol_equal(vma_policy(vma), new_pol))
808 			continue;
809 
810 		pgoff = vma->vm_pgoff +
811 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
812 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
813 				 vma->anon_vma, vma->vm_file, pgoff,
814 				 new_pol, vma->vm_userfaultfd_ctx);
815 		if (prev) {
816 			vma = prev;
817 			next = vma->vm_next;
818 			if (mpol_equal(vma_policy(vma), new_pol))
819 				continue;
820 			/* vma_merge() joined vma && vma->next, case 8 */
821 			goto replace;
822 		}
823 		if (vma->vm_start != vmstart) {
824 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
825 			if (err)
826 				goto out;
827 		}
828 		if (vma->vm_end != vmend) {
829 			err = split_vma(vma->vm_mm, vma, vmend, 0);
830 			if (err)
831 				goto out;
832 		}
833  replace:
834 		err = vma_replace_policy(vma, new_pol);
835 		if (err)
836 			goto out;
837 	}
838 
839  out:
840 	return err;
841 }
842 
843 /* Set the process memory policy */
844 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
845 			     nodemask_t *nodes)
846 {
847 	struct mempolicy *new, *old;
848 	NODEMASK_SCRATCH(scratch);
849 	int ret;
850 
851 	if (!scratch)
852 		return -ENOMEM;
853 
854 	new = mpol_new(mode, flags, nodes);
855 	if (IS_ERR(new)) {
856 		ret = PTR_ERR(new);
857 		goto out;
858 	}
859 
860 	if (flags & MPOL_F_NUMA_BALANCING) {
861 		if (new && new->mode == MPOL_BIND) {
862 			new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
863 		} else {
864 			ret = -EINVAL;
865 			mpol_put(new);
866 			goto out;
867 		}
868 	}
869 
870 	ret = mpol_set_nodemask(new, nodes, scratch);
871 	if (ret) {
872 		mpol_put(new);
873 		goto out;
874 	}
875 	task_lock(current);
876 	old = current->mempolicy;
877 	current->mempolicy = new;
878 	if (new && new->mode == MPOL_INTERLEAVE)
879 		current->il_prev = MAX_NUMNODES-1;
880 	task_unlock(current);
881 	mpol_put(old);
882 	ret = 0;
883 out:
884 	NODEMASK_SCRATCH_FREE(scratch);
885 	return ret;
886 }
887 
888 /*
889  * Return nodemask for policy for get_mempolicy() query
890  *
891  * Called with task's alloc_lock held
892  */
893 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
894 {
895 	nodes_clear(*nodes);
896 	if (p == &default_policy)
897 		return;
898 
899 	switch (p->mode) {
900 	case MPOL_BIND:
901 	case MPOL_INTERLEAVE:
902 	case MPOL_PREFERRED:
903 		*nodes = p->nodes;
904 		break;
905 	case MPOL_LOCAL:
906 		/* return empty node mask for local allocation */
907 		break;
908 	default:
909 		BUG();
910 	}
911 }
912 
913 static int lookup_node(struct mm_struct *mm, unsigned long addr)
914 {
915 	struct page *p = NULL;
916 	int err;
917 
918 	int locked = 1;
919 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
920 	if (err > 0) {
921 		err = page_to_nid(p);
922 		put_page(p);
923 	}
924 	if (locked)
925 		mmap_read_unlock(mm);
926 	return err;
927 }
928 
929 /* Retrieve NUMA policy */
930 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
931 			     unsigned long addr, unsigned long flags)
932 {
933 	int err;
934 	struct mm_struct *mm = current->mm;
935 	struct vm_area_struct *vma = NULL;
936 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
937 
938 	if (flags &
939 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
940 		return -EINVAL;
941 
942 	if (flags & MPOL_F_MEMS_ALLOWED) {
943 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
944 			return -EINVAL;
945 		*policy = 0;	/* just so it's initialized */
946 		task_lock(current);
947 		*nmask  = cpuset_current_mems_allowed;
948 		task_unlock(current);
949 		return 0;
950 	}
951 
952 	if (flags & MPOL_F_ADDR) {
953 		/*
954 		 * Do NOT fall back to task policy if the
955 		 * vma/shared policy at addr is NULL.  We
956 		 * want to return MPOL_DEFAULT in this case.
957 		 */
958 		mmap_read_lock(mm);
959 		vma = vma_lookup(mm, addr);
960 		if (!vma) {
961 			mmap_read_unlock(mm);
962 			return -EFAULT;
963 		}
964 		if (vma->vm_ops && vma->vm_ops->get_policy)
965 			pol = vma->vm_ops->get_policy(vma, addr);
966 		else
967 			pol = vma->vm_policy;
968 	} else if (addr)
969 		return -EINVAL;
970 
971 	if (!pol)
972 		pol = &default_policy;	/* indicates default behavior */
973 
974 	if (flags & MPOL_F_NODE) {
975 		if (flags & MPOL_F_ADDR) {
976 			/*
977 			 * Take a refcount on the mpol, lookup_node()
978 			 * will drop the mmap_lock, so after calling
979 			 * lookup_node() only "pol" remains valid, "vma"
980 			 * is stale.
981 			 */
982 			pol_refcount = pol;
983 			vma = NULL;
984 			mpol_get(pol);
985 			err = lookup_node(mm, addr);
986 			if (err < 0)
987 				goto out;
988 			*policy = err;
989 		} else if (pol == current->mempolicy &&
990 				pol->mode == MPOL_INTERLEAVE) {
991 			*policy = next_node_in(current->il_prev, pol->nodes);
992 		} else {
993 			err = -EINVAL;
994 			goto out;
995 		}
996 	} else {
997 		*policy = pol == &default_policy ? MPOL_DEFAULT :
998 						pol->mode;
999 		/*
1000 		 * Internal mempolicy flags must be masked off before exposing
1001 		 * the policy to userspace.
1002 		 */
1003 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1004 	}
1005 
1006 	err = 0;
1007 	if (nmask) {
1008 		if (mpol_store_user_nodemask(pol)) {
1009 			*nmask = pol->w.user_nodemask;
1010 		} else {
1011 			task_lock(current);
1012 			get_policy_nodemask(pol, nmask);
1013 			task_unlock(current);
1014 		}
1015 	}
1016 
1017  out:
1018 	mpol_cond_put(pol);
1019 	if (vma)
1020 		mmap_read_unlock(mm);
1021 	if (pol_refcount)
1022 		mpol_put(pol_refcount);
1023 	return err;
1024 }
1025 
1026 #ifdef CONFIG_MIGRATION
1027 /*
1028  * page migration, thp tail pages can be passed.
1029  */
1030 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1031 				unsigned long flags)
1032 {
1033 	struct page *head = compound_head(page);
1034 	/*
1035 	 * Avoid migrating a page that is shared with others.
1036 	 */
1037 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1038 		if (!isolate_lru_page(head)) {
1039 			list_add_tail(&head->lru, pagelist);
1040 			mod_node_page_state(page_pgdat(head),
1041 				NR_ISOLATED_ANON + page_is_file_lru(head),
1042 				thp_nr_pages(head));
1043 		} else if (flags & MPOL_MF_STRICT) {
1044 			/*
1045 			 * Non-movable page may reach here.  And, there may be
1046 			 * temporary off LRU pages or non-LRU movable pages.
1047 			 * Treat them as unmovable pages since they can't be
1048 			 * isolated, so they can't be moved at the moment.  It
1049 			 * should return -EIO for this case too.
1050 			 */
1051 			return -EIO;
1052 		}
1053 	}
1054 
1055 	return 0;
1056 }
1057 
1058 /*
1059  * Migrate pages from one node to a target node.
1060  * Returns error or the number of pages not migrated.
1061  */
1062 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1063 			   int flags)
1064 {
1065 	nodemask_t nmask;
1066 	LIST_HEAD(pagelist);
1067 	int err = 0;
1068 	struct migration_target_control mtc = {
1069 		.nid = dest,
1070 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1071 	};
1072 
1073 	nodes_clear(nmask);
1074 	node_set(source, nmask);
1075 
1076 	/*
1077 	 * This does not "check" the range but isolates all pages that
1078 	 * need migration.  Between passing in the full user address
1079 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1080 	 */
1081 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1082 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1083 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1084 
1085 	if (!list_empty(&pagelist)) {
1086 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1087 				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1088 		if (err)
1089 			putback_movable_pages(&pagelist);
1090 	}
1091 
1092 	return err;
1093 }
1094 
1095 /*
1096  * Move pages between the two nodesets so as to preserve the physical
1097  * layout as much as possible.
1098  *
1099  * Returns the number of page that could not be moved.
1100  */
1101 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1102 		     const nodemask_t *to, int flags)
1103 {
1104 	int busy = 0;
1105 	int err = 0;
1106 	nodemask_t tmp;
1107 
1108 	lru_cache_disable();
1109 
1110 	mmap_read_lock(mm);
1111 
1112 	/*
1113 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1114 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1115 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1116 	 * The pair of nodemasks 'to' and 'from' define the map.
1117 	 *
1118 	 * If no pair of bits is found that way, fallback to picking some
1119 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1120 	 * 'source' and 'dest' bits are the same, this represents a node
1121 	 * that will be migrating to itself, so no pages need move.
1122 	 *
1123 	 * If no bits are left in 'tmp', or if all remaining bits left
1124 	 * in 'tmp' correspond to the same bit in 'to', return false
1125 	 * (nothing left to migrate).
1126 	 *
1127 	 * This lets us pick a pair of nodes to migrate between, such that
1128 	 * if possible the dest node is not already occupied by some other
1129 	 * source node, minimizing the risk of overloading the memory on a
1130 	 * node that would happen if we migrated incoming memory to a node
1131 	 * before migrating outgoing memory source that same node.
1132 	 *
1133 	 * A single scan of tmp is sufficient.  As we go, we remember the
1134 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1135 	 * that not only moved, but what's better, moved to an empty slot
1136 	 * (d is not set in tmp), then we break out then, with that pair.
1137 	 * Otherwise when we finish scanning from_tmp, we at least have the
1138 	 * most recent <s, d> pair that moved.  If we get all the way through
1139 	 * the scan of tmp without finding any node that moved, much less
1140 	 * moved to an empty node, then there is nothing left worth migrating.
1141 	 */
1142 
1143 	tmp = *from;
1144 	while (!nodes_empty(tmp)) {
1145 		int s, d;
1146 		int source = NUMA_NO_NODE;
1147 		int dest = 0;
1148 
1149 		for_each_node_mask(s, tmp) {
1150 
1151 			/*
1152 			 * do_migrate_pages() tries to maintain the relative
1153 			 * node relationship of the pages established between
1154 			 * threads and memory areas.
1155                          *
1156 			 * However if the number of source nodes is not equal to
1157 			 * the number of destination nodes we can not preserve
1158 			 * this node relative relationship.  In that case, skip
1159 			 * copying memory from a node that is in the destination
1160 			 * mask.
1161 			 *
1162 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1163 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1164 			 */
1165 
1166 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1167 						(node_isset(s, *to)))
1168 				continue;
1169 
1170 			d = node_remap(s, *from, *to);
1171 			if (s == d)
1172 				continue;
1173 
1174 			source = s;	/* Node moved. Memorize */
1175 			dest = d;
1176 
1177 			/* dest not in remaining from nodes? */
1178 			if (!node_isset(dest, tmp))
1179 				break;
1180 		}
1181 		if (source == NUMA_NO_NODE)
1182 			break;
1183 
1184 		node_clear(source, tmp);
1185 		err = migrate_to_node(mm, source, dest, flags);
1186 		if (err > 0)
1187 			busy += err;
1188 		if (err < 0)
1189 			break;
1190 	}
1191 	mmap_read_unlock(mm);
1192 
1193 	lru_cache_enable();
1194 	if (err < 0)
1195 		return err;
1196 	return busy;
1197 
1198 }
1199 
1200 /*
1201  * Allocate a new page for page migration based on vma policy.
1202  * Start by assuming the page is mapped by the same vma as contains @start.
1203  * Search forward from there, if not.  N.B., this assumes that the
1204  * list of pages handed to migrate_pages()--which is how we get here--
1205  * is in virtual address order.
1206  */
1207 static struct page *new_page(struct page *page, unsigned long start)
1208 {
1209 	struct vm_area_struct *vma;
1210 	unsigned long address;
1211 
1212 	vma = find_vma(current->mm, start);
1213 	while (vma) {
1214 		address = page_address_in_vma(page, vma);
1215 		if (address != -EFAULT)
1216 			break;
1217 		vma = vma->vm_next;
1218 	}
1219 
1220 	if (PageHuge(page)) {
1221 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1222 				vma, address);
1223 	} else if (PageTransHuge(page)) {
1224 		struct page *thp;
1225 
1226 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1227 					 HPAGE_PMD_ORDER);
1228 		if (!thp)
1229 			return NULL;
1230 		prep_transhuge_page(thp);
1231 		return thp;
1232 	}
1233 	/*
1234 	 * if !vma, alloc_page_vma() will use task or system default policy
1235 	 */
1236 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1237 			vma, address);
1238 }
1239 #else
1240 
1241 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1242 				unsigned long flags)
1243 {
1244 	return -EIO;
1245 }
1246 
1247 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1248 		     const nodemask_t *to, int flags)
1249 {
1250 	return -ENOSYS;
1251 }
1252 
1253 static struct page *new_page(struct page *page, unsigned long start)
1254 {
1255 	return NULL;
1256 }
1257 #endif
1258 
1259 static long do_mbind(unsigned long start, unsigned long len,
1260 		     unsigned short mode, unsigned short mode_flags,
1261 		     nodemask_t *nmask, unsigned long flags)
1262 {
1263 	struct mm_struct *mm = current->mm;
1264 	struct mempolicy *new;
1265 	unsigned long end;
1266 	int err;
1267 	int ret;
1268 	LIST_HEAD(pagelist);
1269 
1270 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1271 		return -EINVAL;
1272 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1273 		return -EPERM;
1274 
1275 	if (start & ~PAGE_MASK)
1276 		return -EINVAL;
1277 
1278 	if (mode == MPOL_DEFAULT)
1279 		flags &= ~MPOL_MF_STRICT;
1280 
1281 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1282 	end = start + len;
1283 
1284 	if (end < start)
1285 		return -EINVAL;
1286 	if (end == start)
1287 		return 0;
1288 
1289 	new = mpol_new(mode, mode_flags, nmask);
1290 	if (IS_ERR(new))
1291 		return PTR_ERR(new);
1292 
1293 	if (flags & MPOL_MF_LAZY)
1294 		new->flags |= MPOL_F_MOF;
1295 
1296 	/*
1297 	 * If we are using the default policy then operation
1298 	 * on discontinuous address spaces is okay after all
1299 	 */
1300 	if (!new)
1301 		flags |= MPOL_MF_DISCONTIG_OK;
1302 
1303 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1304 		 start, start + len, mode, mode_flags,
1305 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1306 
1307 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1308 
1309 		lru_cache_disable();
1310 	}
1311 	{
1312 		NODEMASK_SCRATCH(scratch);
1313 		if (scratch) {
1314 			mmap_write_lock(mm);
1315 			err = mpol_set_nodemask(new, nmask, scratch);
1316 			if (err)
1317 				mmap_write_unlock(mm);
1318 		} else
1319 			err = -ENOMEM;
1320 		NODEMASK_SCRATCH_FREE(scratch);
1321 	}
1322 	if (err)
1323 		goto mpol_out;
1324 
1325 	ret = queue_pages_range(mm, start, end, nmask,
1326 			  flags | MPOL_MF_INVERT, &pagelist);
1327 
1328 	if (ret < 0) {
1329 		err = ret;
1330 		goto up_out;
1331 	}
1332 
1333 	err = mbind_range(mm, start, end, new);
1334 
1335 	if (!err) {
1336 		int nr_failed = 0;
1337 
1338 		if (!list_empty(&pagelist)) {
1339 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1340 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1341 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1342 			if (nr_failed)
1343 				putback_movable_pages(&pagelist);
1344 		}
1345 
1346 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1347 			err = -EIO;
1348 	} else {
1349 up_out:
1350 		if (!list_empty(&pagelist))
1351 			putback_movable_pages(&pagelist);
1352 	}
1353 
1354 	mmap_write_unlock(mm);
1355 mpol_out:
1356 	mpol_put(new);
1357 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1358 		lru_cache_enable();
1359 	return err;
1360 }
1361 
1362 /*
1363  * User space interface with variable sized bitmaps for nodelists.
1364  */
1365 
1366 /* Copy a node mask from user space. */
1367 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1368 		     unsigned long maxnode)
1369 {
1370 	unsigned long k;
1371 	unsigned long t;
1372 	unsigned long nlongs;
1373 	unsigned long endmask;
1374 
1375 	--maxnode;
1376 	nodes_clear(*nodes);
1377 	if (maxnode == 0 || !nmask)
1378 		return 0;
1379 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1380 		return -EINVAL;
1381 
1382 	nlongs = BITS_TO_LONGS(maxnode);
1383 	if ((maxnode % BITS_PER_LONG) == 0)
1384 		endmask = ~0UL;
1385 	else
1386 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1387 
1388 	/*
1389 	 * When the user specified more nodes than supported just check
1390 	 * if the non supported part is all zero.
1391 	 *
1392 	 * If maxnode have more longs than MAX_NUMNODES, check
1393 	 * the bits in that area first. And then go through to
1394 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1395 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1396 	 */
1397 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1398 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1399 			if (get_user(t, nmask + k))
1400 				return -EFAULT;
1401 			if (k == nlongs - 1) {
1402 				if (t & endmask)
1403 					return -EINVAL;
1404 			} else if (t)
1405 				return -EINVAL;
1406 		}
1407 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1408 		endmask = ~0UL;
1409 	}
1410 
1411 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1412 		unsigned long valid_mask = endmask;
1413 
1414 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1415 		if (get_user(t, nmask + nlongs - 1))
1416 			return -EFAULT;
1417 		if (t & valid_mask)
1418 			return -EINVAL;
1419 	}
1420 
1421 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1422 		return -EFAULT;
1423 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1424 	return 0;
1425 }
1426 
1427 /* Copy a kernel node mask to user space */
1428 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1429 			      nodemask_t *nodes)
1430 {
1431 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1432 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1433 
1434 	if (copy > nbytes) {
1435 		if (copy > PAGE_SIZE)
1436 			return -EINVAL;
1437 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1438 			return -EFAULT;
1439 		copy = nbytes;
1440 	}
1441 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1442 }
1443 
1444 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1445 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1446 {
1447 	*flags = *mode & MPOL_MODE_FLAGS;
1448 	*mode &= ~MPOL_MODE_FLAGS;
1449 	if ((unsigned int)(*mode) >= MPOL_MAX)
1450 		return -EINVAL;
1451 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1452 		return -EINVAL;
1453 
1454 	return 0;
1455 }
1456 
1457 static long kernel_mbind(unsigned long start, unsigned long len,
1458 			 unsigned long mode, const unsigned long __user *nmask,
1459 			 unsigned long maxnode, unsigned int flags)
1460 {
1461 	unsigned short mode_flags;
1462 	nodemask_t nodes;
1463 	int lmode = mode;
1464 	int err;
1465 
1466 	start = untagged_addr(start);
1467 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1468 	if (err)
1469 		return err;
1470 
1471 	err = get_nodes(&nodes, nmask, maxnode);
1472 	if (err)
1473 		return err;
1474 
1475 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1476 }
1477 
1478 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1479 		unsigned long, mode, const unsigned long __user *, nmask,
1480 		unsigned long, maxnode, unsigned int, flags)
1481 {
1482 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1483 }
1484 
1485 /* Set the process memory policy */
1486 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1487 				 unsigned long maxnode)
1488 {
1489 	unsigned short mode_flags;
1490 	nodemask_t nodes;
1491 	int lmode = mode;
1492 	int err;
1493 
1494 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1495 	if (err)
1496 		return err;
1497 
1498 	err = get_nodes(&nodes, nmask, maxnode);
1499 	if (err)
1500 		return err;
1501 
1502 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1503 }
1504 
1505 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1506 		unsigned long, maxnode)
1507 {
1508 	return kernel_set_mempolicy(mode, nmask, maxnode);
1509 }
1510 
1511 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1512 				const unsigned long __user *old_nodes,
1513 				const unsigned long __user *new_nodes)
1514 {
1515 	struct mm_struct *mm = NULL;
1516 	struct task_struct *task;
1517 	nodemask_t task_nodes;
1518 	int err;
1519 	nodemask_t *old;
1520 	nodemask_t *new;
1521 	NODEMASK_SCRATCH(scratch);
1522 
1523 	if (!scratch)
1524 		return -ENOMEM;
1525 
1526 	old = &scratch->mask1;
1527 	new = &scratch->mask2;
1528 
1529 	err = get_nodes(old, old_nodes, maxnode);
1530 	if (err)
1531 		goto out;
1532 
1533 	err = get_nodes(new, new_nodes, maxnode);
1534 	if (err)
1535 		goto out;
1536 
1537 	/* Find the mm_struct */
1538 	rcu_read_lock();
1539 	task = pid ? find_task_by_vpid(pid) : current;
1540 	if (!task) {
1541 		rcu_read_unlock();
1542 		err = -ESRCH;
1543 		goto out;
1544 	}
1545 	get_task_struct(task);
1546 
1547 	err = -EINVAL;
1548 
1549 	/*
1550 	 * Check if this process has the right to modify the specified process.
1551 	 * Use the regular "ptrace_may_access()" checks.
1552 	 */
1553 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1554 		rcu_read_unlock();
1555 		err = -EPERM;
1556 		goto out_put;
1557 	}
1558 	rcu_read_unlock();
1559 
1560 	task_nodes = cpuset_mems_allowed(task);
1561 	/* Is the user allowed to access the target nodes? */
1562 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1563 		err = -EPERM;
1564 		goto out_put;
1565 	}
1566 
1567 	task_nodes = cpuset_mems_allowed(current);
1568 	nodes_and(*new, *new, task_nodes);
1569 	if (nodes_empty(*new))
1570 		goto out_put;
1571 
1572 	err = security_task_movememory(task);
1573 	if (err)
1574 		goto out_put;
1575 
1576 	mm = get_task_mm(task);
1577 	put_task_struct(task);
1578 
1579 	if (!mm) {
1580 		err = -EINVAL;
1581 		goto out;
1582 	}
1583 
1584 	err = do_migrate_pages(mm, old, new,
1585 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1586 
1587 	mmput(mm);
1588 out:
1589 	NODEMASK_SCRATCH_FREE(scratch);
1590 
1591 	return err;
1592 
1593 out_put:
1594 	put_task_struct(task);
1595 	goto out;
1596 
1597 }
1598 
1599 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1600 		const unsigned long __user *, old_nodes,
1601 		const unsigned long __user *, new_nodes)
1602 {
1603 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1604 }
1605 
1606 
1607 /* Retrieve NUMA policy */
1608 static int kernel_get_mempolicy(int __user *policy,
1609 				unsigned long __user *nmask,
1610 				unsigned long maxnode,
1611 				unsigned long addr,
1612 				unsigned long flags)
1613 {
1614 	int err;
1615 	int pval;
1616 	nodemask_t nodes;
1617 
1618 	if (nmask != NULL && maxnode < nr_node_ids)
1619 		return -EINVAL;
1620 
1621 	addr = untagged_addr(addr);
1622 
1623 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1624 
1625 	if (err)
1626 		return err;
1627 
1628 	if (policy && put_user(pval, policy))
1629 		return -EFAULT;
1630 
1631 	if (nmask)
1632 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1633 
1634 	return err;
1635 }
1636 
1637 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1638 		unsigned long __user *, nmask, unsigned long, maxnode,
1639 		unsigned long, addr, unsigned long, flags)
1640 {
1641 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1642 }
1643 
1644 #ifdef CONFIG_COMPAT
1645 
1646 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1647 		       compat_ulong_t __user *, nmask,
1648 		       compat_ulong_t, maxnode,
1649 		       compat_ulong_t, addr, compat_ulong_t, flags)
1650 {
1651 	long err;
1652 	unsigned long __user *nm = NULL;
1653 	unsigned long nr_bits, alloc_size;
1654 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1655 
1656 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1657 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1658 
1659 	if (nmask)
1660 		nm = compat_alloc_user_space(alloc_size);
1661 
1662 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1663 
1664 	if (!err && nmask) {
1665 		unsigned long copy_size;
1666 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1667 		err = copy_from_user(bm, nm, copy_size);
1668 		/* ensure entire bitmap is zeroed */
1669 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1670 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1671 	}
1672 
1673 	return err;
1674 }
1675 
1676 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1677 		       compat_ulong_t, maxnode)
1678 {
1679 	unsigned long __user *nm = NULL;
1680 	unsigned long nr_bits, alloc_size;
1681 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1682 
1683 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1684 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1685 
1686 	if (nmask) {
1687 		if (compat_get_bitmap(bm, nmask, nr_bits))
1688 			return -EFAULT;
1689 		nm = compat_alloc_user_space(alloc_size);
1690 		if (copy_to_user(nm, bm, alloc_size))
1691 			return -EFAULT;
1692 	}
1693 
1694 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1695 }
1696 
1697 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1698 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1699 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1700 {
1701 	unsigned long __user *nm = NULL;
1702 	unsigned long nr_bits, alloc_size;
1703 	nodemask_t bm;
1704 
1705 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1706 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1707 
1708 	if (nmask) {
1709 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1710 			return -EFAULT;
1711 		nm = compat_alloc_user_space(alloc_size);
1712 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1713 			return -EFAULT;
1714 	}
1715 
1716 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1717 }
1718 
1719 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1720 		       compat_ulong_t, maxnode,
1721 		       const compat_ulong_t __user *, old_nodes,
1722 		       const compat_ulong_t __user *, new_nodes)
1723 {
1724 	unsigned long __user *old = NULL;
1725 	unsigned long __user *new = NULL;
1726 	nodemask_t tmp_mask;
1727 	unsigned long nr_bits;
1728 	unsigned long size;
1729 
1730 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1731 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1732 	if (old_nodes) {
1733 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1734 			return -EFAULT;
1735 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1736 		if (new_nodes)
1737 			new = old + size / sizeof(unsigned long);
1738 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1739 			return -EFAULT;
1740 	}
1741 	if (new_nodes) {
1742 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1743 			return -EFAULT;
1744 		if (new == NULL)
1745 			new = compat_alloc_user_space(size);
1746 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1747 			return -EFAULT;
1748 	}
1749 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1750 }
1751 
1752 #endif /* CONFIG_COMPAT */
1753 
1754 bool vma_migratable(struct vm_area_struct *vma)
1755 {
1756 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1757 		return false;
1758 
1759 	/*
1760 	 * DAX device mappings require predictable access latency, so avoid
1761 	 * incurring periodic faults.
1762 	 */
1763 	if (vma_is_dax(vma))
1764 		return false;
1765 
1766 	if (is_vm_hugetlb_page(vma) &&
1767 		!hugepage_migration_supported(hstate_vma(vma)))
1768 		return false;
1769 
1770 	/*
1771 	 * Migration allocates pages in the highest zone. If we cannot
1772 	 * do so then migration (at least from node to node) is not
1773 	 * possible.
1774 	 */
1775 	if (vma->vm_file &&
1776 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1777 			< policy_zone)
1778 		return false;
1779 	return true;
1780 }
1781 
1782 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1783 						unsigned long addr)
1784 {
1785 	struct mempolicy *pol = NULL;
1786 
1787 	if (vma) {
1788 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1789 			pol = vma->vm_ops->get_policy(vma, addr);
1790 		} else if (vma->vm_policy) {
1791 			pol = vma->vm_policy;
1792 
1793 			/*
1794 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1795 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1796 			 * count on these policies which will be dropped by
1797 			 * mpol_cond_put() later
1798 			 */
1799 			if (mpol_needs_cond_ref(pol))
1800 				mpol_get(pol);
1801 		}
1802 	}
1803 
1804 	return pol;
1805 }
1806 
1807 /*
1808  * get_vma_policy(@vma, @addr)
1809  * @vma: virtual memory area whose policy is sought
1810  * @addr: address in @vma for shared policy lookup
1811  *
1812  * Returns effective policy for a VMA at specified address.
1813  * Falls back to current->mempolicy or system default policy, as necessary.
1814  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1815  * count--added by the get_policy() vm_op, as appropriate--to protect against
1816  * freeing by another task.  It is the caller's responsibility to free the
1817  * extra reference for shared policies.
1818  */
1819 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1820 						unsigned long addr)
1821 {
1822 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1823 
1824 	if (!pol)
1825 		pol = get_task_policy(current);
1826 
1827 	return pol;
1828 }
1829 
1830 bool vma_policy_mof(struct vm_area_struct *vma)
1831 {
1832 	struct mempolicy *pol;
1833 
1834 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1835 		bool ret = false;
1836 
1837 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1838 		if (pol && (pol->flags & MPOL_F_MOF))
1839 			ret = true;
1840 		mpol_cond_put(pol);
1841 
1842 		return ret;
1843 	}
1844 
1845 	pol = vma->vm_policy;
1846 	if (!pol)
1847 		pol = get_task_policy(current);
1848 
1849 	return pol->flags & MPOL_F_MOF;
1850 }
1851 
1852 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1853 {
1854 	enum zone_type dynamic_policy_zone = policy_zone;
1855 
1856 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1857 
1858 	/*
1859 	 * if policy->nodes has movable memory only,
1860 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1861 	 *
1862 	 * policy->nodes is intersect with node_states[N_MEMORY].
1863 	 * so if the following test fails, it implies
1864 	 * policy->nodes has movable memory only.
1865 	 */
1866 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1867 		dynamic_policy_zone = ZONE_MOVABLE;
1868 
1869 	return zone >= dynamic_policy_zone;
1870 }
1871 
1872 /*
1873  * Return a nodemask representing a mempolicy for filtering nodes for
1874  * page allocation
1875  */
1876 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1877 {
1878 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1879 	if (unlikely(policy->mode == MPOL_BIND) &&
1880 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1881 			cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1882 		return &policy->nodes;
1883 
1884 	return NULL;
1885 }
1886 
1887 /* Return the node id preferred by the given mempolicy, or the given id */
1888 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1889 {
1890 	if (policy->mode == MPOL_PREFERRED) {
1891 		nd = first_node(policy->nodes);
1892 	} else {
1893 		/*
1894 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1895 		 * because we might easily break the expectation to stay on the
1896 		 * requested node and not break the policy.
1897 		 */
1898 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1899 	}
1900 
1901 	return nd;
1902 }
1903 
1904 /* Do dynamic interleaving for a process */
1905 static unsigned interleave_nodes(struct mempolicy *policy)
1906 {
1907 	unsigned next;
1908 	struct task_struct *me = current;
1909 
1910 	next = next_node_in(me->il_prev, policy->nodes);
1911 	if (next < MAX_NUMNODES)
1912 		me->il_prev = next;
1913 	return next;
1914 }
1915 
1916 /*
1917  * Depending on the memory policy provide a node from which to allocate the
1918  * next slab entry.
1919  */
1920 unsigned int mempolicy_slab_node(void)
1921 {
1922 	struct mempolicy *policy;
1923 	int node = numa_mem_id();
1924 
1925 	if (in_interrupt())
1926 		return node;
1927 
1928 	policy = current->mempolicy;
1929 	if (!policy)
1930 		return node;
1931 
1932 	switch (policy->mode) {
1933 	case MPOL_PREFERRED:
1934 		return first_node(policy->nodes);
1935 
1936 	case MPOL_INTERLEAVE:
1937 		return interleave_nodes(policy);
1938 
1939 	case MPOL_BIND: {
1940 		struct zoneref *z;
1941 
1942 		/*
1943 		 * Follow bind policy behavior and start allocation at the
1944 		 * first node.
1945 		 */
1946 		struct zonelist *zonelist;
1947 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1948 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1949 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1950 							&policy->nodes);
1951 		return z->zone ? zone_to_nid(z->zone) : node;
1952 	}
1953 	case MPOL_LOCAL:
1954 		return node;
1955 
1956 	default:
1957 		BUG();
1958 	}
1959 }
1960 
1961 /*
1962  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1963  * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1964  * number of present nodes.
1965  */
1966 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1967 {
1968 	unsigned nnodes = nodes_weight(pol->nodes);
1969 	unsigned target;
1970 	int i;
1971 	int nid;
1972 
1973 	if (!nnodes)
1974 		return numa_node_id();
1975 	target = (unsigned int)n % nnodes;
1976 	nid = first_node(pol->nodes);
1977 	for (i = 0; i < target; i++)
1978 		nid = next_node(nid, pol->nodes);
1979 	return nid;
1980 }
1981 
1982 /* Determine a node number for interleave */
1983 static inline unsigned interleave_nid(struct mempolicy *pol,
1984 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1985 {
1986 	if (vma) {
1987 		unsigned long off;
1988 
1989 		/*
1990 		 * for small pages, there is no difference between
1991 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1992 		 * for huge pages, since vm_pgoff is in units of small
1993 		 * pages, we need to shift off the always 0 bits to get
1994 		 * a useful offset.
1995 		 */
1996 		BUG_ON(shift < PAGE_SHIFT);
1997 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1998 		off += (addr - vma->vm_start) >> shift;
1999 		return offset_il_node(pol, off);
2000 	} else
2001 		return interleave_nodes(pol);
2002 }
2003 
2004 #ifdef CONFIG_HUGETLBFS
2005 /*
2006  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2007  * @vma: virtual memory area whose policy is sought
2008  * @addr: address in @vma for shared policy lookup and interleave policy
2009  * @gfp_flags: for requested zone
2010  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2011  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2012  *
2013  * Returns a nid suitable for a huge page allocation and a pointer
2014  * to the struct mempolicy for conditional unref after allocation.
2015  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2016  * @nodemask for filtering the zonelist.
2017  *
2018  * Must be protected by read_mems_allowed_begin()
2019  */
2020 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2021 				struct mempolicy **mpol, nodemask_t **nodemask)
2022 {
2023 	int nid;
2024 
2025 	*mpol = get_vma_policy(vma, addr);
2026 	*nodemask = NULL;	/* assume !MPOL_BIND */
2027 
2028 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2029 		nid = interleave_nid(*mpol, vma, addr,
2030 					huge_page_shift(hstate_vma(vma)));
2031 	} else {
2032 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
2033 		if ((*mpol)->mode == MPOL_BIND)
2034 			*nodemask = &(*mpol)->nodes;
2035 	}
2036 	return nid;
2037 }
2038 
2039 /*
2040  * init_nodemask_of_mempolicy
2041  *
2042  * If the current task's mempolicy is "default" [NULL], return 'false'
2043  * to indicate default policy.  Otherwise, extract the policy nodemask
2044  * for 'bind' or 'interleave' policy into the argument nodemask, or
2045  * initialize the argument nodemask to contain the single node for
2046  * 'preferred' or 'local' policy and return 'true' to indicate presence
2047  * of non-default mempolicy.
2048  *
2049  * We don't bother with reference counting the mempolicy [mpol_get/put]
2050  * because the current task is examining it's own mempolicy and a task's
2051  * mempolicy is only ever changed by the task itself.
2052  *
2053  * N.B., it is the caller's responsibility to free a returned nodemask.
2054  */
2055 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2056 {
2057 	struct mempolicy *mempolicy;
2058 
2059 	if (!(mask && current->mempolicy))
2060 		return false;
2061 
2062 	task_lock(current);
2063 	mempolicy = current->mempolicy;
2064 	switch (mempolicy->mode) {
2065 	case MPOL_PREFERRED:
2066 	case MPOL_BIND:
2067 	case MPOL_INTERLEAVE:
2068 		*mask = mempolicy->nodes;
2069 		break;
2070 
2071 	case MPOL_LOCAL:
2072 		init_nodemask_of_node(mask, numa_node_id());
2073 		break;
2074 
2075 	default:
2076 		BUG();
2077 	}
2078 	task_unlock(current);
2079 
2080 	return true;
2081 }
2082 #endif
2083 
2084 /*
2085  * mempolicy_in_oom_domain
2086  *
2087  * If tsk's mempolicy is "bind", check for intersection between mask and
2088  * the policy nodemask. Otherwise, return true for all other policies
2089  * including "interleave", as a tsk with "interleave" policy may have
2090  * memory allocated from all nodes in system.
2091  *
2092  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2093  */
2094 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2095 					const nodemask_t *mask)
2096 {
2097 	struct mempolicy *mempolicy;
2098 	bool ret = true;
2099 
2100 	if (!mask)
2101 		return ret;
2102 
2103 	task_lock(tsk);
2104 	mempolicy = tsk->mempolicy;
2105 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2106 		ret = nodes_intersects(mempolicy->nodes, *mask);
2107 	task_unlock(tsk);
2108 
2109 	return ret;
2110 }
2111 
2112 /* Allocate a page in interleaved policy.
2113    Own path because it needs to do special accounting. */
2114 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2115 					unsigned nid)
2116 {
2117 	struct page *page;
2118 
2119 	page = __alloc_pages(gfp, order, nid, NULL);
2120 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2121 	if (!static_branch_likely(&vm_numa_stat_key))
2122 		return page;
2123 	if (page && page_to_nid(page) == nid) {
2124 		preempt_disable();
2125 		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2126 		preempt_enable();
2127 	}
2128 	return page;
2129 }
2130 
2131 /**
2132  * alloc_pages_vma - Allocate a page for a VMA.
2133  * @gfp: GFP flags.
2134  * @order: Order of the GFP allocation.
2135  * @vma: Pointer to VMA or NULL if not available.
2136  * @addr: Virtual address of the allocation.  Must be inside @vma.
2137  * @node: Which node to prefer for allocation (modulo policy).
2138  * @hugepage: For hugepages try only the preferred node if possible.
2139  *
2140  * Allocate a page for a specific address in @vma, using the appropriate
2141  * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2142  * of the mm_struct of the VMA to prevent it from going away.  Should be
2143  * used for all allocations for pages that will be mapped into user space.
2144  *
2145  * Return: The page on success or NULL if allocation fails.
2146  */
2147 struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2148 		unsigned long addr, int node, bool hugepage)
2149 {
2150 	struct mempolicy *pol;
2151 	struct page *page;
2152 	int preferred_nid;
2153 	nodemask_t *nmask;
2154 
2155 	pol = get_vma_policy(vma, addr);
2156 
2157 	if (pol->mode == MPOL_INTERLEAVE) {
2158 		unsigned nid;
2159 
2160 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2161 		mpol_cond_put(pol);
2162 		page = alloc_page_interleave(gfp, order, nid);
2163 		goto out;
2164 	}
2165 
2166 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2167 		int hpage_node = node;
2168 
2169 		/*
2170 		 * For hugepage allocation and non-interleave policy which
2171 		 * allows the current node (or other explicitly preferred
2172 		 * node) we only try to allocate from the current/preferred
2173 		 * node and don't fall back to other nodes, as the cost of
2174 		 * remote accesses would likely offset THP benefits.
2175 		 *
2176 		 * If the policy is interleave, or does not allow the current
2177 		 * node in its nodemask, we allocate the standard way.
2178 		 */
2179 		if (pol->mode == MPOL_PREFERRED)
2180 			hpage_node = first_node(pol->nodes);
2181 
2182 		nmask = policy_nodemask(gfp, pol);
2183 		if (!nmask || node_isset(hpage_node, *nmask)) {
2184 			mpol_cond_put(pol);
2185 			/*
2186 			 * First, try to allocate THP only on local node, but
2187 			 * don't reclaim unnecessarily, just compact.
2188 			 */
2189 			page = __alloc_pages_node(hpage_node,
2190 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2191 
2192 			/*
2193 			 * If hugepage allocations are configured to always
2194 			 * synchronous compact or the vma has been madvised
2195 			 * to prefer hugepage backing, retry allowing remote
2196 			 * memory with both reclaim and compact as well.
2197 			 */
2198 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2199 				page = __alloc_pages_node(hpage_node,
2200 								gfp, order);
2201 
2202 			goto out;
2203 		}
2204 	}
2205 
2206 	nmask = policy_nodemask(gfp, pol);
2207 	preferred_nid = policy_node(gfp, pol, node);
2208 	page = __alloc_pages(gfp, order, preferred_nid, nmask);
2209 	mpol_cond_put(pol);
2210 out:
2211 	return page;
2212 }
2213 EXPORT_SYMBOL(alloc_pages_vma);
2214 
2215 /**
2216  * alloc_pages - Allocate pages.
2217  * @gfp: GFP flags.
2218  * @order: Power of two of number of pages to allocate.
2219  *
2220  * Allocate 1 << @order contiguous pages.  The physical address of the
2221  * first page is naturally aligned (eg an order-3 allocation will be aligned
2222  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2223  * process is honoured when in process context.
2224  *
2225  * Context: Can be called from any context, providing the appropriate GFP
2226  * flags are used.
2227  * Return: The page on success or NULL if allocation fails.
2228  */
2229 struct page *alloc_pages(gfp_t gfp, unsigned order)
2230 {
2231 	struct mempolicy *pol = &default_policy;
2232 	struct page *page;
2233 
2234 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2235 		pol = get_task_policy(current);
2236 
2237 	/*
2238 	 * No reference counting needed for current->mempolicy
2239 	 * nor system default_policy
2240 	 */
2241 	if (pol->mode == MPOL_INTERLEAVE)
2242 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2243 	else
2244 		page = __alloc_pages(gfp, order,
2245 				policy_node(gfp, pol, numa_node_id()),
2246 				policy_nodemask(gfp, pol));
2247 
2248 	return page;
2249 }
2250 EXPORT_SYMBOL(alloc_pages);
2251 
2252 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2253 {
2254 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2255 
2256 	if (IS_ERR(pol))
2257 		return PTR_ERR(pol);
2258 	dst->vm_policy = pol;
2259 	return 0;
2260 }
2261 
2262 /*
2263  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2264  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2265  * with the mems_allowed returned by cpuset_mems_allowed().  This
2266  * keeps mempolicies cpuset relative after its cpuset moves.  See
2267  * further kernel/cpuset.c update_nodemask().
2268  *
2269  * current's mempolicy may be rebinded by the other task(the task that changes
2270  * cpuset's mems), so we needn't do rebind work for current task.
2271  */
2272 
2273 /* Slow path of a mempolicy duplicate */
2274 struct mempolicy *__mpol_dup(struct mempolicy *old)
2275 {
2276 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2277 
2278 	if (!new)
2279 		return ERR_PTR(-ENOMEM);
2280 
2281 	/* task's mempolicy is protected by alloc_lock */
2282 	if (old == current->mempolicy) {
2283 		task_lock(current);
2284 		*new = *old;
2285 		task_unlock(current);
2286 	} else
2287 		*new = *old;
2288 
2289 	if (current_cpuset_is_being_rebound()) {
2290 		nodemask_t mems = cpuset_mems_allowed(current);
2291 		mpol_rebind_policy(new, &mems);
2292 	}
2293 	atomic_set(&new->refcnt, 1);
2294 	return new;
2295 }
2296 
2297 /* Slow path of a mempolicy comparison */
2298 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2299 {
2300 	if (!a || !b)
2301 		return false;
2302 	if (a->mode != b->mode)
2303 		return false;
2304 	if (a->flags != b->flags)
2305 		return false;
2306 	if (mpol_store_user_nodemask(a))
2307 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2308 			return false;
2309 
2310 	switch (a->mode) {
2311 	case MPOL_BIND:
2312 	case MPOL_INTERLEAVE:
2313 	case MPOL_PREFERRED:
2314 		return !!nodes_equal(a->nodes, b->nodes);
2315 	case MPOL_LOCAL:
2316 		return true;
2317 	default:
2318 		BUG();
2319 		return false;
2320 	}
2321 }
2322 
2323 /*
2324  * Shared memory backing store policy support.
2325  *
2326  * Remember policies even when nobody has shared memory mapped.
2327  * The policies are kept in Red-Black tree linked from the inode.
2328  * They are protected by the sp->lock rwlock, which should be held
2329  * for any accesses to the tree.
2330  */
2331 
2332 /*
2333  * lookup first element intersecting start-end.  Caller holds sp->lock for
2334  * reading or for writing
2335  */
2336 static struct sp_node *
2337 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2338 {
2339 	struct rb_node *n = sp->root.rb_node;
2340 
2341 	while (n) {
2342 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2343 
2344 		if (start >= p->end)
2345 			n = n->rb_right;
2346 		else if (end <= p->start)
2347 			n = n->rb_left;
2348 		else
2349 			break;
2350 	}
2351 	if (!n)
2352 		return NULL;
2353 	for (;;) {
2354 		struct sp_node *w = NULL;
2355 		struct rb_node *prev = rb_prev(n);
2356 		if (!prev)
2357 			break;
2358 		w = rb_entry(prev, struct sp_node, nd);
2359 		if (w->end <= start)
2360 			break;
2361 		n = prev;
2362 	}
2363 	return rb_entry(n, struct sp_node, nd);
2364 }
2365 
2366 /*
2367  * Insert a new shared policy into the list.  Caller holds sp->lock for
2368  * writing.
2369  */
2370 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2371 {
2372 	struct rb_node **p = &sp->root.rb_node;
2373 	struct rb_node *parent = NULL;
2374 	struct sp_node *nd;
2375 
2376 	while (*p) {
2377 		parent = *p;
2378 		nd = rb_entry(parent, struct sp_node, nd);
2379 		if (new->start < nd->start)
2380 			p = &(*p)->rb_left;
2381 		else if (new->end > nd->end)
2382 			p = &(*p)->rb_right;
2383 		else
2384 			BUG();
2385 	}
2386 	rb_link_node(&new->nd, parent, p);
2387 	rb_insert_color(&new->nd, &sp->root);
2388 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2389 		 new->policy ? new->policy->mode : 0);
2390 }
2391 
2392 /* Find shared policy intersecting idx */
2393 struct mempolicy *
2394 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2395 {
2396 	struct mempolicy *pol = NULL;
2397 	struct sp_node *sn;
2398 
2399 	if (!sp->root.rb_node)
2400 		return NULL;
2401 	read_lock(&sp->lock);
2402 	sn = sp_lookup(sp, idx, idx+1);
2403 	if (sn) {
2404 		mpol_get(sn->policy);
2405 		pol = sn->policy;
2406 	}
2407 	read_unlock(&sp->lock);
2408 	return pol;
2409 }
2410 
2411 static void sp_free(struct sp_node *n)
2412 {
2413 	mpol_put(n->policy);
2414 	kmem_cache_free(sn_cache, n);
2415 }
2416 
2417 /**
2418  * mpol_misplaced - check whether current page node is valid in policy
2419  *
2420  * @page: page to be checked
2421  * @vma: vm area where page mapped
2422  * @addr: virtual address where page mapped
2423  *
2424  * Lookup current policy node id for vma,addr and "compare to" page's
2425  * node id.  Policy determination "mimics" alloc_page_vma().
2426  * Called from fault path where we know the vma and faulting address.
2427  *
2428  * Return: -1 if the page is in a node that is valid for this policy, or a
2429  * suitable node ID to allocate a replacement page from.
2430  */
2431 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2432 {
2433 	struct mempolicy *pol;
2434 	struct zoneref *z;
2435 	int curnid = page_to_nid(page);
2436 	unsigned long pgoff;
2437 	int thiscpu = raw_smp_processor_id();
2438 	int thisnid = cpu_to_node(thiscpu);
2439 	int polnid = NUMA_NO_NODE;
2440 	int ret = -1;
2441 
2442 	pol = get_vma_policy(vma, addr);
2443 	if (!(pol->flags & MPOL_F_MOF))
2444 		goto out;
2445 
2446 	switch (pol->mode) {
2447 	case MPOL_INTERLEAVE:
2448 		pgoff = vma->vm_pgoff;
2449 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2450 		polnid = offset_il_node(pol, pgoff);
2451 		break;
2452 
2453 	case MPOL_PREFERRED:
2454 		polnid = first_node(pol->nodes);
2455 		break;
2456 
2457 	case MPOL_LOCAL:
2458 		polnid = numa_node_id();
2459 		break;
2460 
2461 	case MPOL_BIND:
2462 		/* Optimize placement among multiple nodes via NUMA balancing */
2463 		if (pol->flags & MPOL_F_MORON) {
2464 			if (node_isset(thisnid, pol->nodes))
2465 				break;
2466 			goto out;
2467 		}
2468 
2469 		/*
2470 		 * allows binding to multiple nodes.
2471 		 * use current page if in policy nodemask,
2472 		 * else select nearest allowed node, if any.
2473 		 * If no allowed nodes, use current [!misplaced].
2474 		 */
2475 		if (node_isset(curnid, pol->nodes))
2476 			goto out;
2477 		z = first_zones_zonelist(
2478 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2479 				gfp_zone(GFP_HIGHUSER),
2480 				&pol->nodes);
2481 		polnid = zone_to_nid(z->zone);
2482 		break;
2483 
2484 	default:
2485 		BUG();
2486 	}
2487 
2488 	/* Migrate the page towards the node whose CPU is referencing it */
2489 	if (pol->flags & MPOL_F_MORON) {
2490 		polnid = thisnid;
2491 
2492 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2493 			goto out;
2494 	}
2495 
2496 	if (curnid != polnid)
2497 		ret = polnid;
2498 out:
2499 	mpol_cond_put(pol);
2500 
2501 	return ret;
2502 }
2503 
2504 /*
2505  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2506  * dropped after task->mempolicy is set to NULL so that any allocation done as
2507  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2508  * policy.
2509  */
2510 void mpol_put_task_policy(struct task_struct *task)
2511 {
2512 	struct mempolicy *pol;
2513 
2514 	task_lock(task);
2515 	pol = task->mempolicy;
2516 	task->mempolicy = NULL;
2517 	task_unlock(task);
2518 	mpol_put(pol);
2519 }
2520 
2521 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2522 {
2523 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2524 	rb_erase(&n->nd, &sp->root);
2525 	sp_free(n);
2526 }
2527 
2528 static void sp_node_init(struct sp_node *node, unsigned long start,
2529 			unsigned long end, struct mempolicy *pol)
2530 {
2531 	node->start = start;
2532 	node->end = end;
2533 	node->policy = pol;
2534 }
2535 
2536 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2537 				struct mempolicy *pol)
2538 {
2539 	struct sp_node *n;
2540 	struct mempolicy *newpol;
2541 
2542 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2543 	if (!n)
2544 		return NULL;
2545 
2546 	newpol = mpol_dup(pol);
2547 	if (IS_ERR(newpol)) {
2548 		kmem_cache_free(sn_cache, n);
2549 		return NULL;
2550 	}
2551 	newpol->flags |= MPOL_F_SHARED;
2552 	sp_node_init(n, start, end, newpol);
2553 
2554 	return n;
2555 }
2556 
2557 /* Replace a policy range. */
2558 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2559 				 unsigned long end, struct sp_node *new)
2560 {
2561 	struct sp_node *n;
2562 	struct sp_node *n_new = NULL;
2563 	struct mempolicy *mpol_new = NULL;
2564 	int ret = 0;
2565 
2566 restart:
2567 	write_lock(&sp->lock);
2568 	n = sp_lookup(sp, start, end);
2569 	/* Take care of old policies in the same range. */
2570 	while (n && n->start < end) {
2571 		struct rb_node *next = rb_next(&n->nd);
2572 		if (n->start >= start) {
2573 			if (n->end <= end)
2574 				sp_delete(sp, n);
2575 			else
2576 				n->start = end;
2577 		} else {
2578 			/* Old policy spanning whole new range. */
2579 			if (n->end > end) {
2580 				if (!n_new)
2581 					goto alloc_new;
2582 
2583 				*mpol_new = *n->policy;
2584 				atomic_set(&mpol_new->refcnt, 1);
2585 				sp_node_init(n_new, end, n->end, mpol_new);
2586 				n->end = start;
2587 				sp_insert(sp, n_new);
2588 				n_new = NULL;
2589 				mpol_new = NULL;
2590 				break;
2591 			} else
2592 				n->end = start;
2593 		}
2594 		if (!next)
2595 			break;
2596 		n = rb_entry(next, struct sp_node, nd);
2597 	}
2598 	if (new)
2599 		sp_insert(sp, new);
2600 	write_unlock(&sp->lock);
2601 	ret = 0;
2602 
2603 err_out:
2604 	if (mpol_new)
2605 		mpol_put(mpol_new);
2606 	if (n_new)
2607 		kmem_cache_free(sn_cache, n_new);
2608 
2609 	return ret;
2610 
2611 alloc_new:
2612 	write_unlock(&sp->lock);
2613 	ret = -ENOMEM;
2614 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2615 	if (!n_new)
2616 		goto err_out;
2617 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2618 	if (!mpol_new)
2619 		goto err_out;
2620 	goto restart;
2621 }
2622 
2623 /**
2624  * mpol_shared_policy_init - initialize shared policy for inode
2625  * @sp: pointer to inode shared policy
2626  * @mpol:  struct mempolicy to install
2627  *
2628  * Install non-NULL @mpol in inode's shared policy rb-tree.
2629  * On entry, the current task has a reference on a non-NULL @mpol.
2630  * This must be released on exit.
2631  * This is called at get_inode() calls and we can use GFP_KERNEL.
2632  */
2633 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2634 {
2635 	int ret;
2636 
2637 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2638 	rwlock_init(&sp->lock);
2639 
2640 	if (mpol) {
2641 		struct vm_area_struct pvma;
2642 		struct mempolicy *new;
2643 		NODEMASK_SCRATCH(scratch);
2644 
2645 		if (!scratch)
2646 			goto put_mpol;
2647 		/* contextualize the tmpfs mount point mempolicy */
2648 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2649 		if (IS_ERR(new))
2650 			goto free_scratch; /* no valid nodemask intersection */
2651 
2652 		task_lock(current);
2653 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2654 		task_unlock(current);
2655 		if (ret)
2656 			goto put_new;
2657 
2658 		/* Create pseudo-vma that contains just the policy */
2659 		vma_init(&pvma, NULL);
2660 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2661 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2662 
2663 put_new:
2664 		mpol_put(new);			/* drop initial ref */
2665 free_scratch:
2666 		NODEMASK_SCRATCH_FREE(scratch);
2667 put_mpol:
2668 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2669 	}
2670 }
2671 
2672 int mpol_set_shared_policy(struct shared_policy *info,
2673 			struct vm_area_struct *vma, struct mempolicy *npol)
2674 {
2675 	int err;
2676 	struct sp_node *new = NULL;
2677 	unsigned long sz = vma_pages(vma);
2678 
2679 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2680 		 vma->vm_pgoff,
2681 		 sz, npol ? npol->mode : -1,
2682 		 npol ? npol->flags : -1,
2683 		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2684 
2685 	if (npol) {
2686 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2687 		if (!new)
2688 			return -ENOMEM;
2689 	}
2690 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2691 	if (err && new)
2692 		sp_free(new);
2693 	return err;
2694 }
2695 
2696 /* Free a backing policy store on inode delete. */
2697 void mpol_free_shared_policy(struct shared_policy *p)
2698 {
2699 	struct sp_node *n;
2700 	struct rb_node *next;
2701 
2702 	if (!p->root.rb_node)
2703 		return;
2704 	write_lock(&p->lock);
2705 	next = rb_first(&p->root);
2706 	while (next) {
2707 		n = rb_entry(next, struct sp_node, nd);
2708 		next = rb_next(&n->nd);
2709 		sp_delete(p, n);
2710 	}
2711 	write_unlock(&p->lock);
2712 }
2713 
2714 #ifdef CONFIG_NUMA_BALANCING
2715 static int __initdata numabalancing_override;
2716 
2717 static void __init check_numabalancing_enable(void)
2718 {
2719 	bool numabalancing_default = false;
2720 
2721 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2722 		numabalancing_default = true;
2723 
2724 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2725 	if (numabalancing_override)
2726 		set_numabalancing_state(numabalancing_override == 1);
2727 
2728 	if (num_online_nodes() > 1 && !numabalancing_override) {
2729 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2730 			numabalancing_default ? "Enabling" : "Disabling");
2731 		set_numabalancing_state(numabalancing_default);
2732 	}
2733 }
2734 
2735 static int __init setup_numabalancing(char *str)
2736 {
2737 	int ret = 0;
2738 	if (!str)
2739 		goto out;
2740 
2741 	if (!strcmp(str, "enable")) {
2742 		numabalancing_override = 1;
2743 		ret = 1;
2744 	} else if (!strcmp(str, "disable")) {
2745 		numabalancing_override = -1;
2746 		ret = 1;
2747 	}
2748 out:
2749 	if (!ret)
2750 		pr_warn("Unable to parse numa_balancing=\n");
2751 
2752 	return ret;
2753 }
2754 __setup("numa_balancing=", setup_numabalancing);
2755 #else
2756 static inline void __init check_numabalancing_enable(void)
2757 {
2758 }
2759 #endif /* CONFIG_NUMA_BALANCING */
2760 
2761 /* assumes fs == KERNEL_DS */
2762 void __init numa_policy_init(void)
2763 {
2764 	nodemask_t interleave_nodes;
2765 	unsigned long largest = 0;
2766 	int nid, prefer = 0;
2767 
2768 	policy_cache = kmem_cache_create("numa_policy",
2769 					 sizeof(struct mempolicy),
2770 					 0, SLAB_PANIC, NULL);
2771 
2772 	sn_cache = kmem_cache_create("shared_policy_node",
2773 				     sizeof(struct sp_node),
2774 				     0, SLAB_PANIC, NULL);
2775 
2776 	for_each_node(nid) {
2777 		preferred_node_policy[nid] = (struct mempolicy) {
2778 			.refcnt = ATOMIC_INIT(1),
2779 			.mode = MPOL_PREFERRED,
2780 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2781 			.nodes = nodemask_of_node(nid),
2782 		};
2783 	}
2784 
2785 	/*
2786 	 * Set interleaving policy for system init. Interleaving is only
2787 	 * enabled across suitably sized nodes (default is >= 16MB), or
2788 	 * fall back to the largest node if they're all smaller.
2789 	 */
2790 	nodes_clear(interleave_nodes);
2791 	for_each_node_state(nid, N_MEMORY) {
2792 		unsigned long total_pages = node_present_pages(nid);
2793 
2794 		/* Preserve the largest node */
2795 		if (largest < total_pages) {
2796 			largest = total_pages;
2797 			prefer = nid;
2798 		}
2799 
2800 		/* Interleave this node? */
2801 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2802 			node_set(nid, interleave_nodes);
2803 	}
2804 
2805 	/* All too small, use the largest */
2806 	if (unlikely(nodes_empty(interleave_nodes)))
2807 		node_set(prefer, interleave_nodes);
2808 
2809 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2810 		pr_err("%s: interleaving failed\n", __func__);
2811 
2812 	check_numabalancing_enable();
2813 }
2814 
2815 /* Reset policy of current process to default */
2816 void numa_default_policy(void)
2817 {
2818 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2819 }
2820 
2821 /*
2822  * Parse and format mempolicy from/to strings
2823  */
2824 
2825 static const char * const policy_modes[] =
2826 {
2827 	[MPOL_DEFAULT]    = "default",
2828 	[MPOL_PREFERRED]  = "prefer",
2829 	[MPOL_BIND]       = "bind",
2830 	[MPOL_INTERLEAVE] = "interleave",
2831 	[MPOL_LOCAL]      = "local",
2832 };
2833 
2834 
2835 #ifdef CONFIG_TMPFS
2836 /**
2837  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2838  * @str:  string containing mempolicy to parse
2839  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2840  *
2841  * Format of input:
2842  *	<mode>[=<flags>][:<nodelist>]
2843  *
2844  * On success, returns 0, else 1
2845  */
2846 int mpol_parse_str(char *str, struct mempolicy **mpol)
2847 {
2848 	struct mempolicy *new = NULL;
2849 	unsigned short mode_flags;
2850 	nodemask_t nodes;
2851 	char *nodelist = strchr(str, ':');
2852 	char *flags = strchr(str, '=');
2853 	int err = 1, mode;
2854 
2855 	if (flags)
2856 		*flags++ = '\0';	/* terminate mode string */
2857 
2858 	if (nodelist) {
2859 		/* NUL-terminate mode or flags string */
2860 		*nodelist++ = '\0';
2861 		if (nodelist_parse(nodelist, nodes))
2862 			goto out;
2863 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2864 			goto out;
2865 	} else
2866 		nodes_clear(nodes);
2867 
2868 	mode = match_string(policy_modes, MPOL_MAX, str);
2869 	if (mode < 0)
2870 		goto out;
2871 
2872 	switch (mode) {
2873 	case MPOL_PREFERRED:
2874 		/*
2875 		 * Insist on a nodelist of one node only, although later
2876 		 * we use first_node(nodes) to grab a single node, so here
2877 		 * nodelist (or nodes) cannot be empty.
2878 		 */
2879 		if (nodelist) {
2880 			char *rest = nodelist;
2881 			while (isdigit(*rest))
2882 				rest++;
2883 			if (*rest)
2884 				goto out;
2885 			if (nodes_empty(nodes))
2886 				goto out;
2887 		}
2888 		break;
2889 	case MPOL_INTERLEAVE:
2890 		/*
2891 		 * Default to online nodes with memory if no nodelist
2892 		 */
2893 		if (!nodelist)
2894 			nodes = node_states[N_MEMORY];
2895 		break;
2896 	case MPOL_LOCAL:
2897 		/*
2898 		 * Don't allow a nodelist;  mpol_new() checks flags
2899 		 */
2900 		if (nodelist)
2901 			goto out;
2902 		break;
2903 	case MPOL_DEFAULT:
2904 		/*
2905 		 * Insist on a empty nodelist
2906 		 */
2907 		if (!nodelist)
2908 			err = 0;
2909 		goto out;
2910 	case MPOL_BIND:
2911 		/*
2912 		 * Insist on a nodelist
2913 		 */
2914 		if (!nodelist)
2915 			goto out;
2916 	}
2917 
2918 	mode_flags = 0;
2919 	if (flags) {
2920 		/*
2921 		 * Currently, we only support two mutually exclusive
2922 		 * mode flags.
2923 		 */
2924 		if (!strcmp(flags, "static"))
2925 			mode_flags |= MPOL_F_STATIC_NODES;
2926 		else if (!strcmp(flags, "relative"))
2927 			mode_flags |= MPOL_F_RELATIVE_NODES;
2928 		else
2929 			goto out;
2930 	}
2931 
2932 	new = mpol_new(mode, mode_flags, &nodes);
2933 	if (IS_ERR(new))
2934 		goto out;
2935 
2936 	/*
2937 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2938 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2939 	 */
2940 	if (mode != MPOL_PREFERRED) {
2941 		new->nodes = nodes;
2942 	} else if (nodelist) {
2943 		nodes_clear(new->nodes);
2944 		node_set(first_node(nodes), new->nodes);
2945 	} else {
2946 		new->mode = MPOL_LOCAL;
2947 	}
2948 
2949 	/*
2950 	 * Save nodes for contextualization: this will be used to "clone"
2951 	 * the mempolicy in a specific context [cpuset] at a later time.
2952 	 */
2953 	new->w.user_nodemask = nodes;
2954 
2955 	err = 0;
2956 
2957 out:
2958 	/* Restore string for error message */
2959 	if (nodelist)
2960 		*--nodelist = ':';
2961 	if (flags)
2962 		*--flags = '=';
2963 	if (!err)
2964 		*mpol = new;
2965 	return err;
2966 }
2967 #endif /* CONFIG_TMPFS */
2968 
2969 /**
2970  * mpol_to_str - format a mempolicy structure for printing
2971  * @buffer:  to contain formatted mempolicy string
2972  * @maxlen:  length of @buffer
2973  * @pol:  pointer to mempolicy to be formatted
2974  *
2975  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2976  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2977  * longest flag, "relative", and to display at least a few node ids.
2978  */
2979 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2980 {
2981 	char *p = buffer;
2982 	nodemask_t nodes = NODE_MASK_NONE;
2983 	unsigned short mode = MPOL_DEFAULT;
2984 	unsigned short flags = 0;
2985 
2986 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2987 		mode = pol->mode;
2988 		flags = pol->flags;
2989 	}
2990 
2991 	switch (mode) {
2992 	case MPOL_DEFAULT:
2993 	case MPOL_LOCAL:
2994 		break;
2995 	case MPOL_PREFERRED:
2996 	case MPOL_BIND:
2997 	case MPOL_INTERLEAVE:
2998 		nodes = pol->nodes;
2999 		break;
3000 	default:
3001 		WARN_ON_ONCE(1);
3002 		snprintf(p, maxlen, "unknown");
3003 		return;
3004 	}
3005 
3006 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3007 
3008 	if (flags & MPOL_MODE_FLAGS) {
3009 		p += snprintf(p, buffer + maxlen - p, "=");
3010 
3011 		/*
3012 		 * Currently, the only defined flags are mutually exclusive
3013 		 */
3014 		if (flags & MPOL_F_STATIC_NODES)
3015 			p += snprintf(p, buffer + maxlen - p, "static");
3016 		else if (flags & MPOL_F_RELATIVE_NODES)
3017 			p += snprintf(p, buffer + maxlen - p, "relative");
3018 	}
3019 
3020 	if (!nodes_empty(nodes))
3021 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3022 			       nodemask_pr_args(&nodes));
3023 }
3024