xref: /linux/mm/mempolicy.c (revision d38c07afc356ddebaa3ed8ecb3f553340e05c969)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/pagewalk.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_PREFERRED,
125 	.flags = MPOL_F_LOCAL,
126 };
127 
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 
130 struct mempolicy *get_task_policy(struct task_struct *p)
131 {
132 	struct mempolicy *pol = p->mempolicy;
133 	int node;
134 
135 	if (pol)
136 		return pol;
137 
138 	node = numa_node_id();
139 	if (node != NUMA_NO_NODE) {
140 		pol = &preferred_node_policy[node];
141 		/* preferred_node_policy is not initialised early in boot */
142 		if (pol->mode)
143 			return pol;
144 	}
145 
146 	return &default_policy;
147 }
148 
149 static const struct mempolicy_operations {
150 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152 } mpol_ops[MPOL_MAX];
153 
154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155 {
156 	return pol->flags & MPOL_MODE_FLAGS;
157 }
158 
159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 				   const nodemask_t *rel)
161 {
162 	nodemask_t tmp;
163 	nodes_fold(tmp, *orig, nodes_weight(*rel));
164 	nodes_onto(*ret, tmp, *rel);
165 }
166 
167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (nodes_empty(*nodes))
170 		return -EINVAL;
171 	pol->v.nodes = *nodes;
172 	return 0;
173 }
174 
175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176 {
177 	if (!nodes)
178 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
179 	else if (nodes_empty(*nodes))
180 		return -EINVAL;			/*  no allowed nodes */
181 	else
182 		pol->v.preferred_node = first_node(*nodes);
183 	return 0;
184 }
185 
186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (nodes_empty(*nodes))
189 		return -EINVAL;
190 	pol->v.nodes = *nodes;
191 	return 0;
192 }
193 
194 /*
195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
196  * any, for the new policy.  mpol_new() has already validated the nodes
197  * parameter with respect to the policy mode and flags.  But, we need to
198  * handle an empty nodemask with MPOL_PREFERRED here.
199  *
200  * Must be called holding task's alloc_lock to protect task's mems_allowed
201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
202  */
203 static int mpol_set_nodemask(struct mempolicy *pol,
204 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
205 {
206 	int ret;
207 
208 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
209 	if (pol == NULL)
210 		return 0;
211 	/* Check N_MEMORY */
212 	nodes_and(nsc->mask1,
213 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
214 
215 	VM_BUG_ON(!nodes);
216 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 		nodes = NULL;	/* explicit local allocation */
218 	else {
219 		if (pol->flags & MPOL_F_RELATIVE_NODES)
220 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 		else
222 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
223 
224 		if (mpol_store_user_nodemask(pol))
225 			pol->w.user_nodemask = *nodes;
226 		else
227 			pol->w.cpuset_mems_allowed =
228 						cpuset_current_mems_allowed;
229 	}
230 
231 	if (nodes)
232 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 	else
234 		ret = mpol_ops[pol->mode].create(pol, NULL);
235 	return ret;
236 }
237 
238 /*
239  * This function just creates a new policy, does some check and simple
240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
241  */
242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 				  nodemask_t *nodes)
244 {
245 	struct mempolicy *policy;
246 
247 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249 
250 	if (mode == MPOL_DEFAULT) {
251 		if (nodes && !nodes_empty(*nodes))
252 			return ERR_PTR(-EINVAL);
253 		return NULL;
254 	}
255 	VM_BUG_ON(!nodes);
256 
257 	/*
258 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
259 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
260 	 * All other modes require a valid pointer to a non-empty nodemask.
261 	 */
262 	if (mode == MPOL_PREFERRED) {
263 		if (nodes_empty(*nodes)) {
264 			if (((flags & MPOL_F_STATIC_NODES) ||
265 			     (flags & MPOL_F_RELATIVE_NODES)))
266 				return ERR_PTR(-EINVAL);
267 		}
268 	} else if (mode == MPOL_LOCAL) {
269 		if (!nodes_empty(*nodes) ||
270 		    (flags & MPOL_F_STATIC_NODES) ||
271 		    (flags & MPOL_F_RELATIVE_NODES))
272 			return ERR_PTR(-EINVAL);
273 		mode = MPOL_PREFERRED;
274 	} else if (nodes_empty(*nodes))
275 		return ERR_PTR(-EINVAL);
276 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 	if (!policy)
278 		return ERR_PTR(-ENOMEM);
279 	atomic_set(&policy->refcnt, 1);
280 	policy->mode = mode;
281 	policy->flags = flags;
282 
283 	return policy;
284 }
285 
286 /* Slow path of a mpol destructor. */
287 void __mpol_put(struct mempolicy *p)
288 {
289 	if (!atomic_dec_and_test(&p->refcnt))
290 		return;
291 	kmem_cache_free(policy_cache, p);
292 }
293 
294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295 {
296 }
297 
298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299 {
300 	nodemask_t tmp;
301 
302 	if (pol->flags & MPOL_F_STATIC_NODES)
303 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 	else {
307 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 								*nodes);
309 		pol->w.cpuset_mems_allowed = *nodes;
310 	}
311 
312 	if (nodes_empty(tmp))
313 		tmp = *nodes;
314 
315 	pol->v.nodes = tmp;
316 }
317 
318 static void mpol_rebind_preferred(struct mempolicy *pol,
319 						const nodemask_t *nodes)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES) {
324 		int node = first_node(pol->w.user_nodemask);
325 
326 		if (node_isset(node, *nodes)) {
327 			pol->v.preferred_node = node;
328 			pol->flags &= ~MPOL_F_LOCAL;
329 		} else
330 			pol->flags |= MPOL_F_LOCAL;
331 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 		pol->v.preferred_node = first_node(tmp);
334 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
335 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 						   pol->w.cpuset_mems_allowed,
337 						   *nodes);
338 		pol->w.cpuset_mems_allowed = *nodes;
339 	}
340 }
341 
342 /*
343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
344  *
345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
346  * policies are protected by task->mems_allowed_seq to prevent a premature
347  * OOM/allocation failure due to parallel nodemask modification.
348  */
349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350 {
351 	if (!pol)
352 		return;
353 	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 		return;
356 
357 	mpol_ops[pol->mode].rebind(pol, newmask);
358 }
359 
360 /*
361  * Wrapper for mpol_rebind_policy() that just requires task
362  * pointer, and updates task mempolicy.
363  *
364  * Called with task's alloc_lock held.
365  */
366 
367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368 {
369 	mpol_rebind_policy(tsk->mempolicy, new);
370 }
371 
372 /*
373  * Rebind each vma in mm to new nodemask.
374  *
375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
376  */
377 
378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379 {
380 	struct vm_area_struct *vma;
381 
382 	down_write(&mm->mmap_sem);
383 	for (vma = mm->mmap; vma; vma = vma->vm_next)
384 		mpol_rebind_policy(vma->vm_policy, new);
385 	up_write(&mm->mmap_sem);
386 }
387 
388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 	[MPOL_DEFAULT] = {
390 		.rebind = mpol_rebind_default,
391 	},
392 	[MPOL_INTERLEAVE] = {
393 		.create = mpol_new_interleave,
394 		.rebind = mpol_rebind_nodemask,
395 	},
396 	[MPOL_PREFERRED] = {
397 		.create = mpol_new_preferred,
398 		.rebind = mpol_rebind_preferred,
399 	},
400 	[MPOL_BIND] = {
401 		.create = mpol_new_bind,
402 		.rebind = mpol_rebind_nodemask,
403 	},
404 };
405 
406 static int migrate_page_add(struct page *page, struct list_head *pagelist,
407 				unsigned long flags);
408 
409 struct queue_pages {
410 	struct list_head *pagelist;
411 	unsigned long flags;
412 	nodemask_t *nmask;
413 	unsigned long start;
414 	unsigned long end;
415 	struct vm_area_struct *first;
416 };
417 
418 /*
419  * Check if the page's nid is in qp->nmask.
420  *
421  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
422  * in the invert of qp->nmask.
423  */
424 static inline bool queue_pages_required(struct page *page,
425 					struct queue_pages *qp)
426 {
427 	int nid = page_to_nid(page);
428 	unsigned long flags = qp->flags;
429 
430 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
431 }
432 
433 /*
434  * queue_pages_pmd() has four possible return values:
435  * 0 - pages are placed on the right node or queued successfully.
436  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
437  *     specified.
438  * 2 - THP was split.
439  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
440  *        existing page was already on a node that does not follow the
441  *        policy.
442  */
443 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
444 				unsigned long end, struct mm_walk *walk)
445 {
446 	int ret = 0;
447 	struct page *page;
448 	struct queue_pages *qp = walk->private;
449 	unsigned long flags;
450 
451 	if (unlikely(is_pmd_migration_entry(*pmd))) {
452 		ret = -EIO;
453 		goto unlock;
454 	}
455 	page = pmd_page(*pmd);
456 	if (is_huge_zero_page(page)) {
457 		spin_unlock(ptl);
458 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
459 		ret = 2;
460 		goto out;
461 	}
462 	if (!queue_pages_required(page, qp))
463 		goto unlock;
464 
465 	flags = qp->flags;
466 	/* go to thp migration */
467 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
468 		if (!vma_migratable(walk->vma) ||
469 		    migrate_page_add(page, qp->pagelist, flags)) {
470 			ret = 1;
471 			goto unlock;
472 		}
473 	} else
474 		ret = -EIO;
475 unlock:
476 	spin_unlock(ptl);
477 out:
478 	return ret;
479 }
480 
481 /*
482  * Scan through pages checking if pages follow certain conditions,
483  * and move them to the pagelist if they do.
484  *
485  * queue_pages_pte_range() has three possible return values:
486  * 0 - pages are placed on the right node or queued successfully.
487  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
488  *     specified.
489  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
490  *        on a node that does not follow the policy.
491  */
492 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
493 			unsigned long end, struct mm_walk *walk)
494 {
495 	struct vm_area_struct *vma = walk->vma;
496 	struct page *page;
497 	struct queue_pages *qp = walk->private;
498 	unsigned long flags = qp->flags;
499 	int ret;
500 	bool has_unmovable = false;
501 	pte_t *pte;
502 	spinlock_t *ptl;
503 
504 	ptl = pmd_trans_huge_lock(pmd, vma);
505 	if (ptl) {
506 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
507 		if (ret != 2)
508 			return ret;
509 	}
510 	/* THP was split, fall through to pte walk */
511 
512 	if (pmd_trans_unstable(pmd))
513 		return 0;
514 
515 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
516 	for (; addr != end; pte++, addr += PAGE_SIZE) {
517 		if (!pte_present(*pte))
518 			continue;
519 		page = vm_normal_page(vma, addr, *pte);
520 		if (!page)
521 			continue;
522 		/*
523 		 * vm_normal_page() filters out zero pages, but there might
524 		 * still be PageReserved pages to skip, perhaps in a VDSO.
525 		 */
526 		if (PageReserved(page))
527 			continue;
528 		if (!queue_pages_required(page, qp))
529 			continue;
530 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
531 			/* MPOL_MF_STRICT must be specified if we get here */
532 			if (!vma_migratable(vma)) {
533 				has_unmovable = true;
534 				break;
535 			}
536 
537 			/*
538 			 * Do not abort immediately since there may be
539 			 * temporary off LRU pages in the range.  Still
540 			 * need migrate other LRU pages.
541 			 */
542 			if (migrate_page_add(page, qp->pagelist, flags))
543 				has_unmovable = true;
544 		} else
545 			break;
546 	}
547 	pte_unmap_unlock(pte - 1, ptl);
548 	cond_resched();
549 
550 	if (has_unmovable)
551 		return 1;
552 
553 	return addr != end ? -EIO : 0;
554 }
555 
556 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
557 			       unsigned long addr, unsigned long end,
558 			       struct mm_walk *walk)
559 {
560 	int ret = 0;
561 #ifdef CONFIG_HUGETLB_PAGE
562 	struct queue_pages *qp = walk->private;
563 	unsigned long flags = (qp->flags & MPOL_MF_VALID);
564 	struct page *page;
565 	spinlock_t *ptl;
566 	pte_t entry;
567 
568 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
569 	entry = huge_ptep_get(pte);
570 	if (!pte_present(entry))
571 		goto unlock;
572 	page = pte_page(entry);
573 	if (!queue_pages_required(page, qp))
574 		goto unlock;
575 
576 	if (flags == MPOL_MF_STRICT) {
577 		/*
578 		 * STRICT alone means only detecting misplaced page and no
579 		 * need to further check other vma.
580 		 */
581 		ret = -EIO;
582 		goto unlock;
583 	}
584 
585 	if (!vma_migratable(walk->vma)) {
586 		/*
587 		 * Must be STRICT with MOVE*, otherwise .test_walk() have
588 		 * stopped walking current vma.
589 		 * Detecting misplaced page but allow migrating pages which
590 		 * have been queued.
591 		 */
592 		ret = 1;
593 		goto unlock;
594 	}
595 
596 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
597 	if (flags & (MPOL_MF_MOVE_ALL) ||
598 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
599 		if (!isolate_huge_page(page, qp->pagelist) &&
600 			(flags & MPOL_MF_STRICT))
601 			/*
602 			 * Failed to isolate page but allow migrating pages
603 			 * which have been queued.
604 			 */
605 			ret = 1;
606 	}
607 unlock:
608 	spin_unlock(ptl);
609 #else
610 	BUG();
611 #endif
612 	return ret;
613 }
614 
615 #ifdef CONFIG_NUMA_BALANCING
616 /*
617  * This is used to mark a range of virtual addresses to be inaccessible.
618  * These are later cleared by a NUMA hinting fault. Depending on these
619  * faults, pages may be migrated for better NUMA placement.
620  *
621  * This is assuming that NUMA faults are handled using PROT_NONE. If
622  * an architecture makes a different choice, it will need further
623  * changes to the core.
624  */
625 unsigned long change_prot_numa(struct vm_area_struct *vma,
626 			unsigned long addr, unsigned long end)
627 {
628 	int nr_updated;
629 
630 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
631 	if (nr_updated)
632 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
633 
634 	return nr_updated;
635 }
636 #else
637 static unsigned long change_prot_numa(struct vm_area_struct *vma,
638 			unsigned long addr, unsigned long end)
639 {
640 	return 0;
641 }
642 #endif /* CONFIG_NUMA_BALANCING */
643 
644 static int queue_pages_test_walk(unsigned long start, unsigned long end,
645 				struct mm_walk *walk)
646 {
647 	struct vm_area_struct *vma = walk->vma;
648 	struct queue_pages *qp = walk->private;
649 	unsigned long endvma = vma->vm_end;
650 	unsigned long flags = qp->flags;
651 
652 	/* range check first */
653 	VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
654 
655 	if (!qp->first) {
656 		qp->first = vma;
657 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
658 			(qp->start < vma->vm_start))
659 			/* hole at head side of range */
660 			return -EFAULT;
661 	}
662 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
663 		((vma->vm_end < qp->end) &&
664 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
665 		/* hole at middle or tail of range */
666 		return -EFAULT;
667 
668 	/*
669 	 * Need check MPOL_MF_STRICT to return -EIO if possible
670 	 * regardless of vma_migratable
671 	 */
672 	if (!vma_migratable(vma) &&
673 	    !(flags & MPOL_MF_STRICT))
674 		return 1;
675 
676 	if (endvma > end)
677 		endvma = end;
678 
679 	if (flags & MPOL_MF_LAZY) {
680 		/* Similar to task_numa_work, skip inaccessible VMAs */
681 		if (!is_vm_hugetlb_page(vma) &&
682 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
683 			!(vma->vm_flags & VM_MIXEDMAP))
684 			change_prot_numa(vma, start, endvma);
685 		return 1;
686 	}
687 
688 	/* queue pages from current vma */
689 	if (flags & MPOL_MF_VALID)
690 		return 0;
691 	return 1;
692 }
693 
694 static const struct mm_walk_ops queue_pages_walk_ops = {
695 	.hugetlb_entry		= queue_pages_hugetlb,
696 	.pmd_entry		= queue_pages_pte_range,
697 	.test_walk		= queue_pages_test_walk,
698 };
699 
700 /*
701  * Walk through page tables and collect pages to be migrated.
702  *
703  * If pages found in a given range are on a set of nodes (determined by
704  * @nodes and @flags,) it's isolated and queued to the pagelist which is
705  * passed via @private.
706  *
707  * queue_pages_range() has three possible return values:
708  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
709  *     specified.
710  * 0 - queue pages successfully or no misplaced page.
711  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
712  *         memory range specified by nodemask and maxnode points outside
713  *         your accessible address space (-EFAULT)
714  */
715 static int
716 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
717 		nodemask_t *nodes, unsigned long flags,
718 		struct list_head *pagelist)
719 {
720 	int err;
721 	struct queue_pages qp = {
722 		.pagelist = pagelist,
723 		.flags = flags,
724 		.nmask = nodes,
725 		.start = start,
726 		.end = end,
727 		.first = NULL,
728 	};
729 
730 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
731 
732 	if (!qp.first)
733 		/* whole range in hole */
734 		err = -EFAULT;
735 
736 	return err;
737 }
738 
739 /*
740  * Apply policy to a single VMA
741  * This must be called with the mmap_sem held for writing.
742  */
743 static int vma_replace_policy(struct vm_area_struct *vma,
744 						struct mempolicy *pol)
745 {
746 	int err;
747 	struct mempolicy *old;
748 	struct mempolicy *new;
749 
750 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
751 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
752 		 vma->vm_ops, vma->vm_file,
753 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
754 
755 	new = mpol_dup(pol);
756 	if (IS_ERR(new))
757 		return PTR_ERR(new);
758 
759 	if (vma->vm_ops && vma->vm_ops->set_policy) {
760 		err = vma->vm_ops->set_policy(vma, new);
761 		if (err)
762 			goto err_out;
763 	}
764 
765 	old = vma->vm_policy;
766 	vma->vm_policy = new; /* protected by mmap_sem */
767 	mpol_put(old);
768 
769 	return 0;
770  err_out:
771 	mpol_put(new);
772 	return err;
773 }
774 
775 /* Step 2: apply policy to a range and do splits. */
776 static int mbind_range(struct mm_struct *mm, unsigned long start,
777 		       unsigned long end, struct mempolicy *new_pol)
778 {
779 	struct vm_area_struct *next;
780 	struct vm_area_struct *prev;
781 	struct vm_area_struct *vma;
782 	int err = 0;
783 	pgoff_t pgoff;
784 	unsigned long vmstart;
785 	unsigned long vmend;
786 
787 	vma = find_vma(mm, start);
788 	VM_BUG_ON(!vma);
789 
790 	prev = vma->vm_prev;
791 	if (start > vma->vm_start)
792 		prev = vma;
793 
794 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
795 		next = vma->vm_next;
796 		vmstart = max(start, vma->vm_start);
797 		vmend   = min(end, vma->vm_end);
798 
799 		if (mpol_equal(vma_policy(vma), new_pol))
800 			continue;
801 
802 		pgoff = vma->vm_pgoff +
803 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
804 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
805 				 vma->anon_vma, vma->vm_file, pgoff,
806 				 new_pol, vma->vm_userfaultfd_ctx);
807 		if (prev) {
808 			vma = prev;
809 			next = vma->vm_next;
810 			if (mpol_equal(vma_policy(vma), new_pol))
811 				continue;
812 			/* vma_merge() joined vma && vma->next, case 8 */
813 			goto replace;
814 		}
815 		if (vma->vm_start != vmstart) {
816 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
817 			if (err)
818 				goto out;
819 		}
820 		if (vma->vm_end != vmend) {
821 			err = split_vma(vma->vm_mm, vma, vmend, 0);
822 			if (err)
823 				goto out;
824 		}
825  replace:
826 		err = vma_replace_policy(vma, new_pol);
827 		if (err)
828 			goto out;
829 	}
830 
831  out:
832 	return err;
833 }
834 
835 /* Set the process memory policy */
836 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
837 			     nodemask_t *nodes)
838 {
839 	struct mempolicy *new, *old;
840 	NODEMASK_SCRATCH(scratch);
841 	int ret;
842 
843 	if (!scratch)
844 		return -ENOMEM;
845 
846 	new = mpol_new(mode, flags, nodes);
847 	if (IS_ERR(new)) {
848 		ret = PTR_ERR(new);
849 		goto out;
850 	}
851 
852 	task_lock(current);
853 	ret = mpol_set_nodemask(new, nodes, scratch);
854 	if (ret) {
855 		task_unlock(current);
856 		mpol_put(new);
857 		goto out;
858 	}
859 	old = current->mempolicy;
860 	current->mempolicy = new;
861 	if (new && new->mode == MPOL_INTERLEAVE)
862 		current->il_prev = MAX_NUMNODES-1;
863 	task_unlock(current);
864 	mpol_put(old);
865 	ret = 0;
866 out:
867 	NODEMASK_SCRATCH_FREE(scratch);
868 	return ret;
869 }
870 
871 /*
872  * Return nodemask for policy for get_mempolicy() query
873  *
874  * Called with task's alloc_lock held
875  */
876 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
877 {
878 	nodes_clear(*nodes);
879 	if (p == &default_policy)
880 		return;
881 
882 	switch (p->mode) {
883 	case MPOL_BIND:
884 		/* Fall through */
885 	case MPOL_INTERLEAVE:
886 		*nodes = p->v.nodes;
887 		break;
888 	case MPOL_PREFERRED:
889 		if (!(p->flags & MPOL_F_LOCAL))
890 			node_set(p->v.preferred_node, *nodes);
891 		/* else return empty node mask for local allocation */
892 		break;
893 	default:
894 		BUG();
895 	}
896 }
897 
898 static int lookup_node(struct mm_struct *mm, unsigned long addr)
899 {
900 	struct page *p;
901 	int err;
902 
903 	int locked = 1;
904 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
905 	if (err >= 0) {
906 		err = page_to_nid(p);
907 		put_page(p);
908 	}
909 	if (locked)
910 		up_read(&mm->mmap_sem);
911 	return err;
912 }
913 
914 /* Retrieve NUMA policy */
915 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
916 			     unsigned long addr, unsigned long flags)
917 {
918 	int err;
919 	struct mm_struct *mm = current->mm;
920 	struct vm_area_struct *vma = NULL;
921 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
922 
923 	if (flags &
924 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
925 		return -EINVAL;
926 
927 	if (flags & MPOL_F_MEMS_ALLOWED) {
928 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
929 			return -EINVAL;
930 		*policy = 0;	/* just so it's initialized */
931 		task_lock(current);
932 		*nmask  = cpuset_current_mems_allowed;
933 		task_unlock(current);
934 		return 0;
935 	}
936 
937 	if (flags & MPOL_F_ADDR) {
938 		/*
939 		 * Do NOT fall back to task policy if the
940 		 * vma/shared policy at addr is NULL.  We
941 		 * want to return MPOL_DEFAULT in this case.
942 		 */
943 		down_read(&mm->mmap_sem);
944 		vma = find_vma_intersection(mm, addr, addr+1);
945 		if (!vma) {
946 			up_read(&mm->mmap_sem);
947 			return -EFAULT;
948 		}
949 		if (vma->vm_ops && vma->vm_ops->get_policy)
950 			pol = vma->vm_ops->get_policy(vma, addr);
951 		else
952 			pol = vma->vm_policy;
953 	} else if (addr)
954 		return -EINVAL;
955 
956 	if (!pol)
957 		pol = &default_policy;	/* indicates default behavior */
958 
959 	if (flags & MPOL_F_NODE) {
960 		if (flags & MPOL_F_ADDR) {
961 			/*
962 			 * Take a refcount on the mpol, lookup_node()
963 			 * wil drop the mmap_sem, so after calling
964 			 * lookup_node() only "pol" remains valid, "vma"
965 			 * is stale.
966 			 */
967 			pol_refcount = pol;
968 			vma = NULL;
969 			mpol_get(pol);
970 			err = lookup_node(mm, addr);
971 			if (err < 0)
972 				goto out;
973 			*policy = err;
974 		} else if (pol == current->mempolicy &&
975 				pol->mode == MPOL_INTERLEAVE) {
976 			*policy = next_node_in(current->il_prev, pol->v.nodes);
977 		} else {
978 			err = -EINVAL;
979 			goto out;
980 		}
981 	} else {
982 		*policy = pol == &default_policy ? MPOL_DEFAULT :
983 						pol->mode;
984 		/*
985 		 * Internal mempolicy flags must be masked off before exposing
986 		 * the policy to userspace.
987 		 */
988 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
989 	}
990 
991 	err = 0;
992 	if (nmask) {
993 		if (mpol_store_user_nodemask(pol)) {
994 			*nmask = pol->w.user_nodemask;
995 		} else {
996 			task_lock(current);
997 			get_policy_nodemask(pol, nmask);
998 			task_unlock(current);
999 		}
1000 	}
1001 
1002  out:
1003 	mpol_cond_put(pol);
1004 	if (vma)
1005 		up_read(&mm->mmap_sem);
1006 	if (pol_refcount)
1007 		mpol_put(pol_refcount);
1008 	return err;
1009 }
1010 
1011 #ifdef CONFIG_MIGRATION
1012 /*
1013  * page migration, thp tail pages can be passed.
1014  */
1015 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1016 				unsigned long flags)
1017 {
1018 	struct page *head = compound_head(page);
1019 	/*
1020 	 * Avoid migrating a page that is shared with others.
1021 	 */
1022 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1023 		if (!isolate_lru_page(head)) {
1024 			list_add_tail(&head->lru, pagelist);
1025 			mod_node_page_state(page_pgdat(head),
1026 				NR_ISOLATED_ANON + page_is_file_cache(head),
1027 				hpage_nr_pages(head));
1028 		} else if (flags & MPOL_MF_STRICT) {
1029 			/*
1030 			 * Non-movable page may reach here.  And, there may be
1031 			 * temporary off LRU pages or non-LRU movable pages.
1032 			 * Treat them as unmovable pages since they can't be
1033 			 * isolated, so they can't be moved at the moment.  It
1034 			 * should return -EIO for this case too.
1035 			 */
1036 			return -EIO;
1037 		}
1038 	}
1039 
1040 	return 0;
1041 }
1042 
1043 /* page allocation callback for NUMA node migration */
1044 struct page *alloc_new_node_page(struct page *page, unsigned long node)
1045 {
1046 	if (PageHuge(page))
1047 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1048 					node);
1049 	else if (PageTransHuge(page)) {
1050 		struct page *thp;
1051 
1052 		thp = alloc_pages_node(node,
1053 			(GFP_TRANSHUGE | __GFP_THISNODE),
1054 			HPAGE_PMD_ORDER);
1055 		if (!thp)
1056 			return NULL;
1057 		prep_transhuge_page(thp);
1058 		return thp;
1059 	} else
1060 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1061 						    __GFP_THISNODE, 0);
1062 }
1063 
1064 /*
1065  * Migrate pages from one node to a target node.
1066  * Returns error or the number of pages not migrated.
1067  */
1068 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1069 			   int flags)
1070 {
1071 	nodemask_t nmask;
1072 	LIST_HEAD(pagelist);
1073 	int err = 0;
1074 
1075 	nodes_clear(nmask);
1076 	node_set(source, nmask);
1077 
1078 	/*
1079 	 * This does not "check" the range but isolates all pages that
1080 	 * need migration.  Between passing in the full user address
1081 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1082 	 */
1083 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1084 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1085 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1086 
1087 	if (!list_empty(&pagelist)) {
1088 		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1089 					MIGRATE_SYNC, MR_SYSCALL);
1090 		if (err)
1091 			putback_movable_pages(&pagelist);
1092 	}
1093 
1094 	return err;
1095 }
1096 
1097 /*
1098  * Move pages between the two nodesets so as to preserve the physical
1099  * layout as much as possible.
1100  *
1101  * Returns the number of page that could not be moved.
1102  */
1103 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1104 		     const nodemask_t *to, int flags)
1105 {
1106 	int busy = 0;
1107 	int err;
1108 	nodemask_t tmp;
1109 
1110 	err = migrate_prep();
1111 	if (err)
1112 		return err;
1113 
1114 	down_read(&mm->mmap_sem);
1115 
1116 	/*
1117 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1118 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1119 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1120 	 * The pair of nodemasks 'to' and 'from' define the map.
1121 	 *
1122 	 * If no pair of bits is found that way, fallback to picking some
1123 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1124 	 * 'source' and 'dest' bits are the same, this represents a node
1125 	 * that will be migrating to itself, so no pages need move.
1126 	 *
1127 	 * If no bits are left in 'tmp', or if all remaining bits left
1128 	 * in 'tmp' correspond to the same bit in 'to', return false
1129 	 * (nothing left to migrate).
1130 	 *
1131 	 * This lets us pick a pair of nodes to migrate between, such that
1132 	 * if possible the dest node is not already occupied by some other
1133 	 * source node, minimizing the risk of overloading the memory on a
1134 	 * node that would happen if we migrated incoming memory to a node
1135 	 * before migrating outgoing memory source that same node.
1136 	 *
1137 	 * A single scan of tmp is sufficient.  As we go, we remember the
1138 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1139 	 * that not only moved, but what's better, moved to an empty slot
1140 	 * (d is not set in tmp), then we break out then, with that pair.
1141 	 * Otherwise when we finish scanning from_tmp, we at least have the
1142 	 * most recent <s, d> pair that moved.  If we get all the way through
1143 	 * the scan of tmp without finding any node that moved, much less
1144 	 * moved to an empty node, then there is nothing left worth migrating.
1145 	 */
1146 
1147 	tmp = *from;
1148 	while (!nodes_empty(tmp)) {
1149 		int s,d;
1150 		int source = NUMA_NO_NODE;
1151 		int dest = 0;
1152 
1153 		for_each_node_mask(s, tmp) {
1154 
1155 			/*
1156 			 * do_migrate_pages() tries to maintain the relative
1157 			 * node relationship of the pages established between
1158 			 * threads and memory areas.
1159                          *
1160 			 * However if the number of source nodes is not equal to
1161 			 * the number of destination nodes we can not preserve
1162 			 * this node relative relationship.  In that case, skip
1163 			 * copying memory from a node that is in the destination
1164 			 * mask.
1165 			 *
1166 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1167 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1168 			 */
1169 
1170 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1171 						(node_isset(s, *to)))
1172 				continue;
1173 
1174 			d = node_remap(s, *from, *to);
1175 			if (s == d)
1176 				continue;
1177 
1178 			source = s;	/* Node moved. Memorize */
1179 			dest = d;
1180 
1181 			/* dest not in remaining from nodes? */
1182 			if (!node_isset(dest, tmp))
1183 				break;
1184 		}
1185 		if (source == NUMA_NO_NODE)
1186 			break;
1187 
1188 		node_clear(source, tmp);
1189 		err = migrate_to_node(mm, source, dest, flags);
1190 		if (err > 0)
1191 			busy += err;
1192 		if (err < 0)
1193 			break;
1194 	}
1195 	up_read(&mm->mmap_sem);
1196 	if (err < 0)
1197 		return err;
1198 	return busy;
1199 
1200 }
1201 
1202 /*
1203  * Allocate a new page for page migration based on vma policy.
1204  * Start by assuming the page is mapped by the same vma as contains @start.
1205  * Search forward from there, if not.  N.B., this assumes that the
1206  * list of pages handed to migrate_pages()--which is how we get here--
1207  * is in virtual address order.
1208  */
1209 static struct page *new_page(struct page *page, unsigned long start)
1210 {
1211 	struct vm_area_struct *vma;
1212 	unsigned long uninitialized_var(address);
1213 
1214 	vma = find_vma(current->mm, start);
1215 	while (vma) {
1216 		address = page_address_in_vma(page, vma);
1217 		if (address != -EFAULT)
1218 			break;
1219 		vma = vma->vm_next;
1220 	}
1221 
1222 	if (PageHuge(page)) {
1223 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1224 				vma, address);
1225 	} else if (PageTransHuge(page)) {
1226 		struct page *thp;
1227 
1228 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1229 					 HPAGE_PMD_ORDER);
1230 		if (!thp)
1231 			return NULL;
1232 		prep_transhuge_page(thp);
1233 		return thp;
1234 	}
1235 	/*
1236 	 * if !vma, alloc_page_vma() will use task or system default policy
1237 	 */
1238 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1239 			vma, address);
1240 }
1241 #else
1242 
1243 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1244 				unsigned long flags)
1245 {
1246 	return -EIO;
1247 }
1248 
1249 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1250 		     const nodemask_t *to, int flags)
1251 {
1252 	return -ENOSYS;
1253 }
1254 
1255 static struct page *new_page(struct page *page, unsigned long start)
1256 {
1257 	return NULL;
1258 }
1259 #endif
1260 
1261 static long do_mbind(unsigned long start, unsigned long len,
1262 		     unsigned short mode, unsigned short mode_flags,
1263 		     nodemask_t *nmask, unsigned long flags)
1264 {
1265 	struct mm_struct *mm = current->mm;
1266 	struct mempolicy *new;
1267 	unsigned long end;
1268 	int err;
1269 	int ret;
1270 	LIST_HEAD(pagelist);
1271 
1272 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1273 		return -EINVAL;
1274 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1275 		return -EPERM;
1276 
1277 	if (start & ~PAGE_MASK)
1278 		return -EINVAL;
1279 
1280 	if (mode == MPOL_DEFAULT)
1281 		flags &= ~MPOL_MF_STRICT;
1282 
1283 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1284 	end = start + len;
1285 
1286 	if (end < start)
1287 		return -EINVAL;
1288 	if (end == start)
1289 		return 0;
1290 
1291 	new = mpol_new(mode, mode_flags, nmask);
1292 	if (IS_ERR(new))
1293 		return PTR_ERR(new);
1294 
1295 	if (flags & MPOL_MF_LAZY)
1296 		new->flags |= MPOL_F_MOF;
1297 
1298 	/*
1299 	 * If we are using the default policy then operation
1300 	 * on discontinuous address spaces is okay after all
1301 	 */
1302 	if (!new)
1303 		flags |= MPOL_MF_DISCONTIG_OK;
1304 
1305 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1306 		 start, start + len, mode, mode_flags,
1307 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1308 
1309 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1310 
1311 		err = migrate_prep();
1312 		if (err)
1313 			goto mpol_out;
1314 	}
1315 	{
1316 		NODEMASK_SCRATCH(scratch);
1317 		if (scratch) {
1318 			down_write(&mm->mmap_sem);
1319 			task_lock(current);
1320 			err = mpol_set_nodemask(new, nmask, scratch);
1321 			task_unlock(current);
1322 			if (err)
1323 				up_write(&mm->mmap_sem);
1324 		} else
1325 			err = -ENOMEM;
1326 		NODEMASK_SCRATCH_FREE(scratch);
1327 	}
1328 	if (err)
1329 		goto mpol_out;
1330 
1331 	ret = queue_pages_range(mm, start, end, nmask,
1332 			  flags | MPOL_MF_INVERT, &pagelist);
1333 
1334 	if (ret < 0) {
1335 		err = ret;
1336 		goto up_out;
1337 	}
1338 
1339 	err = mbind_range(mm, start, end, new);
1340 
1341 	if (!err) {
1342 		int nr_failed = 0;
1343 
1344 		if (!list_empty(&pagelist)) {
1345 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1346 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1347 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1348 			if (nr_failed)
1349 				putback_movable_pages(&pagelist);
1350 		}
1351 
1352 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1353 			err = -EIO;
1354 	} else {
1355 up_out:
1356 		if (!list_empty(&pagelist))
1357 			putback_movable_pages(&pagelist);
1358 	}
1359 
1360 	up_write(&mm->mmap_sem);
1361 mpol_out:
1362 	mpol_put(new);
1363 	return err;
1364 }
1365 
1366 /*
1367  * User space interface with variable sized bitmaps for nodelists.
1368  */
1369 
1370 /* Copy a node mask from user space. */
1371 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1372 		     unsigned long maxnode)
1373 {
1374 	unsigned long k;
1375 	unsigned long t;
1376 	unsigned long nlongs;
1377 	unsigned long endmask;
1378 
1379 	--maxnode;
1380 	nodes_clear(*nodes);
1381 	if (maxnode == 0 || !nmask)
1382 		return 0;
1383 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1384 		return -EINVAL;
1385 
1386 	nlongs = BITS_TO_LONGS(maxnode);
1387 	if ((maxnode % BITS_PER_LONG) == 0)
1388 		endmask = ~0UL;
1389 	else
1390 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1391 
1392 	/*
1393 	 * When the user specified more nodes than supported just check
1394 	 * if the non supported part is all zero.
1395 	 *
1396 	 * If maxnode have more longs than MAX_NUMNODES, check
1397 	 * the bits in that area first. And then go through to
1398 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1399 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1400 	 */
1401 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1402 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1403 			if (get_user(t, nmask + k))
1404 				return -EFAULT;
1405 			if (k == nlongs - 1) {
1406 				if (t & endmask)
1407 					return -EINVAL;
1408 			} else if (t)
1409 				return -EINVAL;
1410 		}
1411 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1412 		endmask = ~0UL;
1413 	}
1414 
1415 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1416 		unsigned long valid_mask = endmask;
1417 
1418 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1419 		if (get_user(t, nmask + nlongs - 1))
1420 			return -EFAULT;
1421 		if (t & valid_mask)
1422 			return -EINVAL;
1423 	}
1424 
1425 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1426 		return -EFAULT;
1427 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1428 	return 0;
1429 }
1430 
1431 /* Copy a kernel node mask to user space */
1432 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1433 			      nodemask_t *nodes)
1434 {
1435 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1436 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1437 
1438 	if (copy > nbytes) {
1439 		if (copy > PAGE_SIZE)
1440 			return -EINVAL;
1441 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1442 			return -EFAULT;
1443 		copy = nbytes;
1444 	}
1445 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1446 }
1447 
1448 static long kernel_mbind(unsigned long start, unsigned long len,
1449 			 unsigned long mode, const unsigned long __user *nmask,
1450 			 unsigned long maxnode, unsigned int flags)
1451 {
1452 	nodemask_t nodes;
1453 	int err;
1454 	unsigned short mode_flags;
1455 
1456 	start = untagged_addr(start);
1457 	mode_flags = mode & MPOL_MODE_FLAGS;
1458 	mode &= ~MPOL_MODE_FLAGS;
1459 	if (mode >= MPOL_MAX)
1460 		return -EINVAL;
1461 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1462 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1463 		return -EINVAL;
1464 	err = get_nodes(&nodes, nmask, maxnode);
1465 	if (err)
1466 		return err;
1467 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1468 }
1469 
1470 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1471 		unsigned long, mode, const unsigned long __user *, nmask,
1472 		unsigned long, maxnode, unsigned int, flags)
1473 {
1474 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1475 }
1476 
1477 /* Set the process memory policy */
1478 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1479 				 unsigned long maxnode)
1480 {
1481 	int err;
1482 	nodemask_t nodes;
1483 	unsigned short flags;
1484 
1485 	flags = mode & MPOL_MODE_FLAGS;
1486 	mode &= ~MPOL_MODE_FLAGS;
1487 	if ((unsigned int)mode >= MPOL_MAX)
1488 		return -EINVAL;
1489 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1490 		return -EINVAL;
1491 	err = get_nodes(&nodes, nmask, maxnode);
1492 	if (err)
1493 		return err;
1494 	return do_set_mempolicy(mode, flags, &nodes);
1495 }
1496 
1497 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1498 		unsigned long, maxnode)
1499 {
1500 	return kernel_set_mempolicy(mode, nmask, maxnode);
1501 }
1502 
1503 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1504 				const unsigned long __user *old_nodes,
1505 				const unsigned long __user *new_nodes)
1506 {
1507 	struct mm_struct *mm = NULL;
1508 	struct task_struct *task;
1509 	nodemask_t task_nodes;
1510 	int err;
1511 	nodemask_t *old;
1512 	nodemask_t *new;
1513 	NODEMASK_SCRATCH(scratch);
1514 
1515 	if (!scratch)
1516 		return -ENOMEM;
1517 
1518 	old = &scratch->mask1;
1519 	new = &scratch->mask2;
1520 
1521 	err = get_nodes(old, old_nodes, maxnode);
1522 	if (err)
1523 		goto out;
1524 
1525 	err = get_nodes(new, new_nodes, maxnode);
1526 	if (err)
1527 		goto out;
1528 
1529 	/* Find the mm_struct */
1530 	rcu_read_lock();
1531 	task = pid ? find_task_by_vpid(pid) : current;
1532 	if (!task) {
1533 		rcu_read_unlock();
1534 		err = -ESRCH;
1535 		goto out;
1536 	}
1537 	get_task_struct(task);
1538 
1539 	err = -EINVAL;
1540 
1541 	/*
1542 	 * Check if this process has the right to modify the specified process.
1543 	 * Use the regular "ptrace_may_access()" checks.
1544 	 */
1545 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1546 		rcu_read_unlock();
1547 		err = -EPERM;
1548 		goto out_put;
1549 	}
1550 	rcu_read_unlock();
1551 
1552 	task_nodes = cpuset_mems_allowed(task);
1553 	/* Is the user allowed to access the target nodes? */
1554 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1555 		err = -EPERM;
1556 		goto out_put;
1557 	}
1558 
1559 	task_nodes = cpuset_mems_allowed(current);
1560 	nodes_and(*new, *new, task_nodes);
1561 	if (nodes_empty(*new))
1562 		goto out_put;
1563 
1564 	err = security_task_movememory(task);
1565 	if (err)
1566 		goto out_put;
1567 
1568 	mm = get_task_mm(task);
1569 	put_task_struct(task);
1570 
1571 	if (!mm) {
1572 		err = -EINVAL;
1573 		goto out;
1574 	}
1575 
1576 	err = do_migrate_pages(mm, old, new,
1577 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1578 
1579 	mmput(mm);
1580 out:
1581 	NODEMASK_SCRATCH_FREE(scratch);
1582 
1583 	return err;
1584 
1585 out_put:
1586 	put_task_struct(task);
1587 	goto out;
1588 
1589 }
1590 
1591 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1592 		const unsigned long __user *, old_nodes,
1593 		const unsigned long __user *, new_nodes)
1594 {
1595 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1596 }
1597 
1598 
1599 /* Retrieve NUMA policy */
1600 static int kernel_get_mempolicy(int __user *policy,
1601 				unsigned long __user *nmask,
1602 				unsigned long maxnode,
1603 				unsigned long addr,
1604 				unsigned long flags)
1605 {
1606 	int err;
1607 	int uninitialized_var(pval);
1608 	nodemask_t nodes;
1609 
1610 	addr = untagged_addr(addr);
1611 
1612 	if (nmask != NULL && maxnode < nr_node_ids)
1613 		return -EINVAL;
1614 
1615 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1616 
1617 	if (err)
1618 		return err;
1619 
1620 	if (policy && put_user(pval, policy))
1621 		return -EFAULT;
1622 
1623 	if (nmask)
1624 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1625 
1626 	return err;
1627 }
1628 
1629 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1630 		unsigned long __user *, nmask, unsigned long, maxnode,
1631 		unsigned long, addr, unsigned long, flags)
1632 {
1633 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1634 }
1635 
1636 #ifdef CONFIG_COMPAT
1637 
1638 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1639 		       compat_ulong_t __user *, nmask,
1640 		       compat_ulong_t, maxnode,
1641 		       compat_ulong_t, addr, compat_ulong_t, flags)
1642 {
1643 	long err;
1644 	unsigned long __user *nm = NULL;
1645 	unsigned long nr_bits, alloc_size;
1646 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1647 
1648 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1649 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1650 
1651 	if (nmask)
1652 		nm = compat_alloc_user_space(alloc_size);
1653 
1654 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1655 
1656 	if (!err && nmask) {
1657 		unsigned long copy_size;
1658 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1659 		err = copy_from_user(bm, nm, copy_size);
1660 		/* ensure entire bitmap is zeroed */
1661 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1662 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1663 	}
1664 
1665 	return err;
1666 }
1667 
1668 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1669 		       compat_ulong_t, maxnode)
1670 {
1671 	unsigned long __user *nm = NULL;
1672 	unsigned long nr_bits, alloc_size;
1673 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1674 
1675 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1676 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1677 
1678 	if (nmask) {
1679 		if (compat_get_bitmap(bm, nmask, nr_bits))
1680 			return -EFAULT;
1681 		nm = compat_alloc_user_space(alloc_size);
1682 		if (copy_to_user(nm, bm, alloc_size))
1683 			return -EFAULT;
1684 	}
1685 
1686 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1687 }
1688 
1689 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1690 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1691 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1692 {
1693 	unsigned long __user *nm = NULL;
1694 	unsigned long nr_bits, alloc_size;
1695 	nodemask_t bm;
1696 
1697 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1698 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1699 
1700 	if (nmask) {
1701 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1702 			return -EFAULT;
1703 		nm = compat_alloc_user_space(alloc_size);
1704 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1705 			return -EFAULT;
1706 	}
1707 
1708 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1709 }
1710 
1711 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1712 		       compat_ulong_t, maxnode,
1713 		       const compat_ulong_t __user *, old_nodes,
1714 		       const compat_ulong_t __user *, new_nodes)
1715 {
1716 	unsigned long __user *old = NULL;
1717 	unsigned long __user *new = NULL;
1718 	nodemask_t tmp_mask;
1719 	unsigned long nr_bits;
1720 	unsigned long size;
1721 
1722 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1723 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1724 	if (old_nodes) {
1725 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1726 			return -EFAULT;
1727 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1728 		if (new_nodes)
1729 			new = old + size / sizeof(unsigned long);
1730 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1731 			return -EFAULT;
1732 	}
1733 	if (new_nodes) {
1734 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1735 			return -EFAULT;
1736 		if (new == NULL)
1737 			new = compat_alloc_user_space(size);
1738 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1739 			return -EFAULT;
1740 	}
1741 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1742 }
1743 
1744 #endif /* CONFIG_COMPAT */
1745 
1746 bool vma_migratable(struct vm_area_struct *vma)
1747 {
1748 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1749 		return false;
1750 
1751 	/*
1752 	 * DAX device mappings require predictable access latency, so avoid
1753 	 * incurring periodic faults.
1754 	 */
1755 	if (vma_is_dax(vma))
1756 		return false;
1757 
1758 	if (is_vm_hugetlb_page(vma) &&
1759 		!hugepage_migration_supported(hstate_vma(vma)))
1760 		return false;
1761 
1762 	/*
1763 	 * Migration allocates pages in the highest zone. If we cannot
1764 	 * do so then migration (at least from node to node) is not
1765 	 * possible.
1766 	 */
1767 	if (vma->vm_file &&
1768 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1769 			< policy_zone)
1770 		return false;
1771 	return true;
1772 }
1773 
1774 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1775 						unsigned long addr)
1776 {
1777 	struct mempolicy *pol = NULL;
1778 
1779 	if (vma) {
1780 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1781 			pol = vma->vm_ops->get_policy(vma, addr);
1782 		} else if (vma->vm_policy) {
1783 			pol = vma->vm_policy;
1784 
1785 			/*
1786 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1787 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1788 			 * count on these policies which will be dropped by
1789 			 * mpol_cond_put() later
1790 			 */
1791 			if (mpol_needs_cond_ref(pol))
1792 				mpol_get(pol);
1793 		}
1794 	}
1795 
1796 	return pol;
1797 }
1798 
1799 /*
1800  * get_vma_policy(@vma, @addr)
1801  * @vma: virtual memory area whose policy is sought
1802  * @addr: address in @vma for shared policy lookup
1803  *
1804  * Returns effective policy for a VMA at specified address.
1805  * Falls back to current->mempolicy or system default policy, as necessary.
1806  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1807  * count--added by the get_policy() vm_op, as appropriate--to protect against
1808  * freeing by another task.  It is the caller's responsibility to free the
1809  * extra reference for shared policies.
1810  */
1811 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1812 						unsigned long addr)
1813 {
1814 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1815 
1816 	if (!pol)
1817 		pol = get_task_policy(current);
1818 
1819 	return pol;
1820 }
1821 
1822 bool vma_policy_mof(struct vm_area_struct *vma)
1823 {
1824 	struct mempolicy *pol;
1825 
1826 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1827 		bool ret = false;
1828 
1829 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1830 		if (pol && (pol->flags & MPOL_F_MOF))
1831 			ret = true;
1832 		mpol_cond_put(pol);
1833 
1834 		return ret;
1835 	}
1836 
1837 	pol = vma->vm_policy;
1838 	if (!pol)
1839 		pol = get_task_policy(current);
1840 
1841 	return pol->flags & MPOL_F_MOF;
1842 }
1843 
1844 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1845 {
1846 	enum zone_type dynamic_policy_zone = policy_zone;
1847 
1848 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1849 
1850 	/*
1851 	 * if policy->v.nodes has movable memory only,
1852 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1853 	 *
1854 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1855 	 * so if the following test faile, it implies
1856 	 * policy->v.nodes has movable memory only.
1857 	 */
1858 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1859 		dynamic_policy_zone = ZONE_MOVABLE;
1860 
1861 	return zone >= dynamic_policy_zone;
1862 }
1863 
1864 /*
1865  * Return a nodemask representing a mempolicy for filtering nodes for
1866  * page allocation
1867  */
1868 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1869 {
1870 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1871 	if (unlikely(policy->mode == MPOL_BIND) &&
1872 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1873 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1874 		return &policy->v.nodes;
1875 
1876 	return NULL;
1877 }
1878 
1879 /* Return the node id preferred by the given mempolicy, or the given id */
1880 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1881 								int nd)
1882 {
1883 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1884 		nd = policy->v.preferred_node;
1885 	else {
1886 		/*
1887 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1888 		 * because we might easily break the expectation to stay on the
1889 		 * requested node and not break the policy.
1890 		 */
1891 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1892 	}
1893 
1894 	return nd;
1895 }
1896 
1897 /* Do dynamic interleaving for a process */
1898 static unsigned interleave_nodes(struct mempolicy *policy)
1899 {
1900 	unsigned next;
1901 	struct task_struct *me = current;
1902 
1903 	next = next_node_in(me->il_prev, policy->v.nodes);
1904 	if (next < MAX_NUMNODES)
1905 		me->il_prev = next;
1906 	return next;
1907 }
1908 
1909 /*
1910  * Depending on the memory policy provide a node from which to allocate the
1911  * next slab entry.
1912  */
1913 unsigned int mempolicy_slab_node(void)
1914 {
1915 	struct mempolicy *policy;
1916 	int node = numa_mem_id();
1917 
1918 	if (in_interrupt())
1919 		return node;
1920 
1921 	policy = current->mempolicy;
1922 	if (!policy || policy->flags & MPOL_F_LOCAL)
1923 		return node;
1924 
1925 	switch (policy->mode) {
1926 	case MPOL_PREFERRED:
1927 		/*
1928 		 * handled MPOL_F_LOCAL above
1929 		 */
1930 		return policy->v.preferred_node;
1931 
1932 	case MPOL_INTERLEAVE:
1933 		return interleave_nodes(policy);
1934 
1935 	case MPOL_BIND: {
1936 		struct zoneref *z;
1937 
1938 		/*
1939 		 * Follow bind policy behavior and start allocation at the
1940 		 * first node.
1941 		 */
1942 		struct zonelist *zonelist;
1943 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1944 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1945 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1946 							&policy->v.nodes);
1947 		return z->zone ? zone_to_nid(z->zone) : node;
1948 	}
1949 
1950 	default:
1951 		BUG();
1952 	}
1953 }
1954 
1955 /*
1956  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1957  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1958  * number of present nodes.
1959  */
1960 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1961 {
1962 	unsigned nnodes = nodes_weight(pol->v.nodes);
1963 	unsigned target;
1964 	int i;
1965 	int nid;
1966 
1967 	if (!nnodes)
1968 		return numa_node_id();
1969 	target = (unsigned int)n % nnodes;
1970 	nid = first_node(pol->v.nodes);
1971 	for (i = 0; i < target; i++)
1972 		nid = next_node(nid, pol->v.nodes);
1973 	return nid;
1974 }
1975 
1976 /* Determine a node number for interleave */
1977 static inline unsigned interleave_nid(struct mempolicy *pol,
1978 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1979 {
1980 	if (vma) {
1981 		unsigned long off;
1982 
1983 		/*
1984 		 * for small pages, there is no difference between
1985 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1986 		 * for huge pages, since vm_pgoff is in units of small
1987 		 * pages, we need to shift off the always 0 bits to get
1988 		 * a useful offset.
1989 		 */
1990 		BUG_ON(shift < PAGE_SHIFT);
1991 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1992 		off += (addr - vma->vm_start) >> shift;
1993 		return offset_il_node(pol, off);
1994 	} else
1995 		return interleave_nodes(pol);
1996 }
1997 
1998 #ifdef CONFIG_HUGETLBFS
1999 /*
2000  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2001  * @vma: virtual memory area whose policy is sought
2002  * @addr: address in @vma for shared policy lookup and interleave policy
2003  * @gfp_flags: for requested zone
2004  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2005  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2006  *
2007  * Returns a nid suitable for a huge page allocation and a pointer
2008  * to the struct mempolicy for conditional unref after allocation.
2009  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2010  * @nodemask for filtering the zonelist.
2011  *
2012  * Must be protected by read_mems_allowed_begin()
2013  */
2014 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2015 				struct mempolicy **mpol, nodemask_t **nodemask)
2016 {
2017 	int nid;
2018 
2019 	*mpol = get_vma_policy(vma, addr);
2020 	*nodemask = NULL;	/* assume !MPOL_BIND */
2021 
2022 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2023 		nid = interleave_nid(*mpol, vma, addr,
2024 					huge_page_shift(hstate_vma(vma)));
2025 	} else {
2026 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
2027 		if ((*mpol)->mode == MPOL_BIND)
2028 			*nodemask = &(*mpol)->v.nodes;
2029 	}
2030 	return nid;
2031 }
2032 
2033 /*
2034  * init_nodemask_of_mempolicy
2035  *
2036  * If the current task's mempolicy is "default" [NULL], return 'false'
2037  * to indicate default policy.  Otherwise, extract the policy nodemask
2038  * for 'bind' or 'interleave' policy into the argument nodemask, or
2039  * initialize the argument nodemask to contain the single node for
2040  * 'preferred' or 'local' policy and return 'true' to indicate presence
2041  * of non-default mempolicy.
2042  *
2043  * We don't bother with reference counting the mempolicy [mpol_get/put]
2044  * because the current task is examining it's own mempolicy and a task's
2045  * mempolicy is only ever changed by the task itself.
2046  *
2047  * N.B., it is the caller's responsibility to free a returned nodemask.
2048  */
2049 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2050 {
2051 	struct mempolicy *mempolicy;
2052 	int nid;
2053 
2054 	if (!(mask && current->mempolicy))
2055 		return false;
2056 
2057 	task_lock(current);
2058 	mempolicy = current->mempolicy;
2059 	switch (mempolicy->mode) {
2060 	case MPOL_PREFERRED:
2061 		if (mempolicy->flags & MPOL_F_LOCAL)
2062 			nid = numa_node_id();
2063 		else
2064 			nid = mempolicy->v.preferred_node;
2065 		init_nodemask_of_node(mask, nid);
2066 		break;
2067 
2068 	case MPOL_BIND:
2069 		/* Fall through */
2070 	case MPOL_INTERLEAVE:
2071 		*mask =  mempolicy->v.nodes;
2072 		break;
2073 
2074 	default:
2075 		BUG();
2076 	}
2077 	task_unlock(current);
2078 
2079 	return true;
2080 }
2081 #endif
2082 
2083 /*
2084  * mempolicy_nodemask_intersects
2085  *
2086  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2087  * policy.  Otherwise, check for intersection between mask and the policy
2088  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2089  * policy, always return true since it may allocate elsewhere on fallback.
2090  *
2091  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2092  */
2093 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2094 					const nodemask_t *mask)
2095 {
2096 	struct mempolicy *mempolicy;
2097 	bool ret = true;
2098 
2099 	if (!mask)
2100 		return ret;
2101 	task_lock(tsk);
2102 	mempolicy = tsk->mempolicy;
2103 	if (!mempolicy)
2104 		goto out;
2105 
2106 	switch (mempolicy->mode) {
2107 	case MPOL_PREFERRED:
2108 		/*
2109 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2110 		 * allocate from, they may fallback to other nodes when oom.
2111 		 * Thus, it's possible for tsk to have allocated memory from
2112 		 * nodes in mask.
2113 		 */
2114 		break;
2115 	case MPOL_BIND:
2116 	case MPOL_INTERLEAVE:
2117 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2118 		break;
2119 	default:
2120 		BUG();
2121 	}
2122 out:
2123 	task_unlock(tsk);
2124 	return ret;
2125 }
2126 
2127 /* Allocate a page in interleaved policy.
2128    Own path because it needs to do special accounting. */
2129 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2130 					unsigned nid)
2131 {
2132 	struct page *page;
2133 
2134 	page = __alloc_pages(gfp, order, nid);
2135 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2136 	if (!static_branch_likely(&vm_numa_stat_key))
2137 		return page;
2138 	if (page && page_to_nid(page) == nid) {
2139 		preempt_disable();
2140 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2141 		preempt_enable();
2142 	}
2143 	return page;
2144 }
2145 
2146 /**
2147  * 	alloc_pages_vma	- Allocate a page for a VMA.
2148  *
2149  * 	@gfp:
2150  *      %GFP_USER    user allocation.
2151  *      %GFP_KERNEL  kernel allocations,
2152  *      %GFP_HIGHMEM highmem/user allocations,
2153  *      %GFP_FS      allocation should not call back into a file system.
2154  *      %GFP_ATOMIC  don't sleep.
2155  *
2156  *	@order:Order of the GFP allocation.
2157  * 	@vma:  Pointer to VMA or NULL if not available.
2158  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2159  *	@node: Which node to prefer for allocation (modulo policy).
2160  *	@hugepage: for hugepages try only the preferred node if possible
2161  *
2162  * 	This function allocates a page from the kernel page pool and applies
2163  *	a NUMA policy associated with the VMA or the current process.
2164  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2165  *	mm_struct of the VMA to prevent it from going away. Should be used for
2166  *	all allocations for pages that will be mapped into user space. Returns
2167  *	NULL when no page can be allocated.
2168  */
2169 struct page *
2170 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2171 		unsigned long addr, int node, bool hugepage)
2172 {
2173 	struct mempolicy *pol;
2174 	struct page *page;
2175 	int preferred_nid;
2176 	nodemask_t *nmask;
2177 
2178 	pol = get_vma_policy(vma, addr);
2179 
2180 	if (pol->mode == MPOL_INTERLEAVE) {
2181 		unsigned nid;
2182 
2183 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2184 		mpol_cond_put(pol);
2185 		page = alloc_page_interleave(gfp, order, nid);
2186 		goto out;
2187 	}
2188 
2189 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2190 		int hpage_node = node;
2191 
2192 		/*
2193 		 * For hugepage allocation and non-interleave policy which
2194 		 * allows the current node (or other explicitly preferred
2195 		 * node) we only try to allocate from the current/preferred
2196 		 * node and don't fall back to other nodes, as the cost of
2197 		 * remote accesses would likely offset THP benefits.
2198 		 *
2199 		 * If the policy is interleave, or does not allow the current
2200 		 * node in its nodemask, we allocate the standard way.
2201 		 */
2202 		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2203 			hpage_node = pol->v.preferred_node;
2204 
2205 		nmask = policy_nodemask(gfp, pol);
2206 		if (!nmask || node_isset(hpage_node, *nmask)) {
2207 			mpol_cond_put(pol);
2208 			/*
2209 			 * First, try to allocate THP only on local node, but
2210 			 * don't reclaim unnecessarily, just compact.
2211 			 */
2212 			page = __alloc_pages_node(hpage_node,
2213 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2214 
2215 			/*
2216 			 * If hugepage allocations are configured to always
2217 			 * synchronous compact or the vma has been madvised
2218 			 * to prefer hugepage backing, retry allowing remote
2219 			 * memory with both reclaim and compact as well.
2220 			 */
2221 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2222 				page = __alloc_pages_node(hpage_node,
2223 								gfp, order);
2224 
2225 			goto out;
2226 		}
2227 	}
2228 
2229 	nmask = policy_nodemask(gfp, pol);
2230 	preferred_nid = policy_node(gfp, pol, node);
2231 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2232 	mpol_cond_put(pol);
2233 out:
2234 	return page;
2235 }
2236 EXPORT_SYMBOL(alloc_pages_vma);
2237 
2238 /**
2239  * 	alloc_pages_current - Allocate pages.
2240  *
2241  *	@gfp:
2242  *		%GFP_USER   user allocation,
2243  *      	%GFP_KERNEL kernel allocation,
2244  *      	%GFP_HIGHMEM highmem allocation,
2245  *      	%GFP_FS     don't call back into a file system.
2246  *      	%GFP_ATOMIC don't sleep.
2247  *	@order: Power of two of allocation size in pages. 0 is a single page.
2248  *
2249  *	Allocate a page from the kernel page pool.  When not in
2250  *	interrupt context and apply the current process NUMA policy.
2251  *	Returns NULL when no page can be allocated.
2252  */
2253 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2254 {
2255 	struct mempolicy *pol = &default_policy;
2256 	struct page *page;
2257 
2258 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2259 		pol = get_task_policy(current);
2260 
2261 	/*
2262 	 * No reference counting needed for current->mempolicy
2263 	 * nor system default_policy
2264 	 */
2265 	if (pol->mode == MPOL_INTERLEAVE)
2266 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2267 	else
2268 		page = __alloc_pages_nodemask(gfp, order,
2269 				policy_node(gfp, pol, numa_node_id()),
2270 				policy_nodemask(gfp, pol));
2271 
2272 	return page;
2273 }
2274 EXPORT_SYMBOL(alloc_pages_current);
2275 
2276 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2277 {
2278 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2279 
2280 	if (IS_ERR(pol))
2281 		return PTR_ERR(pol);
2282 	dst->vm_policy = pol;
2283 	return 0;
2284 }
2285 
2286 /*
2287  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2288  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2289  * with the mems_allowed returned by cpuset_mems_allowed().  This
2290  * keeps mempolicies cpuset relative after its cpuset moves.  See
2291  * further kernel/cpuset.c update_nodemask().
2292  *
2293  * current's mempolicy may be rebinded by the other task(the task that changes
2294  * cpuset's mems), so we needn't do rebind work for current task.
2295  */
2296 
2297 /* Slow path of a mempolicy duplicate */
2298 struct mempolicy *__mpol_dup(struct mempolicy *old)
2299 {
2300 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2301 
2302 	if (!new)
2303 		return ERR_PTR(-ENOMEM);
2304 
2305 	/* task's mempolicy is protected by alloc_lock */
2306 	if (old == current->mempolicy) {
2307 		task_lock(current);
2308 		*new = *old;
2309 		task_unlock(current);
2310 	} else
2311 		*new = *old;
2312 
2313 	if (current_cpuset_is_being_rebound()) {
2314 		nodemask_t mems = cpuset_mems_allowed(current);
2315 		mpol_rebind_policy(new, &mems);
2316 	}
2317 	atomic_set(&new->refcnt, 1);
2318 	return new;
2319 }
2320 
2321 /* Slow path of a mempolicy comparison */
2322 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2323 {
2324 	if (!a || !b)
2325 		return false;
2326 	if (a->mode != b->mode)
2327 		return false;
2328 	if (a->flags != b->flags)
2329 		return false;
2330 	if (mpol_store_user_nodemask(a))
2331 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2332 			return false;
2333 
2334 	switch (a->mode) {
2335 	case MPOL_BIND:
2336 		/* Fall through */
2337 	case MPOL_INTERLEAVE:
2338 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2339 	case MPOL_PREFERRED:
2340 		/* a's ->flags is the same as b's */
2341 		if (a->flags & MPOL_F_LOCAL)
2342 			return true;
2343 		return a->v.preferred_node == b->v.preferred_node;
2344 	default:
2345 		BUG();
2346 		return false;
2347 	}
2348 }
2349 
2350 /*
2351  * Shared memory backing store policy support.
2352  *
2353  * Remember policies even when nobody has shared memory mapped.
2354  * The policies are kept in Red-Black tree linked from the inode.
2355  * They are protected by the sp->lock rwlock, which should be held
2356  * for any accesses to the tree.
2357  */
2358 
2359 /*
2360  * lookup first element intersecting start-end.  Caller holds sp->lock for
2361  * reading or for writing
2362  */
2363 static struct sp_node *
2364 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2365 {
2366 	struct rb_node *n = sp->root.rb_node;
2367 
2368 	while (n) {
2369 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2370 
2371 		if (start >= p->end)
2372 			n = n->rb_right;
2373 		else if (end <= p->start)
2374 			n = n->rb_left;
2375 		else
2376 			break;
2377 	}
2378 	if (!n)
2379 		return NULL;
2380 	for (;;) {
2381 		struct sp_node *w = NULL;
2382 		struct rb_node *prev = rb_prev(n);
2383 		if (!prev)
2384 			break;
2385 		w = rb_entry(prev, struct sp_node, nd);
2386 		if (w->end <= start)
2387 			break;
2388 		n = prev;
2389 	}
2390 	return rb_entry(n, struct sp_node, nd);
2391 }
2392 
2393 /*
2394  * Insert a new shared policy into the list.  Caller holds sp->lock for
2395  * writing.
2396  */
2397 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2398 {
2399 	struct rb_node **p = &sp->root.rb_node;
2400 	struct rb_node *parent = NULL;
2401 	struct sp_node *nd;
2402 
2403 	while (*p) {
2404 		parent = *p;
2405 		nd = rb_entry(parent, struct sp_node, nd);
2406 		if (new->start < nd->start)
2407 			p = &(*p)->rb_left;
2408 		else if (new->end > nd->end)
2409 			p = &(*p)->rb_right;
2410 		else
2411 			BUG();
2412 	}
2413 	rb_link_node(&new->nd, parent, p);
2414 	rb_insert_color(&new->nd, &sp->root);
2415 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2416 		 new->policy ? new->policy->mode : 0);
2417 }
2418 
2419 /* Find shared policy intersecting idx */
2420 struct mempolicy *
2421 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2422 {
2423 	struct mempolicy *pol = NULL;
2424 	struct sp_node *sn;
2425 
2426 	if (!sp->root.rb_node)
2427 		return NULL;
2428 	read_lock(&sp->lock);
2429 	sn = sp_lookup(sp, idx, idx+1);
2430 	if (sn) {
2431 		mpol_get(sn->policy);
2432 		pol = sn->policy;
2433 	}
2434 	read_unlock(&sp->lock);
2435 	return pol;
2436 }
2437 
2438 static void sp_free(struct sp_node *n)
2439 {
2440 	mpol_put(n->policy);
2441 	kmem_cache_free(sn_cache, n);
2442 }
2443 
2444 /**
2445  * mpol_misplaced - check whether current page node is valid in policy
2446  *
2447  * @page: page to be checked
2448  * @vma: vm area where page mapped
2449  * @addr: virtual address where page mapped
2450  *
2451  * Lookup current policy node id for vma,addr and "compare to" page's
2452  * node id.
2453  *
2454  * Returns:
2455  *	-1	- not misplaced, page is in the right node
2456  *	node	- node id where the page should be
2457  *
2458  * Policy determination "mimics" alloc_page_vma().
2459  * Called from fault path where we know the vma and faulting address.
2460  */
2461 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2462 {
2463 	struct mempolicy *pol;
2464 	struct zoneref *z;
2465 	int curnid = page_to_nid(page);
2466 	unsigned long pgoff;
2467 	int thiscpu = raw_smp_processor_id();
2468 	int thisnid = cpu_to_node(thiscpu);
2469 	int polnid = NUMA_NO_NODE;
2470 	int ret = -1;
2471 
2472 	pol = get_vma_policy(vma, addr);
2473 	if (!(pol->flags & MPOL_F_MOF))
2474 		goto out;
2475 
2476 	switch (pol->mode) {
2477 	case MPOL_INTERLEAVE:
2478 		pgoff = vma->vm_pgoff;
2479 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2480 		polnid = offset_il_node(pol, pgoff);
2481 		break;
2482 
2483 	case MPOL_PREFERRED:
2484 		if (pol->flags & MPOL_F_LOCAL)
2485 			polnid = numa_node_id();
2486 		else
2487 			polnid = pol->v.preferred_node;
2488 		break;
2489 
2490 	case MPOL_BIND:
2491 
2492 		/*
2493 		 * allows binding to multiple nodes.
2494 		 * use current page if in policy nodemask,
2495 		 * else select nearest allowed node, if any.
2496 		 * If no allowed nodes, use current [!misplaced].
2497 		 */
2498 		if (node_isset(curnid, pol->v.nodes))
2499 			goto out;
2500 		z = first_zones_zonelist(
2501 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2502 				gfp_zone(GFP_HIGHUSER),
2503 				&pol->v.nodes);
2504 		polnid = zone_to_nid(z->zone);
2505 		break;
2506 
2507 	default:
2508 		BUG();
2509 	}
2510 
2511 	/* Migrate the page towards the node whose CPU is referencing it */
2512 	if (pol->flags & MPOL_F_MORON) {
2513 		polnid = thisnid;
2514 
2515 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2516 			goto out;
2517 	}
2518 
2519 	if (curnid != polnid)
2520 		ret = polnid;
2521 out:
2522 	mpol_cond_put(pol);
2523 
2524 	return ret;
2525 }
2526 
2527 /*
2528  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2529  * dropped after task->mempolicy is set to NULL so that any allocation done as
2530  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2531  * policy.
2532  */
2533 void mpol_put_task_policy(struct task_struct *task)
2534 {
2535 	struct mempolicy *pol;
2536 
2537 	task_lock(task);
2538 	pol = task->mempolicy;
2539 	task->mempolicy = NULL;
2540 	task_unlock(task);
2541 	mpol_put(pol);
2542 }
2543 
2544 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2545 {
2546 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2547 	rb_erase(&n->nd, &sp->root);
2548 	sp_free(n);
2549 }
2550 
2551 static void sp_node_init(struct sp_node *node, unsigned long start,
2552 			unsigned long end, struct mempolicy *pol)
2553 {
2554 	node->start = start;
2555 	node->end = end;
2556 	node->policy = pol;
2557 }
2558 
2559 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2560 				struct mempolicy *pol)
2561 {
2562 	struct sp_node *n;
2563 	struct mempolicy *newpol;
2564 
2565 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2566 	if (!n)
2567 		return NULL;
2568 
2569 	newpol = mpol_dup(pol);
2570 	if (IS_ERR(newpol)) {
2571 		kmem_cache_free(sn_cache, n);
2572 		return NULL;
2573 	}
2574 	newpol->flags |= MPOL_F_SHARED;
2575 	sp_node_init(n, start, end, newpol);
2576 
2577 	return n;
2578 }
2579 
2580 /* Replace a policy range. */
2581 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2582 				 unsigned long end, struct sp_node *new)
2583 {
2584 	struct sp_node *n;
2585 	struct sp_node *n_new = NULL;
2586 	struct mempolicy *mpol_new = NULL;
2587 	int ret = 0;
2588 
2589 restart:
2590 	write_lock(&sp->lock);
2591 	n = sp_lookup(sp, start, end);
2592 	/* Take care of old policies in the same range. */
2593 	while (n && n->start < end) {
2594 		struct rb_node *next = rb_next(&n->nd);
2595 		if (n->start >= start) {
2596 			if (n->end <= end)
2597 				sp_delete(sp, n);
2598 			else
2599 				n->start = end;
2600 		} else {
2601 			/* Old policy spanning whole new range. */
2602 			if (n->end > end) {
2603 				if (!n_new)
2604 					goto alloc_new;
2605 
2606 				*mpol_new = *n->policy;
2607 				atomic_set(&mpol_new->refcnt, 1);
2608 				sp_node_init(n_new, end, n->end, mpol_new);
2609 				n->end = start;
2610 				sp_insert(sp, n_new);
2611 				n_new = NULL;
2612 				mpol_new = NULL;
2613 				break;
2614 			} else
2615 				n->end = start;
2616 		}
2617 		if (!next)
2618 			break;
2619 		n = rb_entry(next, struct sp_node, nd);
2620 	}
2621 	if (new)
2622 		sp_insert(sp, new);
2623 	write_unlock(&sp->lock);
2624 	ret = 0;
2625 
2626 err_out:
2627 	if (mpol_new)
2628 		mpol_put(mpol_new);
2629 	if (n_new)
2630 		kmem_cache_free(sn_cache, n_new);
2631 
2632 	return ret;
2633 
2634 alloc_new:
2635 	write_unlock(&sp->lock);
2636 	ret = -ENOMEM;
2637 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2638 	if (!n_new)
2639 		goto err_out;
2640 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2641 	if (!mpol_new)
2642 		goto err_out;
2643 	goto restart;
2644 }
2645 
2646 /**
2647  * mpol_shared_policy_init - initialize shared policy for inode
2648  * @sp: pointer to inode shared policy
2649  * @mpol:  struct mempolicy to install
2650  *
2651  * Install non-NULL @mpol in inode's shared policy rb-tree.
2652  * On entry, the current task has a reference on a non-NULL @mpol.
2653  * This must be released on exit.
2654  * This is called at get_inode() calls and we can use GFP_KERNEL.
2655  */
2656 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2657 {
2658 	int ret;
2659 
2660 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2661 	rwlock_init(&sp->lock);
2662 
2663 	if (mpol) {
2664 		struct vm_area_struct pvma;
2665 		struct mempolicy *new;
2666 		NODEMASK_SCRATCH(scratch);
2667 
2668 		if (!scratch)
2669 			goto put_mpol;
2670 		/* contextualize the tmpfs mount point mempolicy */
2671 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2672 		if (IS_ERR(new))
2673 			goto free_scratch; /* no valid nodemask intersection */
2674 
2675 		task_lock(current);
2676 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2677 		task_unlock(current);
2678 		if (ret)
2679 			goto put_new;
2680 
2681 		/* Create pseudo-vma that contains just the policy */
2682 		vma_init(&pvma, NULL);
2683 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2684 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2685 
2686 put_new:
2687 		mpol_put(new);			/* drop initial ref */
2688 free_scratch:
2689 		NODEMASK_SCRATCH_FREE(scratch);
2690 put_mpol:
2691 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2692 	}
2693 }
2694 
2695 int mpol_set_shared_policy(struct shared_policy *info,
2696 			struct vm_area_struct *vma, struct mempolicy *npol)
2697 {
2698 	int err;
2699 	struct sp_node *new = NULL;
2700 	unsigned long sz = vma_pages(vma);
2701 
2702 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2703 		 vma->vm_pgoff,
2704 		 sz, npol ? npol->mode : -1,
2705 		 npol ? npol->flags : -1,
2706 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2707 
2708 	if (npol) {
2709 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2710 		if (!new)
2711 			return -ENOMEM;
2712 	}
2713 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2714 	if (err && new)
2715 		sp_free(new);
2716 	return err;
2717 }
2718 
2719 /* Free a backing policy store on inode delete. */
2720 void mpol_free_shared_policy(struct shared_policy *p)
2721 {
2722 	struct sp_node *n;
2723 	struct rb_node *next;
2724 
2725 	if (!p->root.rb_node)
2726 		return;
2727 	write_lock(&p->lock);
2728 	next = rb_first(&p->root);
2729 	while (next) {
2730 		n = rb_entry(next, struct sp_node, nd);
2731 		next = rb_next(&n->nd);
2732 		sp_delete(p, n);
2733 	}
2734 	write_unlock(&p->lock);
2735 }
2736 
2737 #ifdef CONFIG_NUMA_BALANCING
2738 static int __initdata numabalancing_override;
2739 
2740 static void __init check_numabalancing_enable(void)
2741 {
2742 	bool numabalancing_default = false;
2743 
2744 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2745 		numabalancing_default = true;
2746 
2747 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2748 	if (numabalancing_override)
2749 		set_numabalancing_state(numabalancing_override == 1);
2750 
2751 	if (num_online_nodes() > 1 && !numabalancing_override) {
2752 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2753 			numabalancing_default ? "Enabling" : "Disabling");
2754 		set_numabalancing_state(numabalancing_default);
2755 	}
2756 }
2757 
2758 static int __init setup_numabalancing(char *str)
2759 {
2760 	int ret = 0;
2761 	if (!str)
2762 		goto out;
2763 
2764 	if (!strcmp(str, "enable")) {
2765 		numabalancing_override = 1;
2766 		ret = 1;
2767 	} else if (!strcmp(str, "disable")) {
2768 		numabalancing_override = -1;
2769 		ret = 1;
2770 	}
2771 out:
2772 	if (!ret)
2773 		pr_warn("Unable to parse numa_balancing=\n");
2774 
2775 	return ret;
2776 }
2777 __setup("numa_balancing=", setup_numabalancing);
2778 #else
2779 static inline void __init check_numabalancing_enable(void)
2780 {
2781 }
2782 #endif /* CONFIG_NUMA_BALANCING */
2783 
2784 /* assumes fs == KERNEL_DS */
2785 void __init numa_policy_init(void)
2786 {
2787 	nodemask_t interleave_nodes;
2788 	unsigned long largest = 0;
2789 	int nid, prefer = 0;
2790 
2791 	policy_cache = kmem_cache_create("numa_policy",
2792 					 sizeof(struct mempolicy),
2793 					 0, SLAB_PANIC, NULL);
2794 
2795 	sn_cache = kmem_cache_create("shared_policy_node",
2796 				     sizeof(struct sp_node),
2797 				     0, SLAB_PANIC, NULL);
2798 
2799 	for_each_node(nid) {
2800 		preferred_node_policy[nid] = (struct mempolicy) {
2801 			.refcnt = ATOMIC_INIT(1),
2802 			.mode = MPOL_PREFERRED,
2803 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2804 			.v = { .preferred_node = nid, },
2805 		};
2806 	}
2807 
2808 	/*
2809 	 * Set interleaving policy for system init. Interleaving is only
2810 	 * enabled across suitably sized nodes (default is >= 16MB), or
2811 	 * fall back to the largest node if they're all smaller.
2812 	 */
2813 	nodes_clear(interleave_nodes);
2814 	for_each_node_state(nid, N_MEMORY) {
2815 		unsigned long total_pages = node_present_pages(nid);
2816 
2817 		/* Preserve the largest node */
2818 		if (largest < total_pages) {
2819 			largest = total_pages;
2820 			prefer = nid;
2821 		}
2822 
2823 		/* Interleave this node? */
2824 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2825 			node_set(nid, interleave_nodes);
2826 	}
2827 
2828 	/* All too small, use the largest */
2829 	if (unlikely(nodes_empty(interleave_nodes)))
2830 		node_set(prefer, interleave_nodes);
2831 
2832 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2833 		pr_err("%s: interleaving failed\n", __func__);
2834 
2835 	check_numabalancing_enable();
2836 }
2837 
2838 /* Reset policy of current process to default */
2839 void numa_default_policy(void)
2840 {
2841 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2842 }
2843 
2844 /*
2845  * Parse and format mempolicy from/to strings
2846  */
2847 
2848 /*
2849  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2850  */
2851 static const char * const policy_modes[] =
2852 {
2853 	[MPOL_DEFAULT]    = "default",
2854 	[MPOL_PREFERRED]  = "prefer",
2855 	[MPOL_BIND]       = "bind",
2856 	[MPOL_INTERLEAVE] = "interleave",
2857 	[MPOL_LOCAL]      = "local",
2858 };
2859 
2860 
2861 #ifdef CONFIG_TMPFS
2862 /**
2863  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2864  * @str:  string containing mempolicy to parse
2865  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2866  *
2867  * Format of input:
2868  *	<mode>[=<flags>][:<nodelist>]
2869  *
2870  * On success, returns 0, else 1
2871  */
2872 int mpol_parse_str(char *str, struct mempolicy **mpol)
2873 {
2874 	struct mempolicy *new = NULL;
2875 	unsigned short mode_flags;
2876 	nodemask_t nodes;
2877 	char *nodelist = strchr(str, ':');
2878 	char *flags = strchr(str, '=');
2879 	int err = 1, mode;
2880 
2881 	if (flags)
2882 		*flags++ = '\0';	/* terminate mode string */
2883 
2884 	if (nodelist) {
2885 		/* NUL-terminate mode or flags string */
2886 		*nodelist++ = '\0';
2887 		if (nodelist_parse(nodelist, nodes))
2888 			goto out;
2889 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2890 			goto out;
2891 	} else
2892 		nodes_clear(nodes);
2893 
2894 	mode = match_string(policy_modes, MPOL_MAX, str);
2895 	if (mode < 0)
2896 		goto out;
2897 
2898 	switch (mode) {
2899 	case MPOL_PREFERRED:
2900 		/*
2901 		 * Insist on a nodelist of one node only, although later
2902 		 * we use first_node(nodes) to grab a single node, so here
2903 		 * nodelist (or nodes) cannot be empty.
2904 		 */
2905 		if (nodelist) {
2906 			char *rest = nodelist;
2907 			while (isdigit(*rest))
2908 				rest++;
2909 			if (*rest)
2910 				goto out;
2911 			if (nodes_empty(nodes))
2912 				goto out;
2913 		}
2914 		break;
2915 	case MPOL_INTERLEAVE:
2916 		/*
2917 		 * Default to online nodes with memory if no nodelist
2918 		 */
2919 		if (!nodelist)
2920 			nodes = node_states[N_MEMORY];
2921 		break;
2922 	case MPOL_LOCAL:
2923 		/*
2924 		 * Don't allow a nodelist;  mpol_new() checks flags
2925 		 */
2926 		if (nodelist)
2927 			goto out;
2928 		mode = MPOL_PREFERRED;
2929 		break;
2930 	case MPOL_DEFAULT:
2931 		/*
2932 		 * Insist on a empty nodelist
2933 		 */
2934 		if (!nodelist)
2935 			err = 0;
2936 		goto out;
2937 	case MPOL_BIND:
2938 		/*
2939 		 * Insist on a nodelist
2940 		 */
2941 		if (!nodelist)
2942 			goto out;
2943 	}
2944 
2945 	mode_flags = 0;
2946 	if (flags) {
2947 		/*
2948 		 * Currently, we only support two mutually exclusive
2949 		 * mode flags.
2950 		 */
2951 		if (!strcmp(flags, "static"))
2952 			mode_flags |= MPOL_F_STATIC_NODES;
2953 		else if (!strcmp(flags, "relative"))
2954 			mode_flags |= MPOL_F_RELATIVE_NODES;
2955 		else
2956 			goto out;
2957 	}
2958 
2959 	new = mpol_new(mode, mode_flags, &nodes);
2960 	if (IS_ERR(new))
2961 		goto out;
2962 
2963 	/*
2964 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2965 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2966 	 */
2967 	if (mode != MPOL_PREFERRED)
2968 		new->v.nodes = nodes;
2969 	else if (nodelist)
2970 		new->v.preferred_node = first_node(nodes);
2971 	else
2972 		new->flags |= MPOL_F_LOCAL;
2973 
2974 	/*
2975 	 * Save nodes for contextualization: this will be used to "clone"
2976 	 * the mempolicy in a specific context [cpuset] at a later time.
2977 	 */
2978 	new->w.user_nodemask = nodes;
2979 
2980 	err = 0;
2981 
2982 out:
2983 	/* Restore string for error message */
2984 	if (nodelist)
2985 		*--nodelist = ':';
2986 	if (flags)
2987 		*--flags = '=';
2988 	if (!err)
2989 		*mpol = new;
2990 	return err;
2991 }
2992 #endif /* CONFIG_TMPFS */
2993 
2994 /**
2995  * mpol_to_str - format a mempolicy structure for printing
2996  * @buffer:  to contain formatted mempolicy string
2997  * @maxlen:  length of @buffer
2998  * @pol:  pointer to mempolicy to be formatted
2999  *
3000  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3001  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3002  * longest flag, "relative", and to display at least a few node ids.
3003  */
3004 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3005 {
3006 	char *p = buffer;
3007 	nodemask_t nodes = NODE_MASK_NONE;
3008 	unsigned short mode = MPOL_DEFAULT;
3009 	unsigned short flags = 0;
3010 
3011 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3012 		mode = pol->mode;
3013 		flags = pol->flags;
3014 	}
3015 
3016 	switch (mode) {
3017 	case MPOL_DEFAULT:
3018 		break;
3019 	case MPOL_PREFERRED:
3020 		if (flags & MPOL_F_LOCAL)
3021 			mode = MPOL_LOCAL;
3022 		else
3023 			node_set(pol->v.preferred_node, nodes);
3024 		break;
3025 	case MPOL_BIND:
3026 	case MPOL_INTERLEAVE:
3027 		nodes = pol->v.nodes;
3028 		break;
3029 	default:
3030 		WARN_ON_ONCE(1);
3031 		snprintf(p, maxlen, "unknown");
3032 		return;
3033 	}
3034 
3035 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3036 
3037 	if (flags & MPOL_MODE_FLAGS) {
3038 		p += snprintf(p, buffer + maxlen - p, "=");
3039 
3040 		/*
3041 		 * Currently, the only defined flags are mutually exclusive
3042 		 */
3043 		if (flags & MPOL_F_STATIC_NODES)
3044 			p += snprintf(p, buffer + maxlen - p, "static");
3045 		else if (flags & MPOL_F_RELATIVE_NODES)
3046 			p += snprintf(p, buffer + maxlen - p, "relative");
3047 	}
3048 
3049 	if (!nodes_empty(nodes))
3050 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3051 			       nodemask_pr_args(&nodes));
3052 }
3053