xref: /linux/mm/mempolicy.c (revision 06bd48b6cd97ef3889b68c8e09014d81dbc463f1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/pagewalk.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_PREFERRED,
125 	.flags = MPOL_F_LOCAL,
126 };
127 
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 
130 struct mempolicy *get_task_policy(struct task_struct *p)
131 {
132 	struct mempolicy *pol = p->mempolicy;
133 	int node;
134 
135 	if (pol)
136 		return pol;
137 
138 	node = numa_node_id();
139 	if (node != NUMA_NO_NODE) {
140 		pol = &preferred_node_policy[node];
141 		/* preferred_node_policy is not initialised early in boot */
142 		if (pol->mode)
143 			return pol;
144 	}
145 
146 	return &default_policy;
147 }
148 
149 static const struct mempolicy_operations {
150 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152 } mpol_ops[MPOL_MAX];
153 
154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155 {
156 	return pol->flags & MPOL_MODE_FLAGS;
157 }
158 
159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 				   const nodemask_t *rel)
161 {
162 	nodemask_t tmp;
163 	nodes_fold(tmp, *orig, nodes_weight(*rel));
164 	nodes_onto(*ret, tmp, *rel);
165 }
166 
167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (nodes_empty(*nodes))
170 		return -EINVAL;
171 	pol->v.nodes = *nodes;
172 	return 0;
173 }
174 
175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176 {
177 	if (!nodes)
178 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
179 	else if (nodes_empty(*nodes))
180 		return -EINVAL;			/*  no allowed nodes */
181 	else
182 		pol->v.preferred_node = first_node(*nodes);
183 	return 0;
184 }
185 
186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (nodes_empty(*nodes))
189 		return -EINVAL;
190 	pol->v.nodes = *nodes;
191 	return 0;
192 }
193 
194 /*
195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
196  * any, for the new policy.  mpol_new() has already validated the nodes
197  * parameter with respect to the policy mode and flags.  But, we need to
198  * handle an empty nodemask with MPOL_PREFERRED here.
199  *
200  * Must be called holding task's alloc_lock to protect task's mems_allowed
201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
202  */
203 static int mpol_set_nodemask(struct mempolicy *pol,
204 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
205 {
206 	int ret;
207 
208 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
209 	if (pol == NULL)
210 		return 0;
211 	/* Check N_MEMORY */
212 	nodes_and(nsc->mask1,
213 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
214 
215 	VM_BUG_ON(!nodes);
216 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 		nodes = NULL;	/* explicit local allocation */
218 	else {
219 		if (pol->flags & MPOL_F_RELATIVE_NODES)
220 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 		else
222 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
223 
224 		if (mpol_store_user_nodemask(pol))
225 			pol->w.user_nodemask = *nodes;
226 		else
227 			pol->w.cpuset_mems_allowed =
228 						cpuset_current_mems_allowed;
229 	}
230 
231 	if (nodes)
232 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 	else
234 		ret = mpol_ops[pol->mode].create(pol, NULL);
235 	return ret;
236 }
237 
238 /*
239  * This function just creates a new policy, does some check and simple
240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
241  */
242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 				  nodemask_t *nodes)
244 {
245 	struct mempolicy *policy;
246 
247 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249 
250 	if (mode == MPOL_DEFAULT) {
251 		if (nodes && !nodes_empty(*nodes))
252 			return ERR_PTR(-EINVAL);
253 		return NULL;
254 	}
255 	VM_BUG_ON(!nodes);
256 
257 	/*
258 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
259 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
260 	 * All other modes require a valid pointer to a non-empty nodemask.
261 	 */
262 	if (mode == MPOL_PREFERRED) {
263 		if (nodes_empty(*nodes)) {
264 			if (((flags & MPOL_F_STATIC_NODES) ||
265 			     (flags & MPOL_F_RELATIVE_NODES)))
266 				return ERR_PTR(-EINVAL);
267 		}
268 	} else if (mode == MPOL_LOCAL) {
269 		if (!nodes_empty(*nodes) ||
270 		    (flags & MPOL_F_STATIC_NODES) ||
271 		    (flags & MPOL_F_RELATIVE_NODES))
272 			return ERR_PTR(-EINVAL);
273 		mode = MPOL_PREFERRED;
274 	} else if (nodes_empty(*nodes))
275 		return ERR_PTR(-EINVAL);
276 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 	if (!policy)
278 		return ERR_PTR(-ENOMEM);
279 	atomic_set(&policy->refcnt, 1);
280 	policy->mode = mode;
281 	policy->flags = flags;
282 
283 	return policy;
284 }
285 
286 /* Slow path of a mpol destructor. */
287 void __mpol_put(struct mempolicy *p)
288 {
289 	if (!atomic_dec_and_test(&p->refcnt))
290 		return;
291 	kmem_cache_free(policy_cache, p);
292 }
293 
294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295 {
296 }
297 
298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299 {
300 	nodemask_t tmp;
301 
302 	if (pol->flags & MPOL_F_STATIC_NODES)
303 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 	else {
307 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 								*nodes);
309 		pol->w.cpuset_mems_allowed = *nodes;
310 	}
311 
312 	if (nodes_empty(tmp))
313 		tmp = *nodes;
314 
315 	pol->v.nodes = tmp;
316 }
317 
318 static void mpol_rebind_preferred(struct mempolicy *pol,
319 						const nodemask_t *nodes)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES) {
324 		int node = first_node(pol->w.user_nodemask);
325 
326 		if (node_isset(node, *nodes)) {
327 			pol->v.preferred_node = node;
328 			pol->flags &= ~MPOL_F_LOCAL;
329 		} else
330 			pol->flags |= MPOL_F_LOCAL;
331 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 		pol->v.preferred_node = first_node(tmp);
334 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
335 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 						   pol->w.cpuset_mems_allowed,
337 						   *nodes);
338 		pol->w.cpuset_mems_allowed = *nodes;
339 	}
340 }
341 
342 /*
343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
344  *
345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
346  * policies are protected by task->mems_allowed_seq to prevent a premature
347  * OOM/allocation failure due to parallel nodemask modification.
348  */
349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350 {
351 	if (!pol)
352 		return;
353 	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 		return;
356 
357 	mpol_ops[pol->mode].rebind(pol, newmask);
358 }
359 
360 /*
361  * Wrapper for mpol_rebind_policy() that just requires task
362  * pointer, and updates task mempolicy.
363  *
364  * Called with task's alloc_lock held.
365  */
366 
367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368 {
369 	mpol_rebind_policy(tsk->mempolicy, new);
370 }
371 
372 /*
373  * Rebind each vma in mm to new nodemask.
374  *
375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
376  */
377 
378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379 {
380 	struct vm_area_struct *vma;
381 
382 	down_write(&mm->mmap_sem);
383 	for (vma = mm->mmap; vma; vma = vma->vm_next)
384 		mpol_rebind_policy(vma->vm_policy, new);
385 	up_write(&mm->mmap_sem);
386 }
387 
388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 	[MPOL_DEFAULT] = {
390 		.rebind = mpol_rebind_default,
391 	},
392 	[MPOL_INTERLEAVE] = {
393 		.create = mpol_new_interleave,
394 		.rebind = mpol_rebind_nodemask,
395 	},
396 	[MPOL_PREFERRED] = {
397 		.create = mpol_new_preferred,
398 		.rebind = mpol_rebind_preferred,
399 	},
400 	[MPOL_BIND] = {
401 		.create = mpol_new_bind,
402 		.rebind = mpol_rebind_nodemask,
403 	},
404 };
405 
406 static int migrate_page_add(struct page *page, struct list_head *pagelist,
407 				unsigned long flags);
408 
409 struct queue_pages {
410 	struct list_head *pagelist;
411 	unsigned long flags;
412 	nodemask_t *nmask;
413 	unsigned long start;
414 	unsigned long end;
415 	struct vm_area_struct *first;
416 };
417 
418 /*
419  * Check if the page's nid is in qp->nmask.
420  *
421  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
422  * in the invert of qp->nmask.
423  */
424 static inline bool queue_pages_required(struct page *page,
425 					struct queue_pages *qp)
426 {
427 	int nid = page_to_nid(page);
428 	unsigned long flags = qp->flags;
429 
430 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
431 }
432 
433 /*
434  * queue_pages_pmd() has four possible return values:
435  * 0 - pages are placed on the right node or queued successfully.
436  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
437  *     specified.
438  * 2 - THP was split.
439  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
440  *        existing page was already on a node that does not follow the
441  *        policy.
442  */
443 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
444 				unsigned long end, struct mm_walk *walk)
445 	__releases(ptl)
446 {
447 	int ret = 0;
448 	struct page *page;
449 	struct queue_pages *qp = walk->private;
450 	unsigned long flags;
451 
452 	if (unlikely(is_pmd_migration_entry(*pmd))) {
453 		ret = -EIO;
454 		goto unlock;
455 	}
456 	page = pmd_page(*pmd);
457 	if (is_huge_zero_page(page)) {
458 		spin_unlock(ptl);
459 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
460 		ret = 2;
461 		goto out;
462 	}
463 	if (!queue_pages_required(page, qp))
464 		goto unlock;
465 
466 	flags = qp->flags;
467 	/* go to thp migration */
468 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
469 		if (!vma_migratable(walk->vma) ||
470 		    migrate_page_add(page, qp->pagelist, flags)) {
471 			ret = 1;
472 			goto unlock;
473 		}
474 	} else
475 		ret = -EIO;
476 unlock:
477 	spin_unlock(ptl);
478 out:
479 	return ret;
480 }
481 
482 /*
483  * Scan through pages checking if pages follow certain conditions,
484  * and move them to the pagelist if they do.
485  *
486  * queue_pages_pte_range() has three possible return values:
487  * 0 - pages are placed on the right node or queued successfully.
488  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
489  *     specified.
490  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
491  *        on a node that does not follow the policy.
492  */
493 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
494 			unsigned long end, struct mm_walk *walk)
495 {
496 	struct vm_area_struct *vma = walk->vma;
497 	struct page *page;
498 	struct queue_pages *qp = walk->private;
499 	unsigned long flags = qp->flags;
500 	int ret;
501 	bool has_unmovable = false;
502 	pte_t *pte;
503 	spinlock_t *ptl;
504 
505 	ptl = pmd_trans_huge_lock(pmd, vma);
506 	if (ptl) {
507 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
508 		if (ret != 2)
509 			return ret;
510 	}
511 	/* THP was split, fall through to pte walk */
512 
513 	if (pmd_trans_unstable(pmd))
514 		return 0;
515 
516 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
517 	for (; addr != end; pte++, addr += PAGE_SIZE) {
518 		if (!pte_present(*pte))
519 			continue;
520 		page = vm_normal_page(vma, addr, *pte);
521 		if (!page)
522 			continue;
523 		/*
524 		 * vm_normal_page() filters out zero pages, but there might
525 		 * still be PageReserved pages to skip, perhaps in a VDSO.
526 		 */
527 		if (PageReserved(page))
528 			continue;
529 		if (!queue_pages_required(page, qp))
530 			continue;
531 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
532 			/* MPOL_MF_STRICT must be specified if we get here */
533 			if (!vma_migratable(vma)) {
534 				has_unmovable = true;
535 				break;
536 			}
537 
538 			/*
539 			 * Do not abort immediately since there may be
540 			 * temporary off LRU pages in the range.  Still
541 			 * need migrate other LRU pages.
542 			 */
543 			if (migrate_page_add(page, qp->pagelist, flags))
544 				has_unmovable = true;
545 		} else
546 			break;
547 	}
548 	pte_unmap_unlock(pte - 1, ptl);
549 	cond_resched();
550 
551 	if (has_unmovable)
552 		return 1;
553 
554 	return addr != end ? -EIO : 0;
555 }
556 
557 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
558 			       unsigned long addr, unsigned long end,
559 			       struct mm_walk *walk)
560 {
561 	int ret = 0;
562 #ifdef CONFIG_HUGETLB_PAGE
563 	struct queue_pages *qp = walk->private;
564 	unsigned long flags = (qp->flags & MPOL_MF_VALID);
565 	struct page *page;
566 	spinlock_t *ptl;
567 	pte_t entry;
568 
569 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
570 	entry = huge_ptep_get(pte);
571 	if (!pte_present(entry))
572 		goto unlock;
573 	page = pte_page(entry);
574 	if (!queue_pages_required(page, qp))
575 		goto unlock;
576 
577 	if (flags == MPOL_MF_STRICT) {
578 		/*
579 		 * STRICT alone means only detecting misplaced page and no
580 		 * need to further check other vma.
581 		 */
582 		ret = -EIO;
583 		goto unlock;
584 	}
585 
586 	if (!vma_migratable(walk->vma)) {
587 		/*
588 		 * Must be STRICT with MOVE*, otherwise .test_walk() have
589 		 * stopped walking current vma.
590 		 * Detecting misplaced page but allow migrating pages which
591 		 * have been queued.
592 		 */
593 		ret = 1;
594 		goto unlock;
595 	}
596 
597 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
598 	if (flags & (MPOL_MF_MOVE_ALL) ||
599 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
600 		if (!isolate_huge_page(page, qp->pagelist) &&
601 			(flags & MPOL_MF_STRICT))
602 			/*
603 			 * Failed to isolate page but allow migrating pages
604 			 * which have been queued.
605 			 */
606 			ret = 1;
607 	}
608 unlock:
609 	spin_unlock(ptl);
610 #else
611 	BUG();
612 #endif
613 	return ret;
614 }
615 
616 #ifdef CONFIG_NUMA_BALANCING
617 /*
618  * This is used to mark a range of virtual addresses to be inaccessible.
619  * These are later cleared by a NUMA hinting fault. Depending on these
620  * faults, pages may be migrated for better NUMA placement.
621  *
622  * This is assuming that NUMA faults are handled using PROT_NONE. If
623  * an architecture makes a different choice, it will need further
624  * changes to the core.
625  */
626 unsigned long change_prot_numa(struct vm_area_struct *vma,
627 			unsigned long addr, unsigned long end)
628 {
629 	int nr_updated;
630 
631 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
632 	if (nr_updated)
633 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634 
635 	return nr_updated;
636 }
637 #else
638 static unsigned long change_prot_numa(struct vm_area_struct *vma,
639 			unsigned long addr, unsigned long end)
640 {
641 	return 0;
642 }
643 #endif /* CONFIG_NUMA_BALANCING */
644 
645 static int queue_pages_test_walk(unsigned long start, unsigned long end,
646 				struct mm_walk *walk)
647 {
648 	struct vm_area_struct *vma = walk->vma;
649 	struct queue_pages *qp = walk->private;
650 	unsigned long endvma = vma->vm_end;
651 	unsigned long flags = qp->flags;
652 
653 	/* range check first */
654 	VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
655 
656 	if (!qp->first) {
657 		qp->first = vma;
658 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
659 			(qp->start < vma->vm_start))
660 			/* hole at head side of range */
661 			return -EFAULT;
662 	}
663 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
664 		((vma->vm_end < qp->end) &&
665 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
666 		/* hole at middle or tail of range */
667 		return -EFAULT;
668 
669 	/*
670 	 * Need check MPOL_MF_STRICT to return -EIO if possible
671 	 * regardless of vma_migratable
672 	 */
673 	if (!vma_migratable(vma) &&
674 	    !(flags & MPOL_MF_STRICT))
675 		return 1;
676 
677 	if (endvma > end)
678 		endvma = end;
679 
680 	if (flags & MPOL_MF_LAZY) {
681 		/* Similar to task_numa_work, skip inaccessible VMAs */
682 		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
683 			!(vma->vm_flags & VM_MIXEDMAP))
684 			change_prot_numa(vma, start, endvma);
685 		return 1;
686 	}
687 
688 	/* queue pages from current vma */
689 	if (flags & MPOL_MF_VALID)
690 		return 0;
691 	return 1;
692 }
693 
694 static const struct mm_walk_ops queue_pages_walk_ops = {
695 	.hugetlb_entry		= queue_pages_hugetlb,
696 	.pmd_entry		= queue_pages_pte_range,
697 	.test_walk		= queue_pages_test_walk,
698 };
699 
700 /*
701  * Walk through page tables and collect pages to be migrated.
702  *
703  * If pages found in a given range are on a set of nodes (determined by
704  * @nodes and @flags,) it's isolated and queued to the pagelist which is
705  * passed via @private.
706  *
707  * queue_pages_range() has three possible return values:
708  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
709  *     specified.
710  * 0 - queue pages successfully or no misplaced page.
711  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
712  *         memory range specified by nodemask and maxnode points outside
713  *         your accessible address space (-EFAULT)
714  */
715 static int
716 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
717 		nodemask_t *nodes, unsigned long flags,
718 		struct list_head *pagelist)
719 {
720 	int err;
721 	struct queue_pages qp = {
722 		.pagelist = pagelist,
723 		.flags = flags,
724 		.nmask = nodes,
725 		.start = start,
726 		.end = end,
727 		.first = NULL,
728 	};
729 
730 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
731 
732 	if (!qp.first)
733 		/* whole range in hole */
734 		err = -EFAULT;
735 
736 	return err;
737 }
738 
739 /*
740  * Apply policy to a single VMA
741  * This must be called with the mmap_sem held for writing.
742  */
743 static int vma_replace_policy(struct vm_area_struct *vma,
744 						struct mempolicy *pol)
745 {
746 	int err;
747 	struct mempolicy *old;
748 	struct mempolicy *new;
749 
750 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
751 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
752 		 vma->vm_ops, vma->vm_file,
753 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
754 
755 	new = mpol_dup(pol);
756 	if (IS_ERR(new))
757 		return PTR_ERR(new);
758 
759 	if (vma->vm_ops && vma->vm_ops->set_policy) {
760 		err = vma->vm_ops->set_policy(vma, new);
761 		if (err)
762 			goto err_out;
763 	}
764 
765 	old = vma->vm_policy;
766 	vma->vm_policy = new; /* protected by mmap_sem */
767 	mpol_put(old);
768 
769 	return 0;
770  err_out:
771 	mpol_put(new);
772 	return err;
773 }
774 
775 /* Step 2: apply policy to a range and do splits. */
776 static int mbind_range(struct mm_struct *mm, unsigned long start,
777 		       unsigned long end, struct mempolicy *new_pol)
778 {
779 	struct vm_area_struct *next;
780 	struct vm_area_struct *prev;
781 	struct vm_area_struct *vma;
782 	int err = 0;
783 	pgoff_t pgoff;
784 	unsigned long vmstart;
785 	unsigned long vmend;
786 
787 	vma = find_vma(mm, start);
788 	VM_BUG_ON(!vma);
789 
790 	prev = vma->vm_prev;
791 	if (start > vma->vm_start)
792 		prev = vma;
793 
794 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
795 		next = vma->vm_next;
796 		vmstart = max(start, vma->vm_start);
797 		vmend   = min(end, vma->vm_end);
798 
799 		if (mpol_equal(vma_policy(vma), new_pol))
800 			continue;
801 
802 		pgoff = vma->vm_pgoff +
803 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
804 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
805 				 vma->anon_vma, vma->vm_file, pgoff,
806 				 new_pol, vma->vm_userfaultfd_ctx);
807 		if (prev) {
808 			vma = prev;
809 			next = vma->vm_next;
810 			if (mpol_equal(vma_policy(vma), new_pol))
811 				continue;
812 			/* vma_merge() joined vma && vma->next, case 8 */
813 			goto replace;
814 		}
815 		if (vma->vm_start != vmstart) {
816 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
817 			if (err)
818 				goto out;
819 		}
820 		if (vma->vm_end != vmend) {
821 			err = split_vma(vma->vm_mm, vma, vmend, 0);
822 			if (err)
823 				goto out;
824 		}
825  replace:
826 		err = vma_replace_policy(vma, new_pol);
827 		if (err)
828 			goto out;
829 	}
830 
831  out:
832 	return err;
833 }
834 
835 /* Set the process memory policy */
836 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
837 			     nodemask_t *nodes)
838 {
839 	struct mempolicy *new, *old;
840 	NODEMASK_SCRATCH(scratch);
841 	int ret;
842 
843 	if (!scratch)
844 		return -ENOMEM;
845 
846 	new = mpol_new(mode, flags, nodes);
847 	if (IS_ERR(new)) {
848 		ret = PTR_ERR(new);
849 		goto out;
850 	}
851 
852 	task_lock(current);
853 	ret = mpol_set_nodemask(new, nodes, scratch);
854 	if (ret) {
855 		task_unlock(current);
856 		mpol_put(new);
857 		goto out;
858 	}
859 	old = current->mempolicy;
860 	current->mempolicy = new;
861 	if (new && new->mode == MPOL_INTERLEAVE)
862 		current->il_prev = MAX_NUMNODES-1;
863 	task_unlock(current);
864 	mpol_put(old);
865 	ret = 0;
866 out:
867 	NODEMASK_SCRATCH_FREE(scratch);
868 	return ret;
869 }
870 
871 /*
872  * Return nodemask for policy for get_mempolicy() query
873  *
874  * Called with task's alloc_lock held
875  */
876 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
877 {
878 	nodes_clear(*nodes);
879 	if (p == &default_policy)
880 		return;
881 
882 	switch (p->mode) {
883 	case MPOL_BIND:
884 	case MPOL_INTERLEAVE:
885 		*nodes = p->v.nodes;
886 		break;
887 	case MPOL_PREFERRED:
888 		if (!(p->flags & MPOL_F_LOCAL))
889 			node_set(p->v.preferred_node, *nodes);
890 		/* else return empty node mask for local allocation */
891 		break;
892 	default:
893 		BUG();
894 	}
895 }
896 
897 static int lookup_node(struct mm_struct *mm, unsigned long addr)
898 {
899 	struct page *p = NULL;
900 	int err;
901 
902 	int locked = 1;
903 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
904 	if (err == 0) {
905 		/* E.g. GUP interrupted by fatal signal */
906 		err = -EFAULT;
907 	} else if (err > 0) {
908 		err = page_to_nid(p);
909 		put_page(p);
910 	}
911 	if (locked)
912 		up_read(&mm->mmap_sem);
913 	return err;
914 }
915 
916 /* Retrieve NUMA policy */
917 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
918 			     unsigned long addr, unsigned long flags)
919 {
920 	int err;
921 	struct mm_struct *mm = current->mm;
922 	struct vm_area_struct *vma = NULL;
923 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
924 
925 	if (flags &
926 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
927 		return -EINVAL;
928 
929 	if (flags & MPOL_F_MEMS_ALLOWED) {
930 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
931 			return -EINVAL;
932 		*policy = 0;	/* just so it's initialized */
933 		task_lock(current);
934 		*nmask  = cpuset_current_mems_allowed;
935 		task_unlock(current);
936 		return 0;
937 	}
938 
939 	if (flags & MPOL_F_ADDR) {
940 		/*
941 		 * Do NOT fall back to task policy if the
942 		 * vma/shared policy at addr is NULL.  We
943 		 * want to return MPOL_DEFAULT in this case.
944 		 */
945 		down_read(&mm->mmap_sem);
946 		vma = find_vma_intersection(mm, addr, addr+1);
947 		if (!vma) {
948 			up_read(&mm->mmap_sem);
949 			return -EFAULT;
950 		}
951 		if (vma->vm_ops && vma->vm_ops->get_policy)
952 			pol = vma->vm_ops->get_policy(vma, addr);
953 		else
954 			pol = vma->vm_policy;
955 	} else if (addr)
956 		return -EINVAL;
957 
958 	if (!pol)
959 		pol = &default_policy;	/* indicates default behavior */
960 
961 	if (flags & MPOL_F_NODE) {
962 		if (flags & MPOL_F_ADDR) {
963 			/*
964 			 * Take a refcount on the mpol, lookup_node()
965 			 * wil drop the mmap_sem, so after calling
966 			 * lookup_node() only "pol" remains valid, "vma"
967 			 * is stale.
968 			 */
969 			pol_refcount = pol;
970 			vma = NULL;
971 			mpol_get(pol);
972 			err = lookup_node(mm, addr);
973 			if (err < 0)
974 				goto out;
975 			*policy = err;
976 		} else if (pol == current->mempolicy &&
977 				pol->mode == MPOL_INTERLEAVE) {
978 			*policy = next_node_in(current->il_prev, pol->v.nodes);
979 		} else {
980 			err = -EINVAL;
981 			goto out;
982 		}
983 	} else {
984 		*policy = pol == &default_policy ? MPOL_DEFAULT :
985 						pol->mode;
986 		/*
987 		 * Internal mempolicy flags must be masked off before exposing
988 		 * the policy to userspace.
989 		 */
990 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
991 	}
992 
993 	err = 0;
994 	if (nmask) {
995 		if (mpol_store_user_nodemask(pol)) {
996 			*nmask = pol->w.user_nodemask;
997 		} else {
998 			task_lock(current);
999 			get_policy_nodemask(pol, nmask);
1000 			task_unlock(current);
1001 		}
1002 	}
1003 
1004  out:
1005 	mpol_cond_put(pol);
1006 	if (vma)
1007 		up_read(&mm->mmap_sem);
1008 	if (pol_refcount)
1009 		mpol_put(pol_refcount);
1010 	return err;
1011 }
1012 
1013 #ifdef CONFIG_MIGRATION
1014 /*
1015  * page migration, thp tail pages can be passed.
1016  */
1017 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1018 				unsigned long flags)
1019 {
1020 	struct page *head = compound_head(page);
1021 	/*
1022 	 * Avoid migrating a page that is shared with others.
1023 	 */
1024 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1025 		if (!isolate_lru_page(head)) {
1026 			list_add_tail(&head->lru, pagelist);
1027 			mod_node_page_state(page_pgdat(head),
1028 				NR_ISOLATED_ANON + page_is_file_lru(head),
1029 				hpage_nr_pages(head));
1030 		} else if (flags & MPOL_MF_STRICT) {
1031 			/*
1032 			 * Non-movable page may reach here.  And, there may be
1033 			 * temporary off LRU pages or non-LRU movable pages.
1034 			 * Treat them as unmovable pages since they can't be
1035 			 * isolated, so they can't be moved at the moment.  It
1036 			 * should return -EIO for this case too.
1037 			 */
1038 			return -EIO;
1039 		}
1040 	}
1041 
1042 	return 0;
1043 }
1044 
1045 /* page allocation callback for NUMA node migration */
1046 struct page *alloc_new_node_page(struct page *page, unsigned long node)
1047 {
1048 	if (PageHuge(page))
1049 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1050 					node);
1051 	else if (PageTransHuge(page)) {
1052 		struct page *thp;
1053 
1054 		thp = alloc_pages_node(node,
1055 			(GFP_TRANSHUGE | __GFP_THISNODE),
1056 			HPAGE_PMD_ORDER);
1057 		if (!thp)
1058 			return NULL;
1059 		prep_transhuge_page(thp);
1060 		return thp;
1061 	} else
1062 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1063 						    __GFP_THISNODE, 0);
1064 }
1065 
1066 /*
1067  * Migrate pages from one node to a target node.
1068  * Returns error or the number of pages not migrated.
1069  */
1070 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1071 			   int flags)
1072 {
1073 	nodemask_t nmask;
1074 	LIST_HEAD(pagelist);
1075 	int err = 0;
1076 
1077 	nodes_clear(nmask);
1078 	node_set(source, nmask);
1079 
1080 	/*
1081 	 * This does not "check" the range but isolates all pages that
1082 	 * need migration.  Between passing in the full user address
1083 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1084 	 */
1085 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1086 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1087 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1088 
1089 	if (!list_empty(&pagelist)) {
1090 		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1091 					MIGRATE_SYNC, MR_SYSCALL);
1092 		if (err)
1093 			putback_movable_pages(&pagelist);
1094 	}
1095 
1096 	return err;
1097 }
1098 
1099 /*
1100  * Move pages between the two nodesets so as to preserve the physical
1101  * layout as much as possible.
1102  *
1103  * Returns the number of page that could not be moved.
1104  */
1105 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1106 		     const nodemask_t *to, int flags)
1107 {
1108 	int busy = 0;
1109 	int err;
1110 	nodemask_t tmp;
1111 
1112 	err = migrate_prep();
1113 	if (err)
1114 		return err;
1115 
1116 	down_read(&mm->mmap_sem);
1117 
1118 	/*
1119 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1120 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1121 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1122 	 * The pair of nodemasks 'to' and 'from' define the map.
1123 	 *
1124 	 * If no pair of bits is found that way, fallback to picking some
1125 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1126 	 * 'source' and 'dest' bits are the same, this represents a node
1127 	 * that will be migrating to itself, so no pages need move.
1128 	 *
1129 	 * If no bits are left in 'tmp', or if all remaining bits left
1130 	 * in 'tmp' correspond to the same bit in 'to', return false
1131 	 * (nothing left to migrate).
1132 	 *
1133 	 * This lets us pick a pair of nodes to migrate between, such that
1134 	 * if possible the dest node is not already occupied by some other
1135 	 * source node, minimizing the risk of overloading the memory on a
1136 	 * node that would happen if we migrated incoming memory to a node
1137 	 * before migrating outgoing memory source that same node.
1138 	 *
1139 	 * A single scan of tmp is sufficient.  As we go, we remember the
1140 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1141 	 * that not only moved, but what's better, moved to an empty slot
1142 	 * (d is not set in tmp), then we break out then, with that pair.
1143 	 * Otherwise when we finish scanning from_tmp, we at least have the
1144 	 * most recent <s, d> pair that moved.  If we get all the way through
1145 	 * the scan of tmp without finding any node that moved, much less
1146 	 * moved to an empty node, then there is nothing left worth migrating.
1147 	 */
1148 
1149 	tmp = *from;
1150 	while (!nodes_empty(tmp)) {
1151 		int s,d;
1152 		int source = NUMA_NO_NODE;
1153 		int dest = 0;
1154 
1155 		for_each_node_mask(s, tmp) {
1156 
1157 			/*
1158 			 * do_migrate_pages() tries to maintain the relative
1159 			 * node relationship of the pages established between
1160 			 * threads and memory areas.
1161                          *
1162 			 * However if the number of source nodes is not equal to
1163 			 * the number of destination nodes we can not preserve
1164 			 * this node relative relationship.  In that case, skip
1165 			 * copying memory from a node that is in the destination
1166 			 * mask.
1167 			 *
1168 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1169 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1170 			 */
1171 
1172 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1173 						(node_isset(s, *to)))
1174 				continue;
1175 
1176 			d = node_remap(s, *from, *to);
1177 			if (s == d)
1178 				continue;
1179 
1180 			source = s;	/* Node moved. Memorize */
1181 			dest = d;
1182 
1183 			/* dest not in remaining from nodes? */
1184 			if (!node_isset(dest, tmp))
1185 				break;
1186 		}
1187 		if (source == NUMA_NO_NODE)
1188 			break;
1189 
1190 		node_clear(source, tmp);
1191 		err = migrate_to_node(mm, source, dest, flags);
1192 		if (err > 0)
1193 			busy += err;
1194 		if (err < 0)
1195 			break;
1196 	}
1197 	up_read(&mm->mmap_sem);
1198 	if (err < 0)
1199 		return err;
1200 	return busy;
1201 
1202 }
1203 
1204 /*
1205  * Allocate a new page for page migration based on vma policy.
1206  * Start by assuming the page is mapped by the same vma as contains @start.
1207  * Search forward from there, if not.  N.B., this assumes that the
1208  * list of pages handed to migrate_pages()--which is how we get here--
1209  * is in virtual address order.
1210  */
1211 static struct page *new_page(struct page *page, unsigned long start)
1212 {
1213 	struct vm_area_struct *vma;
1214 	unsigned long uninitialized_var(address);
1215 
1216 	vma = find_vma(current->mm, start);
1217 	while (vma) {
1218 		address = page_address_in_vma(page, vma);
1219 		if (address != -EFAULT)
1220 			break;
1221 		vma = vma->vm_next;
1222 	}
1223 
1224 	if (PageHuge(page)) {
1225 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1226 				vma, address);
1227 	} else if (PageTransHuge(page)) {
1228 		struct page *thp;
1229 
1230 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1231 					 HPAGE_PMD_ORDER);
1232 		if (!thp)
1233 			return NULL;
1234 		prep_transhuge_page(thp);
1235 		return thp;
1236 	}
1237 	/*
1238 	 * if !vma, alloc_page_vma() will use task or system default policy
1239 	 */
1240 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1241 			vma, address);
1242 }
1243 #else
1244 
1245 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1246 				unsigned long flags)
1247 {
1248 	return -EIO;
1249 }
1250 
1251 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1252 		     const nodemask_t *to, int flags)
1253 {
1254 	return -ENOSYS;
1255 }
1256 
1257 static struct page *new_page(struct page *page, unsigned long start)
1258 {
1259 	return NULL;
1260 }
1261 #endif
1262 
1263 static long do_mbind(unsigned long start, unsigned long len,
1264 		     unsigned short mode, unsigned short mode_flags,
1265 		     nodemask_t *nmask, unsigned long flags)
1266 {
1267 	struct mm_struct *mm = current->mm;
1268 	struct mempolicy *new;
1269 	unsigned long end;
1270 	int err;
1271 	int ret;
1272 	LIST_HEAD(pagelist);
1273 
1274 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1275 		return -EINVAL;
1276 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1277 		return -EPERM;
1278 
1279 	if (start & ~PAGE_MASK)
1280 		return -EINVAL;
1281 
1282 	if (mode == MPOL_DEFAULT)
1283 		flags &= ~MPOL_MF_STRICT;
1284 
1285 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1286 	end = start + len;
1287 
1288 	if (end < start)
1289 		return -EINVAL;
1290 	if (end == start)
1291 		return 0;
1292 
1293 	new = mpol_new(mode, mode_flags, nmask);
1294 	if (IS_ERR(new))
1295 		return PTR_ERR(new);
1296 
1297 	if (flags & MPOL_MF_LAZY)
1298 		new->flags |= MPOL_F_MOF;
1299 
1300 	/*
1301 	 * If we are using the default policy then operation
1302 	 * on discontinuous address spaces is okay after all
1303 	 */
1304 	if (!new)
1305 		flags |= MPOL_MF_DISCONTIG_OK;
1306 
1307 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1308 		 start, start + len, mode, mode_flags,
1309 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1310 
1311 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1312 
1313 		err = migrate_prep();
1314 		if (err)
1315 			goto mpol_out;
1316 	}
1317 	{
1318 		NODEMASK_SCRATCH(scratch);
1319 		if (scratch) {
1320 			down_write(&mm->mmap_sem);
1321 			task_lock(current);
1322 			err = mpol_set_nodemask(new, nmask, scratch);
1323 			task_unlock(current);
1324 			if (err)
1325 				up_write(&mm->mmap_sem);
1326 		} else
1327 			err = -ENOMEM;
1328 		NODEMASK_SCRATCH_FREE(scratch);
1329 	}
1330 	if (err)
1331 		goto mpol_out;
1332 
1333 	ret = queue_pages_range(mm, start, end, nmask,
1334 			  flags | MPOL_MF_INVERT, &pagelist);
1335 
1336 	if (ret < 0) {
1337 		err = ret;
1338 		goto up_out;
1339 	}
1340 
1341 	err = mbind_range(mm, start, end, new);
1342 
1343 	if (!err) {
1344 		int nr_failed = 0;
1345 
1346 		if (!list_empty(&pagelist)) {
1347 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1348 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1349 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1350 			if (nr_failed)
1351 				putback_movable_pages(&pagelist);
1352 		}
1353 
1354 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1355 			err = -EIO;
1356 	} else {
1357 up_out:
1358 		if (!list_empty(&pagelist))
1359 			putback_movable_pages(&pagelist);
1360 	}
1361 
1362 	up_write(&mm->mmap_sem);
1363 mpol_out:
1364 	mpol_put(new);
1365 	return err;
1366 }
1367 
1368 /*
1369  * User space interface with variable sized bitmaps for nodelists.
1370  */
1371 
1372 /* Copy a node mask from user space. */
1373 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1374 		     unsigned long maxnode)
1375 {
1376 	unsigned long k;
1377 	unsigned long t;
1378 	unsigned long nlongs;
1379 	unsigned long endmask;
1380 
1381 	--maxnode;
1382 	nodes_clear(*nodes);
1383 	if (maxnode == 0 || !nmask)
1384 		return 0;
1385 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1386 		return -EINVAL;
1387 
1388 	nlongs = BITS_TO_LONGS(maxnode);
1389 	if ((maxnode % BITS_PER_LONG) == 0)
1390 		endmask = ~0UL;
1391 	else
1392 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1393 
1394 	/*
1395 	 * When the user specified more nodes than supported just check
1396 	 * if the non supported part is all zero.
1397 	 *
1398 	 * If maxnode have more longs than MAX_NUMNODES, check
1399 	 * the bits in that area first. And then go through to
1400 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1401 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1402 	 */
1403 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1404 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1405 			if (get_user(t, nmask + k))
1406 				return -EFAULT;
1407 			if (k == nlongs - 1) {
1408 				if (t & endmask)
1409 					return -EINVAL;
1410 			} else if (t)
1411 				return -EINVAL;
1412 		}
1413 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1414 		endmask = ~0UL;
1415 	}
1416 
1417 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1418 		unsigned long valid_mask = endmask;
1419 
1420 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1421 		if (get_user(t, nmask + nlongs - 1))
1422 			return -EFAULT;
1423 		if (t & valid_mask)
1424 			return -EINVAL;
1425 	}
1426 
1427 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1428 		return -EFAULT;
1429 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1430 	return 0;
1431 }
1432 
1433 /* Copy a kernel node mask to user space */
1434 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1435 			      nodemask_t *nodes)
1436 {
1437 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1438 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1439 
1440 	if (copy > nbytes) {
1441 		if (copy > PAGE_SIZE)
1442 			return -EINVAL;
1443 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1444 			return -EFAULT;
1445 		copy = nbytes;
1446 	}
1447 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1448 }
1449 
1450 static long kernel_mbind(unsigned long start, unsigned long len,
1451 			 unsigned long mode, const unsigned long __user *nmask,
1452 			 unsigned long maxnode, unsigned int flags)
1453 {
1454 	nodemask_t nodes;
1455 	int err;
1456 	unsigned short mode_flags;
1457 
1458 	start = untagged_addr(start);
1459 	mode_flags = mode & MPOL_MODE_FLAGS;
1460 	mode &= ~MPOL_MODE_FLAGS;
1461 	if (mode >= MPOL_MAX)
1462 		return -EINVAL;
1463 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1464 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1465 		return -EINVAL;
1466 	err = get_nodes(&nodes, nmask, maxnode);
1467 	if (err)
1468 		return err;
1469 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1470 }
1471 
1472 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1473 		unsigned long, mode, const unsigned long __user *, nmask,
1474 		unsigned long, maxnode, unsigned int, flags)
1475 {
1476 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1477 }
1478 
1479 /* Set the process memory policy */
1480 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1481 				 unsigned long maxnode)
1482 {
1483 	int err;
1484 	nodemask_t nodes;
1485 	unsigned short flags;
1486 
1487 	flags = mode & MPOL_MODE_FLAGS;
1488 	mode &= ~MPOL_MODE_FLAGS;
1489 	if ((unsigned int)mode >= MPOL_MAX)
1490 		return -EINVAL;
1491 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1492 		return -EINVAL;
1493 	err = get_nodes(&nodes, nmask, maxnode);
1494 	if (err)
1495 		return err;
1496 	return do_set_mempolicy(mode, flags, &nodes);
1497 }
1498 
1499 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1500 		unsigned long, maxnode)
1501 {
1502 	return kernel_set_mempolicy(mode, nmask, maxnode);
1503 }
1504 
1505 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1506 				const unsigned long __user *old_nodes,
1507 				const unsigned long __user *new_nodes)
1508 {
1509 	struct mm_struct *mm = NULL;
1510 	struct task_struct *task;
1511 	nodemask_t task_nodes;
1512 	int err;
1513 	nodemask_t *old;
1514 	nodemask_t *new;
1515 	NODEMASK_SCRATCH(scratch);
1516 
1517 	if (!scratch)
1518 		return -ENOMEM;
1519 
1520 	old = &scratch->mask1;
1521 	new = &scratch->mask2;
1522 
1523 	err = get_nodes(old, old_nodes, maxnode);
1524 	if (err)
1525 		goto out;
1526 
1527 	err = get_nodes(new, new_nodes, maxnode);
1528 	if (err)
1529 		goto out;
1530 
1531 	/* Find the mm_struct */
1532 	rcu_read_lock();
1533 	task = pid ? find_task_by_vpid(pid) : current;
1534 	if (!task) {
1535 		rcu_read_unlock();
1536 		err = -ESRCH;
1537 		goto out;
1538 	}
1539 	get_task_struct(task);
1540 
1541 	err = -EINVAL;
1542 
1543 	/*
1544 	 * Check if this process has the right to modify the specified process.
1545 	 * Use the regular "ptrace_may_access()" checks.
1546 	 */
1547 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1548 		rcu_read_unlock();
1549 		err = -EPERM;
1550 		goto out_put;
1551 	}
1552 	rcu_read_unlock();
1553 
1554 	task_nodes = cpuset_mems_allowed(task);
1555 	/* Is the user allowed to access the target nodes? */
1556 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1557 		err = -EPERM;
1558 		goto out_put;
1559 	}
1560 
1561 	task_nodes = cpuset_mems_allowed(current);
1562 	nodes_and(*new, *new, task_nodes);
1563 	if (nodes_empty(*new))
1564 		goto out_put;
1565 
1566 	err = security_task_movememory(task);
1567 	if (err)
1568 		goto out_put;
1569 
1570 	mm = get_task_mm(task);
1571 	put_task_struct(task);
1572 
1573 	if (!mm) {
1574 		err = -EINVAL;
1575 		goto out;
1576 	}
1577 
1578 	err = do_migrate_pages(mm, old, new,
1579 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1580 
1581 	mmput(mm);
1582 out:
1583 	NODEMASK_SCRATCH_FREE(scratch);
1584 
1585 	return err;
1586 
1587 out_put:
1588 	put_task_struct(task);
1589 	goto out;
1590 
1591 }
1592 
1593 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1594 		const unsigned long __user *, old_nodes,
1595 		const unsigned long __user *, new_nodes)
1596 {
1597 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1598 }
1599 
1600 
1601 /* Retrieve NUMA policy */
1602 static int kernel_get_mempolicy(int __user *policy,
1603 				unsigned long __user *nmask,
1604 				unsigned long maxnode,
1605 				unsigned long addr,
1606 				unsigned long flags)
1607 {
1608 	int err;
1609 	int uninitialized_var(pval);
1610 	nodemask_t nodes;
1611 
1612 	addr = untagged_addr(addr);
1613 
1614 	if (nmask != NULL && maxnode < nr_node_ids)
1615 		return -EINVAL;
1616 
1617 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1618 
1619 	if (err)
1620 		return err;
1621 
1622 	if (policy && put_user(pval, policy))
1623 		return -EFAULT;
1624 
1625 	if (nmask)
1626 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1627 
1628 	return err;
1629 }
1630 
1631 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1632 		unsigned long __user *, nmask, unsigned long, maxnode,
1633 		unsigned long, addr, unsigned long, flags)
1634 {
1635 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1636 }
1637 
1638 #ifdef CONFIG_COMPAT
1639 
1640 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1641 		       compat_ulong_t __user *, nmask,
1642 		       compat_ulong_t, maxnode,
1643 		       compat_ulong_t, addr, compat_ulong_t, flags)
1644 {
1645 	long err;
1646 	unsigned long __user *nm = NULL;
1647 	unsigned long nr_bits, alloc_size;
1648 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1649 
1650 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1651 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1652 
1653 	if (nmask)
1654 		nm = compat_alloc_user_space(alloc_size);
1655 
1656 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1657 
1658 	if (!err && nmask) {
1659 		unsigned long copy_size;
1660 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1661 		err = copy_from_user(bm, nm, copy_size);
1662 		/* ensure entire bitmap is zeroed */
1663 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1664 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1665 	}
1666 
1667 	return err;
1668 }
1669 
1670 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1671 		       compat_ulong_t, maxnode)
1672 {
1673 	unsigned long __user *nm = NULL;
1674 	unsigned long nr_bits, alloc_size;
1675 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1676 
1677 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1678 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1679 
1680 	if (nmask) {
1681 		if (compat_get_bitmap(bm, nmask, nr_bits))
1682 			return -EFAULT;
1683 		nm = compat_alloc_user_space(alloc_size);
1684 		if (copy_to_user(nm, bm, alloc_size))
1685 			return -EFAULT;
1686 	}
1687 
1688 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1689 }
1690 
1691 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1692 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1693 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1694 {
1695 	unsigned long __user *nm = NULL;
1696 	unsigned long nr_bits, alloc_size;
1697 	nodemask_t bm;
1698 
1699 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1700 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1701 
1702 	if (nmask) {
1703 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1704 			return -EFAULT;
1705 		nm = compat_alloc_user_space(alloc_size);
1706 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1707 			return -EFAULT;
1708 	}
1709 
1710 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1711 }
1712 
1713 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1714 		       compat_ulong_t, maxnode,
1715 		       const compat_ulong_t __user *, old_nodes,
1716 		       const compat_ulong_t __user *, new_nodes)
1717 {
1718 	unsigned long __user *old = NULL;
1719 	unsigned long __user *new = NULL;
1720 	nodemask_t tmp_mask;
1721 	unsigned long nr_bits;
1722 	unsigned long size;
1723 
1724 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1725 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1726 	if (old_nodes) {
1727 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1728 			return -EFAULT;
1729 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1730 		if (new_nodes)
1731 			new = old + size / sizeof(unsigned long);
1732 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1733 			return -EFAULT;
1734 	}
1735 	if (new_nodes) {
1736 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1737 			return -EFAULT;
1738 		if (new == NULL)
1739 			new = compat_alloc_user_space(size);
1740 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1741 			return -EFAULT;
1742 	}
1743 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1744 }
1745 
1746 #endif /* CONFIG_COMPAT */
1747 
1748 bool vma_migratable(struct vm_area_struct *vma)
1749 {
1750 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1751 		return false;
1752 
1753 	/*
1754 	 * DAX device mappings require predictable access latency, so avoid
1755 	 * incurring periodic faults.
1756 	 */
1757 	if (vma_is_dax(vma))
1758 		return false;
1759 
1760 	if (is_vm_hugetlb_page(vma) &&
1761 		!hugepage_migration_supported(hstate_vma(vma)))
1762 		return false;
1763 
1764 	/*
1765 	 * Migration allocates pages in the highest zone. If we cannot
1766 	 * do so then migration (at least from node to node) is not
1767 	 * possible.
1768 	 */
1769 	if (vma->vm_file &&
1770 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1771 			< policy_zone)
1772 		return false;
1773 	return true;
1774 }
1775 
1776 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1777 						unsigned long addr)
1778 {
1779 	struct mempolicy *pol = NULL;
1780 
1781 	if (vma) {
1782 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1783 			pol = vma->vm_ops->get_policy(vma, addr);
1784 		} else if (vma->vm_policy) {
1785 			pol = vma->vm_policy;
1786 
1787 			/*
1788 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1789 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1790 			 * count on these policies which will be dropped by
1791 			 * mpol_cond_put() later
1792 			 */
1793 			if (mpol_needs_cond_ref(pol))
1794 				mpol_get(pol);
1795 		}
1796 	}
1797 
1798 	return pol;
1799 }
1800 
1801 /*
1802  * get_vma_policy(@vma, @addr)
1803  * @vma: virtual memory area whose policy is sought
1804  * @addr: address in @vma for shared policy lookup
1805  *
1806  * Returns effective policy for a VMA at specified address.
1807  * Falls back to current->mempolicy or system default policy, as necessary.
1808  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1809  * count--added by the get_policy() vm_op, as appropriate--to protect against
1810  * freeing by another task.  It is the caller's responsibility to free the
1811  * extra reference for shared policies.
1812  */
1813 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1814 						unsigned long addr)
1815 {
1816 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1817 
1818 	if (!pol)
1819 		pol = get_task_policy(current);
1820 
1821 	return pol;
1822 }
1823 
1824 bool vma_policy_mof(struct vm_area_struct *vma)
1825 {
1826 	struct mempolicy *pol;
1827 
1828 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1829 		bool ret = false;
1830 
1831 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1832 		if (pol && (pol->flags & MPOL_F_MOF))
1833 			ret = true;
1834 		mpol_cond_put(pol);
1835 
1836 		return ret;
1837 	}
1838 
1839 	pol = vma->vm_policy;
1840 	if (!pol)
1841 		pol = get_task_policy(current);
1842 
1843 	return pol->flags & MPOL_F_MOF;
1844 }
1845 
1846 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1847 {
1848 	enum zone_type dynamic_policy_zone = policy_zone;
1849 
1850 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1851 
1852 	/*
1853 	 * if policy->v.nodes has movable memory only,
1854 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1855 	 *
1856 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1857 	 * so if the following test faile, it implies
1858 	 * policy->v.nodes has movable memory only.
1859 	 */
1860 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1861 		dynamic_policy_zone = ZONE_MOVABLE;
1862 
1863 	return zone >= dynamic_policy_zone;
1864 }
1865 
1866 /*
1867  * Return a nodemask representing a mempolicy for filtering nodes for
1868  * page allocation
1869  */
1870 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1871 {
1872 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1873 	if (unlikely(policy->mode == MPOL_BIND) &&
1874 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1875 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1876 		return &policy->v.nodes;
1877 
1878 	return NULL;
1879 }
1880 
1881 /* Return the node id preferred by the given mempolicy, or the given id */
1882 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1883 								int nd)
1884 {
1885 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1886 		nd = policy->v.preferred_node;
1887 	else {
1888 		/*
1889 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1890 		 * because we might easily break the expectation to stay on the
1891 		 * requested node and not break the policy.
1892 		 */
1893 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1894 	}
1895 
1896 	return nd;
1897 }
1898 
1899 /* Do dynamic interleaving for a process */
1900 static unsigned interleave_nodes(struct mempolicy *policy)
1901 {
1902 	unsigned next;
1903 	struct task_struct *me = current;
1904 
1905 	next = next_node_in(me->il_prev, policy->v.nodes);
1906 	if (next < MAX_NUMNODES)
1907 		me->il_prev = next;
1908 	return next;
1909 }
1910 
1911 /*
1912  * Depending on the memory policy provide a node from which to allocate the
1913  * next slab entry.
1914  */
1915 unsigned int mempolicy_slab_node(void)
1916 {
1917 	struct mempolicy *policy;
1918 	int node = numa_mem_id();
1919 
1920 	if (in_interrupt())
1921 		return node;
1922 
1923 	policy = current->mempolicy;
1924 	if (!policy || policy->flags & MPOL_F_LOCAL)
1925 		return node;
1926 
1927 	switch (policy->mode) {
1928 	case MPOL_PREFERRED:
1929 		/*
1930 		 * handled MPOL_F_LOCAL above
1931 		 */
1932 		return policy->v.preferred_node;
1933 
1934 	case MPOL_INTERLEAVE:
1935 		return interleave_nodes(policy);
1936 
1937 	case MPOL_BIND: {
1938 		struct zoneref *z;
1939 
1940 		/*
1941 		 * Follow bind policy behavior and start allocation at the
1942 		 * first node.
1943 		 */
1944 		struct zonelist *zonelist;
1945 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1946 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1947 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1948 							&policy->v.nodes);
1949 		return z->zone ? zone_to_nid(z->zone) : node;
1950 	}
1951 
1952 	default:
1953 		BUG();
1954 	}
1955 }
1956 
1957 /*
1958  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1959  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1960  * number of present nodes.
1961  */
1962 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1963 {
1964 	unsigned nnodes = nodes_weight(pol->v.nodes);
1965 	unsigned target;
1966 	int i;
1967 	int nid;
1968 
1969 	if (!nnodes)
1970 		return numa_node_id();
1971 	target = (unsigned int)n % nnodes;
1972 	nid = first_node(pol->v.nodes);
1973 	for (i = 0; i < target; i++)
1974 		nid = next_node(nid, pol->v.nodes);
1975 	return nid;
1976 }
1977 
1978 /* Determine a node number for interleave */
1979 static inline unsigned interleave_nid(struct mempolicy *pol,
1980 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1981 {
1982 	if (vma) {
1983 		unsigned long off;
1984 
1985 		/*
1986 		 * for small pages, there is no difference between
1987 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1988 		 * for huge pages, since vm_pgoff is in units of small
1989 		 * pages, we need to shift off the always 0 bits to get
1990 		 * a useful offset.
1991 		 */
1992 		BUG_ON(shift < PAGE_SHIFT);
1993 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1994 		off += (addr - vma->vm_start) >> shift;
1995 		return offset_il_node(pol, off);
1996 	} else
1997 		return interleave_nodes(pol);
1998 }
1999 
2000 #ifdef CONFIG_HUGETLBFS
2001 /*
2002  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2003  * @vma: virtual memory area whose policy is sought
2004  * @addr: address in @vma for shared policy lookup and interleave policy
2005  * @gfp_flags: for requested zone
2006  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2007  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2008  *
2009  * Returns a nid suitable for a huge page allocation and a pointer
2010  * to the struct mempolicy for conditional unref after allocation.
2011  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2012  * @nodemask for filtering the zonelist.
2013  *
2014  * Must be protected by read_mems_allowed_begin()
2015  */
2016 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2017 				struct mempolicy **mpol, nodemask_t **nodemask)
2018 {
2019 	int nid;
2020 
2021 	*mpol = get_vma_policy(vma, addr);
2022 	*nodemask = NULL;	/* assume !MPOL_BIND */
2023 
2024 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2025 		nid = interleave_nid(*mpol, vma, addr,
2026 					huge_page_shift(hstate_vma(vma)));
2027 	} else {
2028 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
2029 		if ((*mpol)->mode == MPOL_BIND)
2030 			*nodemask = &(*mpol)->v.nodes;
2031 	}
2032 	return nid;
2033 }
2034 
2035 /*
2036  * init_nodemask_of_mempolicy
2037  *
2038  * If the current task's mempolicy is "default" [NULL], return 'false'
2039  * to indicate default policy.  Otherwise, extract the policy nodemask
2040  * for 'bind' or 'interleave' policy into the argument nodemask, or
2041  * initialize the argument nodemask to contain the single node for
2042  * 'preferred' or 'local' policy and return 'true' to indicate presence
2043  * of non-default mempolicy.
2044  *
2045  * We don't bother with reference counting the mempolicy [mpol_get/put]
2046  * because the current task is examining it's own mempolicy and a task's
2047  * mempolicy is only ever changed by the task itself.
2048  *
2049  * N.B., it is the caller's responsibility to free a returned nodemask.
2050  */
2051 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2052 {
2053 	struct mempolicy *mempolicy;
2054 	int nid;
2055 
2056 	if (!(mask && current->mempolicy))
2057 		return false;
2058 
2059 	task_lock(current);
2060 	mempolicy = current->mempolicy;
2061 	switch (mempolicy->mode) {
2062 	case MPOL_PREFERRED:
2063 		if (mempolicy->flags & MPOL_F_LOCAL)
2064 			nid = numa_node_id();
2065 		else
2066 			nid = mempolicy->v.preferred_node;
2067 		init_nodemask_of_node(mask, nid);
2068 		break;
2069 
2070 	case MPOL_BIND:
2071 	case MPOL_INTERLEAVE:
2072 		*mask =  mempolicy->v.nodes;
2073 		break;
2074 
2075 	default:
2076 		BUG();
2077 	}
2078 	task_unlock(current);
2079 
2080 	return true;
2081 }
2082 #endif
2083 
2084 /*
2085  * mempolicy_nodemask_intersects
2086  *
2087  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2088  * policy.  Otherwise, check for intersection between mask and the policy
2089  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2090  * policy, always return true since it may allocate elsewhere on fallback.
2091  *
2092  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2093  */
2094 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2095 					const nodemask_t *mask)
2096 {
2097 	struct mempolicy *mempolicy;
2098 	bool ret = true;
2099 
2100 	if (!mask)
2101 		return ret;
2102 	task_lock(tsk);
2103 	mempolicy = tsk->mempolicy;
2104 	if (!mempolicy)
2105 		goto out;
2106 
2107 	switch (mempolicy->mode) {
2108 	case MPOL_PREFERRED:
2109 		/*
2110 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2111 		 * allocate from, they may fallback to other nodes when oom.
2112 		 * Thus, it's possible for tsk to have allocated memory from
2113 		 * nodes in mask.
2114 		 */
2115 		break;
2116 	case MPOL_BIND:
2117 	case MPOL_INTERLEAVE:
2118 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2119 		break;
2120 	default:
2121 		BUG();
2122 	}
2123 out:
2124 	task_unlock(tsk);
2125 	return ret;
2126 }
2127 
2128 /* Allocate a page in interleaved policy.
2129    Own path because it needs to do special accounting. */
2130 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2131 					unsigned nid)
2132 {
2133 	struct page *page;
2134 
2135 	page = __alloc_pages(gfp, order, nid);
2136 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2137 	if (!static_branch_likely(&vm_numa_stat_key))
2138 		return page;
2139 	if (page && page_to_nid(page) == nid) {
2140 		preempt_disable();
2141 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2142 		preempt_enable();
2143 	}
2144 	return page;
2145 }
2146 
2147 /**
2148  * 	alloc_pages_vma	- Allocate a page for a VMA.
2149  *
2150  * 	@gfp:
2151  *      %GFP_USER    user allocation.
2152  *      %GFP_KERNEL  kernel allocations,
2153  *      %GFP_HIGHMEM highmem/user allocations,
2154  *      %GFP_FS      allocation should not call back into a file system.
2155  *      %GFP_ATOMIC  don't sleep.
2156  *
2157  *	@order:Order of the GFP allocation.
2158  * 	@vma:  Pointer to VMA or NULL if not available.
2159  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2160  *	@node: Which node to prefer for allocation (modulo policy).
2161  *	@hugepage: for hugepages try only the preferred node if possible
2162  *
2163  * 	This function allocates a page from the kernel page pool and applies
2164  *	a NUMA policy associated with the VMA or the current process.
2165  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2166  *	mm_struct of the VMA to prevent it from going away. Should be used for
2167  *	all allocations for pages that will be mapped into user space. Returns
2168  *	NULL when no page can be allocated.
2169  */
2170 struct page *
2171 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2172 		unsigned long addr, int node, bool hugepage)
2173 {
2174 	struct mempolicy *pol;
2175 	struct page *page;
2176 	int preferred_nid;
2177 	nodemask_t *nmask;
2178 
2179 	pol = get_vma_policy(vma, addr);
2180 
2181 	if (pol->mode == MPOL_INTERLEAVE) {
2182 		unsigned nid;
2183 
2184 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2185 		mpol_cond_put(pol);
2186 		page = alloc_page_interleave(gfp, order, nid);
2187 		goto out;
2188 	}
2189 
2190 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2191 		int hpage_node = node;
2192 
2193 		/*
2194 		 * For hugepage allocation and non-interleave policy which
2195 		 * allows the current node (or other explicitly preferred
2196 		 * node) we only try to allocate from the current/preferred
2197 		 * node and don't fall back to other nodes, as the cost of
2198 		 * remote accesses would likely offset THP benefits.
2199 		 *
2200 		 * If the policy is interleave, or does not allow the current
2201 		 * node in its nodemask, we allocate the standard way.
2202 		 */
2203 		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2204 			hpage_node = pol->v.preferred_node;
2205 
2206 		nmask = policy_nodemask(gfp, pol);
2207 		if (!nmask || node_isset(hpage_node, *nmask)) {
2208 			mpol_cond_put(pol);
2209 			/*
2210 			 * First, try to allocate THP only on local node, but
2211 			 * don't reclaim unnecessarily, just compact.
2212 			 */
2213 			page = __alloc_pages_node(hpage_node,
2214 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2215 
2216 			/*
2217 			 * If hugepage allocations are configured to always
2218 			 * synchronous compact or the vma has been madvised
2219 			 * to prefer hugepage backing, retry allowing remote
2220 			 * memory with both reclaim and compact as well.
2221 			 */
2222 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2223 				page = __alloc_pages_node(hpage_node,
2224 								gfp, order);
2225 
2226 			goto out;
2227 		}
2228 	}
2229 
2230 	nmask = policy_nodemask(gfp, pol);
2231 	preferred_nid = policy_node(gfp, pol, node);
2232 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2233 	mpol_cond_put(pol);
2234 out:
2235 	return page;
2236 }
2237 EXPORT_SYMBOL(alloc_pages_vma);
2238 
2239 /**
2240  * 	alloc_pages_current - Allocate pages.
2241  *
2242  *	@gfp:
2243  *		%GFP_USER   user allocation,
2244  *      	%GFP_KERNEL kernel allocation,
2245  *      	%GFP_HIGHMEM highmem allocation,
2246  *      	%GFP_FS     don't call back into a file system.
2247  *      	%GFP_ATOMIC don't sleep.
2248  *	@order: Power of two of allocation size in pages. 0 is a single page.
2249  *
2250  *	Allocate a page from the kernel page pool.  When not in
2251  *	interrupt context and apply the current process NUMA policy.
2252  *	Returns NULL when no page can be allocated.
2253  */
2254 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2255 {
2256 	struct mempolicy *pol = &default_policy;
2257 	struct page *page;
2258 
2259 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2260 		pol = get_task_policy(current);
2261 
2262 	/*
2263 	 * No reference counting needed for current->mempolicy
2264 	 * nor system default_policy
2265 	 */
2266 	if (pol->mode == MPOL_INTERLEAVE)
2267 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2268 	else
2269 		page = __alloc_pages_nodemask(gfp, order,
2270 				policy_node(gfp, pol, numa_node_id()),
2271 				policy_nodemask(gfp, pol));
2272 
2273 	return page;
2274 }
2275 EXPORT_SYMBOL(alloc_pages_current);
2276 
2277 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2278 {
2279 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2280 
2281 	if (IS_ERR(pol))
2282 		return PTR_ERR(pol);
2283 	dst->vm_policy = pol;
2284 	return 0;
2285 }
2286 
2287 /*
2288  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2289  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2290  * with the mems_allowed returned by cpuset_mems_allowed().  This
2291  * keeps mempolicies cpuset relative after its cpuset moves.  See
2292  * further kernel/cpuset.c update_nodemask().
2293  *
2294  * current's mempolicy may be rebinded by the other task(the task that changes
2295  * cpuset's mems), so we needn't do rebind work for current task.
2296  */
2297 
2298 /* Slow path of a mempolicy duplicate */
2299 struct mempolicy *__mpol_dup(struct mempolicy *old)
2300 {
2301 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2302 
2303 	if (!new)
2304 		return ERR_PTR(-ENOMEM);
2305 
2306 	/* task's mempolicy is protected by alloc_lock */
2307 	if (old == current->mempolicy) {
2308 		task_lock(current);
2309 		*new = *old;
2310 		task_unlock(current);
2311 	} else
2312 		*new = *old;
2313 
2314 	if (current_cpuset_is_being_rebound()) {
2315 		nodemask_t mems = cpuset_mems_allowed(current);
2316 		mpol_rebind_policy(new, &mems);
2317 	}
2318 	atomic_set(&new->refcnt, 1);
2319 	return new;
2320 }
2321 
2322 /* Slow path of a mempolicy comparison */
2323 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2324 {
2325 	if (!a || !b)
2326 		return false;
2327 	if (a->mode != b->mode)
2328 		return false;
2329 	if (a->flags != b->flags)
2330 		return false;
2331 	if (mpol_store_user_nodemask(a))
2332 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2333 			return false;
2334 
2335 	switch (a->mode) {
2336 	case MPOL_BIND:
2337 	case MPOL_INTERLEAVE:
2338 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2339 	case MPOL_PREFERRED:
2340 		/* a's ->flags is the same as b's */
2341 		if (a->flags & MPOL_F_LOCAL)
2342 			return true;
2343 		return a->v.preferred_node == b->v.preferred_node;
2344 	default:
2345 		BUG();
2346 		return false;
2347 	}
2348 }
2349 
2350 /*
2351  * Shared memory backing store policy support.
2352  *
2353  * Remember policies even when nobody has shared memory mapped.
2354  * The policies are kept in Red-Black tree linked from the inode.
2355  * They are protected by the sp->lock rwlock, which should be held
2356  * for any accesses to the tree.
2357  */
2358 
2359 /*
2360  * lookup first element intersecting start-end.  Caller holds sp->lock for
2361  * reading or for writing
2362  */
2363 static struct sp_node *
2364 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2365 {
2366 	struct rb_node *n = sp->root.rb_node;
2367 
2368 	while (n) {
2369 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2370 
2371 		if (start >= p->end)
2372 			n = n->rb_right;
2373 		else if (end <= p->start)
2374 			n = n->rb_left;
2375 		else
2376 			break;
2377 	}
2378 	if (!n)
2379 		return NULL;
2380 	for (;;) {
2381 		struct sp_node *w = NULL;
2382 		struct rb_node *prev = rb_prev(n);
2383 		if (!prev)
2384 			break;
2385 		w = rb_entry(prev, struct sp_node, nd);
2386 		if (w->end <= start)
2387 			break;
2388 		n = prev;
2389 	}
2390 	return rb_entry(n, struct sp_node, nd);
2391 }
2392 
2393 /*
2394  * Insert a new shared policy into the list.  Caller holds sp->lock for
2395  * writing.
2396  */
2397 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2398 {
2399 	struct rb_node **p = &sp->root.rb_node;
2400 	struct rb_node *parent = NULL;
2401 	struct sp_node *nd;
2402 
2403 	while (*p) {
2404 		parent = *p;
2405 		nd = rb_entry(parent, struct sp_node, nd);
2406 		if (new->start < nd->start)
2407 			p = &(*p)->rb_left;
2408 		else if (new->end > nd->end)
2409 			p = &(*p)->rb_right;
2410 		else
2411 			BUG();
2412 	}
2413 	rb_link_node(&new->nd, parent, p);
2414 	rb_insert_color(&new->nd, &sp->root);
2415 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2416 		 new->policy ? new->policy->mode : 0);
2417 }
2418 
2419 /* Find shared policy intersecting idx */
2420 struct mempolicy *
2421 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2422 {
2423 	struct mempolicy *pol = NULL;
2424 	struct sp_node *sn;
2425 
2426 	if (!sp->root.rb_node)
2427 		return NULL;
2428 	read_lock(&sp->lock);
2429 	sn = sp_lookup(sp, idx, idx+1);
2430 	if (sn) {
2431 		mpol_get(sn->policy);
2432 		pol = sn->policy;
2433 	}
2434 	read_unlock(&sp->lock);
2435 	return pol;
2436 }
2437 
2438 static void sp_free(struct sp_node *n)
2439 {
2440 	mpol_put(n->policy);
2441 	kmem_cache_free(sn_cache, n);
2442 }
2443 
2444 /**
2445  * mpol_misplaced - check whether current page node is valid in policy
2446  *
2447  * @page: page to be checked
2448  * @vma: vm area where page mapped
2449  * @addr: virtual address where page mapped
2450  *
2451  * Lookup current policy node id for vma,addr and "compare to" page's
2452  * node id.
2453  *
2454  * Returns:
2455  *	-1	- not misplaced, page is in the right node
2456  *	node	- node id where the page should be
2457  *
2458  * Policy determination "mimics" alloc_page_vma().
2459  * Called from fault path where we know the vma and faulting address.
2460  */
2461 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2462 {
2463 	struct mempolicy *pol;
2464 	struct zoneref *z;
2465 	int curnid = page_to_nid(page);
2466 	unsigned long pgoff;
2467 	int thiscpu = raw_smp_processor_id();
2468 	int thisnid = cpu_to_node(thiscpu);
2469 	int polnid = NUMA_NO_NODE;
2470 	int ret = -1;
2471 
2472 	pol = get_vma_policy(vma, addr);
2473 	if (!(pol->flags & MPOL_F_MOF))
2474 		goto out;
2475 
2476 	switch (pol->mode) {
2477 	case MPOL_INTERLEAVE:
2478 		pgoff = vma->vm_pgoff;
2479 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2480 		polnid = offset_il_node(pol, pgoff);
2481 		break;
2482 
2483 	case MPOL_PREFERRED:
2484 		if (pol->flags & MPOL_F_LOCAL)
2485 			polnid = numa_node_id();
2486 		else
2487 			polnid = pol->v.preferred_node;
2488 		break;
2489 
2490 	case MPOL_BIND:
2491 
2492 		/*
2493 		 * allows binding to multiple nodes.
2494 		 * use current page if in policy nodemask,
2495 		 * else select nearest allowed node, if any.
2496 		 * If no allowed nodes, use current [!misplaced].
2497 		 */
2498 		if (node_isset(curnid, pol->v.nodes))
2499 			goto out;
2500 		z = first_zones_zonelist(
2501 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2502 				gfp_zone(GFP_HIGHUSER),
2503 				&pol->v.nodes);
2504 		polnid = zone_to_nid(z->zone);
2505 		break;
2506 
2507 	default:
2508 		BUG();
2509 	}
2510 
2511 	/* Migrate the page towards the node whose CPU is referencing it */
2512 	if (pol->flags & MPOL_F_MORON) {
2513 		polnid = thisnid;
2514 
2515 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2516 			goto out;
2517 	}
2518 
2519 	if (curnid != polnid)
2520 		ret = polnid;
2521 out:
2522 	mpol_cond_put(pol);
2523 
2524 	return ret;
2525 }
2526 
2527 /*
2528  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2529  * dropped after task->mempolicy is set to NULL so that any allocation done as
2530  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2531  * policy.
2532  */
2533 void mpol_put_task_policy(struct task_struct *task)
2534 {
2535 	struct mempolicy *pol;
2536 
2537 	task_lock(task);
2538 	pol = task->mempolicy;
2539 	task->mempolicy = NULL;
2540 	task_unlock(task);
2541 	mpol_put(pol);
2542 }
2543 
2544 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2545 {
2546 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2547 	rb_erase(&n->nd, &sp->root);
2548 	sp_free(n);
2549 }
2550 
2551 static void sp_node_init(struct sp_node *node, unsigned long start,
2552 			unsigned long end, struct mempolicy *pol)
2553 {
2554 	node->start = start;
2555 	node->end = end;
2556 	node->policy = pol;
2557 }
2558 
2559 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2560 				struct mempolicy *pol)
2561 {
2562 	struct sp_node *n;
2563 	struct mempolicy *newpol;
2564 
2565 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2566 	if (!n)
2567 		return NULL;
2568 
2569 	newpol = mpol_dup(pol);
2570 	if (IS_ERR(newpol)) {
2571 		kmem_cache_free(sn_cache, n);
2572 		return NULL;
2573 	}
2574 	newpol->flags |= MPOL_F_SHARED;
2575 	sp_node_init(n, start, end, newpol);
2576 
2577 	return n;
2578 }
2579 
2580 /* Replace a policy range. */
2581 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2582 				 unsigned long end, struct sp_node *new)
2583 {
2584 	struct sp_node *n;
2585 	struct sp_node *n_new = NULL;
2586 	struct mempolicy *mpol_new = NULL;
2587 	int ret = 0;
2588 
2589 restart:
2590 	write_lock(&sp->lock);
2591 	n = sp_lookup(sp, start, end);
2592 	/* Take care of old policies in the same range. */
2593 	while (n && n->start < end) {
2594 		struct rb_node *next = rb_next(&n->nd);
2595 		if (n->start >= start) {
2596 			if (n->end <= end)
2597 				sp_delete(sp, n);
2598 			else
2599 				n->start = end;
2600 		} else {
2601 			/* Old policy spanning whole new range. */
2602 			if (n->end > end) {
2603 				if (!n_new)
2604 					goto alloc_new;
2605 
2606 				*mpol_new = *n->policy;
2607 				atomic_set(&mpol_new->refcnt, 1);
2608 				sp_node_init(n_new, end, n->end, mpol_new);
2609 				n->end = start;
2610 				sp_insert(sp, n_new);
2611 				n_new = NULL;
2612 				mpol_new = NULL;
2613 				break;
2614 			} else
2615 				n->end = start;
2616 		}
2617 		if (!next)
2618 			break;
2619 		n = rb_entry(next, struct sp_node, nd);
2620 	}
2621 	if (new)
2622 		sp_insert(sp, new);
2623 	write_unlock(&sp->lock);
2624 	ret = 0;
2625 
2626 err_out:
2627 	if (mpol_new)
2628 		mpol_put(mpol_new);
2629 	if (n_new)
2630 		kmem_cache_free(sn_cache, n_new);
2631 
2632 	return ret;
2633 
2634 alloc_new:
2635 	write_unlock(&sp->lock);
2636 	ret = -ENOMEM;
2637 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2638 	if (!n_new)
2639 		goto err_out;
2640 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2641 	if (!mpol_new)
2642 		goto err_out;
2643 	goto restart;
2644 }
2645 
2646 /**
2647  * mpol_shared_policy_init - initialize shared policy for inode
2648  * @sp: pointer to inode shared policy
2649  * @mpol:  struct mempolicy to install
2650  *
2651  * Install non-NULL @mpol in inode's shared policy rb-tree.
2652  * On entry, the current task has a reference on a non-NULL @mpol.
2653  * This must be released on exit.
2654  * This is called at get_inode() calls and we can use GFP_KERNEL.
2655  */
2656 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2657 {
2658 	int ret;
2659 
2660 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2661 	rwlock_init(&sp->lock);
2662 
2663 	if (mpol) {
2664 		struct vm_area_struct pvma;
2665 		struct mempolicy *new;
2666 		NODEMASK_SCRATCH(scratch);
2667 
2668 		if (!scratch)
2669 			goto put_mpol;
2670 		/* contextualize the tmpfs mount point mempolicy */
2671 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2672 		if (IS_ERR(new))
2673 			goto free_scratch; /* no valid nodemask intersection */
2674 
2675 		task_lock(current);
2676 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2677 		task_unlock(current);
2678 		if (ret)
2679 			goto put_new;
2680 
2681 		/* Create pseudo-vma that contains just the policy */
2682 		vma_init(&pvma, NULL);
2683 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2684 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2685 
2686 put_new:
2687 		mpol_put(new);			/* drop initial ref */
2688 free_scratch:
2689 		NODEMASK_SCRATCH_FREE(scratch);
2690 put_mpol:
2691 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2692 	}
2693 }
2694 
2695 int mpol_set_shared_policy(struct shared_policy *info,
2696 			struct vm_area_struct *vma, struct mempolicy *npol)
2697 {
2698 	int err;
2699 	struct sp_node *new = NULL;
2700 	unsigned long sz = vma_pages(vma);
2701 
2702 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2703 		 vma->vm_pgoff,
2704 		 sz, npol ? npol->mode : -1,
2705 		 npol ? npol->flags : -1,
2706 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2707 
2708 	if (npol) {
2709 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2710 		if (!new)
2711 			return -ENOMEM;
2712 	}
2713 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2714 	if (err && new)
2715 		sp_free(new);
2716 	return err;
2717 }
2718 
2719 /* Free a backing policy store on inode delete. */
2720 void mpol_free_shared_policy(struct shared_policy *p)
2721 {
2722 	struct sp_node *n;
2723 	struct rb_node *next;
2724 
2725 	if (!p->root.rb_node)
2726 		return;
2727 	write_lock(&p->lock);
2728 	next = rb_first(&p->root);
2729 	while (next) {
2730 		n = rb_entry(next, struct sp_node, nd);
2731 		next = rb_next(&n->nd);
2732 		sp_delete(p, n);
2733 	}
2734 	write_unlock(&p->lock);
2735 }
2736 
2737 #ifdef CONFIG_NUMA_BALANCING
2738 static int __initdata numabalancing_override;
2739 
2740 static void __init check_numabalancing_enable(void)
2741 {
2742 	bool numabalancing_default = false;
2743 
2744 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2745 		numabalancing_default = true;
2746 
2747 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2748 	if (numabalancing_override)
2749 		set_numabalancing_state(numabalancing_override == 1);
2750 
2751 	if (num_online_nodes() > 1 && !numabalancing_override) {
2752 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2753 			numabalancing_default ? "Enabling" : "Disabling");
2754 		set_numabalancing_state(numabalancing_default);
2755 	}
2756 }
2757 
2758 static int __init setup_numabalancing(char *str)
2759 {
2760 	int ret = 0;
2761 	if (!str)
2762 		goto out;
2763 
2764 	if (!strcmp(str, "enable")) {
2765 		numabalancing_override = 1;
2766 		ret = 1;
2767 	} else if (!strcmp(str, "disable")) {
2768 		numabalancing_override = -1;
2769 		ret = 1;
2770 	}
2771 out:
2772 	if (!ret)
2773 		pr_warn("Unable to parse numa_balancing=\n");
2774 
2775 	return ret;
2776 }
2777 __setup("numa_balancing=", setup_numabalancing);
2778 #else
2779 static inline void __init check_numabalancing_enable(void)
2780 {
2781 }
2782 #endif /* CONFIG_NUMA_BALANCING */
2783 
2784 /* assumes fs == KERNEL_DS */
2785 void __init numa_policy_init(void)
2786 {
2787 	nodemask_t interleave_nodes;
2788 	unsigned long largest = 0;
2789 	int nid, prefer = 0;
2790 
2791 	policy_cache = kmem_cache_create("numa_policy",
2792 					 sizeof(struct mempolicy),
2793 					 0, SLAB_PANIC, NULL);
2794 
2795 	sn_cache = kmem_cache_create("shared_policy_node",
2796 				     sizeof(struct sp_node),
2797 				     0, SLAB_PANIC, NULL);
2798 
2799 	for_each_node(nid) {
2800 		preferred_node_policy[nid] = (struct mempolicy) {
2801 			.refcnt = ATOMIC_INIT(1),
2802 			.mode = MPOL_PREFERRED,
2803 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2804 			.v = { .preferred_node = nid, },
2805 		};
2806 	}
2807 
2808 	/*
2809 	 * Set interleaving policy for system init. Interleaving is only
2810 	 * enabled across suitably sized nodes (default is >= 16MB), or
2811 	 * fall back to the largest node if they're all smaller.
2812 	 */
2813 	nodes_clear(interleave_nodes);
2814 	for_each_node_state(nid, N_MEMORY) {
2815 		unsigned long total_pages = node_present_pages(nid);
2816 
2817 		/* Preserve the largest node */
2818 		if (largest < total_pages) {
2819 			largest = total_pages;
2820 			prefer = nid;
2821 		}
2822 
2823 		/* Interleave this node? */
2824 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2825 			node_set(nid, interleave_nodes);
2826 	}
2827 
2828 	/* All too small, use the largest */
2829 	if (unlikely(nodes_empty(interleave_nodes)))
2830 		node_set(prefer, interleave_nodes);
2831 
2832 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2833 		pr_err("%s: interleaving failed\n", __func__);
2834 
2835 	check_numabalancing_enable();
2836 }
2837 
2838 /* Reset policy of current process to default */
2839 void numa_default_policy(void)
2840 {
2841 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2842 }
2843 
2844 /*
2845  * Parse and format mempolicy from/to strings
2846  */
2847 
2848 /*
2849  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2850  */
2851 static const char * const policy_modes[] =
2852 {
2853 	[MPOL_DEFAULT]    = "default",
2854 	[MPOL_PREFERRED]  = "prefer",
2855 	[MPOL_BIND]       = "bind",
2856 	[MPOL_INTERLEAVE] = "interleave",
2857 	[MPOL_LOCAL]      = "local",
2858 };
2859 
2860 
2861 #ifdef CONFIG_TMPFS
2862 /**
2863  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2864  * @str:  string containing mempolicy to parse
2865  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2866  *
2867  * Format of input:
2868  *	<mode>[=<flags>][:<nodelist>]
2869  *
2870  * On success, returns 0, else 1
2871  */
2872 int mpol_parse_str(char *str, struct mempolicy **mpol)
2873 {
2874 	struct mempolicy *new = NULL;
2875 	unsigned short mode_flags;
2876 	nodemask_t nodes;
2877 	char *nodelist = strchr(str, ':');
2878 	char *flags = strchr(str, '=');
2879 	int err = 1, mode;
2880 
2881 	if (flags)
2882 		*flags++ = '\0';	/* terminate mode string */
2883 
2884 	if (nodelist) {
2885 		/* NUL-terminate mode or flags string */
2886 		*nodelist++ = '\0';
2887 		if (nodelist_parse(nodelist, nodes))
2888 			goto out;
2889 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2890 			goto out;
2891 	} else
2892 		nodes_clear(nodes);
2893 
2894 	mode = match_string(policy_modes, MPOL_MAX, str);
2895 	if (mode < 0)
2896 		goto out;
2897 
2898 	switch (mode) {
2899 	case MPOL_PREFERRED:
2900 		/*
2901 		 * Insist on a nodelist of one node only, although later
2902 		 * we use first_node(nodes) to grab a single node, so here
2903 		 * nodelist (or nodes) cannot be empty.
2904 		 */
2905 		if (nodelist) {
2906 			char *rest = nodelist;
2907 			while (isdigit(*rest))
2908 				rest++;
2909 			if (*rest)
2910 				goto out;
2911 			if (nodes_empty(nodes))
2912 				goto out;
2913 		}
2914 		break;
2915 	case MPOL_INTERLEAVE:
2916 		/*
2917 		 * Default to online nodes with memory if no nodelist
2918 		 */
2919 		if (!nodelist)
2920 			nodes = node_states[N_MEMORY];
2921 		break;
2922 	case MPOL_LOCAL:
2923 		/*
2924 		 * Don't allow a nodelist;  mpol_new() checks flags
2925 		 */
2926 		if (nodelist)
2927 			goto out;
2928 		mode = MPOL_PREFERRED;
2929 		break;
2930 	case MPOL_DEFAULT:
2931 		/*
2932 		 * Insist on a empty nodelist
2933 		 */
2934 		if (!nodelist)
2935 			err = 0;
2936 		goto out;
2937 	case MPOL_BIND:
2938 		/*
2939 		 * Insist on a nodelist
2940 		 */
2941 		if (!nodelist)
2942 			goto out;
2943 	}
2944 
2945 	mode_flags = 0;
2946 	if (flags) {
2947 		/*
2948 		 * Currently, we only support two mutually exclusive
2949 		 * mode flags.
2950 		 */
2951 		if (!strcmp(flags, "static"))
2952 			mode_flags |= MPOL_F_STATIC_NODES;
2953 		else if (!strcmp(flags, "relative"))
2954 			mode_flags |= MPOL_F_RELATIVE_NODES;
2955 		else
2956 			goto out;
2957 	}
2958 
2959 	new = mpol_new(mode, mode_flags, &nodes);
2960 	if (IS_ERR(new))
2961 		goto out;
2962 
2963 	/*
2964 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2965 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2966 	 */
2967 	if (mode != MPOL_PREFERRED)
2968 		new->v.nodes = nodes;
2969 	else if (nodelist)
2970 		new->v.preferred_node = first_node(nodes);
2971 	else
2972 		new->flags |= MPOL_F_LOCAL;
2973 
2974 	/*
2975 	 * Save nodes for contextualization: this will be used to "clone"
2976 	 * the mempolicy in a specific context [cpuset] at a later time.
2977 	 */
2978 	new->w.user_nodemask = nodes;
2979 
2980 	err = 0;
2981 
2982 out:
2983 	/* Restore string for error message */
2984 	if (nodelist)
2985 		*--nodelist = ':';
2986 	if (flags)
2987 		*--flags = '=';
2988 	if (!err)
2989 		*mpol = new;
2990 	return err;
2991 }
2992 #endif /* CONFIG_TMPFS */
2993 
2994 /**
2995  * mpol_to_str - format a mempolicy structure for printing
2996  * @buffer:  to contain formatted mempolicy string
2997  * @maxlen:  length of @buffer
2998  * @pol:  pointer to mempolicy to be formatted
2999  *
3000  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3001  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3002  * longest flag, "relative", and to display at least a few node ids.
3003  */
3004 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3005 {
3006 	char *p = buffer;
3007 	nodemask_t nodes = NODE_MASK_NONE;
3008 	unsigned short mode = MPOL_DEFAULT;
3009 	unsigned short flags = 0;
3010 
3011 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3012 		mode = pol->mode;
3013 		flags = pol->flags;
3014 	}
3015 
3016 	switch (mode) {
3017 	case MPOL_DEFAULT:
3018 		break;
3019 	case MPOL_PREFERRED:
3020 		if (flags & MPOL_F_LOCAL)
3021 			mode = MPOL_LOCAL;
3022 		else
3023 			node_set(pol->v.preferred_node, nodes);
3024 		break;
3025 	case MPOL_BIND:
3026 	case MPOL_INTERLEAVE:
3027 		nodes = pol->v.nodes;
3028 		break;
3029 	default:
3030 		WARN_ON_ONCE(1);
3031 		snprintf(p, maxlen, "unknown");
3032 		return;
3033 	}
3034 
3035 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3036 
3037 	if (flags & MPOL_MODE_FLAGS) {
3038 		p += snprintf(p, buffer + maxlen - p, "=");
3039 
3040 		/*
3041 		 * Currently, the only defined flags are mutually exclusive
3042 		 */
3043 		if (flags & MPOL_F_STATIC_NODES)
3044 			p += snprintf(p, buffer + maxlen - p, "static");
3045 		else if (flags & MPOL_F_RELATIVE_NODES)
3046 			p += snprintf(p, buffer + maxlen - p, "relative");
3047 	}
3048 
3049 	if (!nodes_empty(nodes))
3050 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3051 			       nodemask_pr_args(&nodes));
3052 }
3053