xref: /linux/mm/mempolicy.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/export.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/ksm.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 #include <linux/syscalls.h>
93 #include <linux/ctype.h>
94 #include <linux/mm_inline.h>
95 #include <linux/mmu_notifier.h>
96 #include <linux/printk.h>
97 
98 #include <asm/tlbflush.h>
99 #include <asm/uaccess.h>
100 
101 #include "internal.h"
102 
103 /* Internal flags */
104 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
105 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
106 
107 static struct kmem_cache *policy_cache;
108 static struct kmem_cache *sn_cache;
109 
110 /* Highest zone. An specific allocation for a zone below that is not
111    policied. */
112 enum zone_type policy_zone = 0;
113 
114 /*
115  * run-time system-wide default policy => local allocation
116  */
117 static struct mempolicy default_policy = {
118 	.refcnt = ATOMIC_INIT(1), /* never free it */
119 	.mode = MPOL_PREFERRED,
120 	.flags = MPOL_F_LOCAL,
121 };
122 
123 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
124 
125 struct mempolicy *get_task_policy(struct task_struct *p)
126 {
127 	struct mempolicy *pol = p->mempolicy;
128 	int node;
129 
130 	if (pol)
131 		return pol;
132 
133 	node = numa_node_id();
134 	if (node != NUMA_NO_NODE) {
135 		pol = &preferred_node_policy[node];
136 		/* preferred_node_policy is not initialised early in boot */
137 		if (pol->mode)
138 			return pol;
139 	}
140 
141 	return &default_policy;
142 }
143 
144 static const struct mempolicy_operations {
145 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 	/*
147 	 * If read-side task has no lock to protect task->mempolicy, write-side
148 	 * task will rebind the task->mempolicy by two step. The first step is
149 	 * setting all the newly nodes, and the second step is cleaning all the
150 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 	 * page.
152 	 * If we have a lock to protect task->mempolicy in read-side, we do
153 	 * rebind directly.
154 	 *
155 	 * step:
156 	 * 	MPOL_REBIND_ONCE - do rebind work at once
157 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
158 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 	 */
160 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 			enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163 
164 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
165 {
166 	return pol->flags & MPOL_MODE_FLAGS;
167 }
168 
169 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
170 				   const nodemask_t *rel)
171 {
172 	nodemask_t tmp;
173 	nodes_fold(tmp, *orig, nodes_weight(*rel));
174 	nodes_onto(*ret, tmp, *rel);
175 }
176 
177 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
178 {
179 	if (nodes_empty(*nodes))
180 		return -EINVAL;
181 	pol->v.nodes = *nodes;
182 	return 0;
183 }
184 
185 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
186 {
187 	if (!nodes)
188 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
189 	else if (nodes_empty(*nodes))
190 		return -EINVAL;			/*  no allowed nodes */
191 	else
192 		pol->v.preferred_node = first_node(*nodes);
193 	return 0;
194 }
195 
196 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
197 {
198 	if (nodes_empty(*nodes))
199 		return -EINVAL;
200 	pol->v.nodes = *nodes;
201 	return 0;
202 }
203 
204 /*
205  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
206  * any, for the new policy.  mpol_new() has already validated the nodes
207  * parameter with respect to the policy mode and flags.  But, we need to
208  * handle an empty nodemask with MPOL_PREFERRED here.
209  *
210  * Must be called holding task's alloc_lock to protect task's mems_allowed
211  * and mempolicy.  May also be called holding the mmap_semaphore for write.
212  */
213 static int mpol_set_nodemask(struct mempolicy *pol,
214 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
215 {
216 	int ret;
217 
218 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
219 	if (pol == NULL)
220 		return 0;
221 	/* Check N_MEMORY */
222 	nodes_and(nsc->mask1,
223 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
224 
225 	VM_BUG_ON(!nodes);
226 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
227 		nodes = NULL;	/* explicit local allocation */
228 	else {
229 		if (pol->flags & MPOL_F_RELATIVE_NODES)
230 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
231 		else
232 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
233 
234 		if (mpol_store_user_nodemask(pol))
235 			pol->w.user_nodemask = *nodes;
236 		else
237 			pol->w.cpuset_mems_allowed =
238 						cpuset_current_mems_allowed;
239 	}
240 
241 	if (nodes)
242 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
243 	else
244 		ret = mpol_ops[pol->mode].create(pol, NULL);
245 	return ret;
246 }
247 
248 /*
249  * This function just creates a new policy, does some check and simple
250  * initialization. You must invoke mpol_set_nodemask() to set nodes.
251  */
252 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
253 				  nodemask_t *nodes)
254 {
255 	struct mempolicy *policy;
256 
257 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
258 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
259 
260 	if (mode == MPOL_DEFAULT) {
261 		if (nodes && !nodes_empty(*nodes))
262 			return ERR_PTR(-EINVAL);
263 		return NULL;
264 	}
265 	VM_BUG_ON(!nodes);
266 
267 	/*
268 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
269 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
270 	 * All other modes require a valid pointer to a non-empty nodemask.
271 	 */
272 	if (mode == MPOL_PREFERRED) {
273 		if (nodes_empty(*nodes)) {
274 			if (((flags & MPOL_F_STATIC_NODES) ||
275 			     (flags & MPOL_F_RELATIVE_NODES)))
276 				return ERR_PTR(-EINVAL);
277 		}
278 	} else if (mode == MPOL_LOCAL) {
279 		if (!nodes_empty(*nodes))
280 			return ERR_PTR(-EINVAL);
281 		mode = MPOL_PREFERRED;
282 	} else if (nodes_empty(*nodes))
283 		return ERR_PTR(-EINVAL);
284 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
285 	if (!policy)
286 		return ERR_PTR(-ENOMEM);
287 	atomic_set(&policy->refcnt, 1);
288 	policy->mode = mode;
289 	policy->flags = flags;
290 
291 	return policy;
292 }
293 
294 /* Slow path of a mpol destructor. */
295 void __mpol_put(struct mempolicy *p)
296 {
297 	if (!atomic_dec_and_test(&p->refcnt))
298 		return;
299 	kmem_cache_free(policy_cache, p);
300 }
301 
302 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
303 				enum mpol_rebind_step step)
304 {
305 }
306 
307 /*
308  * step:
309  * 	MPOL_REBIND_ONCE  - do rebind work at once
310  * 	MPOL_REBIND_STEP1 - set all the newly nodes
311  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
312  */
313 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
314 				 enum mpol_rebind_step step)
315 {
316 	nodemask_t tmp;
317 
318 	if (pol->flags & MPOL_F_STATIC_NODES)
319 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
320 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
321 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
322 	else {
323 		/*
324 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
325 		 * result
326 		 */
327 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
328 			nodes_remap(tmp, pol->v.nodes,
329 					pol->w.cpuset_mems_allowed, *nodes);
330 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
331 		} else if (step == MPOL_REBIND_STEP2) {
332 			tmp = pol->w.cpuset_mems_allowed;
333 			pol->w.cpuset_mems_allowed = *nodes;
334 		} else
335 			BUG();
336 	}
337 
338 	if (nodes_empty(tmp))
339 		tmp = *nodes;
340 
341 	if (step == MPOL_REBIND_STEP1)
342 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
343 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
344 		pol->v.nodes = tmp;
345 	else
346 		BUG();
347 
348 	if (!node_isset(current->il_next, tmp)) {
349 		current->il_next = next_node_in(current->il_next, tmp);
350 		if (current->il_next >= MAX_NUMNODES)
351 			current->il_next = numa_node_id();
352 	}
353 }
354 
355 static void mpol_rebind_preferred(struct mempolicy *pol,
356 				  const nodemask_t *nodes,
357 				  enum mpol_rebind_step step)
358 {
359 	nodemask_t tmp;
360 
361 	if (pol->flags & MPOL_F_STATIC_NODES) {
362 		int node = first_node(pol->w.user_nodemask);
363 
364 		if (node_isset(node, *nodes)) {
365 			pol->v.preferred_node = node;
366 			pol->flags &= ~MPOL_F_LOCAL;
367 		} else
368 			pol->flags |= MPOL_F_LOCAL;
369 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
370 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
371 		pol->v.preferred_node = first_node(tmp);
372 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
373 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
374 						   pol->w.cpuset_mems_allowed,
375 						   *nodes);
376 		pol->w.cpuset_mems_allowed = *nodes;
377 	}
378 }
379 
380 /*
381  * mpol_rebind_policy - Migrate a policy to a different set of nodes
382  *
383  * If read-side task has no lock to protect task->mempolicy, write-side
384  * task will rebind the task->mempolicy by two step. The first step is
385  * setting all the newly nodes, and the second step is cleaning all the
386  * disallowed nodes. In this way, we can avoid finding no node to alloc
387  * page.
388  * If we have a lock to protect task->mempolicy in read-side, we do
389  * rebind directly.
390  *
391  * step:
392  * 	MPOL_REBIND_ONCE  - do rebind work at once
393  * 	MPOL_REBIND_STEP1 - set all the newly nodes
394  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
395  */
396 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
397 				enum mpol_rebind_step step)
398 {
399 	if (!pol)
400 		return;
401 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
402 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
403 		return;
404 
405 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
406 		return;
407 
408 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
409 		BUG();
410 
411 	if (step == MPOL_REBIND_STEP1)
412 		pol->flags |= MPOL_F_REBINDING;
413 	else if (step == MPOL_REBIND_STEP2)
414 		pol->flags &= ~MPOL_F_REBINDING;
415 	else if (step >= MPOL_REBIND_NSTEP)
416 		BUG();
417 
418 	mpol_ops[pol->mode].rebind(pol, newmask, step);
419 }
420 
421 /*
422  * Wrapper for mpol_rebind_policy() that just requires task
423  * pointer, and updates task mempolicy.
424  *
425  * Called with task's alloc_lock held.
426  */
427 
428 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
429 			enum mpol_rebind_step step)
430 {
431 	mpol_rebind_policy(tsk->mempolicy, new, step);
432 }
433 
434 /*
435  * Rebind each vma in mm to new nodemask.
436  *
437  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
438  */
439 
440 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
441 {
442 	struct vm_area_struct *vma;
443 
444 	down_write(&mm->mmap_sem);
445 	for (vma = mm->mmap; vma; vma = vma->vm_next)
446 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
447 	up_write(&mm->mmap_sem);
448 }
449 
450 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
451 	[MPOL_DEFAULT] = {
452 		.rebind = mpol_rebind_default,
453 	},
454 	[MPOL_INTERLEAVE] = {
455 		.create = mpol_new_interleave,
456 		.rebind = mpol_rebind_nodemask,
457 	},
458 	[MPOL_PREFERRED] = {
459 		.create = mpol_new_preferred,
460 		.rebind = mpol_rebind_preferred,
461 	},
462 	[MPOL_BIND] = {
463 		.create = mpol_new_bind,
464 		.rebind = mpol_rebind_nodemask,
465 	},
466 };
467 
468 static void migrate_page_add(struct page *page, struct list_head *pagelist,
469 				unsigned long flags);
470 
471 struct queue_pages {
472 	struct list_head *pagelist;
473 	unsigned long flags;
474 	nodemask_t *nmask;
475 	struct vm_area_struct *prev;
476 };
477 
478 /*
479  * Scan through pages checking if pages follow certain conditions,
480  * and move them to the pagelist if they do.
481  */
482 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
483 			unsigned long end, struct mm_walk *walk)
484 {
485 	struct vm_area_struct *vma = walk->vma;
486 	struct page *page;
487 	struct queue_pages *qp = walk->private;
488 	unsigned long flags = qp->flags;
489 	int nid, ret;
490 	pte_t *pte;
491 	spinlock_t *ptl;
492 
493 	if (pmd_trans_huge(*pmd)) {
494 		ptl = pmd_lock(walk->mm, pmd);
495 		if (pmd_trans_huge(*pmd)) {
496 			page = pmd_page(*pmd);
497 			if (is_huge_zero_page(page)) {
498 				spin_unlock(ptl);
499 				split_huge_pmd(vma, pmd, addr);
500 			} else {
501 				get_page(page);
502 				spin_unlock(ptl);
503 				lock_page(page);
504 				ret = split_huge_page(page);
505 				unlock_page(page);
506 				put_page(page);
507 				if (ret)
508 					return 0;
509 			}
510 		} else {
511 			spin_unlock(ptl);
512 		}
513 	}
514 
515 retry:
516 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
517 	for (; addr != end; pte++, addr += PAGE_SIZE) {
518 		if (!pte_present(*pte))
519 			continue;
520 		page = vm_normal_page(vma, addr, *pte);
521 		if (!page)
522 			continue;
523 		/*
524 		 * vm_normal_page() filters out zero pages, but there might
525 		 * still be PageReserved pages to skip, perhaps in a VDSO.
526 		 */
527 		if (PageReserved(page))
528 			continue;
529 		nid = page_to_nid(page);
530 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
531 			continue;
532 		if (PageTransCompound(page) && PageAnon(page)) {
533 			get_page(page);
534 			pte_unmap_unlock(pte, ptl);
535 			lock_page(page);
536 			ret = split_huge_page(page);
537 			unlock_page(page);
538 			put_page(page);
539 			/* Failed to split -- skip. */
540 			if (ret) {
541 				pte = pte_offset_map_lock(walk->mm, pmd,
542 						addr, &ptl);
543 				continue;
544 			}
545 			goto retry;
546 		}
547 
548 		migrate_page_add(page, qp->pagelist, flags);
549 	}
550 	pte_unmap_unlock(pte - 1, ptl);
551 	cond_resched();
552 	return 0;
553 }
554 
555 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
556 			       unsigned long addr, unsigned long end,
557 			       struct mm_walk *walk)
558 {
559 #ifdef CONFIG_HUGETLB_PAGE
560 	struct queue_pages *qp = walk->private;
561 	unsigned long flags = qp->flags;
562 	int nid;
563 	struct page *page;
564 	spinlock_t *ptl;
565 	pte_t entry;
566 
567 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
568 	entry = huge_ptep_get(pte);
569 	if (!pte_present(entry))
570 		goto unlock;
571 	page = pte_page(entry);
572 	nid = page_to_nid(page);
573 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
574 		goto unlock;
575 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
576 	if (flags & (MPOL_MF_MOVE_ALL) ||
577 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
578 		isolate_huge_page(page, qp->pagelist);
579 unlock:
580 	spin_unlock(ptl);
581 #else
582 	BUG();
583 #endif
584 	return 0;
585 }
586 
587 #ifdef CONFIG_NUMA_BALANCING
588 /*
589  * This is used to mark a range of virtual addresses to be inaccessible.
590  * These are later cleared by a NUMA hinting fault. Depending on these
591  * faults, pages may be migrated for better NUMA placement.
592  *
593  * This is assuming that NUMA faults are handled using PROT_NONE. If
594  * an architecture makes a different choice, it will need further
595  * changes to the core.
596  */
597 unsigned long change_prot_numa(struct vm_area_struct *vma,
598 			unsigned long addr, unsigned long end)
599 {
600 	int nr_updated;
601 
602 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
603 	if (nr_updated)
604 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
605 
606 	return nr_updated;
607 }
608 #else
609 static unsigned long change_prot_numa(struct vm_area_struct *vma,
610 			unsigned long addr, unsigned long end)
611 {
612 	return 0;
613 }
614 #endif /* CONFIG_NUMA_BALANCING */
615 
616 static int queue_pages_test_walk(unsigned long start, unsigned long end,
617 				struct mm_walk *walk)
618 {
619 	struct vm_area_struct *vma = walk->vma;
620 	struct queue_pages *qp = walk->private;
621 	unsigned long endvma = vma->vm_end;
622 	unsigned long flags = qp->flags;
623 
624 	if (!vma_migratable(vma))
625 		return 1;
626 
627 	if (endvma > end)
628 		endvma = end;
629 	if (vma->vm_start > start)
630 		start = vma->vm_start;
631 
632 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
633 		if (!vma->vm_next && vma->vm_end < end)
634 			return -EFAULT;
635 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
636 			return -EFAULT;
637 	}
638 
639 	qp->prev = vma;
640 
641 	if (flags & MPOL_MF_LAZY) {
642 		/* Similar to task_numa_work, skip inaccessible VMAs */
643 		if (!is_vm_hugetlb_page(vma) &&
644 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
645 			!(vma->vm_flags & VM_MIXEDMAP))
646 			change_prot_numa(vma, start, endvma);
647 		return 1;
648 	}
649 
650 	/* queue pages from current vma */
651 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
652 		return 0;
653 	return 1;
654 }
655 
656 /*
657  * Walk through page tables and collect pages to be migrated.
658  *
659  * If pages found in a given range are on a set of nodes (determined by
660  * @nodes and @flags,) it's isolated and queued to the pagelist which is
661  * passed via @private.)
662  */
663 static int
664 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
665 		nodemask_t *nodes, unsigned long flags,
666 		struct list_head *pagelist)
667 {
668 	struct queue_pages qp = {
669 		.pagelist = pagelist,
670 		.flags = flags,
671 		.nmask = nodes,
672 		.prev = NULL,
673 	};
674 	struct mm_walk queue_pages_walk = {
675 		.hugetlb_entry = queue_pages_hugetlb,
676 		.pmd_entry = queue_pages_pte_range,
677 		.test_walk = queue_pages_test_walk,
678 		.mm = mm,
679 		.private = &qp,
680 	};
681 
682 	return walk_page_range(start, end, &queue_pages_walk);
683 }
684 
685 /*
686  * Apply policy to a single VMA
687  * This must be called with the mmap_sem held for writing.
688  */
689 static int vma_replace_policy(struct vm_area_struct *vma,
690 						struct mempolicy *pol)
691 {
692 	int err;
693 	struct mempolicy *old;
694 	struct mempolicy *new;
695 
696 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
697 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
698 		 vma->vm_ops, vma->vm_file,
699 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
700 
701 	new = mpol_dup(pol);
702 	if (IS_ERR(new))
703 		return PTR_ERR(new);
704 
705 	if (vma->vm_ops && vma->vm_ops->set_policy) {
706 		err = vma->vm_ops->set_policy(vma, new);
707 		if (err)
708 			goto err_out;
709 	}
710 
711 	old = vma->vm_policy;
712 	vma->vm_policy = new; /* protected by mmap_sem */
713 	mpol_put(old);
714 
715 	return 0;
716  err_out:
717 	mpol_put(new);
718 	return err;
719 }
720 
721 /* Step 2: apply policy to a range and do splits. */
722 static int mbind_range(struct mm_struct *mm, unsigned long start,
723 		       unsigned long end, struct mempolicy *new_pol)
724 {
725 	struct vm_area_struct *next;
726 	struct vm_area_struct *prev;
727 	struct vm_area_struct *vma;
728 	int err = 0;
729 	pgoff_t pgoff;
730 	unsigned long vmstart;
731 	unsigned long vmend;
732 
733 	vma = find_vma(mm, start);
734 	if (!vma || vma->vm_start > start)
735 		return -EFAULT;
736 
737 	prev = vma->vm_prev;
738 	if (start > vma->vm_start)
739 		prev = vma;
740 
741 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
742 		next = vma->vm_next;
743 		vmstart = max(start, vma->vm_start);
744 		vmend   = min(end, vma->vm_end);
745 
746 		if (mpol_equal(vma_policy(vma), new_pol))
747 			continue;
748 
749 		pgoff = vma->vm_pgoff +
750 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
751 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
752 				 vma->anon_vma, vma->vm_file, pgoff,
753 				 new_pol, vma->vm_userfaultfd_ctx);
754 		if (prev) {
755 			vma = prev;
756 			next = vma->vm_next;
757 			if (mpol_equal(vma_policy(vma), new_pol))
758 				continue;
759 			/* vma_merge() joined vma && vma->next, case 8 */
760 			goto replace;
761 		}
762 		if (vma->vm_start != vmstart) {
763 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
764 			if (err)
765 				goto out;
766 		}
767 		if (vma->vm_end != vmend) {
768 			err = split_vma(vma->vm_mm, vma, vmend, 0);
769 			if (err)
770 				goto out;
771 		}
772  replace:
773 		err = vma_replace_policy(vma, new_pol);
774 		if (err)
775 			goto out;
776 	}
777 
778  out:
779 	return err;
780 }
781 
782 /* Set the process memory policy */
783 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
784 			     nodemask_t *nodes)
785 {
786 	struct mempolicy *new, *old;
787 	NODEMASK_SCRATCH(scratch);
788 	int ret;
789 
790 	if (!scratch)
791 		return -ENOMEM;
792 
793 	new = mpol_new(mode, flags, nodes);
794 	if (IS_ERR(new)) {
795 		ret = PTR_ERR(new);
796 		goto out;
797 	}
798 
799 	task_lock(current);
800 	ret = mpol_set_nodemask(new, nodes, scratch);
801 	if (ret) {
802 		task_unlock(current);
803 		mpol_put(new);
804 		goto out;
805 	}
806 	old = current->mempolicy;
807 	current->mempolicy = new;
808 	if (new && new->mode == MPOL_INTERLEAVE &&
809 	    nodes_weight(new->v.nodes))
810 		current->il_next = first_node(new->v.nodes);
811 	task_unlock(current);
812 	mpol_put(old);
813 	ret = 0;
814 out:
815 	NODEMASK_SCRATCH_FREE(scratch);
816 	return ret;
817 }
818 
819 /*
820  * Return nodemask for policy for get_mempolicy() query
821  *
822  * Called with task's alloc_lock held
823  */
824 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
825 {
826 	nodes_clear(*nodes);
827 	if (p == &default_policy)
828 		return;
829 
830 	switch (p->mode) {
831 	case MPOL_BIND:
832 		/* Fall through */
833 	case MPOL_INTERLEAVE:
834 		*nodes = p->v.nodes;
835 		break;
836 	case MPOL_PREFERRED:
837 		if (!(p->flags & MPOL_F_LOCAL))
838 			node_set(p->v.preferred_node, *nodes);
839 		/* else return empty node mask for local allocation */
840 		break;
841 	default:
842 		BUG();
843 	}
844 }
845 
846 static int lookup_node(unsigned long addr)
847 {
848 	struct page *p;
849 	int err;
850 
851 	err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
852 	if (err >= 0) {
853 		err = page_to_nid(p);
854 		put_page(p);
855 	}
856 	return err;
857 }
858 
859 /* Retrieve NUMA policy */
860 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
861 			     unsigned long addr, unsigned long flags)
862 {
863 	int err;
864 	struct mm_struct *mm = current->mm;
865 	struct vm_area_struct *vma = NULL;
866 	struct mempolicy *pol = current->mempolicy;
867 
868 	if (flags &
869 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
870 		return -EINVAL;
871 
872 	if (flags & MPOL_F_MEMS_ALLOWED) {
873 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
874 			return -EINVAL;
875 		*policy = 0;	/* just so it's initialized */
876 		task_lock(current);
877 		*nmask  = cpuset_current_mems_allowed;
878 		task_unlock(current);
879 		return 0;
880 	}
881 
882 	if (flags & MPOL_F_ADDR) {
883 		/*
884 		 * Do NOT fall back to task policy if the
885 		 * vma/shared policy at addr is NULL.  We
886 		 * want to return MPOL_DEFAULT in this case.
887 		 */
888 		down_read(&mm->mmap_sem);
889 		vma = find_vma_intersection(mm, addr, addr+1);
890 		if (!vma) {
891 			up_read(&mm->mmap_sem);
892 			return -EFAULT;
893 		}
894 		if (vma->vm_ops && vma->vm_ops->get_policy)
895 			pol = vma->vm_ops->get_policy(vma, addr);
896 		else
897 			pol = vma->vm_policy;
898 	} else if (addr)
899 		return -EINVAL;
900 
901 	if (!pol)
902 		pol = &default_policy;	/* indicates default behavior */
903 
904 	if (flags & MPOL_F_NODE) {
905 		if (flags & MPOL_F_ADDR) {
906 			err = lookup_node(addr);
907 			if (err < 0)
908 				goto out;
909 			*policy = err;
910 		} else if (pol == current->mempolicy &&
911 				pol->mode == MPOL_INTERLEAVE) {
912 			*policy = current->il_next;
913 		} else {
914 			err = -EINVAL;
915 			goto out;
916 		}
917 	} else {
918 		*policy = pol == &default_policy ? MPOL_DEFAULT :
919 						pol->mode;
920 		/*
921 		 * Internal mempolicy flags must be masked off before exposing
922 		 * the policy to userspace.
923 		 */
924 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
925 	}
926 
927 	if (vma) {
928 		up_read(&current->mm->mmap_sem);
929 		vma = NULL;
930 	}
931 
932 	err = 0;
933 	if (nmask) {
934 		if (mpol_store_user_nodemask(pol)) {
935 			*nmask = pol->w.user_nodemask;
936 		} else {
937 			task_lock(current);
938 			get_policy_nodemask(pol, nmask);
939 			task_unlock(current);
940 		}
941 	}
942 
943  out:
944 	mpol_cond_put(pol);
945 	if (vma)
946 		up_read(&current->mm->mmap_sem);
947 	return err;
948 }
949 
950 #ifdef CONFIG_MIGRATION
951 /*
952  * page migration
953  */
954 static void migrate_page_add(struct page *page, struct list_head *pagelist,
955 				unsigned long flags)
956 {
957 	/*
958 	 * Avoid migrating a page that is shared with others.
959 	 */
960 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
961 		if (!isolate_lru_page(page)) {
962 			list_add_tail(&page->lru, pagelist);
963 			inc_zone_page_state(page, NR_ISOLATED_ANON +
964 					    page_is_file_cache(page));
965 		}
966 	}
967 }
968 
969 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
970 {
971 	if (PageHuge(page))
972 		return alloc_huge_page_node(page_hstate(compound_head(page)),
973 					node);
974 	else
975 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
976 						    __GFP_THISNODE, 0);
977 }
978 
979 /*
980  * Migrate pages from one node to a target node.
981  * Returns error or the number of pages not migrated.
982  */
983 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
984 			   int flags)
985 {
986 	nodemask_t nmask;
987 	LIST_HEAD(pagelist);
988 	int err = 0;
989 
990 	nodes_clear(nmask);
991 	node_set(source, nmask);
992 
993 	/*
994 	 * This does not "check" the range but isolates all pages that
995 	 * need migration.  Between passing in the full user address
996 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
997 	 */
998 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
999 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1000 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1001 
1002 	if (!list_empty(&pagelist)) {
1003 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1004 					MIGRATE_SYNC, MR_SYSCALL);
1005 		if (err)
1006 			putback_movable_pages(&pagelist);
1007 	}
1008 
1009 	return err;
1010 }
1011 
1012 /*
1013  * Move pages between the two nodesets so as to preserve the physical
1014  * layout as much as possible.
1015  *
1016  * Returns the number of page that could not be moved.
1017  */
1018 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1019 		     const nodemask_t *to, int flags)
1020 {
1021 	int busy = 0;
1022 	int err;
1023 	nodemask_t tmp;
1024 
1025 	err = migrate_prep();
1026 	if (err)
1027 		return err;
1028 
1029 	down_read(&mm->mmap_sem);
1030 
1031 	/*
1032 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1033 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1034 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1035 	 * The pair of nodemasks 'to' and 'from' define the map.
1036 	 *
1037 	 * If no pair of bits is found that way, fallback to picking some
1038 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1039 	 * 'source' and 'dest' bits are the same, this represents a node
1040 	 * that will be migrating to itself, so no pages need move.
1041 	 *
1042 	 * If no bits are left in 'tmp', or if all remaining bits left
1043 	 * in 'tmp' correspond to the same bit in 'to', return false
1044 	 * (nothing left to migrate).
1045 	 *
1046 	 * This lets us pick a pair of nodes to migrate between, such that
1047 	 * if possible the dest node is not already occupied by some other
1048 	 * source node, minimizing the risk of overloading the memory on a
1049 	 * node that would happen if we migrated incoming memory to a node
1050 	 * before migrating outgoing memory source that same node.
1051 	 *
1052 	 * A single scan of tmp is sufficient.  As we go, we remember the
1053 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1054 	 * that not only moved, but what's better, moved to an empty slot
1055 	 * (d is not set in tmp), then we break out then, with that pair.
1056 	 * Otherwise when we finish scanning from_tmp, we at least have the
1057 	 * most recent <s, d> pair that moved.  If we get all the way through
1058 	 * the scan of tmp without finding any node that moved, much less
1059 	 * moved to an empty node, then there is nothing left worth migrating.
1060 	 */
1061 
1062 	tmp = *from;
1063 	while (!nodes_empty(tmp)) {
1064 		int s,d;
1065 		int source = NUMA_NO_NODE;
1066 		int dest = 0;
1067 
1068 		for_each_node_mask(s, tmp) {
1069 
1070 			/*
1071 			 * do_migrate_pages() tries to maintain the relative
1072 			 * node relationship of the pages established between
1073 			 * threads and memory areas.
1074                          *
1075 			 * However if the number of source nodes is not equal to
1076 			 * the number of destination nodes we can not preserve
1077 			 * this node relative relationship.  In that case, skip
1078 			 * copying memory from a node that is in the destination
1079 			 * mask.
1080 			 *
1081 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1082 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1083 			 */
1084 
1085 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1086 						(node_isset(s, *to)))
1087 				continue;
1088 
1089 			d = node_remap(s, *from, *to);
1090 			if (s == d)
1091 				continue;
1092 
1093 			source = s;	/* Node moved. Memorize */
1094 			dest = d;
1095 
1096 			/* dest not in remaining from nodes? */
1097 			if (!node_isset(dest, tmp))
1098 				break;
1099 		}
1100 		if (source == NUMA_NO_NODE)
1101 			break;
1102 
1103 		node_clear(source, tmp);
1104 		err = migrate_to_node(mm, source, dest, flags);
1105 		if (err > 0)
1106 			busy += err;
1107 		if (err < 0)
1108 			break;
1109 	}
1110 	up_read(&mm->mmap_sem);
1111 	if (err < 0)
1112 		return err;
1113 	return busy;
1114 
1115 }
1116 
1117 /*
1118  * Allocate a new page for page migration based on vma policy.
1119  * Start by assuming the page is mapped by the same vma as contains @start.
1120  * Search forward from there, if not.  N.B., this assumes that the
1121  * list of pages handed to migrate_pages()--which is how we get here--
1122  * is in virtual address order.
1123  */
1124 static struct page *new_page(struct page *page, unsigned long start, int **x)
1125 {
1126 	struct vm_area_struct *vma;
1127 	unsigned long uninitialized_var(address);
1128 
1129 	vma = find_vma(current->mm, start);
1130 	while (vma) {
1131 		address = page_address_in_vma(page, vma);
1132 		if (address != -EFAULT)
1133 			break;
1134 		vma = vma->vm_next;
1135 	}
1136 
1137 	if (PageHuge(page)) {
1138 		BUG_ON(!vma);
1139 		return alloc_huge_page_noerr(vma, address, 1);
1140 	}
1141 	/*
1142 	 * if !vma, alloc_page_vma() will use task or system default policy
1143 	 */
1144 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1145 }
1146 #else
1147 
1148 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1149 				unsigned long flags)
1150 {
1151 }
1152 
1153 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1154 		     const nodemask_t *to, int flags)
1155 {
1156 	return -ENOSYS;
1157 }
1158 
1159 static struct page *new_page(struct page *page, unsigned long start, int **x)
1160 {
1161 	return NULL;
1162 }
1163 #endif
1164 
1165 static long do_mbind(unsigned long start, unsigned long len,
1166 		     unsigned short mode, unsigned short mode_flags,
1167 		     nodemask_t *nmask, unsigned long flags)
1168 {
1169 	struct mm_struct *mm = current->mm;
1170 	struct mempolicy *new;
1171 	unsigned long end;
1172 	int err;
1173 	LIST_HEAD(pagelist);
1174 
1175 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1176 		return -EINVAL;
1177 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1178 		return -EPERM;
1179 
1180 	if (start & ~PAGE_MASK)
1181 		return -EINVAL;
1182 
1183 	if (mode == MPOL_DEFAULT)
1184 		flags &= ~MPOL_MF_STRICT;
1185 
1186 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1187 	end = start + len;
1188 
1189 	if (end < start)
1190 		return -EINVAL;
1191 	if (end == start)
1192 		return 0;
1193 
1194 	new = mpol_new(mode, mode_flags, nmask);
1195 	if (IS_ERR(new))
1196 		return PTR_ERR(new);
1197 
1198 	if (flags & MPOL_MF_LAZY)
1199 		new->flags |= MPOL_F_MOF;
1200 
1201 	/*
1202 	 * If we are using the default policy then operation
1203 	 * on discontinuous address spaces is okay after all
1204 	 */
1205 	if (!new)
1206 		flags |= MPOL_MF_DISCONTIG_OK;
1207 
1208 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1209 		 start, start + len, mode, mode_flags,
1210 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1211 
1212 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1213 
1214 		err = migrate_prep();
1215 		if (err)
1216 			goto mpol_out;
1217 	}
1218 	{
1219 		NODEMASK_SCRATCH(scratch);
1220 		if (scratch) {
1221 			down_write(&mm->mmap_sem);
1222 			task_lock(current);
1223 			err = mpol_set_nodemask(new, nmask, scratch);
1224 			task_unlock(current);
1225 			if (err)
1226 				up_write(&mm->mmap_sem);
1227 		} else
1228 			err = -ENOMEM;
1229 		NODEMASK_SCRATCH_FREE(scratch);
1230 	}
1231 	if (err)
1232 		goto mpol_out;
1233 
1234 	err = queue_pages_range(mm, start, end, nmask,
1235 			  flags | MPOL_MF_INVERT, &pagelist);
1236 	if (!err)
1237 		err = mbind_range(mm, start, end, new);
1238 
1239 	if (!err) {
1240 		int nr_failed = 0;
1241 
1242 		if (!list_empty(&pagelist)) {
1243 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1244 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1245 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1246 			if (nr_failed)
1247 				putback_movable_pages(&pagelist);
1248 		}
1249 
1250 		if (nr_failed && (flags & MPOL_MF_STRICT))
1251 			err = -EIO;
1252 	} else
1253 		putback_movable_pages(&pagelist);
1254 
1255 	up_write(&mm->mmap_sem);
1256  mpol_out:
1257 	mpol_put(new);
1258 	return err;
1259 }
1260 
1261 /*
1262  * User space interface with variable sized bitmaps for nodelists.
1263  */
1264 
1265 /* Copy a node mask from user space. */
1266 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1267 		     unsigned long maxnode)
1268 {
1269 	unsigned long k;
1270 	unsigned long nlongs;
1271 	unsigned long endmask;
1272 
1273 	--maxnode;
1274 	nodes_clear(*nodes);
1275 	if (maxnode == 0 || !nmask)
1276 		return 0;
1277 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1278 		return -EINVAL;
1279 
1280 	nlongs = BITS_TO_LONGS(maxnode);
1281 	if ((maxnode % BITS_PER_LONG) == 0)
1282 		endmask = ~0UL;
1283 	else
1284 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1285 
1286 	/* When the user specified more nodes than supported just check
1287 	   if the non supported part is all zero. */
1288 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1289 		if (nlongs > PAGE_SIZE/sizeof(long))
1290 			return -EINVAL;
1291 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1292 			unsigned long t;
1293 			if (get_user(t, nmask + k))
1294 				return -EFAULT;
1295 			if (k == nlongs - 1) {
1296 				if (t & endmask)
1297 					return -EINVAL;
1298 			} else if (t)
1299 				return -EINVAL;
1300 		}
1301 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1302 		endmask = ~0UL;
1303 	}
1304 
1305 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1306 		return -EFAULT;
1307 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1308 	return 0;
1309 }
1310 
1311 /* Copy a kernel node mask to user space */
1312 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1313 			      nodemask_t *nodes)
1314 {
1315 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1316 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1317 
1318 	if (copy > nbytes) {
1319 		if (copy > PAGE_SIZE)
1320 			return -EINVAL;
1321 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1322 			return -EFAULT;
1323 		copy = nbytes;
1324 	}
1325 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1326 }
1327 
1328 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1329 		unsigned long, mode, const unsigned long __user *, nmask,
1330 		unsigned long, maxnode, unsigned, flags)
1331 {
1332 	nodemask_t nodes;
1333 	int err;
1334 	unsigned short mode_flags;
1335 
1336 	mode_flags = mode & MPOL_MODE_FLAGS;
1337 	mode &= ~MPOL_MODE_FLAGS;
1338 	if (mode >= MPOL_MAX)
1339 		return -EINVAL;
1340 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1341 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1342 		return -EINVAL;
1343 	err = get_nodes(&nodes, nmask, maxnode);
1344 	if (err)
1345 		return err;
1346 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1347 }
1348 
1349 /* Set the process memory policy */
1350 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1351 		unsigned long, maxnode)
1352 {
1353 	int err;
1354 	nodemask_t nodes;
1355 	unsigned short flags;
1356 
1357 	flags = mode & MPOL_MODE_FLAGS;
1358 	mode &= ~MPOL_MODE_FLAGS;
1359 	if ((unsigned int)mode >= MPOL_MAX)
1360 		return -EINVAL;
1361 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1362 		return -EINVAL;
1363 	err = get_nodes(&nodes, nmask, maxnode);
1364 	if (err)
1365 		return err;
1366 	return do_set_mempolicy(mode, flags, &nodes);
1367 }
1368 
1369 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1370 		const unsigned long __user *, old_nodes,
1371 		const unsigned long __user *, new_nodes)
1372 {
1373 	const struct cred *cred = current_cred(), *tcred;
1374 	struct mm_struct *mm = NULL;
1375 	struct task_struct *task;
1376 	nodemask_t task_nodes;
1377 	int err;
1378 	nodemask_t *old;
1379 	nodemask_t *new;
1380 	NODEMASK_SCRATCH(scratch);
1381 
1382 	if (!scratch)
1383 		return -ENOMEM;
1384 
1385 	old = &scratch->mask1;
1386 	new = &scratch->mask2;
1387 
1388 	err = get_nodes(old, old_nodes, maxnode);
1389 	if (err)
1390 		goto out;
1391 
1392 	err = get_nodes(new, new_nodes, maxnode);
1393 	if (err)
1394 		goto out;
1395 
1396 	/* Find the mm_struct */
1397 	rcu_read_lock();
1398 	task = pid ? find_task_by_vpid(pid) : current;
1399 	if (!task) {
1400 		rcu_read_unlock();
1401 		err = -ESRCH;
1402 		goto out;
1403 	}
1404 	get_task_struct(task);
1405 
1406 	err = -EINVAL;
1407 
1408 	/*
1409 	 * Check if this process has the right to modify the specified
1410 	 * process. The right exists if the process has administrative
1411 	 * capabilities, superuser privileges or the same
1412 	 * userid as the target process.
1413 	 */
1414 	tcred = __task_cred(task);
1415 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1416 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1417 	    !capable(CAP_SYS_NICE)) {
1418 		rcu_read_unlock();
1419 		err = -EPERM;
1420 		goto out_put;
1421 	}
1422 	rcu_read_unlock();
1423 
1424 	task_nodes = cpuset_mems_allowed(task);
1425 	/* Is the user allowed to access the target nodes? */
1426 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1427 		err = -EPERM;
1428 		goto out_put;
1429 	}
1430 
1431 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1432 		err = -EINVAL;
1433 		goto out_put;
1434 	}
1435 
1436 	err = security_task_movememory(task);
1437 	if (err)
1438 		goto out_put;
1439 
1440 	mm = get_task_mm(task);
1441 	put_task_struct(task);
1442 
1443 	if (!mm) {
1444 		err = -EINVAL;
1445 		goto out;
1446 	}
1447 
1448 	err = do_migrate_pages(mm, old, new,
1449 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1450 
1451 	mmput(mm);
1452 out:
1453 	NODEMASK_SCRATCH_FREE(scratch);
1454 
1455 	return err;
1456 
1457 out_put:
1458 	put_task_struct(task);
1459 	goto out;
1460 
1461 }
1462 
1463 
1464 /* Retrieve NUMA policy */
1465 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1466 		unsigned long __user *, nmask, unsigned long, maxnode,
1467 		unsigned long, addr, unsigned long, flags)
1468 {
1469 	int err;
1470 	int uninitialized_var(pval);
1471 	nodemask_t nodes;
1472 
1473 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1474 		return -EINVAL;
1475 
1476 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1477 
1478 	if (err)
1479 		return err;
1480 
1481 	if (policy && put_user(pval, policy))
1482 		return -EFAULT;
1483 
1484 	if (nmask)
1485 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1486 
1487 	return err;
1488 }
1489 
1490 #ifdef CONFIG_COMPAT
1491 
1492 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1493 		       compat_ulong_t __user *, nmask,
1494 		       compat_ulong_t, maxnode,
1495 		       compat_ulong_t, addr, compat_ulong_t, flags)
1496 {
1497 	long err;
1498 	unsigned long __user *nm = NULL;
1499 	unsigned long nr_bits, alloc_size;
1500 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1501 
1502 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1503 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1504 
1505 	if (nmask)
1506 		nm = compat_alloc_user_space(alloc_size);
1507 
1508 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1509 
1510 	if (!err && nmask) {
1511 		unsigned long copy_size;
1512 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1513 		err = copy_from_user(bm, nm, copy_size);
1514 		/* ensure entire bitmap is zeroed */
1515 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1516 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1517 	}
1518 
1519 	return err;
1520 }
1521 
1522 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1523 		       compat_ulong_t, maxnode)
1524 {
1525 	long err = 0;
1526 	unsigned long __user *nm = NULL;
1527 	unsigned long nr_bits, alloc_size;
1528 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1529 
1530 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1531 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1532 
1533 	if (nmask) {
1534 		err = compat_get_bitmap(bm, nmask, nr_bits);
1535 		nm = compat_alloc_user_space(alloc_size);
1536 		err |= copy_to_user(nm, bm, alloc_size);
1537 	}
1538 
1539 	if (err)
1540 		return -EFAULT;
1541 
1542 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1543 }
1544 
1545 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1546 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1547 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1548 {
1549 	long err = 0;
1550 	unsigned long __user *nm = NULL;
1551 	unsigned long nr_bits, alloc_size;
1552 	nodemask_t bm;
1553 
1554 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1555 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1556 
1557 	if (nmask) {
1558 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1559 		nm = compat_alloc_user_space(alloc_size);
1560 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1561 	}
1562 
1563 	if (err)
1564 		return -EFAULT;
1565 
1566 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1567 }
1568 
1569 #endif
1570 
1571 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1572 						unsigned long addr)
1573 {
1574 	struct mempolicy *pol = NULL;
1575 
1576 	if (vma) {
1577 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1578 			pol = vma->vm_ops->get_policy(vma, addr);
1579 		} else if (vma->vm_policy) {
1580 			pol = vma->vm_policy;
1581 
1582 			/*
1583 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1584 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1585 			 * count on these policies which will be dropped by
1586 			 * mpol_cond_put() later
1587 			 */
1588 			if (mpol_needs_cond_ref(pol))
1589 				mpol_get(pol);
1590 		}
1591 	}
1592 
1593 	return pol;
1594 }
1595 
1596 /*
1597  * get_vma_policy(@vma, @addr)
1598  * @vma: virtual memory area whose policy is sought
1599  * @addr: address in @vma for shared policy lookup
1600  *
1601  * Returns effective policy for a VMA at specified address.
1602  * Falls back to current->mempolicy or system default policy, as necessary.
1603  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1604  * count--added by the get_policy() vm_op, as appropriate--to protect against
1605  * freeing by another task.  It is the caller's responsibility to free the
1606  * extra reference for shared policies.
1607  */
1608 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1609 						unsigned long addr)
1610 {
1611 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1612 
1613 	if (!pol)
1614 		pol = get_task_policy(current);
1615 
1616 	return pol;
1617 }
1618 
1619 bool vma_policy_mof(struct vm_area_struct *vma)
1620 {
1621 	struct mempolicy *pol;
1622 
1623 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1624 		bool ret = false;
1625 
1626 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1627 		if (pol && (pol->flags & MPOL_F_MOF))
1628 			ret = true;
1629 		mpol_cond_put(pol);
1630 
1631 		return ret;
1632 	}
1633 
1634 	pol = vma->vm_policy;
1635 	if (!pol)
1636 		pol = get_task_policy(current);
1637 
1638 	return pol->flags & MPOL_F_MOF;
1639 }
1640 
1641 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1642 {
1643 	enum zone_type dynamic_policy_zone = policy_zone;
1644 
1645 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1646 
1647 	/*
1648 	 * if policy->v.nodes has movable memory only,
1649 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1650 	 *
1651 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1652 	 * so if the following test faile, it implies
1653 	 * policy->v.nodes has movable memory only.
1654 	 */
1655 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1656 		dynamic_policy_zone = ZONE_MOVABLE;
1657 
1658 	return zone >= dynamic_policy_zone;
1659 }
1660 
1661 /*
1662  * Return a nodemask representing a mempolicy for filtering nodes for
1663  * page allocation
1664  */
1665 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1666 {
1667 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1668 	if (unlikely(policy->mode == MPOL_BIND) &&
1669 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1670 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1671 		return &policy->v.nodes;
1672 
1673 	return NULL;
1674 }
1675 
1676 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1677 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1678 	int nd)
1679 {
1680 	switch (policy->mode) {
1681 	case MPOL_PREFERRED:
1682 		if (!(policy->flags & MPOL_F_LOCAL))
1683 			nd = policy->v.preferred_node;
1684 		break;
1685 	case MPOL_BIND:
1686 		/*
1687 		 * Normally, MPOL_BIND allocations are node-local within the
1688 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1689 		 * current node isn't part of the mask, we use the zonelist for
1690 		 * the first node in the mask instead.
1691 		 */
1692 		if (unlikely(gfp & __GFP_THISNODE) &&
1693 				unlikely(!node_isset(nd, policy->v.nodes)))
1694 			nd = first_node(policy->v.nodes);
1695 		break;
1696 	default:
1697 		BUG();
1698 	}
1699 	return node_zonelist(nd, gfp);
1700 }
1701 
1702 /* Do dynamic interleaving for a process */
1703 static unsigned interleave_nodes(struct mempolicy *policy)
1704 {
1705 	unsigned nid, next;
1706 	struct task_struct *me = current;
1707 
1708 	nid = me->il_next;
1709 	next = next_node_in(nid, policy->v.nodes);
1710 	if (next < MAX_NUMNODES)
1711 		me->il_next = next;
1712 	return nid;
1713 }
1714 
1715 /*
1716  * Depending on the memory policy provide a node from which to allocate the
1717  * next slab entry.
1718  */
1719 unsigned int mempolicy_slab_node(void)
1720 {
1721 	struct mempolicy *policy;
1722 	int node = numa_mem_id();
1723 
1724 	if (in_interrupt())
1725 		return node;
1726 
1727 	policy = current->mempolicy;
1728 	if (!policy || policy->flags & MPOL_F_LOCAL)
1729 		return node;
1730 
1731 	switch (policy->mode) {
1732 	case MPOL_PREFERRED:
1733 		/*
1734 		 * handled MPOL_F_LOCAL above
1735 		 */
1736 		return policy->v.preferred_node;
1737 
1738 	case MPOL_INTERLEAVE:
1739 		return interleave_nodes(policy);
1740 
1741 	case MPOL_BIND: {
1742 		struct zoneref *z;
1743 
1744 		/*
1745 		 * Follow bind policy behavior and start allocation at the
1746 		 * first node.
1747 		 */
1748 		struct zonelist *zonelist;
1749 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1750 		zonelist = &NODE_DATA(node)->node_zonelists[0];
1751 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1752 							&policy->v.nodes);
1753 		return z->zone ? z->zone->node : node;
1754 	}
1755 
1756 	default:
1757 		BUG();
1758 	}
1759 }
1760 
1761 /*
1762  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1763  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1764  * number of present nodes.
1765  */
1766 static unsigned offset_il_node(struct mempolicy *pol,
1767 			       struct vm_area_struct *vma, unsigned long n)
1768 {
1769 	unsigned nnodes = nodes_weight(pol->v.nodes);
1770 	unsigned target;
1771 	int i;
1772 	int nid;
1773 
1774 	if (!nnodes)
1775 		return numa_node_id();
1776 	target = (unsigned int)n % nnodes;
1777 	nid = first_node(pol->v.nodes);
1778 	for (i = 0; i < target; i++)
1779 		nid = next_node(nid, pol->v.nodes);
1780 	return nid;
1781 }
1782 
1783 /* Determine a node number for interleave */
1784 static inline unsigned interleave_nid(struct mempolicy *pol,
1785 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1786 {
1787 	if (vma) {
1788 		unsigned long off;
1789 
1790 		/*
1791 		 * for small pages, there is no difference between
1792 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1793 		 * for huge pages, since vm_pgoff is in units of small
1794 		 * pages, we need to shift off the always 0 bits to get
1795 		 * a useful offset.
1796 		 */
1797 		BUG_ON(shift < PAGE_SHIFT);
1798 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1799 		off += (addr - vma->vm_start) >> shift;
1800 		return offset_il_node(pol, vma, off);
1801 	} else
1802 		return interleave_nodes(pol);
1803 }
1804 
1805 #ifdef CONFIG_HUGETLBFS
1806 /*
1807  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1808  * @vma: virtual memory area whose policy is sought
1809  * @addr: address in @vma for shared policy lookup and interleave policy
1810  * @gfp_flags: for requested zone
1811  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1812  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1813  *
1814  * Returns a zonelist suitable for a huge page allocation and a pointer
1815  * to the struct mempolicy for conditional unref after allocation.
1816  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1817  * @nodemask for filtering the zonelist.
1818  *
1819  * Must be protected by read_mems_allowed_begin()
1820  */
1821 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1822 				gfp_t gfp_flags, struct mempolicy **mpol,
1823 				nodemask_t **nodemask)
1824 {
1825 	struct zonelist *zl;
1826 
1827 	*mpol = get_vma_policy(vma, addr);
1828 	*nodemask = NULL;	/* assume !MPOL_BIND */
1829 
1830 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1831 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1832 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1833 	} else {
1834 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1835 		if ((*mpol)->mode == MPOL_BIND)
1836 			*nodemask = &(*mpol)->v.nodes;
1837 	}
1838 	return zl;
1839 }
1840 
1841 /*
1842  * init_nodemask_of_mempolicy
1843  *
1844  * If the current task's mempolicy is "default" [NULL], return 'false'
1845  * to indicate default policy.  Otherwise, extract the policy nodemask
1846  * for 'bind' or 'interleave' policy into the argument nodemask, or
1847  * initialize the argument nodemask to contain the single node for
1848  * 'preferred' or 'local' policy and return 'true' to indicate presence
1849  * of non-default mempolicy.
1850  *
1851  * We don't bother with reference counting the mempolicy [mpol_get/put]
1852  * because the current task is examining it's own mempolicy and a task's
1853  * mempolicy is only ever changed by the task itself.
1854  *
1855  * N.B., it is the caller's responsibility to free a returned nodemask.
1856  */
1857 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1858 {
1859 	struct mempolicy *mempolicy;
1860 	int nid;
1861 
1862 	if (!(mask && current->mempolicy))
1863 		return false;
1864 
1865 	task_lock(current);
1866 	mempolicy = current->mempolicy;
1867 	switch (mempolicy->mode) {
1868 	case MPOL_PREFERRED:
1869 		if (mempolicy->flags & MPOL_F_LOCAL)
1870 			nid = numa_node_id();
1871 		else
1872 			nid = mempolicy->v.preferred_node;
1873 		init_nodemask_of_node(mask, nid);
1874 		break;
1875 
1876 	case MPOL_BIND:
1877 		/* Fall through */
1878 	case MPOL_INTERLEAVE:
1879 		*mask =  mempolicy->v.nodes;
1880 		break;
1881 
1882 	default:
1883 		BUG();
1884 	}
1885 	task_unlock(current);
1886 
1887 	return true;
1888 }
1889 #endif
1890 
1891 /*
1892  * mempolicy_nodemask_intersects
1893  *
1894  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1895  * policy.  Otherwise, check for intersection between mask and the policy
1896  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1897  * policy, always return true since it may allocate elsewhere on fallback.
1898  *
1899  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1900  */
1901 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1902 					const nodemask_t *mask)
1903 {
1904 	struct mempolicy *mempolicy;
1905 	bool ret = true;
1906 
1907 	if (!mask)
1908 		return ret;
1909 	task_lock(tsk);
1910 	mempolicy = tsk->mempolicy;
1911 	if (!mempolicy)
1912 		goto out;
1913 
1914 	switch (mempolicy->mode) {
1915 	case MPOL_PREFERRED:
1916 		/*
1917 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1918 		 * allocate from, they may fallback to other nodes when oom.
1919 		 * Thus, it's possible for tsk to have allocated memory from
1920 		 * nodes in mask.
1921 		 */
1922 		break;
1923 	case MPOL_BIND:
1924 	case MPOL_INTERLEAVE:
1925 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1926 		break;
1927 	default:
1928 		BUG();
1929 	}
1930 out:
1931 	task_unlock(tsk);
1932 	return ret;
1933 }
1934 
1935 /* Allocate a page in interleaved policy.
1936    Own path because it needs to do special accounting. */
1937 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1938 					unsigned nid)
1939 {
1940 	struct zonelist *zl;
1941 	struct page *page;
1942 
1943 	zl = node_zonelist(nid, gfp);
1944 	page = __alloc_pages(gfp, order, zl);
1945 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1946 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1947 	return page;
1948 }
1949 
1950 /**
1951  * 	alloc_pages_vma	- Allocate a page for a VMA.
1952  *
1953  * 	@gfp:
1954  *      %GFP_USER    user allocation.
1955  *      %GFP_KERNEL  kernel allocations,
1956  *      %GFP_HIGHMEM highmem/user allocations,
1957  *      %GFP_FS      allocation should not call back into a file system.
1958  *      %GFP_ATOMIC  don't sleep.
1959  *
1960  *	@order:Order of the GFP allocation.
1961  * 	@vma:  Pointer to VMA or NULL if not available.
1962  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1963  *	@node: Which node to prefer for allocation (modulo policy).
1964  *	@hugepage: for hugepages try only the preferred node if possible
1965  *
1966  * 	This function allocates a page from the kernel page pool and applies
1967  *	a NUMA policy associated with the VMA or the current process.
1968  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1969  *	mm_struct of the VMA to prevent it from going away. Should be used for
1970  *	all allocations for pages that will be mapped into user space. Returns
1971  *	NULL when no page can be allocated.
1972  */
1973 struct page *
1974 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1975 		unsigned long addr, int node, bool hugepage)
1976 {
1977 	struct mempolicy *pol;
1978 	struct page *page;
1979 	unsigned int cpuset_mems_cookie;
1980 	struct zonelist *zl;
1981 	nodemask_t *nmask;
1982 
1983 retry_cpuset:
1984 	pol = get_vma_policy(vma, addr);
1985 	cpuset_mems_cookie = read_mems_allowed_begin();
1986 
1987 	if (pol->mode == MPOL_INTERLEAVE) {
1988 		unsigned nid;
1989 
1990 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1991 		mpol_cond_put(pol);
1992 		page = alloc_page_interleave(gfp, order, nid);
1993 		goto out;
1994 	}
1995 
1996 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1997 		int hpage_node = node;
1998 
1999 		/*
2000 		 * For hugepage allocation and non-interleave policy which
2001 		 * allows the current node (or other explicitly preferred
2002 		 * node) we only try to allocate from the current/preferred
2003 		 * node and don't fall back to other nodes, as the cost of
2004 		 * remote accesses would likely offset THP benefits.
2005 		 *
2006 		 * If the policy is interleave, or does not allow the current
2007 		 * node in its nodemask, we allocate the standard way.
2008 		 */
2009 		if (pol->mode == MPOL_PREFERRED &&
2010 						!(pol->flags & MPOL_F_LOCAL))
2011 			hpage_node = pol->v.preferred_node;
2012 
2013 		nmask = policy_nodemask(gfp, pol);
2014 		if (!nmask || node_isset(hpage_node, *nmask)) {
2015 			mpol_cond_put(pol);
2016 			page = __alloc_pages_node(hpage_node,
2017 						gfp | __GFP_THISNODE, order);
2018 			goto out;
2019 		}
2020 	}
2021 
2022 	nmask = policy_nodemask(gfp, pol);
2023 	zl = policy_zonelist(gfp, pol, node);
2024 	mpol_cond_put(pol);
2025 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2026 out:
2027 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2028 		goto retry_cpuset;
2029 	return page;
2030 }
2031 
2032 /**
2033  * 	alloc_pages_current - Allocate pages.
2034  *
2035  *	@gfp:
2036  *		%GFP_USER   user allocation,
2037  *      	%GFP_KERNEL kernel allocation,
2038  *      	%GFP_HIGHMEM highmem allocation,
2039  *      	%GFP_FS     don't call back into a file system.
2040  *      	%GFP_ATOMIC don't sleep.
2041  *	@order: Power of two of allocation size in pages. 0 is a single page.
2042  *
2043  *	Allocate a page from the kernel page pool.  When not in
2044  *	interrupt context and apply the current process NUMA policy.
2045  *	Returns NULL when no page can be allocated.
2046  *
2047  *	Don't call cpuset_update_task_memory_state() unless
2048  *	1) it's ok to take cpuset_sem (can WAIT), and
2049  *	2) allocating for current task (not interrupt).
2050  */
2051 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2052 {
2053 	struct mempolicy *pol = &default_policy;
2054 	struct page *page;
2055 	unsigned int cpuset_mems_cookie;
2056 
2057 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2058 		pol = get_task_policy(current);
2059 
2060 retry_cpuset:
2061 	cpuset_mems_cookie = read_mems_allowed_begin();
2062 
2063 	/*
2064 	 * No reference counting needed for current->mempolicy
2065 	 * nor system default_policy
2066 	 */
2067 	if (pol->mode == MPOL_INTERLEAVE)
2068 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2069 	else
2070 		page = __alloc_pages_nodemask(gfp, order,
2071 				policy_zonelist(gfp, pol, numa_node_id()),
2072 				policy_nodemask(gfp, pol));
2073 
2074 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2075 		goto retry_cpuset;
2076 
2077 	return page;
2078 }
2079 EXPORT_SYMBOL(alloc_pages_current);
2080 
2081 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2082 {
2083 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2084 
2085 	if (IS_ERR(pol))
2086 		return PTR_ERR(pol);
2087 	dst->vm_policy = pol;
2088 	return 0;
2089 }
2090 
2091 /*
2092  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2093  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2094  * with the mems_allowed returned by cpuset_mems_allowed().  This
2095  * keeps mempolicies cpuset relative after its cpuset moves.  See
2096  * further kernel/cpuset.c update_nodemask().
2097  *
2098  * current's mempolicy may be rebinded by the other task(the task that changes
2099  * cpuset's mems), so we needn't do rebind work for current task.
2100  */
2101 
2102 /* Slow path of a mempolicy duplicate */
2103 struct mempolicy *__mpol_dup(struct mempolicy *old)
2104 {
2105 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2106 
2107 	if (!new)
2108 		return ERR_PTR(-ENOMEM);
2109 
2110 	/* task's mempolicy is protected by alloc_lock */
2111 	if (old == current->mempolicy) {
2112 		task_lock(current);
2113 		*new = *old;
2114 		task_unlock(current);
2115 	} else
2116 		*new = *old;
2117 
2118 	if (current_cpuset_is_being_rebound()) {
2119 		nodemask_t mems = cpuset_mems_allowed(current);
2120 		if (new->flags & MPOL_F_REBINDING)
2121 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2122 		else
2123 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2124 	}
2125 	atomic_set(&new->refcnt, 1);
2126 	return new;
2127 }
2128 
2129 /* Slow path of a mempolicy comparison */
2130 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2131 {
2132 	if (!a || !b)
2133 		return false;
2134 	if (a->mode != b->mode)
2135 		return false;
2136 	if (a->flags != b->flags)
2137 		return false;
2138 	if (mpol_store_user_nodemask(a))
2139 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2140 			return false;
2141 
2142 	switch (a->mode) {
2143 	case MPOL_BIND:
2144 		/* Fall through */
2145 	case MPOL_INTERLEAVE:
2146 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2147 	case MPOL_PREFERRED:
2148 		return a->v.preferred_node == b->v.preferred_node;
2149 	default:
2150 		BUG();
2151 		return false;
2152 	}
2153 }
2154 
2155 /*
2156  * Shared memory backing store policy support.
2157  *
2158  * Remember policies even when nobody has shared memory mapped.
2159  * The policies are kept in Red-Black tree linked from the inode.
2160  * They are protected by the sp->lock rwlock, which should be held
2161  * for any accesses to the tree.
2162  */
2163 
2164 /*
2165  * lookup first element intersecting start-end.  Caller holds sp->lock for
2166  * reading or for writing
2167  */
2168 static struct sp_node *
2169 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2170 {
2171 	struct rb_node *n = sp->root.rb_node;
2172 
2173 	while (n) {
2174 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2175 
2176 		if (start >= p->end)
2177 			n = n->rb_right;
2178 		else if (end <= p->start)
2179 			n = n->rb_left;
2180 		else
2181 			break;
2182 	}
2183 	if (!n)
2184 		return NULL;
2185 	for (;;) {
2186 		struct sp_node *w = NULL;
2187 		struct rb_node *prev = rb_prev(n);
2188 		if (!prev)
2189 			break;
2190 		w = rb_entry(prev, struct sp_node, nd);
2191 		if (w->end <= start)
2192 			break;
2193 		n = prev;
2194 	}
2195 	return rb_entry(n, struct sp_node, nd);
2196 }
2197 
2198 /*
2199  * Insert a new shared policy into the list.  Caller holds sp->lock for
2200  * writing.
2201  */
2202 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2203 {
2204 	struct rb_node **p = &sp->root.rb_node;
2205 	struct rb_node *parent = NULL;
2206 	struct sp_node *nd;
2207 
2208 	while (*p) {
2209 		parent = *p;
2210 		nd = rb_entry(parent, struct sp_node, nd);
2211 		if (new->start < nd->start)
2212 			p = &(*p)->rb_left;
2213 		else if (new->end > nd->end)
2214 			p = &(*p)->rb_right;
2215 		else
2216 			BUG();
2217 	}
2218 	rb_link_node(&new->nd, parent, p);
2219 	rb_insert_color(&new->nd, &sp->root);
2220 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2221 		 new->policy ? new->policy->mode : 0);
2222 }
2223 
2224 /* Find shared policy intersecting idx */
2225 struct mempolicy *
2226 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2227 {
2228 	struct mempolicy *pol = NULL;
2229 	struct sp_node *sn;
2230 
2231 	if (!sp->root.rb_node)
2232 		return NULL;
2233 	read_lock(&sp->lock);
2234 	sn = sp_lookup(sp, idx, idx+1);
2235 	if (sn) {
2236 		mpol_get(sn->policy);
2237 		pol = sn->policy;
2238 	}
2239 	read_unlock(&sp->lock);
2240 	return pol;
2241 }
2242 
2243 static void sp_free(struct sp_node *n)
2244 {
2245 	mpol_put(n->policy);
2246 	kmem_cache_free(sn_cache, n);
2247 }
2248 
2249 /**
2250  * mpol_misplaced - check whether current page node is valid in policy
2251  *
2252  * @page: page to be checked
2253  * @vma: vm area where page mapped
2254  * @addr: virtual address where page mapped
2255  *
2256  * Lookup current policy node id for vma,addr and "compare to" page's
2257  * node id.
2258  *
2259  * Returns:
2260  *	-1	- not misplaced, page is in the right node
2261  *	node	- node id where the page should be
2262  *
2263  * Policy determination "mimics" alloc_page_vma().
2264  * Called from fault path where we know the vma and faulting address.
2265  */
2266 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2267 {
2268 	struct mempolicy *pol;
2269 	struct zoneref *z;
2270 	int curnid = page_to_nid(page);
2271 	unsigned long pgoff;
2272 	int thiscpu = raw_smp_processor_id();
2273 	int thisnid = cpu_to_node(thiscpu);
2274 	int polnid = -1;
2275 	int ret = -1;
2276 
2277 	BUG_ON(!vma);
2278 
2279 	pol = get_vma_policy(vma, addr);
2280 	if (!(pol->flags & MPOL_F_MOF))
2281 		goto out;
2282 
2283 	switch (pol->mode) {
2284 	case MPOL_INTERLEAVE:
2285 		BUG_ON(addr >= vma->vm_end);
2286 		BUG_ON(addr < vma->vm_start);
2287 
2288 		pgoff = vma->vm_pgoff;
2289 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2290 		polnid = offset_il_node(pol, vma, pgoff);
2291 		break;
2292 
2293 	case MPOL_PREFERRED:
2294 		if (pol->flags & MPOL_F_LOCAL)
2295 			polnid = numa_node_id();
2296 		else
2297 			polnid = pol->v.preferred_node;
2298 		break;
2299 
2300 	case MPOL_BIND:
2301 
2302 		/*
2303 		 * allows binding to multiple nodes.
2304 		 * use current page if in policy nodemask,
2305 		 * else select nearest allowed node, if any.
2306 		 * If no allowed nodes, use current [!misplaced].
2307 		 */
2308 		if (node_isset(curnid, pol->v.nodes))
2309 			goto out;
2310 		z = first_zones_zonelist(
2311 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2312 				gfp_zone(GFP_HIGHUSER),
2313 				&pol->v.nodes);
2314 		polnid = z->zone->node;
2315 		break;
2316 
2317 	default:
2318 		BUG();
2319 	}
2320 
2321 	/* Migrate the page towards the node whose CPU is referencing it */
2322 	if (pol->flags & MPOL_F_MORON) {
2323 		polnid = thisnid;
2324 
2325 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2326 			goto out;
2327 	}
2328 
2329 	if (curnid != polnid)
2330 		ret = polnid;
2331 out:
2332 	mpol_cond_put(pol);
2333 
2334 	return ret;
2335 }
2336 
2337 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2338 {
2339 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2340 	rb_erase(&n->nd, &sp->root);
2341 	sp_free(n);
2342 }
2343 
2344 static void sp_node_init(struct sp_node *node, unsigned long start,
2345 			unsigned long end, struct mempolicy *pol)
2346 {
2347 	node->start = start;
2348 	node->end = end;
2349 	node->policy = pol;
2350 }
2351 
2352 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2353 				struct mempolicy *pol)
2354 {
2355 	struct sp_node *n;
2356 	struct mempolicy *newpol;
2357 
2358 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2359 	if (!n)
2360 		return NULL;
2361 
2362 	newpol = mpol_dup(pol);
2363 	if (IS_ERR(newpol)) {
2364 		kmem_cache_free(sn_cache, n);
2365 		return NULL;
2366 	}
2367 	newpol->flags |= MPOL_F_SHARED;
2368 	sp_node_init(n, start, end, newpol);
2369 
2370 	return n;
2371 }
2372 
2373 /* Replace a policy range. */
2374 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2375 				 unsigned long end, struct sp_node *new)
2376 {
2377 	struct sp_node *n;
2378 	struct sp_node *n_new = NULL;
2379 	struct mempolicy *mpol_new = NULL;
2380 	int ret = 0;
2381 
2382 restart:
2383 	write_lock(&sp->lock);
2384 	n = sp_lookup(sp, start, end);
2385 	/* Take care of old policies in the same range. */
2386 	while (n && n->start < end) {
2387 		struct rb_node *next = rb_next(&n->nd);
2388 		if (n->start >= start) {
2389 			if (n->end <= end)
2390 				sp_delete(sp, n);
2391 			else
2392 				n->start = end;
2393 		} else {
2394 			/* Old policy spanning whole new range. */
2395 			if (n->end > end) {
2396 				if (!n_new)
2397 					goto alloc_new;
2398 
2399 				*mpol_new = *n->policy;
2400 				atomic_set(&mpol_new->refcnt, 1);
2401 				sp_node_init(n_new, end, n->end, mpol_new);
2402 				n->end = start;
2403 				sp_insert(sp, n_new);
2404 				n_new = NULL;
2405 				mpol_new = NULL;
2406 				break;
2407 			} else
2408 				n->end = start;
2409 		}
2410 		if (!next)
2411 			break;
2412 		n = rb_entry(next, struct sp_node, nd);
2413 	}
2414 	if (new)
2415 		sp_insert(sp, new);
2416 	write_unlock(&sp->lock);
2417 	ret = 0;
2418 
2419 err_out:
2420 	if (mpol_new)
2421 		mpol_put(mpol_new);
2422 	if (n_new)
2423 		kmem_cache_free(sn_cache, n_new);
2424 
2425 	return ret;
2426 
2427 alloc_new:
2428 	write_unlock(&sp->lock);
2429 	ret = -ENOMEM;
2430 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2431 	if (!n_new)
2432 		goto err_out;
2433 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2434 	if (!mpol_new)
2435 		goto err_out;
2436 	goto restart;
2437 }
2438 
2439 /**
2440  * mpol_shared_policy_init - initialize shared policy for inode
2441  * @sp: pointer to inode shared policy
2442  * @mpol:  struct mempolicy to install
2443  *
2444  * Install non-NULL @mpol in inode's shared policy rb-tree.
2445  * On entry, the current task has a reference on a non-NULL @mpol.
2446  * This must be released on exit.
2447  * This is called at get_inode() calls and we can use GFP_KERNEL.
2448  */
2449 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2450 {
2451 	int ret;
2452 
2453 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2454 	rwlock_init(&sp->lock);
2455 
2456 	if (mpol) {
2457 		struct vm_area_struct pvma;
2458 		struct mempolicy *new;
2459 		NODEMASK_SCRATCH(scratch);
2460 
2461 		if (!scratch)
2462 			goto put_mpol;
2463 		/* contextualize the tmpfs mount point mempolicy */
2464 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2465 		if (IS_ERR(new))
2466 			goto free_scratch; /* no valid nodemask intersection */
2467 
2468 		task_lock(current);
2469 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2470 		task_unlock(current);
2471 		if (ret)
2472 			goto put_new;
2473 
2474 		/* Create pseudo-vma that contains just the policy */
2475 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2476 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2477 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2478 
2479 put_new:
2480 		mpol_put(new);			/* drop initial ref */
2481 free_scratch:
2482 		NODEMASK_SCRATCH_FREE(scratch);
2483 put_mpol:
2484 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2485 	}
2486 }
2487 
2488 int mpol_set_shared_policy(struct shared_policy *info,
2489 			struct vm_area_struct *vma, struct mempolicy *npol)
2490 {
2491 	int err;
2492 	struct sp_node *new = NULL;
2493 	unsigned long sz = vma_pages(vma);
2494 
2495 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2496 		 vma->vm_pgoff,
2497 		 sz, npol ? npol->mode : -1,
2498 		 npol ? npol->flags : -1,
2499 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2500 
2501 	if (npol) {
2502 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2503 		if (!new)
2504 			return -ENOMEM;
2505 	}
2506 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2507 	if (err && new)
2508 		sp_free(new);
2509 	return err;
2510 }
2511 
2512 /* Free a backing policy store on inode delete. */
2513 void mpol_free_shared_policy(struct shared_policy *p)
2514 {
2515 	struct sp_node *n;
2516 	struct rb_node *next;
2517 
2518 	if (!p->root.rb_node)
2519 		return;
2520 	write_lock(&p->lock);
2521 	next = rb_first(&p->root);
2522 	while (next) {
2523 		n = rb_entry(next, struct sp_node, nd);
2524 		next = rb_next(&n->nd);
2525 		sp_delete(p, n);
2526 	}
2527 	write_unlock(&p->lock);
2528 }
2529 
2530 #ifdef CONFIG_NUMA_BALANCING
2531 static int __initdata numabalancing_override;
2532 
2533 static void __init check_numabalancing_enable(void)
2534 {
2535 	bool numabalancing_default = false;
2536 
2537 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2538 		numabalancing_default = true;
2539 
2540 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2541 	if (numabalancing_override)
2542 		set_numabalancing_state(numabalancing_override == 1);
2543 
2544 	if (num_online_nodes() > 1 && !numabalancing_override) {
2545 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2546 			numabalancing_default ? "Enabling" : "Disabling");
2547 		set_numabalancing_state(numabalancing_default);
2548 	}
2549 }
2550 
2551 static int __init setup_numabalancing(char *str)
2552 {
2553 	int ret = 0;
2554 	if (!str)
2555 		goto out;
2556 
2557 	if (!strcmp(str, "enable")) {
2558 		numabalancing_override = 1;
2559 		ret = 1;
2560 	} else if (!strcmp(str, "disable")) {
2561 		numabalancing_override = -1;
2562 		ret = 1;
2563 	}
2564 out:
2565 	if (!ret)
2566 		pr_warn("Unable to parse numa_balancing=\n");
2567 
2568 	return ret;
2569 }
2570 __setup("numa_balancing=", setup_numabalancing);
2571 #else
2572 static inline void __init check_numabalancing_enable(void)
2573 {
2574 }
2575 #endif /* CONFIG_NUMA_BALANCING */
2576 
2577 /* assumes fs == KERNEL_DS */
2578 void __init numa_policy_init(void)
2579 {
2580 	nodemask_t interleave_nodes;
2581 	unsigned long largest = 0;
2582 	int nid, prefer = 0;
2583 
2584 	policy_cache = kmem_cache_create("numa_policy",
2585 					 sizeof(struct mempolicy),
2586 					 0, SLAB_PANIC, NULL);
2587 
2588 	sn_cache = kmem_cache_create("shared_policy_node",
2589 				     sizeof(struct sp_node),
2590 				     0, SLAB_PANIC, NULL);
2591 
2592 	for_each_node(nid) {
2593 		preferred_node_policy[nid] = (struct mempolicy) {
2594 			.refcnt = ATOMIC_INIT(1),
2595 			.mode = MPOL_PREFERRED,
2596 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2597 			.v = { .preferred_node = nid, },
2598 		};
2599 	}
2600 
2601 	/*
2602 	 * Set interleaving policy for system init. Interleaving is only
2603 	 * enabled across suitably sized nodes (default is >= 16MB), or
2604 	 * fall back to the largest node if they're all smaller.
2605 	 */
2606 	nodes_clear(interleave_nodes);
2607 	for_each_node_state(nid, N_MEMORY) {
2608 		unsigned long total_pages = node_present_pages(nid);
2609 
2610 		/* Preserve the largest node */
2611 		if (largest < total_pages) {
2612 			largest = total_pages;
2613 			prefer = nid;
2614 		}
2615 
2616 		/* Interleave this node? */
2617 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2618 			node_set(nid, interleave_nodes);
2619 	}
2620 
2621 	/* All too small, use the largest */
2622 	if (unlikely(nodes_empty(interleave_nodes)))
2623 		node_set(prefer, interleave_nodes);
2624 
2625 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2626 		pr_err("%s: interleaving failed\n", __func__);
2627 
2628 	check_numabalancing_enable();
2629 }
2630 
2631 /* Reset policy of current process to default */
2632 void numa_default_policy(void)
2633 {
2634 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2635 }
2636 
2637 /*
2638  * Parse and format mempolicy from/to strings
2639  */
2640 
2641 /*
2642  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2643  */
2644 static const char * const policy_modes[] =
2645 {
2646 	[MPOL_DEFAULT]    = "default",
2647 	[MPOL_PREFERRED]  = "prefer",
2648 	[MPOL_BIND]       = "bind",
2649 	[MPOL_INTERLEAVE] = "interleave",
2650 	[MPOL_LOCAL]      = "local",
2651 };
2652 
2653 
2654 #ifdef CONFIG_TMPFS
2655 /**
2656  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2657  * @str:  string containing mempolicy to parse
2658  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2659  *
2660  * Format of input:
2661  *	<mode>[=<flags>][:<nodelist>]
2662  *
2663  * On success, returns 0, else 1
2664  */
2665 int mpol_parse_str(char *str, struct mempolicy **mpol)
2666 {
2667 	struct mempolicy *new = NULL;
2668 	unsigned short mode;
2669 	unsigned short mode_flags;
2670 	nodemask_t nodes;
2671 	char *nodelist = strchr(str, ':');
2672 	char *flags = strchr(str, '=');
2673 	int err = 1;
2674 
2675 	if (nodelist) {
2676 		/* NUL-terminate mode or flags string */
2677 		*nodelist++ = '\0';
2678 		if (nodelist_parse(nodelist, nodes))
2679 			goto out;
2680 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2681 			goto out;
2682 	} else
2683 		nodes_clear(nodes);
2684 
2685 	if (flags)
2686 		*flags++ = '\0';	/* terminate mode string */
2687 
2688 	for (mode = 0; mode < MPOL_MAX; mode++) {
2689 		if (!strcmp(str, policy_modes[mode])) {
2690 			break;
2691 		}
2692 	}
2693 	if (mode >= MPOL_MAX)
2694 		goto out;
2695 
2696 	switch (mode) {
2697 	case MPOL_PREFERRED:
2698 		/*
2699 		 * Insist on a nodelist of one node only
2700 		 */
2701 		if (nodelist) {
2702 			char *rest = nodelist;
2703 			while (isdigit(*rest))
2704 				rest++;
2705 			if (*rest)
2706 				goto out;
2707 		}
2708 		break;
2709 	case MPOL_INTERLEAVE:
2710 		/*
2711 		 * Default to online nodes with memory if no nodelist
2712 		 */
2713 		if (!nodelist)
2714 			nodes = node_states[N_MEMORY];
2715 		break;
2716 	case MPOL_LOCAL:
2717 		/*
2718 		 * Don't allow a nodelist;  mpol_new() checks flags
2719 		 */
2720 		if (nodelist)
2721 			goto out;
2722 		mode = MPOL_PREFERRED;
2723 		break;
2724 	case MPOL_DEFAULT:
2725 		/*
2726 		 * Insist on a empty nodelist
2727 		 */
2728 		if (!nodelist)
2729 			err = 0;
2730 		goto out;
2731 	case MPOL_BIND:
2732 		/*
2733 		 * Insist on a nodelist
2734 		 */
2735 		if (!nodelist)
2736 			goto out;
2737 	}
2738 
2739 	mode_flags = 0;
2740 	if (flags) {
2741 		/*
2742 		 * Currently, we only support two mutually exclusive
2743 		 * mode flags.
2744 		 */
2745 		if (!strcmp(flags, "static"))
2746 			mode_flags |= MPOL_F_STATIC_NODES;
2747 		else if (!strcmp(flags, "relative"))
2748 			mode_flags |= MPOL_F_RELATIVE_NODES;
2749 		else
2750 			goto out;
2751 	}
2752 
2753 	new = mpol_new(mode, mode_flags, &nodes);
2754 	if (IS_ERR(new))
2755 		goto out;
2756 
2757 	/*
2758 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2759 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2760 	 */
2761 	if (mode != MPOL_PREFERRED)
2762 		new->v.nodes = nodes;
2763 	else if (nodelist)
2764 		new->v.preferred_node = first_node(nodes);
2765 	else
2766 		new->flags |= MPOL_F_LOCAL;
2767 
2768 	/*
2769 	 * Save nodes for contextualization: this will be used to "clone"
2770 	 * the mempolicy in a specific context [cpuset] at a later time.
2771 	 */
2772 	new->w.user_nodemask = nodes;
2773 
2774 	err = 0;
2775 
2776 out:
2777 	/* Restore string for error message */
2778 	if (nodelist)
2779 		*--nodelist = ':';
2780 	if (flags)
2781 		*--flags = '=';
2782 	if (!err)
2783 		*mpol = new;
2784 	return err;
2785 }
2786 #endif /* CONFIG_TMPFS */
2787 
2788 /**
2789  * mpol_to_str - format a mempolicy structure for printing
2790  * @buffer:  to contain formatted mempolicy string
2791  * @maxlen:  length of @buffer
2792  * @pol:  pointer to mempolicy to be formatted
2793  *
2794  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2795  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2796  * longest flag, "relative", and to display at least a few node ids.
2797  */
2798 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2799 {
2800 	char *p = buffer;
2801 	nodemask_t nodes = NODE_MASK_NONE;
2802 	unsigned short mode = MPOL_DEFAULT;
2803 	unsigned short flags = 0;
2804 
2805 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2806 		mode = pol->mode;
2807 		flags = pol->flags;
2808 	}
2809 
2810 	switch (mode) {
2811 	case MPOL_DEFAULT:
2812 		break;
2813 	case MPOL_PREFERRED:
2814 		if (flags & MPOL_F_LOCAL)
2815 			mode = MPOL_LOCAL;
2816 		else
2817 			node_set(pol->v.preferred_node, nodes);
2818 		break;
2819 	case MPOL_BIND:
2820 	case MPOL_INTERLEAVE:
2821 		nodes = pol->v.nodes;
2822 		break;
2823 	default:
2824 		WARN_ON_ONCE(1);
2825 		snprintf(p, maxlen, "unknown");
2826 		return;
2827 	}
2828 
2829 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2830 
2831 	if (flags & MPOL_MODE_FLAGS) {
2832 		p += snprintf(p, buffer + maxlen - p, "=");
2833 
2834 		/*
2835 		 * Currently, the only defined flags are mutually exclusive
2836 		 */
2837 		if (flags & MPOL_F_STATIC_NODES)
2838 			p += snprintf(p, buffer + maxlen - p, "static");
2839 		else if (flags & MPOL_F_RELATIVE_NODES)
2840 			p += snprintf(p, buffer + maxlen - p, "relative");
2841 	}
2842 
2843 	if (!nodes_empty(nodes))
2844 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2845 			       nodemask_pr_args(&nodes));
2846 }
2847