xref: /linux/mm/mempolicy.c (revision 31a1b26f16e822577def5402ffc79cfe4aed2db9)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/export.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 #include <linux/mmu_notifier.h>
94 
95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h>
97 #include <linux/random.h>
98 
99 #include "internal.h"
100 
101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 static struct mempolicy default_policy = {
116 	.refcnt = ATOMIC_INIT(1), /* never free it */
117 	.mode = MPOL_PREFERRED,
118 	.flags = MPOL_F_LOCAL,
119 };
120 
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122 
123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 {
125 	struct mempolicy *pol = p->mempolicy;
126 	int node;
127 
128 	if (!pol) {
129 		node = numa_node_id();
130 		if (node != NUMA_NO_NODE)
131 			pol = &preferred_node_policy[node];
132 
133 		/* preferred_node_policy is not initialised early in boot */
134 		if (!pol->mode)
135 			pol = NULL;
136 	}
137 
138 	return pol;
139 }
140 
141 static const struct mempolicy_operations {
142 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
143 	/*
144 	 * If read-side task has no lock to protect task->mempolicy, write-side
145 	 * task will rebind the task->mempolicy by two step. The first step is
146 	 * setting all the newly nodes, and the second step is cleaning all the
147 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
148 	 * page.
149 	 * If we have a lock to protect task->mempolicy in read-side, we do
150 	 * rebind directly.
151 	 *
152 	 * step:
153 	 * 	MPOL_REBIND_ONCE - do rebind work at once
154 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
155 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
156 	 */
157 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
158 			enum mpol_rebind_step step);
159 } mpol_ops[MPOL_MAX];
160 
161 /* Check that the nodemask contains at least one populated zone */
162 static int is_valid_nodemask(const nodemask_t *nodemask)
163 {
164 	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
165 }
166 
167 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
168 {
169 	return pol->flags & MPOL_MODE_FLAGS;
170 }
171 
172 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
173 				   const nodemask_t *rel)
174 {
175 	nodemask_t tmp;
176 	nodes_fold(tmp, *orig, nodes_weight(*rel));
177 	nodes_onto(*ret, tmp, *rel);
178 }
179 
180 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
181 {
182 	if (nodes_empty(*nodes))
183 		return -EINVAL;
184 	pol->v.nodes = *nodes;
185 	return 0;
186 }
187 
188 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
189 {
190 	if (!nodes)
191 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
192 	else if (nodes_empty(*nodes))
193 		return -EINVAL;			/*  no allowed nodes */
194 	else
195 		pol->v.preferred_node = first_node(*nodes);
196 	return 0;
197 }
198 
199 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
200 {
201 	if (!is_valid_nodemask(nodes))
202 		return -EINVAL;
203 	pol->v.nodes = *nodes;
204 	return 0;
205 }
206 
207 /*
208  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
209  * any, for the new policy.  mpol_new() has already validated the nodes
210  * parameter with respect to the policy mode and flags.  But, we need to
211  * handle an empty nodemask with MPOL_PREFERRED here.
212  *
213  * Must be called holding task's alloc_lock to protect task's mems_allowed
214  * and mempolicy.  May also be called holding the mmap_semaphore for write.
215  */
216 static int mpol_set_nodemask(struct mempolicy *pol,
217 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
218 {
219 	int ret;
220 
221 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
222 	if (pol == NULL)
223 		return 0;
224 	/* Check N_MEMORY */
225 	nodes_and(nsc->mask1,
226 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
227 
228 	VM_BUG_ON(!nodes);
229 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
230 		nodes = NULL;	/* explicit local allocation */
231 	else {
232 		if (pol->flags & MPOL_F_RELATIVE_NODES)
233 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
234 		else
235 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
236 
237 		if (mpol_store_user_nodemask(pol))
238 			pol->w.user_nodemask = *nodes;
239 		else
240 			pol->w.cpuset_mems_allowed =
241 						cpuset_current_mems_allowed;
242 	}
243 
244 	if (nodes)
245 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
246 	else
247 		ret = mpol_ops[pol->mode].create(pol, NULL);
248 	return ret;
249 }
250 
251 /*
252  * This function just creates a new policy, does some check and simple
253  * initialization. You must invoke mpol_set_nodemask() to set nodes.
254  */
255 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
256 				  nodemask_t *nodes)
257 {
258 	struct mempolicy *policy;
259 
260 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
261 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
262 
263 	if (mode == MPOL_DEFAULT) {
264 		if (nodes && !nodes_empty(*nodes))
265 			return ERR_PTR(-EINVAL);
266 		return NULL;
267 	}
268 	VM_BUG_ON(!nodes);
269 
270 	/*
271 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
272 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
273 	 * All other modes require a valid pointer to a non-empty nodemask.
274 	 */
275 	if (mode == MPOL_PREFERRED) {
276 		if (nodes_empty(*nodes)) {
277 			if (((flags & MPOL_F_STATIC_NODES) ||
278 			     (flags & MPOL_F_RELATIVE_NODES)))
279 				return ERR_PTR(-EINVAL);
280 		}
281 	} else if (mode == MPOL_LOCAL) {
282 		if (!nodes_empty(*nodes))
283 			return ERR_PTR(-EINVAL);
284 		mode = MPOL_PREFERRED;
285 	} else if (nodes_empty(*nodes))
286 		return ERR_PTR(-EINVAL);
287 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
288 	if (!policy)
289 		return ERR_PTR(-ENOMEM);
290 	atomic_set(&policy->refcnt, 1);
291 	policy->mode = mode;
292 	policy->flags = flags;
293 
294 	return policy;
295 }
296 
297 /* Slow path of a mpol destructor. */
298 void __mpol_put(struct mempolicy *p)
299 {
300 	if (!atomic_dec_and_test(&p->refcnt))
301 		return;
302 	kmem_cache_free(policy_cache, p);
303 }
304 
305 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
306 				enum mpol_rebind_step step)
307 {
308 }
309 
310 /*
311  * step:
312  * 	MPOL_REBIND_ONCE  - do rebind work at once
313  * 	MPOL_REBIND_STEP1 - set all the newly nodes
314  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
315  */
316 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
317 				 enum mpol_rebind_step step)
318 {
319 	nodemask_t tmp;
320 
321 	if (pol->flags & MPOL_F_STATIC_NODES)
322 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
323 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
324 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
325 	else {
326 		/*
327 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
328 		 * result
329 		 */
330 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
331 			nodes_remap(tmp, pol->v.nodes,
332 					pol->w.cpuset_mems_allowed, *nodes);
333 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
334 		} else if (step == MPOL_REBIND_STEP2) {
335 			tmp = pol->w.cpuset_mems_allowed;
336 			pol->w.cpuset_mems_allowed = *nodes;
337 		} else
338 			BUG();
339 	}
340 
341 	if (nodes_empty(tmp))
342 		tmp = *nodes;
343 
344 	if (step == MPOL_REBIND_STEP1)
345 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
346 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
347 		pol->v.nodes = tmp;
348 	else
349 		BUG();
350 
351 	if (!node_isset(current->il_next, tmp)) {
352 		current->il_next = next_node(current->il_next, tmp);
353 		if (current->il_next >= MAX_NUMNODES)
354 			current->il_next = first_node(tmp);
355 		if (current->il_next >= MAX_NUMNODES)
356 			current->il_next = numa_node_id();
357 	}
358 }
359 
360 static void mpol_rebind_preferred(struct mempolicy *pol,
361 				  const nodemask_t *nodes,
362 				  enum mpol_rebind_step step)
363 {
364 	nodemask_t tmp;
365 
366 	if (pol->flags & MPOL_F_STATIC_NODES) {
367 		int node = first_node(pol->w.user_nodemask);
368 
369 		if (node_isset(node, *nodes)) {
370 			pol->v.preferred_node = node;
371 			pol->flags &= ~MPOL_F_LOCAL;
372 		} else
373 			pol->flags |= MPOL_F_LOCAL;
374 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
375 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
376 		pol->v.preferred_node = first_node(tmp);
377 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
378 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
379 						   pol->w.cpuset_mems_allowed,
380 						   *nodes);
381 		pol->w.cpuset_mems_allowed = *nodes;
382 	}
383 }
384 
385 /*
386  * mpol_rebind_policy - Migrate a policy to a different set of nodes
387  *
388  * If read-side task has no lock to protect task->mempolicy, write-side
389  * task will rebind the task->mempolicy by two step. The first step is
390  * setting all the newly nodes, and the second step is cleaning all the
391  * disallowed nodes. In this way, we can avoid finding no node to alloc
392  * page.
393  * If we have a lock to protect task->mempolicy in read-side, we do
394  * rebind directly.
395  *
396  * step:
397  * 	MPOL_REBIND_ONCE  - do rebind work at once
398  * 	MPOL_REBIND_STEP1 - set all the newly nodes
399  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
400  */
401 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
402 				enum mpol_rebind_step step)
403 {
404 	if (!pol)
405 		return;
406 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
407 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
408 		return;
409 
410 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
411 		return;
412 
413 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
414 		BUG();
415 
416 	if (step == MPOL_REBIND_STEP1)
417 		pol->flags |= MPOL_F_REBINDING;
418 	else if (step == MPOL_REBIND_STEP2)
419 		pol->flags &= ~MPOL_F_REBINDING;
420 	else if (step >= MPOL_REBIND_NSTEP)
421 		BUG();
422 
423 	mpol_ops[pol->mode].rebind(pol, newmask, step);
424 }
425 
426 /*
427  * Wrapper for mpol_rebind_policy() that just requires task
428  * pointer, and updates task mempolicy.
429  *
430  * Called with task's alloc_lock held.
431  */
432 
433 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
434 			enum mpol_rebind_step step)
435 {
436 	mpol_rebind_policy(tsk->mempolicy, new, step);
437 }
438 
439 /*
440  * Rebind each vma in mm to new nodemask.
441  *
442  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
443  */
444 
445 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
446 {
447 	struct vm_area_struct *vma;
448 
449 	down_write(&mm->mmap_sem);
450 	for (vma = mm->mmap; vma; vma = vma->vm_next)
451 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
452 	up_write(&mm->mmap_sem);
453 }
454 
455 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
456 	[MPOL_DEFAULT] = {
457 		.rebind = mpol_rebind_default,
458 	},
459 	[MPOL_INTERLEAVE] = {
460 		.create = mpol_new_interleave,
461 		.rebind = mpol_rebind_nodemask,
462 	},
463 	[MPOL_PREFERRED] = {
464 		.create = mpol_new_preferred,
465 		.rebind = mpol_rebind_preferred,
466 	},
467 	[MPOL_BIND] = {
468 		.create = mpol_new_bind,
469 		.rebind = mpol_rebind_nodemask,
470 	},
471 };
472 
473 static void migrate_page_add(struct page *page, struct list_head *pagelist,
474 				unsigned long flags);
475 
476 /* Scan through pages checking if pages follow certain conditions. */
477 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
478 		unsigned long addr, unsigned long end,
479 		const nodemask_t *nodes, unsigned long flags,
480 		void *private)
481 {
482 	pte_t *orig_pte;
483 	pte_t *pte;
484 	spinlock_t *ptl;
485 
486 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
487 	do {
488 		struct page *page;
489 		int nid;
490 
491 		if (!pte_present(*pte))
492 			continue;
493 		page = vm_normal_page(vma, addr, *pte);
494 		if (!page)
495 			continue;
496 		/*
497 		 * vm_normal_page() filters out zero pages, but there might
498 		 * still be PageReserved pages to skip, perhaps in a VDSO.
499 		 */
500 		if (PageReserved(page))
501 			continue;
502 		nid = page_to_nid(page);
503 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
504 			continue;
505 
506 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
507 			migrate_page_add(page, private, flags);
508 		else
509 			break;
510 	} while (pte++, addr += PAGE_SIZE, addr != end);
511 	pte_unmap_unlock(orig_pte, ptl);
512 	return addr != end;
513 }
514 
515 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
516 		unsigned long addr, unsigned long end,
517 		const nodemask_t *nodes, unsigned long flags,
518 		void *private)
519 {
520 	pmd_t *pmd;
521 	unsigned long next;
522 
523 	pmd = pmd_offset(pud, addr);
524 	do {
525 		next = pmd_addr_end(addr, end);
526 		split_huge_page_pmd(vma, addr, pmd);
527 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
528 			continue;
529 		if (check_pte_range(vma, pmd, addr, next, nodes,
530 				    flags, private))
531 			return -EIO;
532 	} while (pmd++, addr = next, addr != end);
533 	return 0;
534 }
535 
536 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
537 		unsigned long addr, unsigned long end,
538 		const nodemask_t *nodes, unsigned long flags,
539 		void *private)
540 {
541 	pud_t *pud;
542 	unsigned long next;
543 
544 	pud = pud_offset(pgd, addr);
545 	do {
546 		next = pud_addr_end(addr, end);
547 		if (pud_none_or_clear_bad(pud))
548 			continue;
549 		if (check_pmd_range(vma, pud, addr, next, nodes,
550 				    flags, private))
551 			return -EIO;
552 	} while (pud++, addr = next, addr != end);
553 	return 0;
554 }
555 
556 static inline int check_pgd_range(struct vm_area_struct *vma,
557 		unsigned long addr, unsigned long end,
558 		const nodemask_t *nodes, unsigned long flags,
559 		void *private)
560 {
561 	pgd_t *pgd;
562 	unsigned long next;
563 
564 	pgd = pgd_offset(vma->vm_mm, addr);
565 	do {
566 		next = pgd_addr_end(addr, end);
567 		if (pgd_none_or_clear_bad(pgd))
568 			continue;
569 		if (check_pud_range(vma, pgd, addr, next, nodes,
570 				    flags, private))
571 			return -EIO;
572 	} while (pgd++, addr = next, addr != end);
573 	return 0;
574 }
575 
576 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
577 /*
578  * This is used to mark a range of virtual addresses to be inaccessible.
579  * These are later cleared by a NUMA hinting fault. Depending on these
580  * faults, pages may be migrated for better NUMA placement.
581  *
582  * This is assuming that NUMA faults are handled using PROT_NONE. If
583  * an architecture makes a different choice, it will need further
584  * changes to the core.
585  */
586 unsigned long change_prot_numa(struct vm_area_struct *vma,
587 			unsigned long addr, unsigned long end)
588 {
589 	int nr_updated;
590 	BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
591 
592 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
593 	if (nr_updated)
594 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
595 
596 	return nr_updated;
597 }
598 #else
599 static unsigned long change_prot_numa(struct vm_area_struct *vma,
600 			unsigned long addr, unsigned long end)
601 {
602 	return 0;
603 }
604 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
605 
606 /*
607  * Check if all pages in a range are on a set of nodes.
608  * If pagelist != NULL then isolate pages from the LRU and
609  * put them on the pagelist.
610  */
611 static struct vm_area_struct *
612 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
613 		const nodemask_t *nodes, unsigned long flags, void *private)
614 {
615 	int err;
616 	struct vm_area_struct *first, *vma, *prev;
617 
618 
619 	first = find_vma(mm, start);
620 	if (!first)
621 		return ERR_PTR(-EFAULT);
622 	prev = NULL;
623 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
624 		unsigned long endvma = vma->vm_end;
625 
626 		if (endvma > end)
627 			endvma = end;
628 		if (vma->vm_start > start)
629 			start = vma->vm_start;
630 
631 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
632 			if (!vma->vm_next && vma->vm_end < end)
633 				return ERR_PTR(-EFAULT);
634 			if (prev && prev->vm_end < vma->vm_start)
635 				return ERR_PTR(-EFAULT);
636 		}
637 
638 		if (is_vm_hugetlb_page(vma))
639 			goto next;
640 
641 		if (flags & MPOL_MF_LAZY) {
642 			change_prot_numa(vma, start, endvma);
643 			goto next;
644 		}
645 
646 		if ((flags & MPOL_MF_STRICT) ||
647 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
648 		      vma_migratable(vma))) {
649 
650 			err = check_pgd_range(vma, start, endvma, nodes,
651 						flags, private);
652 			if (err) {
653 				first = ERR_PTR(err);
654 				break;
655 			}
656 		}
657 next:
658 		prev = vma;
659 	}
660 	return first;
661 }
662 
663 /*
664  * Apply policy to a single VMA
665  * This must be called with the mmap_sem held for writing.
666  */
667 static int vma_replace_policy(struct vm_area_struct *vma,
668 						struct mempolicy *pol)
669 {
670 	int err;
671 	struct mempolicy *old;
672 	struct mempolicy *new;
673 
674 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
675 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
676 		 vma->vm_ops, vma->vm_file,
677 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
678 
679 	new = mpol_dup(pol);
680 	if (IS_ERR(new))
681 		return PTR_ERR(new);
682 
683 	if (vma->vm_ops && vma->vm_ops->set_policy) {
684 		err = vma->vm_ops->set_policy(vma, new);
685 		if (err)
686 			goto err_out;
687 	}
688 
689 	old = vma->vm_policy;
690 	vma->vm_policy = new; /* protected by mmap_sem */
691 	mpol_put(old);
692 
693 	return 0;
694  err_out:
695 	mpol_put(new);
696 	return err;
697 }
698 
699 /* Step 2: apply policy to a range and do splits. */
700 static int mbind_range(struct mm_struct *mm, unsigned long start,
701 		       unsigned long end, struct mempolicy *new_pol)
702 {
703 	struct vm_area_struct *next;
704 	struct vm_area_struct *prev;
705 	struct vm_area_struct *vma;
706 	int err = 0;
707 	pgoff_t pgoff;
708 	unsigned long vmstart;
709 	unsigned long vmend;
710 
711 	vma = find_vma(mm, start);
712 	if (!vma || vma->vm_start > start)
713 		return -EFAULT;
714 
715 	prev = vma->vm_prev;
716 	if (start > vma->vm_start)
717 		prev = vma;
718 
719 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
720 		next = vma->vm_next;
721 		vmstart = max(start, vma->vm_start);
722 		vmend   = min(end, vma->vm_end);
723 
724 		if (mpol_equal(vma_policy(vma), new_pol))
725 			continue;
726 
727 		pgoff = vma->vm_pgoff +
728 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
729 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
730 				  vma->anon_vma, vma->vm_file, pgoff,
731 				  new_pol);
732 		if (prev) {
733 			vma = prev;
734 			next = vma->vm_next;
735 			if (mpol_equal(vma_policy(vma), new_pol))
736 				continue;
737 			/* vma_merge() joined vma && vma->next, case 8 */
738 			goto replace;
739 		}
740 		if (vma->vm_start != vmstart) {
741 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
742 			if (err)
743 				goto out;
744 		}
745 		if (vma->vm_end != vmend) {
746 			err = split_vma(vma->vm_mm, vma, vmend, 0);
747 			if (err)
748 				goto out;
749 		}
750  replace:
751 		err = vma_replace_policy(vma, new_pol);
752 		if (err)
753 			goto out;
754 	}
755 
756  out:
757 	return err;
758 }
759 
760 /*
761  * Update task->flags PF_MEMPOLICY bit: set iff non-default
762  * mempolicy.  Allows more rapid checking of this (combined perhaps
763  * with other PF_* flag bits) on memory allocation hot code paths.
764  *
765  * If called from outside this file, the task 'p' should -only- be
766  * a newly forked child not yet visible on the task list, because
767  * manipulating the task flags of a visible task is not safe.
768  *
769  * The above limitation is why this routine has the funny name
770  * mpol_fix_fork_child_flag().
771  *
772  * It is also safe to call this with a task pointer of current,
773  * which the static wrapper mpol_set_task_struct_flag() does,
774  * for use within this file.
775  */
776 
777 void mpol_fix_fork_child_flag(struct task_struct *p)
778 {
779 	if (p->mempolicy)
780 		p->flags |= PF_MEMPOLICY;
781 	else
782 		p->flags &= ~PF_MEMPOLICY;
783 }
784 
785 static void mpol_set_task_struct_flag(void)
786 {
787 	mpol_fix_fork_child_flag(current);
788 }
789 
790 /* Set the process memory policy */
791 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
792 			     nodemask_t *nodes)
793 {
794 	struct mempolicy *new, *old;
795 	struct mm_struct *mm = current->mm;
796 	NODEMASK_SCRATCH(scratch);
797 	int ret;
798 
799 	if (!scratch)
800 		return -ENOMEM;
801 
802 	new = mpol_new(mode, flags, nodes);
803 	if (IS_ERR(new)) {
804 		ret = PTR_ERR(new);
805 		goto out;
806 	}
807 	/*
808 	 * prevent changing our mempolicy while show_numa_maps()
809 	 * is using it.
810 	 * Note:  do_set_mempolicy() can be called at init time
811 	 * with no 'mm'.
812 	 */
813 	if (mm)
814 		down_write(&mm->mmap_sem);
815 	task_lock(current);
816 	ret = mpol_set_nodemask(new, nodes, scratch);
817 	if (ret) {
818 		task_unlock(current);
819 		if (mm)
820 			up_write(&mm->mmap_sem);
821 		mpol_put(new);
822 		goto out;
823 	}
824 	old = current->mempolicy;
825 	current->mempolicy = new;
826 	mpol_set_task_struct_flag();
827 	if (new && new->mode == MPOL_INTERLEAVE &&
828 	    nodes_weight(new->v.nodes))
829 		current->il_next = first_node(new->v.nodes);
830 	task_unlock(current);
831 	if (mm)
832 		up_write(&mm->mmap_sem);
833 
834 	mpol_put(old);
835 	ret = 0;
836 out:
837 	NODEMASK_SCRATCH_FREE(scratch);
838 	return ret;
839 }
840 
841 /*
842  * Return nodemask for policy for get_mempolicy() query
843  *
844  * Called with task's alloc_lock held
845  */
846 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
847 {
848 	nodes_clear(*nodes);
849 	if (p == &default_policy)
850 		return;
851 
852 	switch (p->mode) {
853 	case MPOL_BIND:
854 		/* Fall through */
855 	case MPOL_INTERLEAVE:
856 		*nodes = p->v.nodes;
857 		break;
858 	case MPOL_PREFERRED:
859 		if (!(p->flags & MPOL_F_LOCAL))
860 			node_set(p->v.preferred_node, *nodes);
861 		/* else return empty node mask for local allocation */
862 		break;
863 	default:
864 		BUG();
865 	}
866 }
867 
868 static int lookup_node(struct mm_struct *mm, unsigned long addr)
869 {
870 	struct page *p;
871 	int err;
872 
873 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
874 	if (err >= 0) {
875 		err = page_to_nid(p);
876 		put_page(p);
877 	}
878 	return err;
879 }
880 
881 /* Retrieve NUMA policy */
882 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
883 			     unsigned long addr, unsigned long flags)
884 {
885 	int err;
886 	struct mm_struct *mm = current->mm;
887 	struct vm_area_struct *vma = NULL;
888 	struct mempolicy *pol = current->mempolicy;
889 
890 	if (flags &
891 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
892 		return -EINVAL;
893 
894 	if (flags & MPOL_F_MEMS_ALLOWED) {
895 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
896 			return -EINVAL;
897 		*policy = 0;	/* just so it's initialized */
898 		task_lock(current);
899 		*nmask  = cpuset_current_mems_allowed;
900 		task_unlock(current);
901 		return 0;
902 	}
903 
904 	if (flags & MPOL_F_ADDR) {
905 		/*
906 		 * Do NOT fall back to task policy if the
907 		 * vma/shared policy at addr is NULL.  We
908 		 * want to return MPOL_DEFAULT in this case.
909 		 */
910 		down_read(&mm->mmap_sem);
911 		vma = find_vma_intersection(mm, addr, addr+1);
912 		if (!vma) {
913 			up_read(&mm->mmap_sem);
914 			return -EFAULT;
915 		}
916 		if (vma->vm_ops && vma->vm_ops->get_policy)
917 			pol = vma->vm_ops->get_policy(vma, addr);
918 		else
919 			pol = vma->vm_policy;
920 	} else if (addr)
921 		return -EINVAL;
922 
923 	if (!pol)
924 		pol = &default_policy;	/* indicates default behavior */
925 
926 	if (flags & MPOL_F_NODE) {
927 		if (flags & MPOL_F_ADDR) {
928 			err = lookup_node(mm, addr);
929 			if (err < 0)
930 				goto out;
931 			*policy = err;
932 		} else if (pol == current->mempolicy &&
933 				pol->mode == MPOL_INTERLEAVE) {
934 			*policy = current->il_next;
935 		} else {
936 			err = -EINVAL;
937 			goto out;
938 		}
939 	} else {
940 		*policy = pol == &default_policy ? MPOL_DEFAULT :
941 						pol->mode;
942 		/*
943 		 * Internal mempolicy flags must be masked off before exposing
944 		 * the policy to userspace.
945 		 */
946 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
947 	}
948 
949 	if (vma) {
950 		up_read(&current->mm->mmap_sem);
951 		vma = NULL;
952 	}
953 
954 	err = 0;
955 	if (nmask) {
956 		if (mpol_store_user_nodemask(pol)) {
957 			*nmask = pol->w.user_nodemask;
958 		} else {
959 			task_lock(current);
960 			get_policy_nodemask(pol, nmask);
961 			task_unlock(current);
962 		}
963 	}
964 
965  out:
966 	mpol_cond_put(pol);
967 	if (vma)
968 		up_read(&current->mm->mmap_sem);
969 	return err;
970 }
971 
972 #ifdef CONFIG_MIGRATION
973 /*
974  * page migration
975  */
976 static void migrate_page_add(struct page *page, struct list_head *pagelist,
977 				unsigned long flags)
978 {
979 	/*
980 	 * Avoid migrating a page that is shared with others.
981 	 */
982 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
983 		if (!isolate_lru_page(page)) {
984 			list_add_tail(&page->lru, pagelist);
985 			inc_zone_page_state(page, NR_ISOLATED_ANON +
986 					    page_is_file_cache(page));
987 		}
988 	}
989 }
990 
991 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
992 {
993 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
994 }
995 
996 /*
997  * Migrate pages from one node to a target node.
998  * Returns error or the number of pages not migrated.
999  */
1000 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1001 			   int flags)
1002 {
1003 	nodemask_t nmask;
1004 	LIST_HEAD(pagelist);
1005 	int err = 0;
1006 
1007 	nodes_clear(nmask);
1008 	node_set(source, nmask);
1009 
1010 	/*
1011 	 * This does not "check" the range but isolates all pages that
1012 	 * need migration.  Between passing in the full user address
1013 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1014 	 */
1015 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1016 	check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1017 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1018 
1019 	if (!list_empty(&pagelist)) {
1020 		err = migrate_pages(&pagelist, new_node_page, dest,
1021 					MIGRATE_SYNC, MR_SYSCALL);
1022 		if (err)
1023 			putback_lru_pages(&pagelist);
1024 	}
1025 
1026 	return err;
1027 }
1028 
1029 /*
1030  * Move pages between the two nodesets so as to preserve the physical
1031  * layout as much as possible.
1032  *
1033  * Returns the number of page that could not be moved.
1034  */
1035 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1036 		     const nodemask_t *to, int flags)
1037 {
1038 	int busy = 0;
1039 	int err;
1040 	nodemask_t tmp;
1041 
1042 	err = migrate_prep();
1043 	if (err)
1044 		return err;
1045 
1046 	down_read(&mm->mmap_sem);
1047 
1048 	err = migrate_vmas(mm, from, to, flags);
1049 	if (err)
1050 		goto out;
1051 
1052 	/*
1053 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1054 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1055 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1056 	 * The pair of nodemasks 'to' and 'from' define the map.
1057 	 *
1058 	 * If no pair of bits is found that way, fallback to picking some
1059 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1060 	 * 'source' and 'dest' bits are the same, this represents a node
1061 	 * that will be migrating to itself, so no pages need move.
1062 	 *
1063 	 * If no bits are left in 'tmp', or if all remaining bits left
1064 	 * in 'tmp' correspond to the same bit in 'to', return false
1065 	 * (nothing left to migrate).
1066 	 *
1067 	 * This lets us pick a pair of nodes to migrate between, such that
1068 	 * if possible the dest node is not already occupied by some other
1069 	 * source node, minimizing the risk of overloading the memory on a
1070 	 * node that would happen if we migrated incoming memory to a node
1071 	 * before migrating outgoing memory source that same node.
1072 	 *
1073 	 * A single scan of tmp is sufficient.  As we go, we remember the
1074 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1075 	 * that not only moved, but what's better, moved to an empty slot
1076 	 * (d is not set in tmp), then we break out then, with that pair.
1077 	 * Otherwise when we finish scanning from_tmp, we at least have the
1078 	 * most recent <s, d> pair that moved.  If we get all the way through
1079 	 * the scan of tmp without finding any node that moved, much less
1080 	 * moved to an empty node, then there is nothing left worth migrating.
1081 	 */
1082 
1083 	tmp = *from;
1084 	while (!nodes_empty(tmp)) {
1085 		int s,d;
1086 		int source = -1;
1087 		int dest = 0;
1088 
1089 		for_each_node_mask(s, tmp) {
1090 
1091 			/*
1092 			 * do_migrate_pages() tries to maintain the relative
1093 			 * node relationship of the pages established between
1094 			 * threads and memory areas.
1095                          *
1096 			 * However if the number of source nodes is not equal to
1097 			 * the number of destination nodes we can not preserve
1098 			 * this node relative relationship.  In that case, skip
1099 			 * copying memory from a node that is in the destination
1100 			 * mask.
1101 			 *
1102 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1103 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1104 			 */
1105 
1106 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1107 						(node_isset(s, *to)))
1108 				continue;
1109 
1110 			d = node_remap(s, *from, *to);
1111 			if (s == d)
1112 				continue;
1113 
1114 			source = s;	/* Node moved. Memorize */
1115 			dest = d;
1116 
1117 			/* dest not in remaining from nodes? */
1118 			if (!node_isset(dest, tmp))
1119 				break;
1120 		}
1121 		if (source == -1)
1122 			break;
1123 
1124 		node_clear(source, tmp);
1125 		err = migrate_to_node(mm, source, dest, flags);
1126 		if (err > 0)
1127 			busy += err;
1128 		if (err < 0)
1129 			break;
1130 	}
1131 out:
1132 	up_read(&mm->mmap_sem);
1133 	if (err < 0)
1134 		return err;
1135 	return busy;
1136 
1137 }
1138 
1139 /*
1140  * Allocate a new page for page migration based on vma policy.
1141  * Start assuming that page is mapped by vma pointed to by @private.
1142  * Search forward from there, if not.  N.B., this assumes that the
1143  * list of pages handed to migrate_pages()--which is how we get here--
1144  * is in virtual address order.
1145  */
1146 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1147 {
1148 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1149 	unsigned long uninitialized_var(address);
1150 
1151 	while (vma) {
1152 		address = page_address_in_vma(page, vma);
1153 		if (address != -EFAULT)
1154 			break;
1155 		vma = vma->vm_next;
1156 	}
1157 
1158 	/*
1159 	 * if !vma, alloc_page_vma() will use task or system default policy
1160 	 */
1161 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1162 }
1163 #else
1164 
1165 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1166 				unsigned long flags)
1167 {
1168 }
1169 
1170 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1171 		     const nodemask_t *to, int flags)
1172 {
1173 	return -ENOSYS;
1174 }
1175 
1176 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1177 {
1178 	return NULL;
1179 }
1180 #endif
1181 
1182 static long do_mbind(unsigned long start, unsigned long len,
1183 		     unsigned short mode, unsigned short mode_flags,
1184 		     nodemask_t *nmask, unsigned long flags)
1185 {
1186 	struct vm_area_struct *vma;
1187 	struct mm_struct *mm = current->mm;
1188 	struct mempolicy *new;
1189 	unsigned long end;
1190 	int err;
1191 	LIST_HEAD(pagelist);
1192 
1193 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1194 		return -EINVAL;
1195 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1196 		return -EPERM;
1197 
1198 	if (start & ~PAGE_MASK)
1199 		return -EINVAL;
1200 
1201 	if (mode == MPOL_DEFAULT)
1202 		flags &= ~MPOL_MF_STRICT;
1203 
1204 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1205 	end = start + len;
1206 
1207 	if (end < start)
1208 		return -EINVAL;
1209 	if (end == start)
1210 		return 0;
1211 
1212 	new = mpol_new(mode, mode_flags, nmask);
1213 	if (IS_ERR(new))
1214 		return PTR_ERR(new);
1215 
1216 	if (flags & MPOL_MF_LAZY)
1217 		new->flags |= MPOL_F_MOF;
1218 
1219 	/*
1220 	 * If we are using the default policy then operation
1221 	 * on discontinuous address spaces is okay after all
1222 	 */
1223 	if (!new)
1224 		flags |= MPOL_MF_DISCONTIG_OK;
1225 
1226 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1227 		 start, start + len, mode, mode_flags,
1228 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1229 
1230 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1231 
1232 		err = migrate_prep();
1233 		if (err)
1234 			goto mpol_out;
1235 	}
1236 	{
1237 		NODEMASK_SCRATCH(scratch);
1238 		if (scratch) {
1239 			down_write(&mm->mmap_sem);
1240 			task_lock(current);
1241 			err = mpol_set_nodemask(new, nmask, scratch);
1242 			task_unlock(current);
1243 			if (err)
1244 				up_write(&mm->mmap_sem);
1245 		} else
1246 			err = -ENOMEM;
1247 		NODEMASK_SCRATCH_FREE(scratch);
1248 	}
1249 	if (err)
1250 		goto mpol_out;
1251 
1252 	vma = check_range(mm, start, end, nmask,
1253 			  flags | MPOL_MF_INVERT, &pagelist);
1254 
1255 	err = PTR_ERR(vma);	/* maybe ... */
1256 	if (!IS_ERR(vma))
1257 		err = mbind_range(mm, start, end, new);
1258 
1259 	if (!err) {
1260 		int nr_failed = 0;
1261 
1262 		if (!list_empty(&pagelist)) {
1263 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1264 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1265 					(unsigned long)vma,
1266 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1267 			if (nr_failed)
1268 				putback_lru_pages(&pagelist);
1269 		}
1270 
1271 		if (nr_failed && (flags & MPOL_MF_STRICT))
1272 			err = -EIO;
1273 	} else
1274 		putback_lru_pages(&pagelist);
1275 
1276 	up_write(&mm->mmap_sem);
1277  mpol_out:
1278 	mpol_put(new);
1279 	return err;
1280 }
1281 
1282 /*
1283  * User space interface with variable sized bitmaps for nodelists.
1284  */
1285 
1286 /* Copy a node mask from user space. */
1287 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1288 		     unsigned long maxnode)
1289 {
1290 	unsigned long k;
1291 	unsigned long nlongs;
1292 	unsigned long endmask;
1293 
1294 	--maxnode;
1295 	nodes_clear(*nodes);
1296 	if (maxnode == 0 || !nmask)
1297 		return 0;
1298 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1299 		return -EINVAL;
1300 
1301 	nlongs = BITS_TO_LONGS(maxnode);
1302 	if ((maxnode % BITS_PER_LONG) == 0)
1303 		endmask = ~0UL;
1304 	else
1305 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1306 
1307 	/* When the user specified more nodes than supported just check
1308 	   if the non supported part is all zero. */
1309 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1310 		if (nlongs > PAGE_SIZE/sizeof(long))
1311 			return -EINVAL;
1312 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1313 			unsigned long t;
1314 			if (get_user(t, nmask + k))
1315 				return -EFAULT;
1316 			if (k == nlongs - 1) {
1317 				if (t & endmask)
1318 					return -EINVAL;
1319 			} else if (t)
1320 				return -EINVAL;
1321 		}
1322 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1323 		endmask = ~0UL;
1324 	}
1325 
1326 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1327 		return -EFAULT;
1328 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1329 	return 0;
1330 }
1331 
1332 /* Copy a kernel node mask to user space */
1333 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1334 			      nodemask_t *nodes)
1335 {
1336 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1337 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1338 
1339 	if (copy > nbytes) {
1340 		if (copy > PAGE_SIZE)
1341 			return -EINVAL;
1342 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1343 			return -EFAULT;
1344 		copy = nbytes;
1345 	}
1346 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1347 }
1348 
1349 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1350 		unsigned long, mode, unsigned long __user *, nmask,
1351 		unsigned long, maxnode, unsigned, flags)
1352 {
1353 	nodemask_t nodes;
1354 	int err;
1355 	unsigned short mode_flags;
1356 
1357 	mode_flags = mode & MPOL_MODE_FLAGS;
1358 	mode &= ~MPOL_MODE_FLAGS;
1359 	if (mode >= MPOL_MAX)
1360 		return -EINVAL;
1361 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1362 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1363 		return -EINVAL;
1364 	err = get_nodes(&nodes, nmask, maxnode);
1365 	if (err)
1366 		return err;
1367 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1368 }
1369 
1370 /* Set the process memory policy */
1371 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1372 		unsigned long, maxnode)
1373 {
1374 	int err;
1375 	nodemask_t nodes;
1376 	unsigned short flags;
1377 
1378 	flags = mode & MPOL_MODE_FLAGS;
1379 	mode &= ~MPOL_MODE_FLAGS;
1380 	if ((unsigned int)mode >= MPOL_MAX)
1381 		return -EINVAL;
1382 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1383 		return -EINVAL;
1384 	err = get_nodes(&nodes, nmask, maxnode);
1385 	if (err)
1386 		return err;
1387 	return do_set_mempolicy(mode, flags, &nodes);
1388 }
1389 
1390 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1391 		const unsigned long __user *, old_nodes,
1392 		const unsigned long __user *, new_nodes)
1393 {
1394 	const struct cred *cred = current_cred(), *tcred;
1395 	struct mm_struct *mm = NULL;
1396 	struct task_struct *task;
1397 	nodemask_t task_nodes;
1398 	int err;
1399 	nodemask_t *old;
1400 	nodemask_t *new;
1401 	NODEMASK_SCRATCH(scratch);
1402 
1403 	if (!scratch)
1404 		return -ENOMEM;
1405 
1406 	old = &scratch->mask1;
1407 	new = &scratch->mask2;
1408 
1409 	err = get_nodes(old, old_nodes, maxnode);
1410 	if (err)
1411 		goto out;
1412 
1413 	err = get_nodes(new, new_nodes, maxnode);
1414 	if (err)
1415 		goto out;
1416 
1417 	/* Find the mm_struct */
1418 	rcu_read_lock();
1419 	task = pid ? find_task_by_vpid(pid) : current;
1420 	if (!task) {
1421 		rcu_read_unlock();
1422 		err = -ESRCH;
1423 		goto out;
1424 	}
1425 	get_task_struct(task);
1426 
1427 	err = -EINVAL;
1428 
1429 	/*
1430 	 * Check if this process has the right to modify the specified
1431 	 * process. The right exists if the process has administrative
1432 	 * capabilities, superuser privileges or the same
1433 	 * userid as the target process.
1434 	 */
1435 	tcred = __task_cred(task);
1436 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1437 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1438 	    !capable(CAP_SYS_NICE)) {
1439 		rcu_read_unlock();
1440 		err = -EPERM;
1441 		goto out_put;
1442 	}
1443 	rcu_read_unlock();
1444 
1445 	task_nodes = cpuset_mems_allowed(task);
1446 	/* Is the user allowed to access the target nodes? */
1447 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1448 		err = -EPERM;
1449 		goto out_put;
1450 	}
1451 
1452 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1453 		err = -EINVAL;
1454 		goto out_put;
1455 	}
1456 
1457 	err = security_task_movememory(task);
1458 	if (err)
1459 		goto out_put;
1460 
1461 	mm = get_task_mm(task);
1462 	put_task_struct(task);
1463 
1464 	if (!mm) {
1465 		err = -EINVAL;
1466 		goto out;
1467 	}
1468 
1469 	err = do_migrate_pages(mm, old, new,
1470 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1471 
1472 	mmput(mm);
1473 out:
1474 	NODEMASK_SCRATCH_FREE(scratch);
1475 
1476 	return err;
1477 
1478 out_put:
1479 	put_task_struct(task);
1480 	goto out;
1481 
1482 }
1483 
1484 
1485 /* Retrieve NUMA policy */
1486 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1487 		unsigned long __user *, nmask, unsigned long, maxnode,
1488 		unsigned long, addr, unsigned long, flags)
1489 {
1490 	int err;
1491 	int uninitialized_var(pval);
1492 	nodemask_t nodes;
1493 
1494 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1495 		return -EINVAL;
1496 
1497 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1498 
1499 	if (err)
1500 		return err;
1501 
1502 	if (policy && put_user(pval, policy))
1503 		return -EFAULT;
1504 
1505 	if (nmask)
1506 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1507 
1508 	return err;
1509 }
1510 
1511 #ifdef CONFIG_COMPAT
1512 
1513 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1514 				     compat_ulong_t __user *nmask,
1515 				     compat_ulong_t maxnode,
1516 				     compat_ulong_t addr, compat_ulong_t flags)
1517 {
1518 	long err;
1519 	unsigned long __user *nm = NULL;
1520 	unsigned long nr_bits, alloc_size;
1521 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1522 
1523 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1524 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1525 
1526 	if (nmask)
1527 		nm = compat_alloc_user_space(alloc_size);
1528 
1529 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1530 
1531 	if (!err && nmask) {
1532 		unsigned long copy_size;
1533 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1534 		err = copy_from_user(bm, nm, copy_size);
1535 		/* ensure entire bitmap is zeroed */
1536 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1537 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1538 	}
1539 
1540 	return err;
1541 }
1542 
1543 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1544 				     compat_ulong_t maxnode)
1545 {
1546 	long err = 0;
1547 	unsigned long __user *nm = NULL;
1548 	unsigned long nr_bits, alloc_size;
1549 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1550 
1551 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1552 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1553 
1554 	if (nmask) {
1555 		err = compat_get_bitmap(bm, nmask, nr_bits);
1556 		nm = compat_alloc_user_space(alloc_size);
1557 		err |= copy_to_user(nm, bm, alloc_size);
1558 	}
1559 
1560 	if (err)
1561 		return -EFAULT;
1562 
1563 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1564 }
1565 
1566 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1567 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1568 			     compat_ulong_t maxnode, compat_ulong_t flags)
1569 {
1570 	long err = 0;
1571 	unsigned long __user *nm = NULL;
1572 	unsigned long nr_bits, alloc_size;
1573 	nodemask_t bm;
1574 
1575 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1576 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1577 
1578 	if (nmask) {
1579 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1580 		nm = compat_alloc_user_space(alloc_size);
1581 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1582 	}
1583 
1584 	if (err)
1585 		return -EFAULT;
1586 
1587 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1588 }
1589 
1590 #endif
1591 
1592 /*
1593  * get_vma_policy(@task, @vma, @addr)
1594  * @task - task for fallback if vma policy == default
1595  * @vma   - virtual memory area whose policy is sought
1596  * @addr  - address in @vma for shared policy lookup
1597  *
1598  * Returns effective policy for a VMA at specified address.
1599  * Falls back to @task or system default policy, as necessary.
1600  * Current or other task's task mempolicy and non-shared vma policies must be
1601  * protected by task_lock(task) by the caller.
1602  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1603  * count--added by the get_policy() vm_op, as appropriate--to protect against
1604  * freeing by another task.  It is the caller's responsibility to free the
1605  * extra reference for shared policies.
1606  */
1607 struct mempolicy *get_vma_policy(struct task_struct *task,
1608 		struct vm_area_struct *vma, unsigned long addr)
1609 {
1610 	struct mempolicy *pol = get_task_policy(task);
1611 
1612 	if (vma) {
1613 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1614 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1615 									addr);
1616 			if (vpol)
1617 				pol = vpol;
1618 		} else if (vma->vm_policy) {
1619 			pol = vma->vm_policy;
1620 
1621 			/*
1622 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1623 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1624 			 * count on these policies which will be dropped by
1625 			 * mpol_cond_put() later
1626 			 */
1627 			if (mpol_needs_cond_ref(pol))
1628 				mpol_get(pol);
1629 		}
1630 	}
1631 	if (!pol)
1632 		pol = &default_policy;
1633 	return pol;
1634 }
1635 
1636 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1637 {
1638 	enum zone_type dynamic_policy_zone = policy_zone;
1639 
1640 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1641 
1642 	/*
1643 	 * if policy->v.nodes has movable memory only,
1644 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1645 	 *
1646 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1647 	 * so if the following test faile, it implies
1648 	 * policy->v.nodes has movable memory only.
1649 	 */
1650 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1651 		dynamic_policy_zone = ZONE_MOVABLE;
1652 
1653 	return zone >= dynamic_policy_zone;
1654 }
1655 
1656 /*
1657  * Return a nodemask representing a mempolicy for filtering nodes for
1658  * page allocation
1659  */
1660 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1661 {
1662 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1663 	if (unlikely(policy->mode == MPOL_BIND) &&
1664 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1665 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1666 		return &policy->v.nodes;
1667 
1668 	return NULL;
1669 }
1670 
1671 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1672 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1673 	int nd)
1674 {
1675 	switch (policy->mode) {
1676 	case MPOL_PREFERRED:
1677 		if (!(policy->flags & MPOL_F_LOCAL))
1678 			nd = policy->v.preferred_node;
1679 		break;
1680 	case MPOL_BIND:
1681 		/*
1682 		 * Normally, MPOL_BIND allocations are node-local within the
1683 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1684 		 * current node isn't part of the mask, we use the zonelist for
1685 		 * the first node in the mask instead.
1686 		 */
1687 		if (unlikely(gfp & __GFP_THISNODE) &&
1688 				unlikely(!node_isset(nd, policy->v.nodes)))
1689 			nd = first_node(policy->v.nodes);
1690 		break;
1691 	default:
1692 		BUG();
1693 	}
1694 	return node_zonelist(nd, gfp);
1695 }
1696 
1697 /* Do dynamic interleaving for a process */
1698 static unsigned interleave_nodes(struct mempolicy *policy)
1699 {
1700 	unsigned nid, next;
1701 	struct task_struct *me = current;
1702 
1703 	nid = me->il_next;
1704 	next = next_node(nid, policy->v.nodes);
1705 	if (next >= MAX_NUMNODES)
1706 		next = first_node(policy->v.nodes);
1707 	if (next < MAX_NUMNODES)
1708 		me->il_next = next;
1709 	return nid;
1710 }
1711 
1712 /*
1713  * Depending on the memory policy provide a node from which to allocate the
1714  * next slab entry.
1715  * @policy must be protected by freeing by the caller.  If @policy is
1716  * the current task's mempolicy, this protection is implicit, as only the
1717  * task can change it's policy.  The system default policy requires no
1718  * such protection.
1719  */
1720 unsigned slab_node(void)
1721 {
1722 	struct mempolicy *policy;
1723 
1724 	if (in_interrupt())
1725 		return numa_node_id();
1726 
1727 	policy = current->mempolicy;
1728 	if (!policy || policy->flags & MPOL_F_LOCAL)
1729 		return numa_node_id();
1730 
1731 	switch (policy->mode) {
1732 	case MPOL_PREFERRED:
1733 		/*
1734 		 * handled MPOL_F_LOCAL above
1735 		 */
1736 		return policy->v.preferred_node;
1737 
1738 	case MPOL_INTERLEAVE:
1739 		return interleave_nodes(policy);
1740 
1741 	case MPOL_BIND: {
1742 		/*
1743 		 * Follow bind policy behavior and start allocation at the
1744 		 * first node.
1745 		 */
1746 		struct zonelist *zonelist;
1747 		struct zone *zone;
1748 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1749 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1750 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1751 							&policy->v.nodes,
1752 							&zone);
1753 		return zone ? zone->node : numa_node_id();
1754 	}
1755 
1756 	default:
1757 		BUG();
1758 	}
1759 }
1760 
1761 /* Do static interleaving for a VMA with known offset. */
1762 static unsigned offset_il_node(struct mempolicy *pol,
1763 		struct vm_area_struct *vma, unsigned long off)
1764 {
1765 	unsigned nnodes = nodes_weight(pol->v.nodes);
1766 	unsigned target;
1767 	int c;
1768 	int nid = -1;
1769 
1770 	if (!nnodes)
1771 		return numa_node_id();
1772 	target = (unsigned int)off % nnodes;
1773 	c = 0;
1774 	do {
1775 		nid = next_node(nid, pol->v.nodes);
1776 		c++;
1777 	} while (c <= target);
1778 	return nid;
1779 }
1780 
1781 /* Determine a node number for interleave */
1782 static inline unsigned interleave_nid(struct mempolicy *pol,
1783 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1784 {
1785 	if (vma) {
1786 		unsigned long off;
1787 
1788 		/*
1789 		 * for small pages, there is no difference between
1790 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1791 		 * for huge pages, since vm_pgoff is in units of small
1792 		 * pages, we need to shift off the always 0 bits to get
1793 		 * a useful offset.
1794 		 */
1795 		BUG_ON(shift < PAGE_SHIFT);
1796 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1797 		off += (addr - vma->vm_start) >> shift;
1798 		return offset_il_node(pol, vma, off);
1799 	} else
1800 		return interleave_nodes(pol);
1801 }
1802 
1803 /*
1804  * Return the bit number of a random bit set in the nodemask.
1805  * (returns -1 if nodemask is empty)
1806  */
1807 int node_random(const nodemask_t *maskp)
1808 {
1809 	int w, bit = -1;
1810 
1811 	w = nodes_weight(*maskp);
1812 	if (w)
1813 		bit = bitmap_ord_to_pos(maskp->bits,
1814 			get_random_int() % w, MAX_NUMNODES);
1815 	return bit;
1816 }
1817 
1818 #ifdef CONFIG_HUGETLBFS
1819 /*
1820  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1821  * @vma = virtual memory area whose policy is sought
1822  * @addr = address in @vma for shared policy lookup and interleave policy
1823  * @gfp_flags = for requested zone
1824  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1825  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1826  *
1827  * Returns a zonelist suitable for a huge page allocation and a pointer
1828  * to the struct mempolicy for conditional unref after allocation.
1829  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1830  * @nodemask for filtering the zonelist.
1831  *
1832  * Must be protected by get_mems_allowed()
1833  */
1834 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1835 				gfp_t gfp_flags, struct mempolicy **mpol,
1836 				nodemask_t **nodemask)
1837 {
1838 	struct zonelist *zl;
1839 
1840 	*mpol = get_vma_policy(current, vma, addr);
1841 	*nodemask = NULL;	/* assume !MPOL_BIND */
1842 
1843 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1844 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1845 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1846 	} else {
1847 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1848 		if ((*mpol)->mode == MPOL_BIND)
1849 			*nodemask = &(*mpol)->v.nodes;
1850 	}
1851 	return zl;
1852 }
1853 
1854 /*
1855  * init_nodemask_of_mempolicy
1856  *
1857  * If the current task's mempolicy is "default" [NULL], return 'false'
1858  * to indicate default policy.  Otherwise, extract the policy nodemask
1859  * for 'bind' or 'interleave' policy into the argument nodemask, or
1860  * initialize the argument nodemask to contain the single node for
1861  * 'preferred' or 'local' policy and return 'true' to indicate presence
1862  * of non-default mempolicy.
1863  *
1864  * We don't bother with reference counting the mempolicy [mpol_get/put]
1865  * because the current task is examining it's own mempolicy and a task's
1866  * mempolicy is only ever changed by the task itself.
1867  *
1868  * N.B., it is the caller's responsibility to free a returned nodemask.
1869  */
1870 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1871 {
1872 	struct mempolicy *mempolicy;
1873 	int nid;
1874 
1875 	if (!(mask && current->mempolicy))
1876 		return false;
1877 
1878 	task_lock(current);
1879 	mempolicy = current->mempolicy;
1880 	switch (mempolicy->mode) {
1881 	case MPOL_PREFERRED:
1882 		if (mempolicy->flags & MPOL_F_LOCAL)
1883 			nid = numa_node_id();
1884 		else
1885 			nid = mempolicy->v.preferred_node;
1886 		init_nodemask_of_node(mask, nid);
1887 		break;
1888 
1889 	case MPOL_BIND:
1890 		/* Fall through */
1891 	case MPOL_INTERLEAVE:
1892 		*mask =  mempolicy->v.nodes;
1893 		break;
1894 
1895 	default:
1896 		BUG();
1897 	}
1898 	task_unlock(current);
1899 
1900 	return true;
1901 }
1902 #endif
1903 
1904 /*
1905  * mempolicy_nodemask_intersects
1906  *
1907  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1908  * policy.  Otherwise, check for intersection between mask and the policy
1909  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1910  * policy, always return true since it may allocate elsewhere on fallback.
1911  *
1912  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1913  */
1914 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1915 					const nodemask_t *mask)
1916 {
1917 	struct mempolicy *mempolicy;
1918 	bool ret = true;
1919 
1920 	if (!mask)
1921 		return ret;
1922 	task_lock(tsk);
1923 	mempolicy = tsk->mempolicy;
1924 	if (!mempolicy)
1925 		goto out;
1926 
1927 	switch (mempolicy->mode) {
1928 	case MPOL_PREFERRED:
1929 		/*
1930 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1931 		 * allocate from, they may fallback to other nodes when oom.
1932 		 * Thus, it's possible for tsk to have allocated memory from
1933 		 * nodes in mask.
1934 		 */
1935 		break;
1936 	case MPOL_BIND:
1937 	case MPOL_INTERLEAVE:
1938 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1939 		break;
1940 	default:
1941 		BUG();
1942 	}
1943 out:
1944 	task_unlock(tsk);
1945 	return ret;
1946 }
1947 
1948 /* Allocate a page in interleaved policy.
1949    Own path because it needs to do special accounting. */
1950 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1951 					unsigned nid)
1952 {
1953 	struct zonelist *zl;
1954 	struct page *page;
1955 
1956 	zl = node_zonelist(nid, gfp);
1957 	page = __alloc_pages(gfp, order, zl);
1958 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1959 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1960 	return page;
1961 }
1962 
1963 /**
1964  * 	alloc_pages_vma	- Allocate a page for a VMA.
1965  *
1966  * 	@gfp:
1967  *      %GFP_USER    user allocation.
1968  *      %GFP_KERNEL  kernel allocations,
1969  *      %GFP_HIGHMEM highmem/user allocations,
1970  *      %GFP_FS      allocation should not call back into a file system.
1971  *      %GFP_ATOMIC  don't sleep.
1972  *
1973  *	@order:Order of the GFP allocation.
1974  * 	@vma:  Pointer to VMA or NULL if not available.
1975  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1976  *
1977  * 	This function allocates a page from the kernel page pool and applies
1978  *	a NUMA policy associated with the VMA or the current process.
1979  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1980  *	mm_struct of the VMA to prevent it from going away. Should be used for
1981  *	all allocations for pages that will be mapped into
1982  * 	user space. Returns NULL when no page can be allocated.
1983  *
1984  *	Should be called with the mm_sem of the vma hold.
1985  */
1986 struct page *
1987 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1988 		unsigned long addr, int node)
1989 {
1990 	struct mempolicy *pol;
1991 	struct page *page;
1992 	unsigned int cpuset_mems_cookie;
1993 
1994 retry_cpuset:
1995 	pol = get_vma_policy(current, vma, addr);
1996 	cpuset_mems_cookie = get_mems_allowed();
1997 
1998 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1999 		unsigned nid;
2000 
2001 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2002 		mpol_cond_put(pol);
2003 		page = alloc_page_interleave(gfp, order, nid);
2004 		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2005 			goto retry_cpuset;
2006 
2007 		return page;
2008 	}
2009 	page = __alloc_pages_nodemask(gfp, order,
2010 				      policy_zonelist(gfp, pol, node),
2011 				      policy_nodemask(gfp, pol));
2012 	if (unlikely(mpol_needs_cond_ref(pol)))
2013 		__mpol_put(pol);
2014 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2015 		goto retry_cpuset;
2016 	return page;
2017 }
2018 
2019 /**
2020  * 	alloc_pages_current - Allocate pages.
2021  *
2022  *	@gfp:
2023  *		%GFP_USER   user allocation,
2024  *      	%GFP_KERNEL kernel allocation,
2025  *      	%GFP_HIGHMEM highmem allocation,
2026  *      	%GFP_FS     don't call back into a file system.
2027  *      	%GFP_ATOMIC don't sleep.
2028  *	@order: Power of two of allocation size in pages. 0 is a single page.
2029  *
2030  *	Allocate a page from the kernel page pool.  When not in
2031  *	interrupt context and apply the current process NUMA policy.
2032  *	Returns NULL when no page can be allocated.
2033  *
2034  *	Don't call cpuset_update_task_memory_state() unless
2035  *	1) it's ok to take cpuset_sem (can WAIT), and
2036  *	2) allocating for current task (not interrupt).
2037  */
2038 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2039 {
2040 	struct mempolicy *pol = get_task_policy(current);
2041 	struct page *page;
2042 	unsigned int cpuset_mems_cookie;
2043 
2044 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2045 		pol = &default_policy;
2046 
2047 retry_cpuset:
2048 	cpuset_mems_cookie = get_mems_allowed();
2049 
2050 	/*
2051 	 * No reference counting needed for current->mempolicy
2052 	 * nor system default_policy
2053 	 */
2054 	if (pol->mode == MPOL_INTERLEAVE)
2055 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2056 	else
2057 		page = __alloc_pages_nodemask(gfp, order,
2058 				policy_zonelist(gfp, pol, numa_node_id()),
2059 				policy_nodemask(gfp, pol));
2060 
2061 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2062 		goto retry_cpuset;
2063 
2064 	return page;
2065 }
2066 EXPORT_SYMBOL(alloc_pages_current);
2067 
2068 /*
2069  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2070  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2071  * with the mems_allowed returned by cpuset_mems_allowed().  This
2072  * keeps mempolicies cpuset relative after its cpuset moves.  See
2073  * further kernel/cpuset.c update_nodemask().
2074  *
2075  * current's mempolicy may be rebinded by the other task(the task that changes
2076  * cpuset's mems), so we needn't do rebind work for current task.
2077  */
2078 
2079 /* Slow path of a mempolicy duplicate */
2080 struct mempolicy *__mpol_dup(struct mempolicy *old)
2081 {
2082 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2083 
2084 	if (!new)
2085 		return ERR_PTR(-ENOMEM);
2086 
2087 	/* task's mempolicy is protected by alloc_lock */
2088 	if (old == current->mempolicy) {
2089 		task_lock(current);
2090 		*new = *old;
2091 		task_unlock(current);
2092 	} else
2093 		*new = *old;
2094 
2095 	rcu_read_lock();
2096 	if (current_cpuset_is_being_rebound()) {
2097 		nodemask_t mems = cpuset_mems_allowed(current);
2098 		if (new->flags & MPOL_F_REBINDING)
2099 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2100 		else
2101 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2102 	}
2103 	rcu_read_unlock();
2104 	atomic_set(&new->refcnt, 1);
2105 	return new;
2106 }
2107 
2108 /* Slow path of a mempolicy comparison */
2109 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2110 {
2111 	if (!a || !b)
2112 		return false;
2113 	if (a->mode != b->mode)
2114 		return false;
2115 	if (a->flags != b->flags)
2116 		return false;
2117 	if (mpol_store_user_nodemask(a))
2118 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2119 			return false;
2120 
2121 	switch (a->mode) {
2122 	case MPOL_BIND:
2123 		/* Fall through */
2124 	case MPOL_INTERLEAVE:
2125 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2126 	case MPOL_PREFERRED:
2127 		return a->v.preferred_node == b->v.preferred_node;
2128 	default:
2129 		BUG();
2130 		return false;
2131 	}
2132 }
2133 
2134 /*
2135  * Shared memory backing store policy support.
2136  *
2137  * Remember policies even when nobody has shared memory mapped.
2138  * The policies are kept in Red-Black tree linked from the inode.
2139  * They are protected by the sp->lock spinlock, which should be held
2140  * for any accesses to the tree.
2141  */
2142 
2143 /* lookup first element intersecting start-end */
2144 /* Caller holds sp->lock */
2145 static struct sp_node *
2146 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2147 {
2148 	struct rb_node *n = sp->root.rb_node;
2149 
2150 	while (n) {
2151 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2152 
2153 		if (start >= p->end)
2154 			n = n->rb_right;
2155 		else if (end <= p->start)
2156 			n = n->rb_left;
2157 		else
2158 			break;
2159 	}
2160 	if (!n)
2161 		return NULL;
2162 	for (;;) {
2163 		struct sp_node *w = NULL;
2164 		struct rb_node *prev = rb_prev(n);
2165 		if (!prev)
2166 			break;
2167 		w = rb_entry(prev, struct sp_node, nd);
2168 		if (w->end <= start)
2169 			break;
2170 		n = prev;
2171 	}
2172 	return rb_entry(n, struct sp_node, nd);
2173 }
2174 
2175 /* Insert a new shared policy into the list. */
2176 /* Caller holds sp->lock */
2177 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2178 {
2179 	struct rb_node **p = &sp->root.rb_node;
2180 	struct rb_node *parent = NULL;
2181 	struct sp_node *nd;
2182 
2183 	while (*p) {
2184 		parent = *p;
2185 		nd = rb_entry(parent, struct sp_node, nd);
2186 		if (new->start < nd->start)
2187 			p = &(*p)->rb_left;
2188 		else if (new->end > nd->end)
2189 			p = &(*p)->rb_right;
2190 		else
2191 			BUG();
2192 	}
2193 	rb_link_node(&new->nd, parent, p);
2194 	rb_insert_color(&new->nd, &sp->root);
2195 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2196 		 new->policy ? new->policy->mode : 0);
2197 }
2198 
2199 /* Find shared policy intersecting idx */
2200 struct mempolicy *
2201 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2202 {
2203 	struct mempolicy *pol = NULL;
2204 	struct sp_node *sn;
2205 
2206 	if (!sp->root.rb_node)
2207 		return NULL;
2208 	spin_lock(&sp->lock);
2209 	sn = sp_lookup(sp, idx, idx+1);
2210 	if (sn) {
2211 		mpol_get(sn->policy);
2212 		pol = sn->policy;
2213 	}
2214 	spin_unlock(&sp->lock);
2215 	return pol;
2216 }
2217 
2218 static void sp_free(struct sp_node *n)
2219 {
2220 	mpol_put(n->policy);
2221 	kmem_cache_free(sn_cache, n);
2222 }
2223 
2224 /**
2225  * mpol_misplaced - check whether current page node is valid in policy
2226  *
2227  * @page   - page to be checked
2228  * @vma    - vm area where page mapped
2229  * @addr   - virtual address where page mapped
2230  *
2231  * Lookup current policy node id for vma,addr and "compare to" page's
2232  * node id.
2233  *
2234  * Returns:
2235  *	-1	- not misplaced, page is in the right node
2236  *	node	- node id where the page should be
2237  *
2238  * Policy determination "mimics" alloc_page_vma().
2239  * Called from fault path where we know the vma and faulting address.
2240  */
2241 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2242 {
2243 	struct mempolicy *pol;
2244 	struct zone *zone;
2245 	int curnid = page_to_nid(page);
2246 	unsigned long pgoff;
2247 	int polnid = -1;
2248 	int ret = -1;
2249 
2250 	BUG_ON(!vma);
2251 
2252 	pol = get_vma_policy(current, vma, addr);
2253 	if (!(pol->flags & MPOL_F_MOF))
2254 		goto out;
2255 
2256 	switch (pol->mode) {
2257 	case MPOL_INTERLEAVE:
2258 		BUG_ON(addr >= vma->vm_end);
2259 		BUG_ON(addr < vma->vm_start);
2260 
2261 		pgoff = vma->vm_pgoff;
2262 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2263 		polnid = offset_il_node(pol, vma, pgoff);
2264 		break;
2265 
2266 	case MPOL_PREFERRED:
2267 		if (pol->flags & MPOL_F_LOCAL)
2268 			polnid = numa_node_id();
2269 		else
2270 			polnid = pol->v.preferred_node;
2271 		break;
2272 
2273 	case MPOL_BIND:
2274 		/*
2275 		 * allows binding to multiple nodes.
2276 		 * use current page if in policy nodemask,
2277 		 * else select nearest allowed node, if any.
2278 		 * If no allowed nodes, use current [!misplaced].
2279 		 */
2280 		if (node_isset(curnid, pol->v.nodes))
2281 			goto out;
2282 		(void)first_zones_zonelist(
2283 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2284 				gfp_zone(GFP_HIGHUSER),
2285 				&pol->v.nodes, &zone);
2286 		polnid = zone->node;
2287 		break;
2288 
2289 	default:
2290 		BUG();
2291 	}
2292 
2293 	/* Migrate the page towards the node whose CPU is referencing it */
2294 	if (pol->flags & MPOL_F_MORON) {
2295 		int last_nid;
2296 
2297 		polnid = numa_node_id();
2298 
2299 		/*
2300 		 * Multi-stage node selection is used in conjunction
2301 		 * with a periodic migration fault to build a temporal
2302 		 * task<->page relation. By using a two-stage filter we
2303 		 * remove short/unlikely relations.
2304 		 *
2305 		 * Using P(p) ~ n_p / n_t as per frequentist
2306 		 * probability, we can equate a task's usage of a
2307 		 * particular page (n_p) per total usage of this
2308 		 * page (n_t) (in a given time-span) to a probability.
2309 		 *
2310 		 * Our periodic faults will sample this probability and
2311 		 * getting the same result twice in a row, given these
2312 		 * samples are fully independent, is then given by
2313 		 * P(n)^2, provided our sample period is sufficiently
2314 		 * short compared to the usage pattern.
2315 		 *
2316 		 * This quadric squishes small probabilities, making
2317 		 * it less likely we act on an unlikely task<->page
2318 		 * relation.
2319 		 */
2320 		last_nid = page_nid_xchg_last(page, polnid);
2321 		if (last_nid != polnid)
2322 			goto out;
2323 	}
2324 
2325 	if (curnid != polnid)
2326 		ret = polnid;
2327 out:
2328 	mpol_cond_put(pol);
2329 
2330 	return ret;
2331 }
2332 
2333 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2334 {
2335 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2336 	rb_erase(&n->nd, &sp->root);
2337 	sp_free(n);
2338 }
2339 
2340 static void sp_node_init(struct sp_node *node, unsigned long start,
2341 			unsigned long end, struct mempolicy *pol)
2342 {
2343 	node->start = start;
2344 	node->end = end;
2345 	node->policy = pol;
2346 }
2347 
2348 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2349 				struct mempolicy *pol)
2350 {
2351 	struct sp_node *n;
2352 	struct mempolicy *newpol;
2353 
2354 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2355 	if (!n)
2356 		return NULL;
2357 
2358 	newpol = mpol_dup(pol);
2359 	if (IS_ERR(newpol)) {
2360 		kmem_cache_free(sn_cache, n);
2361 		return NULL;
2362 	}
2363 	newpol->flags |= MPOL_F_SHARED;
2364 	sp_node_init(n, start, end, newpol);
2365 
2366 	return n;
2367 }
2368 
2369 /* Replace a policy range. */
2370 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2371 				 unsigned long end, struct sp_node *new)
2372 {
2373 	struct sp_node *n;
2374 	struct sp_node *n_new = NULL;
2375 	struct mempolicy *mpol_new = NULL;
2376 	int ret = 0;
2377 
2378 restart:
2379 	spin_lock(&sp->lock);
2380 	n = sp_lookup(sp, start, end);
2381 	/* Take care of old policies in the same range. */
2382 	while (n && n->start < end) {
2383 		struct rb_node *next = rb_next(&n->nd);
2384 		if (n->start >= start) {
2385 			if (n->end <= end)
2386 				sp_delete(sp, n);
2387 			else
2388 				n->start = end;
2389 		} else {
2390 			/* Old policy spanning whole new range. */
2391 			if (n->end > end) {
2392 				if (!n_new)
2393 					goto alloc_new;
2394 
2395 				*mpol_new = *n->policy;
2396 				atomic_set(&mpol_new->refcnt, 1);
2397 				sp_node_init(n_new, end, n->end, mpol_new);
2398 				n->end = start;
2399 				sp_insert(sp, n_new);
2400 				n_new = NULL;
2401 				mpol_new = NULL;
2402 				break;
2403 			} else
2404 				n->end = start;
2405 		}
2406 		if (!next)
2407 			break;
2408 		n = rb_entry(next, struct sp_node, nd);
2409 	}
2410 	if (new)
2411 		sp_insert(sp, new);
2412 	spin_unlock(&sp->lock);
2413 	ret = 0;
2414 
2415 err_out:
2416 	if (mpol_new)
2417 		mpol_put(mpol_new);
2418 	if (n_new)
2419 		kmem_cache_free(sn_cache, n_new);
2420 
2421 	return ret;
2422 
2423 alloc_new:
2424 	spin_unlock(&sp->lock);
2425 	ret = -ENOMEM;
2426 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2427 	if (!n_new)
2428 		goto err_out;
2429 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2430 	if (!mpol_new)
2431 		goto err_out;
2432 	goto restart;
2433 }
2434 
2435 /**
2436  * mpol_shared_policy_init - initialize shared policy for inode
2437  * @sp: pointer to inode shared policy
2438  * @mpol:  struct mempolicy to install
2439  *
2440  * Install non-NULL @mpol in inode's shared policy rb-tree.
2441  * On entry, the current task has a reference on a non-NULL @mpol.
2442  * This must be released on exit.
2443  * This is called at get_inode() calls and we can use GFP_KERNEL.
2444  */
2445 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2446 {
2447 	int ret;
2448 
2449 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2450 	spin_lock_init(&sp->lock);
2451 
2452 	if (mpol) {
2453 		struct vm_area_struct pvma;
2454 		struct mempolicy *new;
2455 		NODEMASK_SCRATCH(scratch);
2456 
2457 		if (!scratch)
2458 			goto put_mpol;
2459 		/* contextualize the tmpfs mount point mempolicy */
2460 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2461 		if (IS_ERR(new))
2462 			goto free_scratch; /* no valid nodemask intersection */
2463 
2464 		task_lock(current);
2465 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2466 		task_unlock(current);
2467 		if (ret)
2468 			goto put_new;
2469 
2470 		/* Create pseudo-vma that contains just the policy */
2471 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2472 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2473 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2474 
2475 put_new:
2476 		mpol_put(new);			/* drop initial ref */
2477 free_scratch:
2478 		NODEMASK_SCRATCH_FREE(scratch);
2479 put_mpol:
2480 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2481 	}
2482 }
2483 
2484 int mpol_set_shared_policy(struct shared_policy *info,
2485 			struct vm_area_struct *vma, struct mempolicy *npol)
2486 {
2487 	int err;
2488 	struct sp_node *new = NULL;
2489 	unsigned long sz = vma_pages(vma);
2490 
2491 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2492 		 vma->vm_pgoff,
2493 		 sz, npol ? npol->mode : -1,
2494 		 npol ? npol->flags : -1,
2495 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2496 
2497 	if (npol) {
2498 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2499 		if (!new)
2500 			return -ENOMEM;
2501 	}
2502 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2503 	if (err && new)
2504 		sp_free(new);
2505 	return err;
2506 }
2507 
2508 /* Free a backing policy store on inode delete. */
2509 void mpol_free_shared_policy(struct shared_policy *p)
2510 {
2511 	struct sp_node *n;
2512 	struct rb_node *next;
2513 
2514 	if (!p->root.rb_node)
2515 		return;
2516 	spin_lock(&p->lock);
2517 	next = rb_first(&p->root);
2518 	while (next) {
2519 		n = rb_entry(next, struct sp_node, nd);
2520 		next = rb_next(&n->nd);
2521 		sp_delete(p, n);
2522 	}
2523 	spin_unlock(&p->lock);
2524 }
2525 
2526 #ifdef CONFIG_NUMA_BALANCING
2527 static bool __initdata numabalancing_override;
2528 
2529 static void __init check_numabalancing_enable(void)
2530 {
2531 	bool numabalancing_default = false;
2532 
2533 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2534 		numabalancing_default = true;
2535 
2536 	if (nr_node_ids > 1 && !numabalancing_override) {
2537 		printk(KERN_INFO "Enabling automatic NUMA balancing. "
2538 			"Configure with numa_balancing= or sysctl");
2539 		set_numabalancing_state(numabalancing_default);
2540 	}
2541 }
2542 
2543 static int __init setup_numabalancing(char *str)
2544 {
2545 	int ret = 0;
2546 	if (!str)
2547 		goto out;
2548 	numabalancing_override = true;
2549 
2550 	if (!strcmp(str, "enable")) {
2551 		set_numabalancing_state(true);
2552 		ret = 1;
2553 	} else if (!strcmp(str, "disable")) {
2554 		set_numabalancing_state(false);
2555 		ret = 1;
2556 	}
2557 out:
2558 	if (!ret)
2559 		printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2560 
2561 	return ret;
2562 }
2563 __setup("numa_balancing=", setup_numabalancing);
2564 #else
2565 static inline void __init check_numabalancing_enable(void)
2566 {
2567 }
2568 #endif /* CONFIG_NUMA_BALANCING */
2569 
2570 /* assumes fs == KERNEL_DS */
2571 void __init numa_policy_init(void)
2572 {
2573 	nodemask_t interleave_nodes;
2574 	unsigned long largest = 0;
2575 	int nid, prefer = 0;
2576 
2577 	policy_cache = kmem_cache_create("numa_policy",
2578 					 sizeof(struct mempolicy),
2579 					 0, SLAB_PANIC, NULL);
2580 
2581 	sn_cache = kmem_cache_create("shared_policy_node",
2582 				     sizeof(struct sp_node),
2583 				     0, SLAB_PANIC, NULL);
2584 
2585 	for_each_node(nid) {
2586 		preferred_node_policy[nid] = (struct mempolicy) {
2587 			.refcnt = ATOMIC_INIT(1),
2588 			.mode = MPOL_PREFERRED,
2589 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2590 			.v = { .preferred_node = nid, },
2591 		};
2592 	}
2593 
2594 	/*
2595 	 * Set interleaving policy for system init. Interleaving is only
2596 	 * enabled across suitably sized nodes (default is >= 16MB), or
2597 	 * fall back to the largest node if they're all smaller.
2598 	 */
2599 	nodes_clear(interleave_nodes);
2600 	for_each_node_state(nid, N_MEMORY) {
2601 		unsigned long total_pages = node_present_pages(nid);
2602 
2603 		/* Preserve the largest node */
2604 		if (largest < total_pages) {
2605 			largest = total_pages;
2606 			prefer = nid;
2607 		}
2608 
2609 		/* Interleave this node? */
2610 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2611 			node_set(nid, interleave_nodes);
2612 	}
2613 
2614 	/* All too small, use the largest */
2615 	if (unlikely(nodes_empty(interleave_nodes)))
2616 		node_set(prefer, interleave_nodes);
2617 
2618 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2619 		printk("numa_policy_init: interleaving failed\n");
2620 
2621 	check_numabalancing_enable();
2622 }
2623 
2624 /* Reset policy of current process to default */
2625 void numa_default_policy(void)
2626 {
2627 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2628 }
2629 
2630 /*
2631  * Parse and format mempolicy from/to strings
2632  */
2633 
2634 /*
2635  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2636  */
2637 static const char * const policy_modes[] =
2638 {
2639 	[MPOL_DEFAULT]    = "default",
2640 	[MPOL_PREFERRED]  = "prefer",
2641 	[MPOL_BIND]       = "bind",
2642 	[MPOL_INTERLEAVE] = "interleave",
2643 	[MPOL_LOCAL]      = "local",
2644 };
2645 
2646 
2647 #ifdef CONFIG_TMPFS
2648 /**
2649  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2650  * @str:  string containing mempolicy to parse
2651  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2652  *
2653  * Format of input:
2654  *	<mode>[=<flags>][:<nodelist>]
2655  *
2656  * On success, returns 0, else 1
2657  */
2658 int mpol_parse_str(char *str, struct mempolicy **mpol)
2659 {
2660 	struct mempolicy *new = NULL;
2661 	unsigned short mode;
2662 	unsigned short mode_flags;
2663 	nodemask_t nodes;
2664 	char *nodelist = strchr(str, ':');
2665 	char *flags = strchr(str, '=');
2666 	int err = 1;
2667 
2668 	if (nodelist) {
2669 		/* NUL-terminate mode or flags string */
2670 		*nodelist++ = '\0';
2671 		if (nodelist_parse(nodelist, nodes))
2672 			goto out;
2673 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2674 			goto out;
2675 	} else
2676 		nodes_clear(nodes);
2677 
2678 	if (flags)
2679 		*flags++ = '\0';	/* terminate mode string */
2680 
2681 	for (mode = 0; mode < MPOL_MAX; mode++) {
2682 		if (!strcmp(str, policy_modes[mode])) {
2683 			break;
2684 		}
2685 	}
2686 	if (mode >= MPOL_MAX)
2687 		goto out;
2688 
2689 	switch (mode) {
2690 	case MPOL_PREFERRED:
2691 		/*
2692 		 * Insist on a nodelist of one node only
2693 		 */
2694 		if (nodelist) {
2695 			char *rest = nodelist;
2696 			while (isdigit(*rest))
2697 				rest++;
2698 			if (*rest)
2699 				goto out;
2700 		}
2701 		break;
2702 	case MPOL_INTERLEAVE:
2703 		/*
2704 		 * Default to online nodes with memory if no nodelist
2705 		 */
2706 		if (!nodelist)
2707 			nodes = node_states[N_MEMORY];
2708 		break;
2709 	case MPOL_LOCAL:
2710 		/*
2711 		 * Don't allow a nodelist;  mpol_new() checks flags
2712 		 */
2713 		if (nodelist)
2714 			goto out;
2715 		mode = MPOL_PREFERRED;
2716 		break;
2717 	case MPOL_DEFAULT:
2718 		/*
2719 		 * Insist on a empty nodelist
2720 		 */
2721 		if (!nodelist)
2722 			err = 0;
2723 		goto out;
2724 	case MPOL_BIND:
2725 		/*
2726 		 * Insist on a nodelist
2727 		 */
2728 		if (!nodelist)
2729 			goto out;
2730 	}
2731 
2732 	mode_flags = 0;
2733 	if (flags) {
2734 		/*
2735 		 * Currently, we only support two mutually exclusive
2736 		 * mode flags.
2737 		 */
2738 		if (!strcmp(flags, "static"))
2739 			mode_flags |= MPOL_F_STATIC_NODES;
2740 		else if (!strcmp(flags, "relative"))
2741 			mode_flags |= MPOL_F_RELATIVE_NODES;
2742 		else
2743 			goto out;
2744 	}
2745 
2746 	new = mpol_new(mode, mode_flags, &nodes);
2747 	if (IS_ERR(new))
2748 		goto out;
2749 
2750 	/*
2751 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2752 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2753 	 */
2754 	if (mode != MPOL_PREFERRED)
2755 		new->v.nodes = nodes;
2756 	else if (nodelist)
2757 		new->v.preferred_node = first_node(nodes);
2758 	else
2759 		new->flags |= MPOL_F_LOCAL;
2760 
2761 	/*
2762 	 * Save nodes for contextualization: this will be used to "clone"
2763 	 * the mempolicy in a specific context [cpuset] at a later time.
2764 	 */
2765 	new->w.user_nodemask = nodes;
2766 
2767 	err = 0;
2768 
2769 out:
2770 	/* Restore string for error message */
2771 	if (nodelist)
2772 		*--nodelist = ':';
2773 	if (flags)
2774 		*--flags = '=';
2775 	if (!err)
2776 		*mpol = new;
2777 	return err;
2778 }
2779 #endif /* CONFIG_TMPFS */
2780 
2781 /**
2782  * mpol_to_str - format a mempolicy structure for printing
2783  * @buffer:  to contain formatted mempolicy string
2784  * @maxlen:  length of @buffer
2785  * @pol:  pointer to mempolicy to be formatted
2786  *
2787  * Convert a mempolicy into a string.
2788  * Returns the number of characters in buffer (if positive)
2789  * or an error (negative)
2790  */
2791 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2792 {
2793 	char *p = buffer;
2794 	int l;
2795 	nodemask_t nodes;
2796 	unsigned short mode;
2797 	unsigned short flags = pol ? pol->flags : 0;
2798 
2799 	/*
2800 	 * Sanity check:  room for longest mode, flag and some nodes
2801 	 */
2802 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2803 
2804 	if (!pol || pol == &default_policy)
2805 		mode = MPOL_DEFAULT;
2806 	else
2807 		mode = pol->mode;
2808 
2809 	switch (mode) {
2810 	case MPOL_DEFAULT:
2811 		nodes_clear(nodes);
2812 		break;
2813 
2814 	case MPOL_PREFERRED:
2815 		nodes_clear(nodes);
2816 		if (flags & MPOL_F_LOCAL)
2817 			mode = MPOL_LOCAL;
2818 		else
2819 			node_set(pol->v.preferred_node, nodes);
2820 		break;
2821 
2822 	case MPOL_BIND:
2823 		/* Fall through */
2824 	case MPOL_INTERLEAVE:
2825 		nodes = pol->v.nodes;
2826 		break;
2827 
2828 	default:
2829 		return -EINVAL;
2830 	}
2831 
2832 	l = strlen(policy_modes[mode]);
2833 	if (buffer + maxlen < p + l + 1)
2834 		return -ENOSPC;
2835 
2836 	strcpy(p, policy_modes[mode]);
2837 	p += l;
2838 
2839 	if (flags & MPOL_MODE_FLAGS) {
2840 		if (buffer + maxlen < p + 2)
2841 			return -ENOSPC;
2842 		*p++ = '=';
2843 
2844 		/*
2845 		 * Currently, the only defined flags are mutually exclusive
2846 		 */
2847 		if (flags & MPOL_F_STATIC_NODES)
2848 			p += snprintf(p, buffer + maxlen - p, "static");
2849 		else if (flags & MPOL_F_RELATIVE_NODES)
2850 			p += snprintf(p, buffer + maxlen - p, "relative");
2851 	}
2852 
2853 	if (!nodes_empty(nodes)) {
2854 		if (buffer + maxlen < p + 2)
2855 			return -ENOSPC;
2856 		*p++ = ':';
2857 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2858 	}
2859 	return p - buffer;
2860 }
2861