xref: /linux/mm/mempolicy.c (revision 1ccd4b7bfdcfcc8cc7ffc4a9c11d3ac5b6da8ca0)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/module.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 
94 #include <asm/tlbflush.h>
95 #include <asm/uaccess.h>
96 #include <linux/random.h>
97 
98 #include "internal.h"
99 
100 /* Internal flags */
101 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
102 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
103 
104 static struct kmem_cache *policy_cache;
105 static struct kmem_cache *sn_cache;
106 
107 /* Highest zone. An specific allocation for a zone below that is not
108    policied. */
109 enum zone_type policy_zone = 0;
110 
111 /*
112  * run-time system-wide default policy => local allocation
113  */
114 struct mempolicy default_policy = {
115 	.refcnt = ATOMIC_INIT(1), /* never free it */
116 	.mode = MPOL_PREFERRED,
117 	.flags = MPOL_F_LOCAL,
118 };
119 
120 static const struct mempolicy_operations {
121 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 	/*
123 	 * If read-side task has no lock to protect task->mempolicy, write-side
124 	 * task will rebind the task->mempolicy by two step. The first step is
125 	 * setting all the newly nodes, and the second step is cleaning all the
126 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 	 * page.
128 	 * If we have a lock to protect task->mempolicy in read-side, we do
129 	 * rebind directly.
130 	 *
131 	 * step:
132 	 * 	MPOL_REBIND_ONCE - do rebind work at once
133 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
134 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 	 */
136 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 			enum mpol_rebind_step step);
138 } mpol_ops[MPOL_MAX];
139 
140 /* Check that the nodemask contains at least one populated zone */
141 static int is_valid_nodemask(const nodemask_t *nodemask)
142 {
143 	int nd, k;
144 
145 	for_each_node_mask(nd, *nodemask) {
146 		struct zone *z;
147 
148 		for (k = 0; k <= policy_zone; k++) {
149 			z = &NODE_DATA(nd)->node_zones[k];
150 			if (z->present_pages > 0)
151 				return 1;
152 		}
153 	}
154 
155 	return 0;
156 }
157 
158 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
159 {
160 	return pol->flags & MPOL_MODE_FLAGS;
161 }
162 
163 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
164 				   const nodemask_t *rel)
165 {
166 	nodemask_t tmp;
167 	nodes_fold(tmp, *orig, nodes_weight(*rel));
168 	nodes_onto(*ret, tmp, *rel);
169 }
170 
171 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
172 {
173 	if (nodes_empty(*nodes))
174 		return -EINVAL;
175 	pol->v.nodes = *nodes;
176 	return 0;
177 }
178 
179 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
180 {
181 	if (!nodes)
182 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
183 	else if (nodes_empty(*nodes))
184 		return -EINVAL;			/*  no allowed nodes */
185 	else
186 		pol->v.preferred_node = first_node(*nodes);
187 	return 0;
188 }
189 
190 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
191 {
192 	if (!is_valid_nodemask(nodes))
193 		return -EINVAL;
194 	pol->v.nodes = *nodes;
195 	return 0;
196 }
197 
198 /*
199  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
200  * any, for the new policy.  mpol_new() has already validated the nodes
201  * parameter with respect to the policy mode and flags.  But, we need to
202  * handle an empty nodemask with MPOL_PREFERRED here.
203  *
204  * Must be called holding task's alloc_lock to protect task's mems_allowed
205  * and mempolicy.  May also be called holding the mmap_semaphore for write.
206  */
207 static int mpol_set_nodemask(struct mempolicy *pol,
208 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
209 {
210 	int ret;
211 
212 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213 	if (pol == NULL)
214 		return 0;
215 	/* Check N_HIGH_MEMORY */
216 	nodes_and(nsc->mask1,
217 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
218 
219 	VM_BUG_ON(!nodes);
220 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
221 		nodes = NULL;	/* explicit local allocation */
222 	else {
223 		if (pol->flags & MPOL_F_RELATIVE_NODES)
224 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
225 		else
226 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
227 
228 		if (mpol_store_user_nodemask(pol))
229 			pol->w.user_nodemask = *nodes;
230 		else
231 			pol->w.cpuset_mems_allowed =
232 						cpuset_current_mems_allowed;
233 	}
234 
235 	if (nodes)
236 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
237 	else
238 		ret = mpol_ops[pol->mode].create(pol, NULL);
239 	return ret;
240 }
241 
242 /*
243  * This function just creates a new policy, does some check and simple
244  * initialization. You must invoke mpol_set_nodemask() to set nodes.
245  */
246 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
247 				  nodemask_t *nodes)
248 {
249 	struct mempolicy *policy;
250 
251 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
252 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
253 
254 	if (mode == MPOL_DEFAULT) {
255 		if (nodes && !nodes_empty(*nodes))
256 			return ERR_PTR(-EINVAL);
257 		return NULL;	/* simply delete any existing policy */
258 	}
259 	VM_BUG_ON(!nodes);
260 
261 	/*
262 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
263 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
264 	 * All other modes require a valid pointer to a non-empty nodemask.
265 	 */
266 	if (mode == MPOL_PREFERRED) {
267 		if (nodes_empty(*nodes)) {
268 			if (((flags & MPOL_F_STATIC_NODES) ||
269 			     (flags & MPOL_F_RELATIVE_NODES)))
270 				return ERR_PTR(-EINVAL);
271 		}
272 	} else if (nodes_empty(*nodes))
273 		return ERR_PTR(-EINVAL);
274 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
275 	if (!policy)
276 		return ERR_PTR(-ENOMEM);
277 	atomic_set(&policy->refcnt, 1);
278 	policy->mode = mode;
279 	policy->flags = flags;
280 
281 	return policy;
282 }
283 
284 /* Slow path of a mpol destructor. */
285 void __mpol_put(struct mempolicy *p)
286 {
287 	if (!atomic_dec_and_test(&p->refcnt))
288 		return;
289 	kmem_cache_free(policy_cache, p);
290 }
291 
292 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 				enum mpol_rebind_step step)
294 {
295 }
296 
297 /*
298  * step:
299  * 	MPOL_REBIND_ONCE  - do rebind work at once
300  * 	MPOL_REBIND_STEP1 - set all the newly nodes
301  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
302  */
303 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 				 enum mpol_rebind_step step)
305 {
306 	nodemask_t tmp;
307 
308 	if (pol->flags & MPOL_F_STATIC_NODES)
309 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
310 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
311 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
312 	else {
313 		/*
314 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
315 		 * result
316 		 */
317 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 			nodes_remap(tmp, pol->v.nodes,
319 					pol->w.cpuset_mems_allowed, *nodes);
320 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 		} else if (step == MPOL_REBIND_STEP2) {
322 			tmp = pol->w.cpuset_mems_allowed;
323 			pol->w.cpuset_mems_allowed = *nodes;
324 		} else
325 			BUG();
326 	}
327 
328 	if (nodes_empty(tmp))
329 		tmp = *nodes;
330 
331 	if (step == MPOL_REBIND_STEP1)
332 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 		pol->v.nodes = tmp;
335 	else
336 		BUG();
337 
338 	if (!node_isset(current->il_next, tmp)) {
339 		current->il_next = next_node(current->il_next, tmp);
340 		if (current->il_next >= MAX_NUMNODES)
341 			current->il_next = first_node(tmp);
342 		if (current->il_next >= MAX_NUMNODES)
343 			current->il_next = numa_node_id();
344 	}
345 }
346 
347 static void mpol_rebind_preferred(struct mempolicy *pol,
348 				  const nodemask_t *nodes,
349 				  enum mpol_rebind_step step)
350 {
351 	nodemask_t tmp;
352 
353 	if (pol->flags & MPOL_F_STATIC_NODES) {
354 		int node = first_node(pol->w.user_nodemask);
355 
356 		if (node_isset(node, *nodes)) {
357 			pol->v.preferred_node = node;
358 			pol->flags &= ~MPOL_F_LOCAL;
359 		} else
360 			pol->flags |= MPOL_F_LOCAL;
361 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
362 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
363 		pol->v.preferred_node = first_node(tmp);
364 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
365 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
366 						   pol->w.cpuset_mems_allowed,
367 						   *nodes);
368 		pol->w.cpuset_mems_allowed = *nodes;
369 	}
370 }
371 
372 /*
373  * mpol_rebind_policy - Migrate a policy to a different set of nodes
374  *
375  * If read-side task has no lock to protect task->mempolicy, write-side
376  * task will rebind the task->mempolicy by two step. The first step is
377  * setting all the newly nodes, and the second step is cleaning all the
378  * disallowed nodes. In this way, we can avoid finding no node to alloc
379  * page.
380  * If we have a lock to protect task->mempolicy in read-side, we do
381  * rebind directly.
382  *
383  * step:
384  * 	MPOL_REBIND_ONCE  - do rebind work at once
385  * 	MPOL_REBIND_STEP1 - set all the newly nodes
386  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
387  */
388 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 				enum mpol_rebind_step step)
390 {
391 	if (!pol)
392 		return;
393 	if (!mpol_store_user_nodemask(pol) && step == 0 &&
394 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
395 		return;
396 
397 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 		return;
399 
400 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 		BUG();
402 
403 	if (step == MPOL_REBIND_STEP1)
404 		pol->flags |= MPOL_F_REBINDING;
405 	else if (step == MPOL_REBIND_STEP2)
406 		pol->flags &= ~MPOL_F_REBINDING;
407 	else if (step >= MPOL_REBIND_NSTEP)
408 		BUG();
409 
410 	mpol_ops[pol->mode].rebind(pol, newmask, step);
411 }
412 
413 /*
414  * Wrapper for mpol_rebind_policy() that just requires task
415  * pointer, and updates task mempolicy.
416  *
417  * Called with task's alloc_lock held.
418  */
419 
420 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 			enum mpol_rebind_step step)
422 {
423 	mpol_rebind_policy(tsk->mempolicy, new, step);
424 }
425 
426 /*
427  * Rebind each vma in mm to new nodemask.
428  *
429  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
430  */
431 
432 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
433 {
434 	struct vm_area_struct *vma;
435 
436 	down_write(&mm->mmap_sem);
437 	for (vma = mm->mmap; vma; vma = vma->vm_next)
438 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
439 	up_write(&mm->mmap_sem);
440 }
441 
442 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
443 	[MPOL_DEFAULT] = {
444 		.rebind = mpol_rebind_default,
445 	},
446 	[MPOL_INTERLEAVE] = {
447 		.create = mpol_new_interleave,
448 		.rebind = mpol_rebind_nodemask,
449 	},
450 	[MPOL_PREFERRED] = {
451 		.create = mpol_new_preferred,
452 		.rebind = mpol_rebind_preferred,
453 	},
454 	[MPOL_BIND] = {
455 		.create = mpol_new_bind,
456 		.rebind = mpol_rebind_nodemask,
457 	},
458 };
459 
460 static void migrate_page_add(struct page *page, struct list_head *pagelist,
461 				unsigned long flags);
462 
463 /* Scan through pages checking if pages follow certain conditions. */
464 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
465 		unsigned long addr, unsigned long end,
466 		const nodemask_t *nodes, unsigned long flags,
467 		void *private)
468 {
469 	pte_t *orig_pte;
470 	pte_t *pte;
471 	spinlock_t *ptl;
472 
473 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
474 	do {
475 		struct page *page;
476 		int nid;
477 
478 		if (!pte_present(*pte))
479 			continue;
480 		page = vm_normal_page(vma, addr, *pte);
481 		if (!page)
482 			continue;
483 		/*
484 		 * vm_normal_page() filters out zero pages, but there might
485 		 * still be PageReserved pages to skip, perhaps in a VDSO.
486 		 * And we cannot move PageKsm pages sensibly or safely yet.
487 		 */
488 		if (PageReserved(page) || PageKsm(page))
489 			continue;
490 		nid = page_to_nid(page);
491 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
492 			continue;
493 
494 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
495 			migrate_page_add(page, private, flags);
496 		else
497 			break;
498 	} while (pte++, addr += PAGE_SIZE, addr != end);
499 	pte_unmap_unlock(orig_pte, ptl);
500 	return addr != end;
501 }
502 
503 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
504 		unsigned long addr, unsigned long end,
505 		const nodemask_t *nodes, unsigned long flags,
506 		void *private)
507 {
508 	pmd_t *pmd;
509 	unsigned long next;
510 
511 	pmd = pmd_offset(pud, addr);
512 	do {
513 		next = pmd_addr_end(addr, end);
514 		split_huge_page_pmd(vma->vm_mm, pmd);
515 		if (pmd_none_or_clear_bad(pmd))
516 			continue;
517 		if (check_pte_range(vma, pmd, addr, next, nodes,
518 				    flags, private))
519 			return -EIO;
520 	} while (pmd++, addr = next, addr != end);
521 	return 0;
522 }
523 
524 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
525 		unsigned long addr, unsigned long end,
526 		const nodemask_t *nodes, unsigned long flags,
527 		void *private)
528 {
529 	pud_t *pud;
530 	unsigned long next;
531 
532 	pud = pud_offset(pgd, addr);
533 	do {
534 		next = pud_addr_end(addr, end);
535 		if (pud_none_or_clear_bad(pud))
536 			continue;
537 		if (check_pmd_range(vma, pud, addr, next, nodes,
538 				    flags, private))
539 			return -EIO;
540 	} while (pud++, addr = next, addr != end);
541 	return 0;
542 }
543 
544 static inline int check_pgd_range(struct vm_area_struct *vma,
545 		unsigned long addr, unsigned long end,
546 		const nodemask_t *nodes, unsigned long flags,
547 		void *private)
548 {
549 	pgd_t *pgd;
550 	unsigned long next;
551 
552 	pgd = pgd_offset(vma->vm_mm, addr);
553 	do {
554 		next = pgd_addr_end(addr, end);
555 		if (pgd_none_or_clear_bad(pgd))
556 			continue;
557 		if (check_pud_range(vma, pgd, addr, next, nodes,
558 				    flags, private))
559 			return -EIO;
560 	} while (pgd++, addr = next, addr != end);
561 	return 0;
562 }
563 
564 /*
565  * Check if all pages in a range are on a set of nodes.
566  * If pagelist != NULL then isolate pages from the LRU and
567  * put them on the pagelist.
568  */
569 static struct vm_area_struct *
570 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
571 		const nodemask_t *nodes, unsigned long flags, void *private)
572 {
573 	int err;
574 	struct vm_area_struct *first, *vma, *prev;
575 
576 
577 	first = find_vma(mm, start);
578 	if (!first)
579 		return ERR_PTR(-EFAULT);
580 	prev = NULL;
581 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
582 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
583 			if (!vma->vm_next && vma->vm_end < end)
584 				return ERR_PTR(-EFAULT);
585 			if (prev && prev->vm_end < vma->vm_start)
586 				return ERR_PTR(-EFAULT);
587 		}
588 		if (!is_vm_hugetlb_page(vma) &&
589 		    ((flags & MPOL_MF_STRICT) ||
590 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
591 				vma_migratable(vma)))) {
592 			unsigned long endvma = vma->vm_end;
593 
594 			if (endvma > end)
595 				endvma = end;
596 			if (vma->vm_start > start)
597 				start = vma->vm_start;
598 			err = check_pgd_range(vma, start, endvma, nodes,
599 						flags, private);
600 			if (err) {
601 				first = ERR_PTR(err);
602 				break;
603 			}
604 		}
605 		prev = vma;
606 	}
607 	return first;
608 }
609 
610 /* Apply policy to a single VMA */
611 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
612 {
613 	int err = 0;
614 	struct mempolicy *old = vma->vm_policy;
615 
616 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
617 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
618 		 vma->vm_ops, vma->vm_file,
619 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
620 
621 	if (vma->vm_ops && vma->vm_ops->set_policy)
622 		err = vma->vm_ops->set_policy(vma, new);
623 	if (!err) {
624 		mpol_get(new);
625 		vma->vm_policy = new;
626 		mpol_put(old);
627 	}
628 	return err;
629 }
630 
631 /* Step 2: apply policy to a range and do splits. */
632 static int mbind_range(struct mm_struct *mm, unsigned long start,
633 		       unsigned long end, struct mempolicy *new_pol)
634 {
635 	struct vm_area_struct *next;
636 	struct vm_area_struct *prev;
637 	struct vm_area_struct *vma;
638 	int err = 0;
639 	pgoff_t pgoff;
640 	unsigned long vmstart;
641 	unsigned long vmend;
642 
643 	vma = find_vma_prev(mm, start, &prev);
644 	if (!vma || vma->vm_start > start)
645 		return -EFAULT;
646 
647 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
648 		next = vma->vm_next;
649 		vmstart = max(start, vma->vm_start);
650 		vmend   = min(end, vma->vm_end);
651 
652 		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
653 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
654 				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
655 		if (prev) {
656 			vma = prev;
657 			next = vma->vm_next;
658 			continue;
659 		}
660 		if (vma->vm_start != vmstart) {
661 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
662 			if (err)
663 				goto out;
664 		}
665 		if (vma->vm_end != vmend) {
666 			err = split_vma(vma->vm_mm, vma, vmend, 0);
667 			if (err)
668 				goto out;
669 		}
670 		err = policy_vma(vma, new_pol);
671 		if (err)
672 			goto out;
673 	}
674 
675  out:
676 	return err;
677 }
678 
679 /*
680  * Update task->flags PF_MEMPOLICY bit: set iff non-default
681  * mempolicy.  Allows more rapid checking of this (combined perhaps
682  * with other PF_* flag bits) on memory allocation hot code paths.
683  *
684  * If called from outside this file, the task 'p' should -only- be
685  * a newly forked child not yet visible on the task list, because
686  * manipulating the task flags of a visible task is not safe.
687  *
688  * The above limitation is why this routine has the funny name
689  * mpol_fix_fork_child_flag().
690  *
691  * It is also safe to call this with a task pointer of current,
692  * which the static wrapper mpol_set_task_struct_flag() does,
693  * for use within this file.
694  */
695 
696 void mpol_fix_fork_child_flag(struct task_struct *p)
697 {
698 	if (p->mempolicy)
699 		p->flags |= PF_MEMPOLICY;
700 	else
701 		p->flags &= ~PF_MEMPOLICY;
702 }
703 
704 static void mpol_set_task_struct_flag(void)
705 {
706 	mpol_fix_fork_child_flag(current);
707 }
708 
709 /* Set the process memory policy */
710 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
711 			     nodemask_t *nodes)
712 {
713 	struct mempolicy *new, *old;
714 	struct mm_struct *mm = current->mm;
715 	NODEMASK_SCRATCH(scratch);
716 	int ret;
717 
718 	if (!scratch)
719 		return -ENOMEM;
720 
721 	new = mpol_new(mode, flags, nodes);
722 	if (IS_ERR(new)) {
723 		ret = PTR_ERR(new);
724 		goto out;
725 	}
726 	/*
727 	 * prevent changing our mempolicy while show_numa_maps()
728 	 * is using it.
729 	 * Note:  do_set_mempolicy() can be called at init time
730 	 * with no 'mm'.
731 	 */
732 	if (mm)
733 		down_write(&mm->mmap_sem);
734 	task_lock(current);
735 	ret = mpol_set_nodemask(new, nodes, scratch);
736 	if (ret) {
737 		task_unlock(current);
738 		if (mm)
739 			up_write(&mm->mmap_sem);
740 		mpol_put(new);
741 		goto out;
742 	}
743 	old = current->mempolicy;
744 	current->mempolicy = new;
745 	mpol_set_task_struct_flag();
746 	if (new && new->mode == MPOL_INTERLEAVE &&
747 	    nodes_weight(new->v.nodes))
748 		current->il_next = first_node(new->v.nodes);
749 	task_unlock(current);
750 	if (mm)
751 		up_write(&mm->mmap_sem);
752 
753 	mpol_put(old);
754 	ret = 0;
755 out:
756 	NODEMASK_SCRATCH_FREE(scratch);
757 	return ret;
758 }
759 
760 /*
761  * Return nodemask for policy for get_mempolicy() query
762  *
763  * Called with task's alloc_lock held
764  */
765 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
766 {
767 	nodes_clear(*nodes);
768 	if (p == &default_policy)
769 		return;
770 
771 	switch (p->mode) {
772 	case MPOL_BIND:
773 		/* Fall through */
774 	case MPOL_INTERLEAVE:
775 		*nodes = p->v.nodes;
776 		break;
777 	case MPOL_PREFERRED:
778 		if (!(p->flags & MPOL_F_LOCAL))
779 			node_set(p->v.preferred_node, *nodes);
780 		/* else return empty node mask for local allocation */
781 		break;
782 	default:
783 		BUG();
784 	}
785 }
786 
787 static int lookup_node(struct mm_struct *mm, unsigned long addr)
788 {
789 	struct page *p;
790 	int err;
791 
792 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
793 	if (err >= 0) {
794 		err = page_to_nid(p);
795 		put_page(p);
796 	}
797 	return err;
798 }
799 
800 /* Retrieve NUMA policy */
801 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
802 			     unsigned long addr, unsigned long flags)
803 {
804 	int err;
805 	struct mm_struct *mm = current->mm;
806 	struct vm_area_struct *vma = NULL;
807 	struct mempolicy *pol = current->mempolicy;
808 
809 	if (flags &
810 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
811 		return -EINVAL;
812 
813 	if (flags & MPOL_F_MEMS_ALLOWED) {
814 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
815 			return -EINVAL;
816 		*policy = 0;	/* just so it's initialized */
817 		task_lock(current);
818 		*nmask  = cpuset_current_mems_allowed;
819 		task_unlock(current);
820 		return 0;
821 	}
822 
823 	if (flags & MPOL_F_ADDR) {
824 		/*
825 		 * Do NOT fall back to task policy if the
826 		 * vma/shared policy at addr is NULL.  We
827 		 * want to return MPOL_DEFAULT in this case.
828 		 */
829 		down_read(&mm->mmap_sem);
830 		vma = find_vma_intersection(mm, addr, addr+1);
831 		if (!vma) {
832 			up_read(&mm->mmap_sem);
833 			return -EFAULT;
834 		}
835 		if (vma->vm_ops && vma->vm_ops->get_policy)
836 			pol = vma->vm_ops->get_policy(vma, addr);
837 		else
838 			pol = vma->vm_policy;
839 	} else if (addr)
840 		return -EINVAL;
841 
842 	if (!pol)
843 		pol = &default_policy;	/* indicates default behavior */
844 
845 	if (flags & MPOL_F_NODE) {
846 		if (flags & MPOL_F_ADDR) {
847 			err = lookup_node(mm, addr);
848 			if (err < 0)
849 				goto out;
850 			*policy = err;
851 		} else if (pol == current->mempolicy &&
852 				pol->mode == MPOL_INTERLEAVE) {
853 			*policy = current->il_next;
854 		} else {
855 			err = -EINVAL;
856 			goto out;
857 		}
858 	} else {
859 		*policy = pol == &default_policy ? MPOL_DEFAULT :
860 						pol->mode;
861 		/*
862 		 * Internal mempolicy flags must be masked off before exposing
863 		 * the policy to userspace.
864 		 */
865 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
866 	}
867 
868 	if (vma) {
869 		up_read(&current->mm->mmap_sem);
870 		vma = NULL;
871 	}
872 
873 	err = 0;
874 	if (nmask) {
875 		if (mpol_store_user_nodemask(pol)) {
876 			*nmask = pol->w.user_nodemask;
877 		} else {
878 			task_lock(current);
879 			get_policy_nodemask(pol, nmask);
880 			task_unlock(current);
881 		}
882 	}
883 
884  out:
885 	mpol_cond_put(pol);
886 	if (vma)
887 		up_read(&current->mm->mmap_sem);
888 	return err;
889 }
890 
891 #ifdef CONFIG_MIGRATION
892 /*
893  * page migration
894  */
895 static void migrate_page_add(struct page *page, struct list_head *pagelist,
896 				unsigned long flags)
897 {
898 	/*
899 	 * Avoid migrating a page that is shared with others.
900 	 */
901 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
902 		if (!isolate_lru_page(page)) {
903 			list_add_tail(&page->lru, pagelist);
904 			inc_zone_page_state(page, NR_ISOLATED_ANON +
905 					    page_is_file_cache(page));
906 		}
907 	}
908 }
909 
910 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
911 {
912 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
913 }
914 
915 /*
916  * Migrate pages from one node to a target node.
917  * Returns error or the number of pages not migrated.
918  */
919 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
920 			   int flags)
921 {
922 	nodemask_t nmask;
923 	LIST_HEAD(pagelist);
924 	int err = 0;
925 	struct vm_area_struct *vma;
926 
927 	nodes_clear(nmask);
928 	node_set(source, nmask);
929 
930 	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
931 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
932 	if (IS_ERR(vma))
933 		return PTR_ERR(vma);
934 
935 	if (!list_empty(&pagelist)) {
936 		err = migrate_pages(&pagelist, new_node_page, dest,
937 								false, true);
938 		if (err)
939 			putback_lru_pages(&pagelist);
940 	}
941 
942 	return err;
943 }
944 
945 /*
946  * Move pages between the two nodesets so as to preserve the physical
947  * layout as much as possible.
948  *
949  * Returns the number of page that could not be moved.
950  */
951 int do_migrate_pages(struct mm_struct *mm,
952 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
953 {
954 	int busy = 0;
955 	int err;
956 	nodemask_t tmp;
957 
958 	err = migrate_prep();
959 	if (err)
960 		return err;
961 
962 	down_read(&mm->mmap_sem);
963 
964 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
965 	if (err)
966 		goto out;
967 
968 	/*
969 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
970 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
971 	 * bit in 'tmp', and return that <source, dest> pair for migration.
972 	 * The pair of nodemasks 'to' and 'from' define the map.
973 	 *
974 	 * If no pair of bits is found that way, fallback to picking some
975 	 * pair of 'source' and 'dest' bits that are not the same.  If the
976 	 * 'source' and 'dest' bits are the same, this represents a node
977 	 * that will be migrating to itself, so no pages need move.
978 	 *
979 	 * If no bits are left in 'tmp', or if all remaining bits left
980 	 * in 'tmp' correspond to the same bit in 'to', return false
981 	 * (nothing left to migrate).
982 	 *
983 	 * This lets us pick a pair of nodes to migrate between, such that
984 	 * if possible the dest node is not already occupied by some other
985 	 * source node, minimizing the risk of overloading the memory on a
986 	 * node that would happen if we migrated incoming memory to a node
987 	 * before migrating outgoing memory source that same node.
988 	 *
989 	 * A single scan of tmp is sufficient.  As we go, we remember the
990 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
991 	 * that not only moved, but what's better, moved to an empty slot
992 	 * (d is not set in tmp), then we break out then, with that pair.
993 	 * Otherwise when we finish scanning from_tmp, we at least have the
994 	 * most recent <s, d> pair that moved.  If we get all the way through
995 	 * the scan of tmp without finding any node that moved, much less
996 	 * moved to an empty node, then there is nothing left worth migrating.
997 	 */
998 
999 	tmp = *from_nodes;
1000 	while (!nodes_empty(tmp)) {
1001 		int s,d;
1002 		int source = -1;
1003 		int dest = 0;
1004 
1005 		for_each_node_mask(s, tmp) {
1006 			d = node_remap(s, *from_nodes, *to_nodes);
1007 			if (s == d)
1008 				continue;
1009 
1010 			source = s;	/* Node moved. Memorize */
1011 			dest = d;
1012 
1013 			/* dest not in remaining from nodes? */
1014 			if (!node_isset(dest, tmp))
1015 				break;
1016 		}
1017 		if (source == -1)
1018 			break;
1019 
1020 		node_clear(source, tmp);
1021 		err = migrate_to_node(mm, source, dest, flags);
1022 		if (err > 0)
1023 			busy += err;
1024 		if (err < 0)
1025 			break;
1026 	}
1027 out:
1028 	up_read(&mm->mmap_sem);
1029 	if (err < 0)
1030 		return err;
1031 	return busy;
1032 
1033 }
1034 
1035 /*
1036  * Allocate a new page for page migration based on vma policy.
1037  * Start assuming that page is mapped by vma pointed to by @private.
1038  * Search forward from there, if not.  N.B., this assumes that the
1039  * list of pages handed to migrate_pages()--which is how we get here--
1040  * is in virtual address order.
1041  */
1042 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1043 {
1044 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1045 	unsigned long uninitialized_var(address);
1046 
1047 	while (vma) {
1048 		address = page_address_in_vma(page, vma);
1049 		if (address != -EFAULT)
1050 			break;
1051 		vma = vma->vm_next;
1052 	}
1053 
1054 	/*
1055 	 * if !vma, alloc_page_vma() will use task or system default policy
1056 	 */
1057 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1058 }
1059 #else
1060 
1061 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1062 				unsigned long flags)
1063 {
1064 }
1065 
1066 int do_migrate_pages(struct mm_struct *mm,
1067 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1068 {
1069 	return -ENOSYS;
1070 }
1071 
1072 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1073 {
1074 	return NULL;
1075 }
1076 #endif
1077 
1078 static long do_mbind(unsigned long start, unsigned long len,
1079 		     unsigned short mode, unsigned short mode_flags,
1080 		     nodemask_t *nmask, unsigned long flags)
1081 {
1082 	struct vm_area_struct *vma;
1083 	struct mm_struct *mm = current->mm;
1084 	struct mempolicy *new;
1085 	unsigned long end;
1086 	int err;
1087 	LIST_HEAD(pagelist);
1088 
1089 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1090 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1091 		return -EINVAL;
1092 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1093 		return -EPERM;
1094 
1095 	if (start & ~PAGE_MASK)
1096 		return -EINVAL;
1097 
1098 	if (mode == MPOL_DEFAULT)
1099 		flags &= ~MPOL_MF_STRICT;
1100 
1101 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1102 	end = start + len;
1103 
1104 	if (end < start)
1105 		return -EINVAL;
1106 	if (end == start)
1107 		return 0;
1108 
1109 	new = mpol_new(mode, mode_flags, nmask);
1110 	if (IS_ERR(new))
1111 		return PTR_ERR(new);
1112 
1113 	/*
1114 	 * If we are using the default policy then operation
1115 	 * on discontinuous address spaces is okay after all
1116 	 */
1117 	if (!new)
1118 		flags |= MPOL_MF_DISCONTIG_OK;
1119 
1120 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1121 		 start, start + len, mode, mode_flags,
1122 		 nmask ? nodes_addr(*nmask)[0] : -1);
1123 
1124 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1125 
1126 		err = migrate_prep();
1127 		if (err)
1128 			goto mpol_out;
1129 	}
1130 	{
1131 		NODEMASK_SCRATCH(scratch);
1132 		if (scratch) {
1133 			down_write(&mm->mmap_sem);
1134 			task_lock(current);
1135 			err = mpol_set_nodemask(new, nmask, scratch);
1136 			task_unlock(current);
1137 			if (err)
1138 				up_write(&mm->mmap_sem);
1139 		} else
1140 			err = -ENOMEM;
1141 		NODEMASK_SCRATCH_FREE(scratch);
1142 	}
1143 	if (err)
1144 		goto mpol_out;
1145 
1146 	vma = check_range(mm, start, end, nmask,
1147 			  flags | MPOL_MF_INVERT, &pagelist);
1148 
1149 	err = PTR_ERR(vma);
1150 	if (!IS_ERR(vma)) {
1151 		int nr_failed = 0;
1152 
1153 		err = mbind_range(mm, start, end, new);
1154 
1155 		if (!list_empty(&pagelist)) {
1156 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1157 						(unsigned long)vma,
1158 						false, true);
1159 			if (nr_failed)
1160 				putback_lru_pages(&pagelist);
1161 		}
1162 
1163 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1164 			err = -EIO;
1165 	} else
1166 		putback_lru_pages(&pagelist);
1167 
1168 	up_write(&mm->mmap_sem);
1169  mpol_out:
1170 	mpol_put(new);
1171 	return err;
1172 }
1173 
1174 /*
1175  * User space interface with variable sized bitmaps for nodelists.
1176  */
1177 
1178 /* Copy a node mask from user space. */
1179 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1180 		     unsigned long maxnode)
1181 {
1182 	unsigned long k;
1183 	unsigned long nlongs;
1184 	unsigned long endmask;
1185 
1186 	--maxnode;
1187 	nodes_clear(*nodes);
1188 	if (maxnode == 0 || !nmask)
1189 		return 0;
1190 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1191 		return -EINVAL;
1192 
1193 	nlongs = BITS_TO_LONGS(maxnode);
1194 	if ((maxnode % BITS_PER_LONG) == 0)
1195 		endmask = ~0UL;
1196 	else
1197 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1198 
1199 	/* When the user specified more nodes than supported just check
1200 	   if the non supported part is all zero. */
1201 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1202 		if (nlongs > PAGE_SIZE/sizeof(long))
1203 			return -EINVAL;
1204 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1205 			unsigned long t;
1206 			if (get_user(t, nmask + k))
1207 				return -EFAULT;
1208 			if (k == nlongs - 1) {
1209 				if (t & endmask)
1210 					return -EINVAL;
1211 			} else if (t)
1212 				return -EINVAL;
1213 		}
1214 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1215 		endmask = ~0UL;
1216 	}
1217 
1218 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1219 		return -EFAULT;
1220 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1221 	return 0;
1222 }
1223 
1224 /* Copy a kernel node mask to user space */
1225 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1226 			      nodemask_t *nodes)
1227 {
1228 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1229 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1230 
1231 	if (copy > nbytes) {
1232 		if (copy > PAGE_SIZE)
1233 			return -EINVAL;
1234 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1235 			return -EFAULT;
1236 		copy = nbytes;
1237 	}
1238 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1239 }
1240 
1241 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1242 		unsigned long, mode, unsigned long __user *, nmask,
1243 		unsigned long, maxnode, unsigned, flags)
1244 {
1245 	nodemask_t nodes;
1246 	int err;
1247 	unsigned short mode_flags;
1248 
1249 	mode_flags = mode & MPOL_MODE_FLAGS;
1250 	mode &= ~MPOL_MODE_FLAGS;
1251 	if (mode >= MPOL_MAX)
1252 		return -EINVAL;
1253 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1254 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1255 		return -EINVAL;
1256 	err = get_nodes(&nodes, nmask, maxnode);
1257 	if (err)
1258 		return err;
1259 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1260 }
1261 
1262 /* Set the process memory policy */
1263 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1264 		unsigned long, maxnode)
1265 {
1266 	int err;
1267 	nodemask_t nodes;
1268 	unsigned short flags;
1269 
1270 	flags = mode & MPOL_MODE_FLAGS;
1271 	mode &= ~MPOL_MODE_FLAGS;
1272 	if ((unsigned int)mode >= MPOL_MAX)
1273 		return -EINVAL;
1274 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1275 		return -EINVAL;
1276 	err = get_nodes(&nodes, nmask, maxnode);
1277 	if (err)
1278 		return err;
1279 	return do_set_mempolicy(mode, flags, &nodes);
1280 }
1281 
1282 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1283 		const unsigned long __user *, old_nodes,
1284 		const unsigned long __user *, new_nodes)
1285 {
1286 	const struct cred *cred = current_cred(), *tcred;
1287 	struct mm_struct *mm = NULL;
1288 	struct task_struct *task;
1289 	nodemask_t task_nodes;
1290 	int err;
1291 	nodemask_t *old;
1292 	nodemask_t *new;
1293 	NODEMASK_SCRATCH(scratch);
1294 
1295 	if (!scratch)
1296 		return -ENOMEM;
1297 
1298 	old = &scratch->mask1;
1299 	new = &scratch->mask2;
1300 
1301 	err = get_nodes(old, old_nodes, maxnode);
1302 	if (err)
1303 		goto out;
1304 
1305 	err = get_nodes(new, new_nodes, maxnode);
1306 	if (err)
1307 		goto out;
1308 
1309 	/* Find the mm_struct */
1310 	rcu_read_lock();
1311 	task = pid ? find_task_by_vpid(pid) : current;
1312 	if (!task) {
1313 		rcu_read_unlock();
1314 		err = -ESRCH;
1315 		goto out;
1316 	}
1317 	mm = get_task_mm(task);
1318 	rcu_read_unlock();
1319 
1320 	err = -EINVAL;
1321 	if (!mm)
1322 		goto out;
1323 
1324 	/*
1325 	 * Check if this process has the right to modify the specified
1326 	 * process. The right exists if the process has administrative
1327 	 * capabilities, superuser privileges or the same
1328 	 * userid as the target process.
1329 	 */
1330 	rcu_read_lock();
1331 	tcred = __task_cred(task);
1332 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1333 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1334 	    !capable(CAP_SYS_NICE)) {
1335 		rcu_read_unlock();
1336 		err = -EPERM;
1337 		goto out;
1338 	}
1339 	rcu_read_unlock();
1340 
1341 	task_nodes = cpuset_mems_allowed(task);
1342 	/* Is the user allowed to access the target nodes? */
1343 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1344 		err = -EPERM;
1345 		goto out;
1346 	}
1347 
1348 	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1349 		err = -EINVAL;
1350 		goto out;
1351 	}
1352 
1353 	err = security_task_movememory(task);
1354 	if (err)
1355 		goto out;
1356 
1357 	err = do_migrate_pages(mm, old, new,
1358 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1359 out:
1360 	if (mm)
1361 		mmput(mm);
1362 	NODEMASK_SCRATCH_FREE(scratch);
1363 
1364 	return err;
1365 }
1366 
1367 
1368 /* Retrieve NUMA policy */
1369 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1370 		unsigned long __user *, nmask, unsigned long, maxnode,
1371 		unsigned long, addr, unsigned long, flags)
1372 {
1373 	int err;
1374 	int uninitialized_var(pval);
1375 	nodemask_t nodes;
1376 
1377 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1378 		return -EINVAL;
1379 
1380 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1381 
1382 	if (err)
1383 		return err;
1384 
1385 	if (policy && put_user(pval, policy))
1386 		return -EFAULT;
1387 
1388 	if (nmask)
1389 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1390 
1391 	return err;
1392 }
1393 
1394 #ifdef CONFIG_COMPAT
1395 
1396 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1397 				     compat_ulong_t __user *nmask,
1398 				     compat_ulong_t maxnode,
1399 				     compat_ulong_t addr, compat_ulong_t flags)
1400 {
1401 	long err;
1402 	unsigned long __user *nm = NULL;
1403 	unsigned long nr_bits, alloc_size;
1404 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1405 
1406 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1407 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1408 
1409 	if (nmask)
1410 		nm = compat_alloc_user_space(alloc_size);
1411 
1412 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1413 
1414 	if (!err && nmask) {
1415 		err = copy_from_user(bm, nm, alloc_size);
1416 		/* ensure entire bitmap is zeroed */
1417 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1418 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1419 	}
1420 
1421 	return err;
1422 }
1423 
1424 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1425 				     compat_ulong_t maxnode)
1426 {
1427 	long err = 0;
1428 	unsigned long __user *nm = NULL;
1429 	unsigned long nr_bits, alloc_size;
1430 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1431 
1432 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1433 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1434 
1435 	if (nmask) {
1436 		err = compat_get_bitmap(bm, nmask, nr_bits);
1437 		nm = compat_alloc_user_space(alloc_size);
1438 		err |= copy_to_user(nm, bm, alloc_size);
1439 	}
1440 
1441 	if (err)
1442 		return -EFAULT;
1443 
1444 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1445 }
1446 
1447 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1448 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1449 			     compat_ulong_t maxnode, compat_ulong_t flags)
1450 {
1451 	long err = 0;
1452 	unsigned long __user *nm = NULL;
1453 	unsigned long nr_bits, alloc_size;
1454 	nodemask_t bm;
1455 
1456 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1457 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1458 
1459 	if (nmask) {
1460 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1461 		nm = compat_alloc_user_space(alloc_size);
1462 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1463 	}
1464 
1465 	if (err)
1466 		return -EFAULT;
1467 
1468 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1469 }
1470 
1471 #endif
1472 
1473 /*
1474  * get_vma_policy(@task, @vma, @addr)
1475  * @task - task for fallback if vma policy == default
1476  * @vma   - virtual memory area whose policy is sought
1477  * @addr  - address in @vma for shared policy lookup
1478  *
1479  * Returns effective policy for a VMA at specified address.
1480  * Falls back to @task or system default policy, as necessary.
1481  * Current or other task's task mempolicy and non-shared vma policies
1482  * are protected by the task's mmap_sem, which must be held for read by
1483  * the caller.
1484  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1485  * count--added by the get_policy() vm_op, as appropriate--to protect against
1486  * freeing by another task.  It is the caller's responsibility to free the
1487  * extra reference for shared policies.
1488  */
1489 struct mempolicy *get_vma_policy(struct task_struct *task,
1490 		struct vm_area_struct *vma, unsigned long addr)
1491 {
1492 	struct mempolicy *pol = task->mempolicy;
1493 
1494 	if (vma) {
1495 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1496 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1497 									addr);
1498 			if (vpol)
1499 				pol = vpol;
1500 		} else if (vma->vm_policy)
1501 			pol = vma->vm_policy;
1502 	}
1503 	if (!pol)
1504 		pol = &default_policy;
1505 	return pol;
1506 }
1507 
1508 /*
1509  * Return a nodemask representing a mempolicy for filtering nodes for
1510  * page allocation
1511  */
1512 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1513 {
1514 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1515 	if (unlikely(policy->mode == MPOL_BIND) &&
1516 			gfp_zone(gfp) >= policy_zone &&
1517 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1518 		return &policy->v.nodes;
1519 
1520 	return NULL;
1521 }
1522 
1523 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1524 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1525 	int nd)
1526 {
1527 	switch (policy->mode) {
1528 	case MPOL_PREFERRED:
1529 		if (!(policy->flags & MPOL_F_LOCAL))
1530 			nd = policy->v.preferred_node;
1531 		break;
1532 	case MPOL_BIND:
1533 		/*
1534 		 * Normally, MPOL_BIND allocations are node-local within the
1535 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1536 		 * current node isn't part of the mask, we use the zonelist for
1537 		 * the first node in the mask instead.
1538 		 */
1539 		if (unlikely(gfp & __GFP_THISNODE) &&
1540 				unlikely(!node_isset(nd, policy->v.nodes)))
1541 			nd = first_node(policy->v.nodes);
1542 		break;
1543 	default:
1544 		BUG();
1545 	}
1546 	return node_zonelist(nd, gfp);
1547 }
1548 
1549 /* Do dynamic interleaving for a process */
1550 static unsigned interleave_nodes(struct mempolicy *policy)
1551 {
1552 	unsigned nid, next;
1553 	struct task_struct *me = current;
1554 
1555 	nid = me->il_next;
1556 	next = next_node(nid, policy->v.nodes);
1557 	if (next >= MAX_NUMNODES)
1558 		next = first_node(policy->v.nodes);
1559 	if (next < MAX_NUMNODES)
1560 		me->il_next = next;
1561 	return nid;
1562 }
1563 
1564 /*
1565  * Depending on the memory policy provide a node from which to allocate the
1566  * next slab entry.
1567  * @policy must be protected by freeing by the caller.  If @policy is
1568  * the current task's mempolicy, this protection is implicit, as only the
1569  * task can change it's policy.  The system default policy requires no
1570  * such protection.
1571  */
1572 unsigned slab_node(struct mempolicy *policy)
1573 {
1574 	if (!policy || policy->flags & MPOL_F_LOCAL)
1575 		return numa_node_id();
1576 
1577 	switch (policy->mode) {
1578 	case MPOL_PREFERRED:
1579 		/*
1580 		 * handled MPOL_F_LOCAL above
1581 		 */
1582 		return policy->v.preferred_node;
1583 
1584 	case MPOL_INTERLEAVE:
1585 		return interleave_nodes(policy);
1586 
1587 	case MPOL_BIND: {
1588 		/*
1589 		 * Follow bind policy behavior and start allocation at the
1590 		 * first node.
1591 		 */
1592 		struct zonelist *zonelist;
1593 		struct zone *zone;
1594 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1595 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1596 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1597 							&policy->v.nodes,
1598 							&zone);
1599 		return zone ? zone->node : numa_node_id();
1600 	}
1601 
1602 	default:
1603 		BUG();
1604 	}
1605 }
1606 
1607 /* Do static interleaving for a VMA with known offset. */
1608 static unsigned offset_il_node(struct mempolicy *pol,
1609 		struct vm_area_struct *vma, unsigned long off)
1610 {
1611 	unsigned nnodes = nodes_weight(pol->v.nodes);
1612 	unsigned target;
1613 	int c;
1614 	int nid = -1;
1615 
1616 	if (!nnodes)
1617 		return numa_node_id();
1618 	target = (unsigned int)off % nnodes;
1619 	c = 0;
1620 	do {
1621 		nid = next_node(nid, pol->v.nodes);
1622 		c++;
1623 	} while (c <= target);
1624 	return nid;
1625 }
1626 
1627 /* Determine a node number for interleave */
1628 static inline unsigned interleave_nid(struct mempolicy *pol,
1629 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1630 {
1631 	if (vma) {
1632 		unsigned long off;
1633 
1634 		/*
1635 		 * for small pages, there is no difference between
1636 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1637 		 * for huge pages, since vm_pgoff is in units of small
1638 		 * pages, we need to shift off the always 0 bits to get
1639 		 * a useful offset.
1640 		 */
1641 		BUG_ON(shift < PAGE_SHIFT);
1642 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1643 		off += (addr - vma->vm_start) >> shift;
1644 		return offset_il_node(pol, vma, off);
1645 	} else
1646 		return interleave_nodes(pol);
1647 }
1648 
1649 /*
1650  * Return the bit number of a random bit set in the nodemask.
1651  * (returns -1 if nodemask is empty)
1652  */
1653 int node_random(const nodemask_t *maskp)
1654 {
1655 	int w, bit = -1;
1656 
1657 	w = nodes_weight(*maskp);
1658 	if (w)
1659 		bit = bitmap_ord_to_pos(maskp->bits,
1660 			get_random_int() % w, MAX_NUMNODES);
1661 	return bit;
1662 }
1663 
1664 #ifdef CONFIG_HUGETLBFS
1665 /*
1666  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1667  * @vma = virtual memory area whose policy is sought
1668  * @addr = address in @vma for shared policy lookup and interleave policy
1669  * @gfp_flags = for requested zone
1670  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1671  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1672  *
1673  * Returns a zonelist suitable for a huge page allocation and a pointer
1674  * to the struct mempolicy for conditional unref after allocation.
1675  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1676  * @nodemask for filtering the zonelist.
1677  *
1678  * Must be protected by get_mems_allowed()
1679  */
1680 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1681 				gfp_t gfp_flags, struct mempolicy **mpol,
1682 				nodemask_t **nodemask)
1683 {
1684 	struct zonelist *zl;
1685 
1686 	*mpol = get_vma_policy(current, vma, addr);
1687 	*nodemask = NULL;	/* assume !MPOL_BIND */
1688 
1689 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1690 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1691 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1692 	} else {
1693 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1694 		if ((*mpol)->mode == MPOL_BIND)
1695 			*nodemask = &(*mpol)->v.nodes;
1696 	}
1697 	return zl;
1698 }
1699 
1700 /*
1701  * init_nodemask_of_mempolicy
1702  *
1703  * If the current task's mempolicy is "default" [NULL], return 'false'
1704  * to indicate default policy.  Otherwise, extract the policy nodemask
1705  * for 'bind' or 'interleave' policy into the argument nodemask, or
1706  * initialize the argument nodemask to contain the single node for
1707  * 'preferred' or 'local' policy and return 'true' to indicate presence
1708  * of non-default mempolicy.
1709  *
1710  * We don't bother with reference counting the mempolicy [mpol_get/put]
1711  * because the current task is examining it's own mempolicy and a task's
1712  * mempolicy is only ever changed by the task itself.
1713  *
1714  * N.B., it is the caller's responsibility to free a returned nodemask.
1715  */
1716 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1717 {
1718 	struct mempolicy *mempolicy;
1719 	int nid;
1720 
1721 	if (!(mask && current->mempolicy))
1722 		return false;
1723 
1724 	task_lock(current);
1725 	mempolicy = current->mempolicy;
1726 	switch (mempolicy->mode) {
1727 	case MPOL_PREFERRED:
1728 		if (mempolicy->flags & MPOL_F_LOCAL)
1729 			nid = numa_node_id();
1730 		else
1731 			nid = mempolicy->v.preferred_node;
1732 		init_nodemask_of_node(mask, nid);
1733 		break;
1734 
1735 	case MPOL_BIND:
1736 		/* Fall through */
1737 	case MPOL_INTERLEAVE:
1738 		*mask =  mempolicy->v.nodes;
1739 		break;
1740 
1741 	default:
1742 		BUG();
1743 	}
1744 	task_unlock(current);
1745 
1746 	return true;
1747 }
1748 #endif
1749 
1750 /*
1751  * mempolicy_nodemask_intersects
1752  *
1753  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1754  * policy.  Otherwise, check for intersection between mask and the policy
1755  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1756  * policy, always return true since it may allocate elsewhere on fallback.
1757  *
1758  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1759  */
1760 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1761 					const nodemask_t *mask)
1762 {
1763 	struct mempolicy *mempolicy;
1764 	bool ret = true;
1765 
1766 	if (!mask)
1767 		return ret;
1768 	task_lock(tsk);
1769 	mempolicy = tsk->mempolicy;
1770 	if (!mempolicy)
1771 		goto out;
1772 
1773 	switch (mempolicy->mode) {
1774 	case MPOL_PREFERRED:
1775 		/*
1776 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1777 		 * allocate from, they may fallback to other nodes when oom.
1778 		 * Thus, it's possible for tsk to have allocated memory from
1779 		 * nodes in mask.
1780 		 */
1781 		break;
1782 	case MPOL_BIND:
1783 	case MPOL_INTERLEAVE:
1784 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1785 		break;
1786 	default:
1787 		BUG();
1788 	}
1789 out:
1790 	task_unlock(tsk);
1791 	return ret;
1792 }
1793 
1794 /* Allocate a page in interleaved policy.
1795    Own path because it needs to do special accounting. */
1796 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1797 					unsigned nid)
1798 {
1799 	struct zonelist *zl;
1800 	struct page *page;
1801 
1802 	zl = node_zonelist(nid, gfp);
1803 	page = __alloc_pages(gfp, order, zl);
1804 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1805 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1806 	return page;
1807 }
1808 
1809 /**
1810  * 	alloc_pages_vma	- Allocate a page for a VMA.
1811  *
1812  * 	@gfp:
1813  *      %GFP_USER    user allocation.
1814  *      %GFP_KERNEL  kernel allocations,
1815  *      %GFP_HIGHMEM highmem/user allocations,
1816  *      %GFP_FS      allocation should not call back into a file system.
1817  *      %GFP_ATOMIC  don't sleep.
1818  *
1819  *	@order:Order of the GFP allocation.
1820  * 	@vma:  Pointer to VMA or NULL if not available.
1821  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1822  *
1823  * 	This function allocates a page from the kernel page pool and applies
1824  *	a NUMA policy associated with the VMA or the current process.
1825  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1826  *	mm_struct of the VMA to prevent it from going away. Should be used for
1827  *	all allocations for pages that will be mapped into
1828  * 	user space. Returns NULL when no page can be allocated.
1829  *
1830  *	Should be called with the mm_sem of the vma hold.
1831  */
1832 struct page *
1833 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1834 		unsigned long addr, int node)
1835 {
1836 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1837 	struct zonelist *zl;
1838 	struct page *page;
1839 
1840 	get_mems_allowed();
1841 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1842 		unsigned nid;
1843 
1844 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1845 		mpol_cond_put(pol);
1846 		page = alloc_page_interleave(gfp, order, nid);
1847 		put_mems_allowed();
1848 		return page;
1849 	}
1850 	zl = policy_zonelist(gfp, pol, node);
1851 	if (unlikely(mpol_needs_cond_ref(pol))) {
1852 		/*
1853 		 * slow path: ref counted shared policy
1854 		 */
1855 		struct page *page =  __alloc_pages_nodemask(gfp, order,
1856 						zl, policy_nodemask(gfp, pol));
1857 		__mpol_put(pol);
1858 		put_mems_allowed();
1859 		return page;
1860 	}
1861 	/*
1862 	 * fast path:  default or task policy
1863 	 */
1864 	page = __alloc_pages_nodemask(gfp, order, zl,
1865 				      policy_nodemask(gfp, pol));
1866 	put_mems_allowed();
1867 	return page;
1868 }
1869 
1870 /**
1871  * 	alloc_pages_current - Allocate pages.
1872  *
1873  *	@gfp:
1874  *		%GFP_USER   user allocation,
1875  *      	%GFP_KERNEL kernel allocation,
1876  *      	%GFP_HIGHMEM highmem allocation,
1877  *      	%GFP_FS     don't call back into a file system.
1878  *      	%GFP_ATOMIC don't sleep.
1879  *	@order: Power of two of allocation size in pages. 0 is a single page.
1880  *
1881  *	Allocate a page from the kernel page pool.  When not in
1882  *	interrupt context and apply the current process NUMA policy.
1883  *	Returns NULL when no page can be allocated.
1884  *
1885  *	Don't call cpuset_update_task_memory_state() unless
1886  *	1) it's ok to take cpuset_sem (can WAIT), and
1887  *	2) allocating for current task (not interrupt).
1888  */
1889 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1890 {
1891 	struct mempolicy *pol = current->mempolicy;
1892 	struct page *page;
1893 
1894 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1895 		pol = &default_policy;
1896 
1897 	get_mems_allowed();
1898 	/*
1899 	 * No reference counting needed for current->mempolicy
1900 	 * nor system default_policy
1901 	 */
1902 	if (pol->mode == MPOL_INTERLEAVE)
1903 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1904 	else
1905 		page = __alloc_pages_nodemask(gfp, order,
1906 				policy_zonelist(gfp, pol, numa_node_id()),
1907 				policy_nodemask(gfp, pol));
1908 	put_mems_allowed();
1909 	return page;
1910 }
1911 EXPORT_SYMBOL(alloc_pages_current);
1912 
1913 /*
1914  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1915  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1916  * with the mems_allowed returned by cpuset_mems_allowed().  This
1917  * keeps mempolicies cpuset relative after its cpuset moves.  See
1918  * further kernel/cpuset.c update_nodemask().
1919  *
1920  * current's mempolicy may be rebinded by the other task(the task that changes
1921  * cpuset's mems), so we needn't do rebind work for current task.
1922  */
1923 
1924 /* Slow path of a mempolicy duplicate */
1925 struct mempolicy *__mpol_dup(struct mempolicy *old)
1926 {
1927 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1928 
1929 	if (!new)
1930 		return ERR_PTR(-ENOMEM);
1931 
1932 	/* task's mempolicy is protected by alloc_lock */
1933 	if (old == current->mempolicy) {
1934 		task_lock(current);
1935 		*new = *old;
1936 		task_unlock(current);
1937 	} else
1938 		*new = *old;
1939 
1940 	rcu_read_lock();
1941 	if (current_cpuset_is_being_rebound()) {
1942 		nodemask_t mems = cpuset_mems_allowed(current);
1943 		if (new->flags & MPOL_F_REBINDING)
1944 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1945 		else
1946 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1947 	}
1948 	rcu_read_unlock();
1949 	atomic_set(&new->refcnt, 1);
1950 	return new;
1951 }
1952 
1953 /*
1954  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1955  * eliminate the * MPOL_F_* flags that require conditional ref and
1956  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1957  * after return.  Use the returned value.
1958  *
1959  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1960  * policy lookup, even if the policy needs/has extra ref on lookup.
1961  * shmem_readahead needs this.
1962  */
1963 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1964 						struct mempolicy *frompol)
1965 {
1966 	if (!mpol_needs_cond_ref(frompol))
1967 		return frompol;
1968 
1969 	*tompol = *frompol;
1970 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1971 	__mpol_put(frompol);
1972 	return tompol;
1973 }
1974 
1975 /* Slow path of a mempolicy comparison */
1976 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1977 {
1978 	if (!a || !b)
1979 		return 0;
1980 	if (a->mode != b->mode)
1981 		return 0;
1982 	if (a->flags != b->flags)
1983 		return 0;
1984 	if (mpol_store_user_nodemask(a))
1985 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1986 			return 0;
1987 
1988 	switch (a->mode) {
1989 	case MPOL_BIND:
1990 		/* Fall through */
1991 	case MPOL_INTERLEAVE:
1992 		return nodes_equal(a->v.nodes, b->v.nodes);
1993 	case MPOL_PREFERRED:
1994 		return a->v.preferred_node == b->v.preferred_node;
1995 	default:
1996 		BUG();
1997 		return 0;
1998 	}
1999 }
2000 
2001 /*
2002  * Shared memory backing store policy support.
2003  *
2004  * Remember policies even when nobody has shared memory mapped.
2005  * The policies are kept in Red-Black tree linked from the inode.
2006  * They are protected by the sp->lock spinlock, which should be held
2007  * for any accesses to the tree.
2008  */
2009 
2010 /* lookup first element intersecting start-end */
2011 /* Caller holds sp->lock */
2012 static struct sp_node *
2013 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2014 {
2015 	struct rb_node *n = sp->root.rb_node;
2016 
2017 	while (n) {
2018 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2019 
2020 		if (start >= p->end)
2021 			n = n->rb_right;
2022 		else if (end <= p->start)
2023 			n = n->rb_left;
2024 		else
2025 			break;
2026 	}
2027 	if (!n)
2028 		return NULL;
2029 	for (;;) {
2030 		struct sp_node *w = NULL;
2031 		struct rb_node *prev = rb_prev(n);
2032 		if (!prev)
2033 			break;
2034 		w = rb_entry(prev, struct sp_node, nd);
2035 		if (w->end <= start)
2036 			break;
2037 		n = prev;
2038 	}
2039 	return rb_entry(n, struct sp_node, nd);
2040 }
2041 
2042 /* Insert a new shared policy into the list. */
2043 /* Caller holds sp->lock */
2044 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2045 {
2046 	struct rb_node **p = &sp->root.rb_node;
2047 	struct rb_node *parent = NULL;
2048 	struct sp_node *nd;
2049 
2050 	while (*p) {
2051 		parent = *p;
2052 		nd = rb_entry(parent, struct sp_node, nd);
2053 		if (new->start < nd->start)
2054 			p = &(*p)->rb_left;
2055 		else if (new->end > nd->end)
2056 			p = &(*p)->rb_right;
2057 		else
2058 			BUG();
2059 	}
2060 	rb_link_node(&new->nd, parent, p);
2061 	rb_insert_color(&new->nd, &sp->root);
2062 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2063 		 new->policy ? new->policy->mode : 0);
2064 }
2065 
2066 /* Find shared policy intersecting idx */
2067 struct mempolicy *
2068 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2069 {
2070 	struct mempolicy *pol = NULL;
2071 	struct sp_node *sn;
2072 
2073 	if (!sp->root.rb_node)
2074 		return NULL;
2075 	spin_lock(&sp->lock);
2076 	sn = sp_lookup(sp, idx, idx+1);
2077 	if (sn) {
2078 		mpol_get(sn->policy);
2079 		pol = sn->policy;
2080 	}
2081 	spin_unlock(&sp->lock);
2082 	return pol;
2083 }
2084 
2085 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2086 {
2087 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2088 	rb_erase(&n->nd, &sp->root);
2089 	mpol_put(n->policy);
2090 	kmem_cache_free(sn_cache, n);
2091 }
2092 
2093 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2094 				struct mempolicy *pol)
2095 {
2096 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2097 
2098 	if (!n)
2099 		return NULL;
2100 	n->start = start;
2101 	n->end = end;
2102 	mpol_get(pol);
2103 	pol->flags |= MPOL_F_SHARED;	/* for unref */
2104 	n->policy = pol;
2105 	return n;
2106 }
2107 
2108 /* Replace a policy range. */
2109 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2110 				 unsigned long end, struct sp_node *new)
2111 {
2112 	struct sp_node *n, *new2 = NULL;
2113 
2114 restart:
2115 	spin_lock(&sp->lock);
2116 	n = sp_lookup(sp, start, end);
2117 	/* Take care of old policies in the same range. */
2118 	while (n && n->start < end) {
2119 		struct rb_node *next = rb_next(&n->nd);
2120 		if (n->start >= start) {
2121 			if (n->end <= end)
2122 				sp_delete(sp, n);
2123 			else
2124 				n->start = end;
2125 		} else {
2126 			/* Old policy spanning whole new range. */
2127 			if (n->end > end) {
2128 				if (!new2) {
2129 					spin_unlock(&sp->lock);
2130 					new2 = sp_alloc(end, n->end, n->policy);
2131 					if (!new2)
2132 						return -ENOMEM;
2133 					goto restart;
2134 				}
2135 				n->end = start;
2136 				sp_insert(sp, new2);
2137 				new2 = NULL;
2138 				break;
2139 			} else
2140 				n->end = start;
2141 		}
2142 		if (!next)
2143 			break;
2144 		n = rb_entry(next, struct sp_node, nd);
2145 	}
2146 	if (new)
2147 		sp_insert(sp, new);
2148 	spin_unlock(&sp->lock);
2149 	if (new2) {
2150 		mpol_put(new2->policy);
2151 		kmem_cache_free(sn_cache, new2);
2152 	}
2153 	return 0;
2154 }
2155 
2156 /**
2157  * mpol_shared_policy_init - initialize shared policy for inode
2158  * @sp: pointer to inode shared policy
2159  * @mpol:  struct mempolicy to install
2160  *
2161  * Install non-NULL @mpol in inode's shared policy rb-tree.
2162  * On entry, the current task has a reference on a non-NULL @mpol.
2163  * This must be released on exit.
2164  * This is called at get_inode() calls and we can use GFP_KERNEL.
2165  */
2166 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2167 {
2168 	int ret;
2169 
2170 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2171 	spin_lock_init(&sp->lock);
2172 
2173 	if (mpol) {
2174 		struct vm_area_struct pvma;
2175 		struct mempolicy *new;
2176 		NODEMASK_SCRATCH(scratch);
2177 
2178 		if (!scratch)
2179 			goto put_mpol;
2180 		/* contextualize the tmpfs mount point mempolicy */
2181 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2182 		if (IS_ERR(new))
2183 			goto free_scratch; /* no valid nodemask intersection */
2184 
2185 		task_lock(current);
2186 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2187 		task_unlock(current);
2188 		if (ret)
2189 			goto put_new;
2190 
2191 		/* Create pseudo-vma that contains just the policy */
2192 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2193 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2194 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2195 
2196 put_new:
2197 		mpol_put(new);			/* drop initial ref */
2198 free_scratch:
2199 		NODEMASK_SCRATCH_FREE(scratch);
2200 put_mpol:
2201 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2202 	}
2203 }
2204 
2205 int mpol_set_shared_policy(struct shared_policy *info,
2206 			struct vm_area_struct *vma, struct mempolicy *npol)
2207 {
2208 	int err;
2209 	struct sp_node *new = NULL;
2210 	unsigned long sz = vma_pages(vma);
2211 
2212 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2213 		 vma->vm_pgoff,
2214 		 sz, npol ? npol->mode : -1,
2215 		 npol ? npol->flags : -1,
2216 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2217 
2218 	if (npol) {
2219 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2220 		if (!new)
2221 			return -ENOMEM;
2222 	}
2223 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2224 	if (err && new)
2225 		kmem_cache_free(sn_cache, new);
2226 	return err;
2227 }
2228 
2229 /* Free a backing policy store on inode delete. */
2230 void mpol_free_shared_policy(struct shared_policy *p)
2231 {
2232 	struct sp_node *n;
2233 	struct rb_node *next;
2234 
2235 	if (!p->root.rb_node)
2236 		return;
2237 	spin_lock(&p->lock);
2238 	next = rb_first(&p->root);
2239 	while (next) {
2240 		n = rb_entry(next, struct sp_node, nd);
2241 		next = rb_next(&n->nd);
2242 		rb_erase(&n->nd, &p->root);
2243 		mpol_put(n->policy);
2244 		kmem_cache_free(sn_cache, n);
2245 	}
2246 	spin_unlock(&p->lock);
2247 }
2248 
2249 /* assumes fs == KERNEL_DS */
2250 void __init numa_policy_init(void)
2251 {
2252 	nodemask_t interleave_nodes;
2253 	unsigned long largest = 0;
2254 	int nid, prefer = 0;
2255 
2256 	policy_cache = kmem_cache_create("numa_policy",
2257 					 sizeof(struct mempolicy),
2258 					 0, SLAB_PANIC, NULL);
2259 
2260 	sn_cache = kmem_cache_create("shared_policy_node",
2261 				     sizeof(struct sp_node),
2262 				     0, SLAB_PANIC, NULL);
2263 
2264 	/*
2265 	 * Set interleaving policy for system init. Interleaving is only
2266 	 * enabled across suitably sized nodes (default is >= 16MB), or
2267 	 * fall back to the largest node if they're all smaller.
2268 	 */
2269 	nodes_clear(interleave_nodes);
2270 	for_each_node_state(nid, N_HIGH_MEMORY) {
2271 		unsigned long total_pages = node_present_pages(nid);
2272 
2273 		/* Preserve the largest node */
2274 		if (largest < total_pages) {
2275 			largest = total_pages;
2276 			prefer = nid;
2277 		}
2278 
2279 		/* Interleave this node? */
2280 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2281 			node_set(nid, interleave_nodes);
2282 	}
2283 
2284 	/* All too small, use the largest */
2285 	if (unlikely(nodes_empty(interleave_nodes)))
2286 		node_set(prefer, interleave_nodes);
2287 
2288 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2289 		printk("numa_policy_init: interleaving failed\n");
2290 }
2291 
2292 /* Reset policy of current process to default */
2293 void numa_default_policy(void)
2294 {
2295 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2296 }
2297 
2298 /*
2299  * Parse and format mempolicy from/to strings
2300  */
2301 
2302 /*
2303  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2304  * Used only for mpol_parse_str() and mpol_to_str()
2305  */
2306 #define MPOL_LOCAL MPOL_MAX
2307 static const char * const policy_modes[] =
2308 {
2309 	[MPOL_DEFAULT]    = "default",
2310 	[MPOL_PREFERRED]  = "prefer",
2311 	[MPOL_BIND]       = "bind",
2312 	[MPOL_INTERLEAVE] = "interleave",
2313 	[MPOL_LOCAL]      = "local"
2314 };
2315 
2316 
2317 #ifdef CONFIG_TMPFS
2318 /**
2319  * mpol_parse_str - parse string to mempolicy
2320  * @str:  string containing mempolicy to parse
2321  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2322  * @no_context:  flag whether to "contextualize" the mempolicy
2323  *
2324  * Format of input:
2325  *	<mode>[=<flags>][:<nodelist>]
2326  *
2327  * if @no_context is true, save the input nodemask in w.user_nodemask in
2328  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2329  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2330  * mount option.  Note that if 'static' or 'relative' mode flags were
2331  * specified, the input nodemask will already have been saved.  Saving
2332  * it again is redundant, but safe.
2333  *
2334  * On success, returns 0, else 1
2335  */
2336 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2337 {
2338 	struct mempolicy *new = NULL;
2339 	unsigned short mode;
2340 	unsigned short uninitialized_var(mode_flags);
2341 	nodemask_t nodes;
2342 	char *nodelist = strchr(str, ':');
2343 	char *flags = strchr(str, '=');
2344 	int err = 1;
2345 
2346 	if (nodelist) {
2347 		/* NUL-terminate mode or flags string */
2348 		*nodelist++ = '\0';
2349 		if (nodelist_parse(nodelist, nodes))
2350 			goto out;
2351 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2352 			goto out;
2353 	} else
2354 		nodes_clear(nodes);
2355 
2356 	if (flags)
2357 		*flags++ = '\0';	/* terminate mode string */
2358 
2359 	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2360 		if (!strcmp(str, policy_modes[mode])) {
2361 			break;
2362 		}
2363 	}
2364 	if (mode > MPOL_LOCAL)
2365 		goto out;
2366 
2367 	switch (mode) {
2368 	case MPOL_PREFERRED:
2369 		/*
2370 		 * Insist on a nodelist of one node only
2371 		 */
2372 		if (nodelist) {
2373 			char *rest = nodelist;
2374 			while (isdigit(*rest))
2375 				rest++;
2376 			if (*rest)
2377 				goto out;
2378 		}
2379 		break;
2380 	case MPOL_INTERLEAVE:
2381 		/*
2382 		 * Default to online nodes with memory if no nodelist
2383 		 */
2384 		if (!nodelist)
2385 			nodes = node_states[N_HIGH_MEMORY];
2386 		break;
2387 	case MPOL_LOCAL:
2388 		/*
2389 		 * Don't allow a nodelist;  mpol_new() checks flags
2390 		 */
2391 		if (nodelist)
2392 			goto out;
2393 		mode = MPOL_PREFERRED;
2394 		break;
2395 	case MPOL_DEFAULT:
2396 		/*
2397 		 * Insist on a empty nodelist
2398 		 */
2399 		if (!nodelist)
2400 			err = 0;
2401 		goto out;
2402 	case MPOL_BIND:
2403 		/*
2404 		 * Insist on a nodelist
2405 		 */
2406 		if (!nodelist)
2407 			goto out;
2408 	}
2409 
2410 	mode_flags = 0;
2411 	if (flags) {
2412 		/*
2413 		 * Currently, we only support two mutually exclusive
2414 		 * mode flags.
2415 		 */
2416 		if (!strcmp(flags, "static"))
2417 			mode_flags |= MPOL_F_STATIC_NODES;
2418 		else if (!strcmp(flags, "relative"))
2419 			mode_flags |= MPOL_F_RELATIVE_NODES;
2420 		else
2421 			goto out;
2422 	}
2423 
2424 	new = mpol_new(mode, mode_flags, &nodes);
2425 	if (IS_ERR(new))
2426 		goto out;
2427 
2428 	if (no_context) {
2429 		/* save for contextualization */
2430 		new->w.user_nodemask = nodes;
2431 	} else {
2432 		int ret;
2433 		NODEMASK_SCRATCH(scratch);
2434 		if (scratch) {
2435 			task_lock(current);
2436 			ret = mpol_set_nodemask(new, &nodes, scratch);
2437 			task_unlock(current);
2438 		} else
2439 			ret = -ENOMEM;
2440 		NODEMASK_SCRATCH_FREE(scratch);
2441 		if (ret) {
2442 			mpol_put(new);
2443 			goto out;
2444 		}
2445 	}
2446 	err = 0;
2447 
2448 out:
2449 	/* Restore string for error message */
2450 	if (nodelist)
2451 		*--nodelist = ':';
2452 	if (flags)
2453 		*--flags = '=';
2454 	if (!err)
2455 		*mpol = new;
2456 	return err;
2457 }
2458 #endif /* CONFIG_TMPFS */
2459 
2460 /**
2461  * mpol_to_str - format a mempolicy structure for printing
2462  * @buffer:  to contain formatted mempolicy string
2463  * @maxlen:  length of @buffer
2464  * @pol:  pointer to mempolicy to be formatted
2465  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2466  *
2467  * Convert a mempolicy into a string.
2468  * Returns the number of characters in buffer (if positive)
2469  * or an error (negative)
2470  */
2471 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2472 {
2473 	char *p = buffer;
2474 	int l;
2475 	nodemask_t nodes;
2476 	unsigned short mode;
2477 	unsigned short flags = pol ? pol->flags : 0;
2478 
2479 	/*
2480 	 * Sanity check:  room for longest mode, flag and some nodes
2481 	 */
2482 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2483 
2484 	if (!pol || pol == &default_policy)
2485 		mode = MPOL_DEFAULT;
2486 	else
2487 		mode = pol->mode;
2488 
2489 	switch (mode) {
2490 	case MPOL_DEFAULT:
2491 		nodes_clear(nodes);
2492 		break;
2493 
2494 	case MPOL_PREFERRED:
2495 		nodes_clear(nodes);
2496 		if (flags & MPOL_F_LOCAL)
2497 			mode = MPOL_LOCAL;	/* pseudo-policy */
2498 		else
2499 			node_set(pol->v.preferred_node, nodes);
2500 		break;
2501 
2502 	case MPOL_BIND:
2503 		/* Fall through */
2504 	case MPOL_INTERLEAVE:
2505 		if (no_context)
2506 			nodes = pol->w.user_nodemask;
2507 		else
2508 			nodes = pol->v.nodes;
2509 		break;
2510 
2511 	default:
2512 		BUG();
2513 	}
2514 
2515 	l = strlen(policy_modes[mode]);
2516 	if (buffer + maxlen < p + l + 1)
2517 		return -ENOSPC;
2518 
2519 	strcpy(p, policy_modes[mode]);
2520 	p += l;
2521 
2522 	if (flags & MPOL_MODE_FLAGS) {
2523 		if (buffer + maxlen < p + 2)
2524 			return -ENOSPC;
2525 		*p++ = '=';
2526 
2527 		/*
2528 		 * Currently, the only defined flags are mutually exclusive
2529 		 */
2530 		if (flags & MPOL_F_STATIC_NODES)
2531 			p += snprintf(p, buffer + maxlen - p, "static");
2532 		else if (flags & MPOL_F_RELATIVE_NODES)
2533 			p += snprintf(p, buffer + maxlen - p, "relative");
2534 	}
2535 
2536 	if (!nodes_empty(nodes)) {
2537 		if (buffer + maxlen < p + 2)
2538 			return -ENOSPC;
2539 		*p++ = ':';
2540 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2541 	}
2542 	return p - buffer;
2543 }
2544