xref: /linux/mm/mempolicy.c (revision 273b281fa22c293963ee3e6eec418f5dda2dbc83)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/gfp.h>
77 #include <linux/slab.h>
78 #include <linux/string.h>
79 #include <linux/module.h>
80 #include <linux/nsproxy.h>
81 #include <linux/interrupt.h>
82 #include <linux/init.h>
83 #include <linux/compat.h>
84 #include <linux/swap.h>
85 #include <linux/seq_file.h>
86 #include <linux/proc_fs.h>
87 #include <linux/migrate.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 #include "internal.h"
97 
98 /* Internal flags */
99 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
100 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
101 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
102 
103 static struct kmem_cache *policy_cache;
104 static struct kmem_cache *sn_cache;
105 
106 /* Highest zone. An specific allocation for a zone below that is not
107    policied. */
108 enum zone_type policy_zone = 0;
109 
110 /*
111  * run-time system-wide default policy => local allocation
112  */
113 struct mempolicy default_policy = {
114 	.refcnt = ATOMIC_INIT(1), /* never free it */
115 	.mode = MPOL_PREFERRED,
116 	.flags = MPOL_F_LOCAL,
117 };
118 
119 static const struct mempolicy_operations {
120 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
122 } mpol_ops[MPOL_MAX];
123 
124 /* Check that the nodemask contains at least one populated zone */
125 static int is_valid_nodemask(const nodemask_t *nodemask)
126 {
127 	int nd, k;
128 
129 	/* Check that there is something useful in this mask */
130 	k = policy_zone;
131 
132 	for_each_node_mask(nd, *nodemask) {
133 		struct zone *z;
134 
135 		for (k = 0; k <= policy_zone; k++) {
136 			z = &NODE_DATA(nd)->node_zones[k];
137 			if (z->present_pages > 0)
138 				return 1;
139 		}
140 	}
141 
142 	return 0;
143 }
144 
145 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
146 {
147 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
148 }
149 
150 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
151 				   const nodemask_t *rel)
152 {
153 	nodemask_t tmp;
154 	nodes_fold(tmp, *orig, nodes_weight(*rel));
155 	nodes_onto(*ret, tmp, *rel);
156 }
157 
158 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
159 {
160 	if (nodes_empty(*nodes))
161 		return -EINVAL;
162 	pol->v.nodes = *nodes;
163 	return 0;
164 }
165 
166 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
167 {
168 	if (!nodes)
169 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
170 	else if (nodes_empty(*nodes))
171 		return -EINVAL;			/*  no allowed nodes */
172 	else
173 		pol->v.preferred_node = first_node(*nodes);
174 	return 0;
175 }
176 
177 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
178 {
179 	if (!is_valid_nodemask(nodes))
180 		return -EINVAL;
181 	pol->v.nodes = *nodes;
182 	return 0;
183 }
184 
185 /*
186  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187  * any, for the new policy.  mpol_new() has already validated the nodes
188  * parameter with respect to the policy mode and flags.  But, we need to
189  * handle an empty nodemask with MPOL_PREFERRED here.
190  *
191  * Must be called holding task's alloc_lock to protect task's mems_allowed
192  * and mempolicy.  May also be called holding the mmap_semaphore for write.
193  */
194 static int mpol_set_nodemask(struct mempolicy *pol,
195 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
196 {
197 	int ret;
198 
199 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 	if (pol == NULL)
201 		return 0;
202 	/* Check N_HIGH_MEMORY */
203 	nodes_and(nsc->mask1,
204 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
205 
206 	VM_BUG_ON(!nodes);
207 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
208 		nodes = NULL;	/* explicit local allocation */
209 	else {
210 		if (pol->flags & MPOL_F_RELATIVE_NODES)
211 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
212 		else
213 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
214 
215 		if (mpol_store_user_nodemask(pol))
216 			pol->w.user_nodemask = *nodes;
217 		else
218 			pol->w.cpuset_mems_allowed =
219 						cpuset_current_mems_allowed;
220 	}
221 
222 	if (nodes)
223 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
224 	else
225 		ret = mpol_ops[pol->mode].create(pol, NULL);
226 	return ret;
227 }
228 
229 /*
230  * This function just creates a new policy, does some check and simple
231  * initialization. You must invoke mpol_set_nodemask() to set nodes.
232  */
233 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
234 				  nodemask_t *nodes)
235 {
236 	struct mempolicy *policy;
237 
238 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
239 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
240 
241 	if (mode == MPOL_DEFAULT) {
242 		if (nodes && !nodes_empty(*nodes))
243 			return ERR_PTR(-EINVAL);
244 		return NULL;	/* simply delete any existing policy */
245 	}
246 	VM_BUG_ON(!nodes);
247 
248 	/*
249 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
250 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
251 	 * All other modes require a valid pointer to a non-empty nodemask.
252 	 */
253 	if (mode == MPOL_PREFERRED) {
254 		if (nodes_empty(*nodes)) {
255 			if (((flags & MPOL_F_STATIC_NODES) ||
256 			     (flags & MPOL_F_RELATIVE_NODES)))
257 				return ERR_PTR(-EINVAL);
258 		}
259 	} else if (nodes_empty(*nodes))
260 		return ERR_PTR(-EINVAL);
261 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
262 	if (!policy)
263 		return ERR_PTR(-ENOMEM);
264 	atomic_set(&policy->refcnt, 1);
265 	policy->mode = mode;
266 	policy->flags = flags;
267 
268 	return policy;
269 }
270 
271 /* Slow path of a mpol destructor. */
272 void __mpol_put(struct mempolicy *p)
273 {
274 	if (!atomic_dec_and_test(&p->refcnt))
275 		return;
276 	kmem_cache_free(policy_cache, p);
277 }
278 
279 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
280 {
281 }
282 
283 static void mpol_rebind_nodemask(struct mempolicy *pol,
284 				 const nodemask_t *nodes)
285 {
286 	nodemask_t tmp;
287 
288 	if (pol->flags & MPOL_F_STATIC_NODES)
289 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
290 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
291 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
292 	else {
293 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
294 			    *nodes);
295 		pol->w.cpuset_mems_allowed = *nodes;
296 	}
297 
298 	pol->v.nodes = tmp;
299 	if (!node_isset(current->il_next, tmp)) {
300 		current->il_next = next_node(current->il_next, tmp);
301 		if (current->il_next >= MAX_NUMNODES)
302 			current->il_next = first_node(tmp);
303 		if (current->il_next >= MAX_NUMNODES)
304 			current->il_next = numa_node_id();
305 	}
306 }
307 
308 static void mpol_rebind_preferred(struct mempolicy *pol,
309 				  const nodemask_t *nodes)
310 {
311 	nodemask_t tmp;
312 
313 	if (pol->flags & MPOL_F_STATIC_NODES) {
314 		int node = first_node(pol->w.user_nodemask);
315 
316 		if (node_isset(node, *nodes)) {
317 			pol->v.preferred_node = node;
318 			pol->flags &= ~MPOL_F_LOCAL;
319 		} else
320 			pol->flags |= MPOL_F_LOCAL;
321 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
322 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323 		pol->v.preferred_node = first_node(tmp);
324 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
325 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
326 						   pol->w.cpuset_mems_allowed,
327 						   *nodes);
328 		pol->w.cpuset_mems_allowed = *nodes;
329 	}
330 }
331 
332 /* Migrate a policy to a different set of nodes */
333 static void mpol_rebind_policy(struct mempolicy *pol,
334 			       const nodemask_t *newmask)
335 {
336 	if (!pol)
337 		return;
338 	if (!mpol_store_user_nodemask(pol) &&
339 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
340 		return;
341 	mpol_ops[pol->mode].rebind(pol, newmask);
342 }
343 
344 /*
345  * Wrapper for mpol_rebind_policy() that just requires task
346  * pointer, and updates task mempolicy.
347  *
348  * Called with task's alloc_lock held.
349  */
350 
351 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
352 {
353 	mpol_rebind_policy(tsk->mempolicy, new);
354 }
355 
356 /*
357  * Rebind each vma in mm to new nodemask.
358  *
359  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
360  */
361 
362 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
363 {
364 	struct vm_area_struct *vma;
365 
366 	down_write(&mm->mmap_sem);
367 	for (vma = mm->mmap; vma; vma = vma->vm_next)
368 		mpol_rebind_policy(vma->vm_policy, new);
369 	up_write(&mm->mmap_sem);
370 }
371 
372 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
373 	[MPOL_DEFAULT] = {
374 		.rebind = mpol_rebind_default,
375 	},
376 	[MPOL_INTERLEAVE] = {
377 		.create = mpol_new_interleave,
378 		.rebind = mpol_rebind_nodemask,
379 	},
380 	[MPOL_PREFERRED] = {
381 		.create = mpol_new_preferred,
382 		.rebind = mpol_rebind_preferred,
383 	},
384 	[MPOL_BIND] = {
385 		.create = mpol_new_bind,
386 		.rebind = mpol_rebind_nodemask,
387 	},
388 };
389 
390 static void gather_stats(struct page *, void *, int pte_dirty);
391 static void migrate_page_add(struct page *page, struct list_head *pagelist,
392 				unsigned long flags);
393 
394 /* Scan through pages checking if pages follow certain conditions. */
395 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
396 		unsigned long addr, unsigned long end,
397 		const nodemask_t *nodes, unsigned long flags,
398 		void *private)
399 {
400 	pte_t *orig_pte;
401 	pte_t *pte;
402 	spinlock_t *ptl;
403 
404 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
405 	do {
406 		struct page *page;
407 		int nid;
408 
409 		if (!pte_present(*pte))
410 			continue;
411 		page = vm_normal_page(vma, addr, *pte);
412 		if (!page)
413 			continue;
414 		/*
415 		 * The check for PageReserved here is important to avoid
416 		 * handling zero pages and other pages that may have been
417 		 * marked special by the system.
418 		 *
419 		 * If the PageReserved would not be checked here then f.e.
420 		 * the location of the zero page could have an influence
421 		 * on MPOL_MF_STRICT, zero pages would be counted for
422 		 * the per node stats, and there would be useless attempts
423 		 * to put zero pages on the migration list.
424 		 */
425 		if (PageReserved(page))
426 			continue;
427 		nid = page_to_nid(page);
428 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
429 			continue;
430 
431 		if (flags & MPOL_MF_STATS)
432 			gather_stats(page, private, pte_dirty(*pte));
433 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
434 			migrate_page_add(page, private, flags);
435 		else
436 			break;
437 	} while (pte++, addr += PAGE_SIZE, addr != end);
438 	pte_unmap_unlock(orig_pte, ptl);
439 	return addr != end;
440 }
441 
442 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
443 		unsigned long addr, unsigned long end,
444 		const nodemask_t *nodes, unsigned long flags,
445 		void *private)
446 {
447 	pmd_t *pmd;
448 	unsigned long next;
449 
450 	pmd = pmd_offset(pud, addr);
451 	do {
452 		next = pmd_addr_end(addr, end);
453 		if (pmd_none_or_clear_bad(pmd))
454 			continue;
455 		if (check_pte_range(vma, pmd, addr, next, nodes,
456 				    flags, private))
457 			return -EIO;
458 	} while (pmd++, addr = next, addr != end);
459 	return 0;
460 }
461 
462 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
463 		unsigned long addr, unsigned long end,
464 		const nodemask_t *nodes, unsigned long flags,
465 		void *private)
466 {
467 	pud_t *pud;
468 	unsigned long next;
469 
470 	pud = pud_offset(pgd, addr);
471 	do {
472 		next = pud_addr_end(addr, end);
473 		if (pud_none_or_clear_bad(pud))
474 			continue;
475 		if (check_pmd_range(vma, pud, addr, next, nodes,
476 				    flags, private))
477 			return -EIO;
478 	} while (pud++, addr = next, addr != end);
479 	return 0;
480 }
481 
482 static inline int check_pgd_range(struct vm_area_struct *vma,
483 		unsigned long addr, unsigned long end,
484 		const nodemask_t *nodes, unsigned long flags,
485 		void *private)
486 {
487 	pgd_t *pgd;
488 	unsigned long next;
489 
490 	pgd = pgd_offset(vma->vm_mm, addr);
491 	do {
492 		next = pgd_addr_end(addr, end);
493 		if (pgd_none_or_clear_bad(pgd))
494 			continue;
495 		if (check_pud_range(vma, pgd, addr, next, nodes,
496 				    flags, private))
497 			return -EIO;
498 	} while (pgd++, addr = next, addr != end);
499 	return 0;
500 }
501 
502 /*
503  * Check if all pages in a range are on a set of nodes.
504  * If pagelist != NULL then isolate pages from the LRU and
505  * put them on the pagelist.
506  */
507 static struct vm_area_struct *
508 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
509 		const nodemask_t *nodes, unsigned long flags, void *private)
510 {
511 	int err;
512 	struct vm_area_struct *first, *vma, *prev;
513 
514 
515 	first = find_vma(mm, start);
516 	if (!first)
517 		return ERR_PTR(-EFAULT);
518 	prev = NULL;
519 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
520 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
521 			if (!vma->vm_next && vma->vm_end < end)
522 				return ERR_PTR(-EFAULT);
523 			if (prev && prev->vm_end < vma->vm_start)
524 				return ERR_PTR(-EFAULT);
525 		}
526 		if (!is_vm_hugetlb_page(vma) &&
527 		    ((flags & MPOL_MF_STRICT) ||
528 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
529 				vma_migratable(vma)))) {
530 			unsigned long endvma = vma->vm_end;
531 
532 			if (endvma > end)
533 				endvma = end;
534 			if (vma->vm_start > start)
535 				start = vma->vm_start;
536 			err = check_pgd_range(vma, start, endvma, nodes,
537 						flags, private);
538 			if (err) {
539 				first = ERR_PTR(err);
540 				break;
541 			}
542 		}
543 		prev = vma;
544 	}
545 	return first;
546 }
547 
548 /* Apply policy to a single VMA */
549 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
550 {
551 	int err = 0;
552 	struct mempolicy *old = vma->vm_policy;
553 
554 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
555 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
556 		 vma->vm_ops, vma->vm_file,
557 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
558 
559 	if (vma->vm_ops && vma->vm_ops->set_policy)
560 		err = vma->vm_ops->set_policy(vma, new);
561 	if (!err) {
562 		mpol_get(new);
563 		vma->vm_policy = new;
564 		mpol_put(old);
565 	}
566 	return err;
567 }
568 
569 /* Step 2: apply policy to a range and do splits. */
570 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
571 		       unsigned long end, struct mempolicy *new)
572 {
573 	struct vm_area_struct *next;
574 	int err;
575 
576 	err = 0;
577 	for (; vma && vma->vm_start < end; vma = next) {
578 		next = vma->vm_next;
579 		if (vma->vm_start < start)
580 			err = split_vma(vma->vm_mm, vma, start, 1);
581 		if (!err && vma->vm_end > end)
582 			err = split_vma(vma->vm_mm, vma, end, 0);
583 		if (!err)
584 			err = policy_vma(vma, new);
585 		if (err)
586 			break;
587 	}
588 	return err;
589 }
590 
591 /*
592  * Update task->flags PF_MEMPOLICY bit: set iff non-default
593  * mempolicy.  Allows more rapid checking of this (combined perhaps
594  * with other PF_* flag bits) on memory allocation hot code paths.
595  *
596  * If called from outside this file, the task 'p' should -only- be
597  * a newly forked child not yet visible on the task list, because
598  * manipulating the task flags of a visible task is not safe.
599  *
600  * The above limitation is why this routine has the funny name
601  * mpol_fix_fork_child_flag().
602  *
603  * It is also safe to call this with a task pointer of current,
604  * which the static wrapper mpol_set_task_struct_flag() does,
605  * for use within this file.
606  */
607 
608 void mpol_fix_fork_child_flag(struct task_struct *p)
609 {
610 	if (p->mempolicy)
611 		p->flags |= PF_MEMPOLICY;
612 	else
613 		p->flags &= ~PF_MEMPOLICY;
614 }
615 
616 static void mpol_set_task_struct_flag(void)
617 {
618 	mpol_fix_fork_child_flag(current);
619 }
620 
621 /* Set the process memory policy */
622 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
623 			     nodemask_t *nodes)
624 {
625 	struct mempolicy *new, *old;
626 	struct mm_struct *mm = current->mm;
627 	NODEMASK_SCRATCH(scratch);
628 	int ret;
629 
630 	if (!scratch)
631 		return -ENOMEM;
632 
633 	new = mpol_new(mode, flags, nodes);
634 	if (IS_ERR(new)) {
635 		ret = PTR_ERR(new);
636 		goto out;
637 	}
638 	/*
639 	 * prevent changing our mempolicy while show_numa_maps()
640 	 * is using it.
641 	 * Note:  do_set_mempolicy() can be called at init time
642 	 * with no 'mm'.
643 	 */
644 	if (mm)
645 		down_write(&mm->mmap_sem);
646 	task_lock(current);
647 	ret = mpol_set_nodemask(new, nodes, scratch);
648 	if (ret) {
649 		task_unlock(current);
650 		if (mm)
651 			up_write(&mm->mmap_sem);
652 		mpol_put(new);
653 		goto out;
654 	}
655 	old = current->mempolicy;
656 	current->mempolicy = new;
657 	mpol_set_task_struct_flag();
658 	if (new && new->mode == MPOL_INTERLEAVE &&
659 	    nodes_weight(new->v.nodes))
660 		current->il_next = first_node(new->v.nodes);
661 	task_unlock(current);
662 	if (mm)
663 		up_write(&mm->mmap_sem);
664 
665 	mpol_put(old);
666 	ret = 0;
667 out:
668 	NODEMASK_SCRATCH_FREE(scratch);
669 	return ret;
670 }
671 
672 /*
673  * Return nodemask for policy for get_mempolicy() query
674  *
675  * Called with task's alloc_lock held
676  */
677 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
678 {
679 	nodes_clear(*nodes);
680 	if (p == &default_policy)
681 		return;
682 
683 	switch (p->mode) {
684 	case MPOL_BIND:
685 		/* Fall through */
686 	case MPOL_INTERLEAVE:
687 		*nodes = p->v.nodes;
688 		break;
689 	case MPOL_PREFERRED:
690 		if (!(p->flags & MPOL_F_LOCAL))
691 			node_set(p->v.preferred_node, *nodes);
692 		/* else return empty node mask for local allocation */
693 		break;
694 	default:
695 		BUG();
696 	}
697 }
698 
699 static int lookup_node(struct mm_struct *mm, unsigned long addr)
700 {
701 	struct page *p;
702 	int err;
703 
704 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
705 	if (err >= 0) {
706 		err = page_to_nid(p);
707 		put_page(p);
708 	}
709 	return err;
710 }
711 
712 /* Retrieve NUMA policy */
713 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
714 			     unsigned long addr, unsigned long flags)
715 {
716 	int err;
717 	struct mm_struct *mm = current->mm;
718 	struct vm_area_struct *vma = NULL;
719 	struct mempolicy *pol = current->mempolicy;
720 
721 	if (flags &
722 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
723 		return -EINVAL;
724 
725 	if (flags & MPOL_F_MEMS_ALLOWED) {
726 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
727 			return -EINVAL;
728 		*policy = 0;	/* just so it's initialized */
729 		task_lock(current);
730 		*nmask  = cpuset_current_mems_allowed;
731 		task_unlock(current);
732 		return 0;
733 	}
734 
735 	if (flags & MPOL_F_ADDR) {
736 		/*
737 		 * Do NOT fall back to task policy if the
738 		 * vma/shared policy at addr is NULL.  We
739 		 * want to return MPOL_DEFAULT in this case.
740 		 */
741 		down_read(&mm->mmap_sem);
742 		vma = find_vma_intersection(mm, addr, addr+1);
743 		if (!vma) {
744 			up_read(&mm->mmap_sem);
745 			return -EFAULT;
746 		}
747 		if (vma->vm_ops && vma->vm_ops->get_policy)
748 			pol = vma->vm_ops->get_policy(vma, addr);
749 		else
750 			pol = vma->vm_policy;
751 	} else if (addr)
752 		return -EINVAL;
753 
754 	if (!pol)
755 		pol = &default_policy;	/* indicates default behavior */
756 
757 	if (flags & MPOL_F_NODE) {
758 		if (flags & MPOL_F_ADDR) {
759 			err = lookup_node(mm, addr);
760 			if (err < 0)
761 				goto out;
762 			*policy = err;
763 		} else if (pol == current->mempolicy &&
764 				pol->mode == MPOL_INTERLEAVE) {
765 			*policy = current->il_next;
766 		} else {
767 			err = -EINVAL;
768 			goto out;
769 		}
770 	} else {
771 		*policy = pol == &default_policy ? MPOL_DEFAULT :
772 						pol->mode;
773 		/*
774 		 * Internal mempolicy flags must be masked off before exposing
775 		 * the policy to userspace.
776 		 */
777 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
778 	}
779 
780 	if (vma) {
781 		up_read(&current->mm->mmap_sem);
782 		vma = NULL;
783 	}
784 
785 	err = 0;
786 	if (nmask) {
787 		task_lock(current);
788 		get_policy_nodemask(pol, nmask);
789 		task_unlock(current);
790 	}
791 
792  out:
793 	mpol_cond_put(pol);
794 	if (vma)
795 		up_read(&current->mm->mmap_sem);
796 	return err;
797 }
798 
799 #ifdef CONFIG_MIGRATION
800 /*
801  * page migration
802  */
803 static void migrate_page_add(struct page *page, struct list_head *pagelist,
804 				unsigned long flags)
805 {
806 	/*
807 	 * Avoid migrating a page that is shared with others.
808 	 */
809 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
810 		if (!isolate_lru_page(page)) {
811 			list_add_tail(&page->lru, pagelist);
812 		}
813 	}
814 }
815 
816 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
817 {
818 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
819 }
820 
821 /*
822  * Migrate pages from one node to a target node.
823  * Returns error or the number of pages not migrated.
824  */
825 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
826 			   int flags)
827 {
828 	nodemask_t nmask;
829 	LIST_HEAD(pagelist);
830 	int err = 0;
831 
832 	nodes_clear(nmask);
833 	node_set(source, nmask);
834 
835 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
836 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
837 
838 	if (!list_empty(&pagelist))
839 		err = migrate_pages(&pagelist, new_node_page, dest);
840 
841 	return err;
842 }
843 
844 /*
845  * Move pages between the two nodesets so as to preserve the physical
846  * layout as much as possible.
847  *
848  * Returns the number of page that could not be moved.
849  */
850 int do_migrate_pages(struct mm_struct *mm,
851 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
852 {
853 	int busy = 0;
854 	int err;
855 	nodemask_t tmp;
856 
857 	err = migrate_prep();
858 	if (err)
859 		return err;
860 
861 	down_read(&mm->mmap_sem);
862 
863 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
864 	if (err)
865 		goto out;
866 
867 /*
868  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
869  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
870  * bit in 'tmp', and return that <source, dest> pair for migration.
871  * The pair of nodemasks 'to' and 'from' define the map.
872  *
873  * If no pair of bits is found that way, fallback to picking some
874  * pair of 'source' and 'dest' bits that are not the same.  If the
875  * 'source' and 'dest' bits are the same, this represents a node
876  * that will be migrating to itself, so no pages need move.
877  *
878  * If no bits are left in 'tmp', or if all remaining bits left
879  * in 'tmp' correspond to the same bit in 'to', return false
880  * (nothing left to migrate).
881  *
882  * This lets us pick a pair of nodes to migrate between, such that
883  * if possible the dest node is not already occupied by some other
884  * source node, minimizing the risk of overloading the memory on a
885  * node that would happen if we migrated incoming memory to a node
886  * before migrating outgoing memory source that same node.
887  *
888  * A single scan of tmp is sufficient.  As we go, we remember the
889  * most recent <s, d> pair that moved (s != d).  If we find a pair
890  * that not only moved, but what's better, moved to an empty slot
891  * (d is not set in tmp), then we break out then, with that pair.
892  * Otherwise when we finish scannng from_tmp, we at least have the
893  * most recent <s, d> pair that moved.  If we get all the way through
894  * the scan of tmp without finding any node that moved, much less
895  * moved to an empty node, then there is nothing left worth migrating.
896  */
897 
898 	tmp = *from_nodes;
899 	while (!nodes_empty(tmp)) {
900 		int s,d;
901 		int source = -1;
902 		int dest = 0;
903 
904 		for_each_node_mask(s, tmp) {
905 			d = node_remap(s, *from_nodes, *to_nodes);
906 			if (s == d)
907 				continue;
908 
909 			source = s;	/* Node moved. Memorize */
910 			dest = d;
911 
912 			/* dest not in remaining from nodes? */
913 			if (!node_isset(dest, tmp))
914 				break;
915 		}
916 		if (source == -1)
917 			break;
918 
919 		node_clear(source, tmp);
920 		err = migrate_to_node(mm, source, dest, flags);
921 		if (err > 0)
922 			busy += err;
923 		if (err < 0)
924 			break;
925 	}
926 out:
927 	up_read(&mm->mmap_sem);
928 	if (err < 0)
929 		return err;
930 	return busy;
931 
932 }
933 
934 /*
935  * Allocate a new page for page migration based on vma policy.
936  * Start assuming that page is mapped by vma pointed to by @private.
937  * Search forward from there, if not.  N.B., this assumes that the
938  * list of pages handed to migrate_pages()--which is how we get here--
939  * is in virtual address order.
940  */
941 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
942 {
943 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
944 	unsigned long uninitialized_var(address);
945 
946 	while (vma) {
947 		address = page_address_in_vma(page, vma);
948 		if (address != -EFAULT)
949 			break;
950 		vma = vma->vm_next;
951 	}
952 
953 	/*
954 	 * if !vma, alloc_page_vma() will use task or system default policy
955 	 */
956 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
957 }
958 #else
959 
960 static void migrate_page_add(struct page *page, struct list_head *pagelist,
961 				unsigned long flags)
962 {
963 }
964 
965 int do_migrate_pages(struct mm_struct *mm,
966 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
967 {
968 	return -ENOSYS;
969 }
970 
971 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
972 {
973 	return NULL;
974 }
975 #endif
976 
977 static long do_mbind(unsigned long start, unsigned long len,
978 		     unsigned short mode, unsigned short mode_flags,
979 		     nodemask_t *nmask, unsigned long flags)
980 {
981 	struct vm_area_struct *vma;
982 	struct mm_struct *mm = current->mm;
983 	struct mempolicy *new;
984 	unsigned long end;
985 	int err;
986 	LIST_HEAD(pagelist);
987 
988 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
989 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
990 		return -EINVAL;
991 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
992 		return -EPERM;
993 
994 	if (start & ~PAGE_MASK)
995 		return -EINVAL;
996 
997 	if (mode == MPOL_DEFAULT)
998 		flags &= ~MPOL_MF_STRICT;
999 
1000 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1001 	end = start + len;
1002 
1003 	if (end < start)
1004 		return -EINVAL;
1005 	if (end == start)
1006 		return 0;
1007 
1008 	new = mpol_new(mode, mode_flags, nmask);
1009 	if (IS_ERR(new))
1010 		return PTR_ERR(new);
1011 
1012 	/*
1013 	 * If we are using the default policy then operation
1014 	 * on discontinuous address spaces is okay after all
1015 	 */
1016 	if (!new)
1017 		flags |= MPOL_MF_DISCONTIG_OK;
1018 
1019 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1020 		 start, start + len, mode, mode_flags,
1021 		 nmask ? nodes_addr(*nmask)[0] : -1);
1022 
1023 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1024 
1025 		err = migrate_prep();
1026 		if (err)
1027 			goto mpol_out;
1028 	}
1029 	{
1030 		NODEMASK_SCRATCH(scratch);
1031 		if (scratch) {
1032 			down_write(&mm->mmap_sem);
1033 			task_lock(current);
1034 			err = mpol_set_nodemask(new, nmask, scratch);
1035 			task_unlock(current);
1036 			if (err)
1037 				up_write(&mm->mmap_sem);
1038 		} else
1039 			err = -ENOMEM;
1040 		NODEMASK_SCRATCH_FREE(scratch);
1041 	}
1042 	if (err)
1043 		goto mpol_out;
1044 
1045 	vma = check_range(mm, start, end, nmask,
1046 			  flags | MPOL_MF_INVERT, &pagelist);
1047 
1048 	err = PTR_ERR(vma);
1049 	if (!IS_ERR(vma)) {
1050 		int nr_failed = 0;
1051 
1052 		err = mbind_range(vma, start, end, new);
1053 
1054 		if (!list_empty(&pagelist))
1055 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1056 						(unsigned long)vma);
1057 
1058 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1059 			err = -EIO;
1060 	} else
1061 		putback_lru_pages(&pagelist);
1062 
1063 	up_write(&mm->mmap_sem);
1064  mpol_out:
1065 	mpol_put(new);
1066 	return err;
1067 }
1068 
1069 /*
1070  * User space interface with variable sized bitmaps for nodelists.
1071  */
1072 
1073 /* Copy a node mask from user space. */
1074 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1075 		     unsigned long maxnode)
1076 {
1077 	unsigned long k;
1078 	unsigned long nlongs;
1079 	unsigned long endmask;
1080 
1081 	--maxnode;
1082 	nodes_clear(*nodes);
1083 	if (maxnode == 0 || !nmask)
1084 		return 0;
1085 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1086 		return -EINVAL;
1087 
1088 	nlongs = BITS_TO_LONGS(maxnode);
1089 	if ((maxnode % BITS_PER_LONG) == 0)
1090 		endmask = ~0UL;
1091 	else
1092 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1093 
1094 	/* When the user specified more nodes than supported just check
1095 	   if the non supported part is all zero. */
1096 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1097 		if (nlongs > PAGE_SIZE/sizeof(long))
1098 			return -EINVAL;
1099 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1100 			unsigned long t;
1101 			if (get_user(t, nmask + k))
1102 				return -EFAULT;
1103 			if (k == nlongs - 1) {
1104 				if (t & endmask)
1105 					return -EINVAL;
1106 			} else if (t)
1107 				return -EINVAL;
1108 		}
1109 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1110 		endmask = ~0UL;
1111 	}
1112 
1113 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1114 		return -EFAULT;
1115 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1116 	return 0;
1117 }
1118 
1119 /* Copy a kernel node mask to user space */
1120 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1121 			      nodemask_t *nodes)
1122 {
1123 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1124 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1125 
1126 	if (copy > nbytes) {
1127 		if (copy > PAGE_SIZE)
1128 			return -EINVAL;
1129 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1130 			return -EFAULT;
1131 		copy = nbytes;
1132 	}
1133 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1134 }
1135 
1136 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1137 		unsigned long, mode, unsigned long __user *, nmask,
1138 		unsigned long, maxnode, unsigned, flags)
1139 {
1140 	nodemask_t nodes;
1141 	int err;
1142 	unsigned short mode_flags;
1143 
1144 	mode_flags = mode & MPOL_MODE_FLAGS;
1145 	mode &= ~MPOL_MODE_FLAGS;
1146 	if (mode >= MPOL_MAX)
1147 		return -EINVAL;
1148 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1149 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1150 		return -EINVAL;
1151 	err = get_nodes(&nodes, nmask, maxnode);
1152 	if (err)
1153 		return err;
1154 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1155 }
1156 
1157 /* Set the process memory policy */
1158 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1159 		unsigned long, maxnode)
1160 {
1161 	int err;
1162 	nodemask_t nodes;
1163 	unsigned short flags;
1164 
1165 	flags = mode & MPOL_MODE_FLAGS;
1166 	mode &= ~MPOL_MODE_FLAGS;
1167 	if ((unsigned int)mode >= MPOL_MAX)
1168 		return -EINVAL;
1169 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1170 		return -EINVAL;
1171 	err = get_nodes(&nodes, nmask, maxnode);
1172 	if (err)
1173 		return err;
1174 	return do_set_mempolicy(mode, flags, &nodes);
1175 }
1176 
1177 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1178 		const unsigned long __user *, old_nodes,
1179 		const unsigned long __user *, new_nodes)
1180 {
1181 	const struct cred *cred = current_cred(), *tcred;
1182 	struct mm_struct *mm;
1183 	struct task_struct *task;
1184 	nodemask_t old;
1185 	nodemask_t new;
1186 	nodemask_t task_nodes;
1187 	int err;
1188 
1189 	err = get_nodes(&old, old_nodes, maxnode);
1190 	if (err)
1191 		return err;
1192 
1193 	err = get_nodes(&new, new_nodes, maxnode);
1194 	if (err)
1195 		return err;
1196 
1197 	/* Find the mm_struct */
1198 	read_lock(&tasklist_lock);
1199 	task = pid ? find_task_by_vpid(pid) : current;
1200 	if (!task) {
1201 		read_unlock(&tasklist_lock);
1202 		return -ESRCH;
1203 	}
1204 	mm = get_task_mm(task);
1205 	read_unlock(&tasklist_lock);
1206 
1207 	if (!mm)
1208 		return -EINVAL;
1209 
1210 	/*
1211 	 * Check if this process has the right to modify the specified
1212 	 * process. The right exists if the process has administrative
1213 	 * capabilities, superuser privileges or the same
1214 	 * userid as the target process.
1215 	 */
1216 	rcu_read_lock();
1217 	tcred = __task_cred(task);
1218 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1219 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1220 	    !capable(CAP_SYS_NICE)) {
1221 		rcu_read_unlock();
1222 		err = -EPERM;
1223 		goto out;
1224 	}
1225 	rcu_read_unlock();
1226 
1227 	task_nodes = cpuset_mems_allowed(task);
1228 	/* Is the user allowed to access the target nodes? */
1229 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1230 		err = -EPERM;
1231 		goto out;
1232 	}
1233 
1234 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1235 		err = -EINVAL;
1236 		goto out;
1237 	}
1238 
1239 	err = security_task_movememory(task);
1240 	if (err)
1241 		goto out;
1242 
1243 	err = do_migrate_pages(mm, &old, &new,
1244 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1245 out:
1246 	mmput(mm);
1247 	return err;
1248 }
1249 
1250 
1251 /* Retrieve NUMA policy */
1252 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1253 		unsigned long __user *, nmask, unsigned long, maxnode,
1254 		unsigned long, addr, unsigned long, flags)
1255 {
1256 	int err;
1257 	int uninitialized_var(pval);
1258 	nodemask_t nodes;
1259 
1260 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1261 		return -EINVAL;
1262 
1263 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1264 
1265 	if (err)
1266 		return err;
1267 
1268 	if (policy && put_user(pval, policy))
1269 		return -EFAULT;
1270 
1271 	if (nmask)
1272 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1273 
1274 	return err;
1275 }
1276 
1277 #ifdef CONFIG_COMPAT
1278 
1279 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1280 				     compat_ulong_t __user *nmask,
1281 				     compat_ulong_t maxnode,
1282 				     compat_ulong_t addr, compat_ulong_t flags)
1283 {
1284 	long err;
1285 	unsigned long __user *nm = NULL;
1286 	unsigned long nr_bits, alloc_size;
1287 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1288 
1289 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1290 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1291 
1292 	if (nmask)
1293 		nm = compat_alloc_user_space(alloc_size);
1294 
1295 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1296 
1297 	if (!err && nmask) {
1298 		err = copy_from_user(bm, nm, alloc_size);
1299 		/* ensure entire bitmap is zeroed */
1300 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1301 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1302 	}
1303 
1304 	return err;
1305 }
1306 
1307 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1308 				     compat_ulong_t maxnode)
1309 {
1310 	long err = 0;
1311 	unsigned long __user *nm = NULL;
1312 	unsigned long nr_bits, alloc_size;
1313 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1314 
1315 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1316 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1317 
1318 	if (nmask) {
1319 		err = compat_get_bitmap(bm, nmask, nr_bits);
1320 		nm = compat_alloc_user_space(alloc_size);
1321 		err |= copy_to_user(nm, bm, alloc_size);
1322 	}
1323 
1324 	if (err)
1325 		return -EFAULT;
1326 
1327 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1328 }
1329 
1330 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1331 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1332 			     compat_ulong_t maxnode, compat_ulong_t flags)
1333 {
1334 	long err = 0;
1335 	unsigned long __user *nm = NULL;
1336 	unsigned long nr_bits, alloc_size;
1337 	nodemask_t bm;
1338 
1339 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1340 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1341 
1342 	if (nmask) {
1343 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1344 		nm = compat_alloc_user_space(alloc_size);
1345 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1346 	}
1347 
1348 	if (err)
1349 		return -EFAULT;
1350 
1351 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1352 }
1353 
1354 #endif
1355 
1356 /*
1357  * get_vma_policy(@task, @vma, @addr)
1358  * @task - task for fallback if vma policy == default
1359  * @vma   - virtual memory area whose policy is sought
1360  * @addr  - address in @vma for shared policy lookup
1361  *
1362  * Returns effective policy for a VMA at specified address.
1363  * Falls back to @task or system default policy, as necessary.
1364  * Current or other task's task mempolicy and non-shared vma policies
1365  * are protected by the task's mmap_sem, which must be held for read by
1366  * the caller.
1367  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1368  * count--added by the get_policy() vm_op, as appropriate--to protect against
1369  * freeing by another task.  It is the caller's responsibility to free the
1370  * extra reference for shared policies.
1371  */
1372 static struct mempolicy *get_vma_policy(struct task_struct *task,
1373 		struct vm_area_struct *vma, unsigned long addr)
1374 {
1375 	struct mempolicy *pol = task->mempolicy;
1376 
1377 	if (vma) {
1378 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1379 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1380 									addr);
1381 			if (vpol)
1382 				pol = vpol;
1383 		} else if (vma->vm_policy)
1384 			pol = vma->vm_policy;
1385 	}
1386 	if (!pol)
1387 		pol = &default_policy;
1388 	return pol;
1389 }
1390 
1391 /*
1392  * Return a nodemask representing a mempolicy for filtering nodes for
1393  * page allocation
1394  */
1395 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1396 {
1397 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1398 	if (unlikely(policy->mode == MPOL_BIND) &&
1399 			gfp_zone(gfp) >= policy_zone &&
1400 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1401 		return &policy->v.nodes;
1402 
1403 	return NULL;
1404 }
1405 
1406 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1407 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1408 {
1409 	int nd = numa_node_id();
1410 
1411 	switch (policy->mode) {
1412 	case MPOL_PREFERRED:
1413 		if (!(policy->flags & MPOL_F_LOCAL))
1414 			nd = policy->v.preferred_node;
1415 		break;
1416 	case MPOL_BIND:
1417 		/*
1418 		 * Normally, MPOL_BIND allocations are node-local within the
1419 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1420 		 * current node is part of the mask, we use the zonelist for
1421 		 * the first node in the mask instead.
1422 		 */
1423 		if (unlikely(gfp & __GFP_THISNODE) &&
1424 				unlikely(!node_isset(nd, policy->v.nodes)))
1425 			nd = first_node(policy->v.nodes);
1426 		break;
1427 	case MPOL_INTERLEAVE: /* should not happen */
1428 		break;
1429 	default:
1430 		BUG();
1431 	}
1432 	return node_zonelist(nd, gfp);
1433 }
1434 
1435 /* Do dynamic interleaving for a process */
1436 static unsigned interleave_nodes(struct mempolicy *policy)
1437 {
1438 	unsigned nid, next;
1439 	struct task_struct *me = current;
1440 
1441 	nid = me->il_next;
1442 	next = next_node(nid, policy->v.nodes);
1443 	if (next >= MAX_NUMNODES)
1444 		next = first_node(policy->v.nodes);
1445 	if (next < MAX_NUMNODES)
1446 		me->il_next = next;
1447 	return nid;
1448 }
1449 
1450 /*
1451  * Depending on the memory policy provide a node from which to allocate the
1452  * next slab entry.
1453  * @policy must be protected by freeing by the caller.  If @policy is
1454  * the current task's mempolicy, this protection is implicit, as only the
1455  * task can change it's policy.  The system default policy requires no
1456  * such protection.
1457  */
1458 unsigned slab_node(struct mempolicy *policy)
1459 {
1460 	if (!policy || policy->flags & MPOL_F_LOCAL)
1461 		return numa_node_id();
1462 
1463 	switch (policy->mode) {
1464 	case MPOL_PREFERRED:
1465 		/*
1466 		 * handled MPOL_F_LOCAL above
1467 		 */
1468 		return policy->v.preferred_node;
1469 
1470 	case MPOL_INTERLEAVE:
1471 		return interleave_nodes(policy);
1472 
1473 	case MPOL_BIND: {
1474 		/*
1475 		 * Follow bind policy behavior and start allocation at the
1476 		 * first node.
1477 		 */
1478 		struct zonelist *zonelist;
1479 		struct zone *zone;
1480 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1481 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1482 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1483 							&policy->v.nodes,
1484 							&zone);
1485 		return zone->node;
1486 	}
1487 
1488 	default:
1489 		BUG();
1490 	}
1491 }
1492 
1493 /* Do static interleaving for a VMA with known offset. */
1494 static unsigned offset_il_node(struct mempolicy *pol,
1495 		struct vm_area_struct *vma, unsigned long off)
1496 {
1497 	unsigned nnodes = nodes_weight(pol->v.nodes);
1498 	unsigned target;
1499 	int c;
1500 	int nid = -1;
1501 
1502 	if (!nnodes)
1503 		return numa_node_id();
1504 	target = (unsigned int)off % nnodes;
1505 	c = 0;
1506 	do {
1507 		nid = next_node(nid, pol->v.nodes);
1508 		c++;
1509 	} while (c <= target);
1510 	return nid;
1511 }
1512 
1513 /* Determine a node number for interleave */
1514 static inline unsigned interleave_nid(struct mempolicy *pol,
1515 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1516 {
1517 	if (vma) {
1518 		unsigned long off;
1519 
1520 		/*
1521 		 * for small pages, there is no difference between
1522 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1523 		 * for huge pages, since vm_pgoff is in units of small
1524 		 * pages, we need to shift off the always 0 bits to get
1525 		 * a useful offset.
1526 		 */
1527 		BUG_ON(shift < PAGE_SHIFT);
1528 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1529 		off += (addr - vma->vm_start) >> shift;
1530 		return offset_il_node(pol, vma, off);
1531 	} else
1532 		return interleave_nodes(pol);
1533 }
1534 
1535 #ifdef CONFIG_HUGETLBFS
1536 /*
1537  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1538  * @vma = virtual memory area whose policy is sought
1539  * @addr = address in @vma for shared policy lookup and interleave policy
1540  * @gfp_flags = for requested zone
1541  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1542  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1543  *
1544  * Returns a zonelist suitable for a huge page allocation and a pointer
1545  * to the struct mempolicy for conditional unref after allocation.
1546  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1547  * @nodemask for filtering the zonelist.
1548  */
1549 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1550 				gfp_t gfp_flags, struct mempolicy **mpol,
1551 				nodemask_t **nodemask)
1552 {
1553 	struct zonelist *zl;
1554 
1555 	*mpol = get_vma_policy(current, vma, addr);
1556 	*nodemask = NULL;	/* assume !MPOL_BIND */
1557 
1558 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1559 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1560 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1561 	} else {
1562 		zl = policy_zonelist(gfp_flags, *mpol);
1563 		if ((*mpol)->mode == MPOL_BIND)
1564 			*nodemask = &(*mpol)->v.nodes;
1565 	}
1566 	return zl;
1567 }
1568 #endif
1569 
1570 /* Allocate a page in interleaved policy.
1571    Own path because it needs to do special accounting. */
1572 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1573 					unsigned nid)
1574 {
1575 	struct zonelist *zl;
1576 	struct page *page;
1577 
1578 	zl = node_zonelist(nid, gfp);
1579 	page = __alloc_pages(gfp, order, zl);
1580 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1581 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1582 	return page;
1583 }
1584 
1585 /**
1586  * 	alloc_page_vma	- Allocate a page for a VMA.
1587  *
1588  * 	@gfp:
1589  *      %GFP_USER    user allocation.
1590  *      %GFP_KERNEL  kernel allocations,
1591  *      %GFP_HIGHMEM highmem/user allocations,
1592  *      %GFP_FS      allocation should not call back into a file system.
1593  *      %GFP_ATOMIC  don't sleep.
1594  *
1595  * 	@vma:  Pointer to VMA or NULL if not available.
1596  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1597  *
1598  * 	This function allocates a page from the kernel page pool and applies
1599  *	a NUMA policy associated with the VMA or the current process.
1600  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1601  *	mm_struct of the VMA to prevent it from going away. Should be used for
1602  *	all allocations for pages that will be mapped into
1603  * 	user space. Returns NULL when no page can be allocated.
1604  *
1605  *	Should be called with the mm_sem of the vma hold.
1606  */
1607 struct page *
1608 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1609 {
1610 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1611 	struct zonelist *zl;
1612 
1613 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1614 		unsigned nid;
1615 
1616 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1617 		mpol_cond_put(pol);
1618 		return alloc_page_interleave(gfp, 0, nid);
1619 	}
1620 	zl = policy_zonelist(gfp, pol);
1621 	if (unlikely(mpol_needs_cond_ref(pol))) {
1622 		/*
1623 		 * slow path: ref counted shared policy
1624 		 */
1625 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1626 						zl, policy_nodemask(gfp, pol));
1627 		__mpol_put(pol);
1628 		return page;
1629 	}
1630 	/*
1631 	 * fast path:  default or task policy
1632 	 */
1633 	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1634 }
1635 
1636 /**
1637  * 	alloc_pages_current - Allocate pages.
1638  *
1639  *	@gfp:
1640  *		%GFP_USER   user allocation,
1641  *      	%GFP_KERNEL kernel allocation,
1642  *      	%GFP_HIGHMEM highmem allocation,
1643  *      	%GFP_FS     don't call back into a file system.
1644  *      	%GFP_ATOMIC don't sleep.
1645  *	@order: Power of two of allocation size in pages. 0 is a single page.
1646  *
1647  *	Allocate a page from the kernel page pool.  When not in
1648  *	interrupt context and apply the current process NUMA policy.
1649  *	Returns NULL when no page can be allocated.
1650  *
1651  *	Don't call cpuset_update_task_memory_state() unless
1652  *	1) it's ok to take cpuset_sem (can WAIT), and
1653  *	2) allocating for current task (not interrupt).
1654  */
1655 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1656 {
1657 	struct mempolicy *pol = current->mempolicy;
1658 
1659 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1660 		pol = &default_policy;
1661 
1662 	/*
1663 	 * No reference counting needed for current->mempolicy
1664 	 * nor system default_policy
1665 	 */
1666 	if (pol->mode == MPOL_INTERLEAVE)
1667 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1668 	return __alloc_pages_nodemask(gfp, order,
1669 			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1670 }
1671 EXPORT_SYMBOL(alloc_pages_current);
1672 
1673 /*
1674  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1675  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1676  * with the mems_allowed returned by cpuset_mems_allowed().  This
1677  * keeps mempolicies cpuset relative after its cpuset moves.  See
1678  * further kernel/cpuset.c update_nodemask().
1679  */
1680 
1681 /* Slow path of a mempolicy duplicate */
1682 struct mempolicy *__mpol_dup(struct mempolicy *old)
1683 {
1684 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1685 
1686 	if (!new)
1687 		return ERR_PTR(-ENOMEM);
1688 	if (current_cpuset_is_being_rebound()) {
1689 		nodemask_t mems = cpuset_mems_allowed(current);
1690 		mpol_rebind_policy(old, &mems);
1691 	}
1692 	*new = *old;
1693 	atomic_set(&new->refcnt, 1);
1694 	return new;
1695 }
1696 
1697 /*
1698  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1699  * eliminate the * MPOL_F_* flags that require conditional ref and
1700  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1701  * after return.  Use the returned value.
1702  *
1703  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1704  * policy lookup, even if the policy needs/has extra ref on lookup.
1705  * shmem_readahead needs this.
1706  */
1707 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1708 						struct mempolicy *frompol)
1709 {
1710 	if (!mpol_needs_cond_ref(frompol))
1711 		return frompol;
1712 
1713 	*tompol = *frompol;
1714 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1715 	__mpol_put(frompol);
1716 	return tompol;
1717 }
1718 
1719 static int mpol_match_intent(const struct mempolicy *a,
1720 			     const struct mempolicy *b)
1721 {
1722 	if (a->flags != b->flags)
1723 		return 0;
1724 	if (!mpol_store_user_nodemask(a))
1725 		return 1;
1726 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1727 }
1728 
1729 /* Slow path of a mempolicy comparison */
1730 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1731 {
1732 	if (!a || !b)
1733 		return 0;
1734 	if (a->mode != b->mode)
1735 		return 0;
1736 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1737 		return 0;
1738 	switch (a->mode) {
1739 	case MPOL_BIND:
1740 		/* Fall through */
1741 	case MPOL_INTERLEAVE:
1742 		return nodes_equal(a->v.nodes, b->v.nodes);
1743 	case MPOL_PREFERRED:
1744 		return a->v.preferred_node == b->v.preferred_node &&
1745 			a->flags == b->flags;
1746 	default:
1747 		BUG();
1748 		return 0;
1749 	}
1750 }
1751 
1752 /*
1753  * Shared memory backing store policy support.
1754  *
1755  * Remember policies even when nobody has shared memory mapped.
1756  * The policies are kept in Red-Black tree linked from the inode.
1757  * They are protected by the sp->lock spinlock, which should be held
1758  * for any accesses to the tree.
1759  */
1760 
1761 /* lookup first element intersecting start-end */
1762 /* Caller holds sp->lock */
1763 static struct sp_node *
1764 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1765 {
1766 	struct rb_node *n = sp->root.rb_node;
1767 
1768 	while (n) {
1769 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1770 
1771 		if (start >= p->end)
1772 			n = n->rb_right;
1773 		else if (end <= p->start)
1774 			n = n->rb_left;
1775 		else
1776 			break;
1777 	}
1778 	if (!n)
1779 		return NULL;
1780 	for (;;) {
1781 		struct sp_node *w = NULL;
1782 		struct rb_node *prev = rb_prev(n);
1783 		if (!prev)
1784 			break;
1785 		w = rb_entry(prev, struct sp_node, nd);
1786 		if (w->end <= start)
1787 			break;
1788 		n = prev;
1789 	}
1790 	return rb_entry(n, struct sp_node, nd);
1791 }
1792 
1793 /* Insert a new shared policy into the list. */
1794 /* Caller holds sp->lock */
1795 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1796 {
1797 	struct rb_node **p = &sp->root.rb_node;
1798 	struct rb_node *parent = NULL;
1799 	struct sp_node *nd;
1800 
1801 	while (*p) {
1802 		parent = *p;
1803 		nd = rb_entry(parent, struct sp_node, nd);
1804 		if (new->start < nd->start)
1805 			p = &(*p)->rb_left;
1806 		else if (new->end > nd->end)
1807 			p = &(*p)->rb_right;
1808 		else
1809 			BUG();
1810 	}
1811 	rb_link_node(&new->nd, parent, p);
1812 	rb_insert_color(&new->nd, &sp->root);
1813 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1814 		 new->policy ? new->policy->mode : 0);
1815 }
1816 
1817 /* Find shared policy intersecting idx */
1818 struct mempolicy *
1819 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1820 {
1821 	struct mempolicy *pol = NULL;
1822 	struct sp_node *sn;
1823 
1824 	if (!sp->root.rb_node)
1825 		return NULL;
1826 	spin_lock(&sp->lock);
1827 	sn = sp_lookup(sp, idx, idx+1);
1828 	if (sn) {
1829 		mpol_get(sn->policy);
1830 		pol = sn->policy;
1831 	}
1832 	spin_unlock(&sp->lock);
1833 	return pol;
1834 }
1835 
1836 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1837 {
1838 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1839 	rb_erase(&n->nd, &sp->root);
1840 	mpol_put(n->policy);
1841 	kmem_cache_free(sn_cache, n);
1842 }
1843 
1844 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1845 				struct mempolicy *pol)
1846 {
1847 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1848 
1849 	if (!n)
1850 		return NULL;
1851 	n->start = start;
1852 	n->end = end;
1853 	mpol_get(pol);
1854 	pol->flags |= MPOL_F_SHARED;	/* for unref */
1855 	n->policy = pol;
1856 	return n;
1857 }
1858 
1859 /* Replace a policy range. */
1860 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1861 				 unsigned long end, struct sp_node *new)
1862 {
1863 	struct sp_node *n, *new2 = NULL;
1864 
1865 restart:
1866 	spin_lock(&sp->lock);
1867 	n = sp_lookup(sp, start, end);
1868 	/* Take care of old policies in the same range. */
1869 	while (n && n->start < end) {
1870 		struct rb_node *next = rb_next(&n->nd);
1871 		if (n->start >= start) {
1872 			if (n->end <= end)
1873 				sp_delete(sp, n);
1874 			else
1875 				n->start = end;
1876 		} else {
1877 			/* Old policy spanning whole new range. */
1878 			if (n->end > end) {
1879 				if (!new2) {
1880 					spin_unlock(&sp->lock);
1881 					new2 = sp_alloc(end, n->end, n->policy);
1882 					if (!new2)
1883 						return -ENOMEM;
1884 					goto restart;
1885 				}
1886 				n->end = start;
1887 				sp_insert(sp, new2);
1888 				new2 = NULL;
1889 				break;
1890 			} else
1891 				n->end = start;
1892 		}
1893 		if (!next)
1894 			break;
1895 		n = rb_entry(next, struct sp_node, nd);
1896 	}
1897 	if (new)
1898 		sp_insert(sp, new);
1899 	spin_unlock(&sp->lock);
1900 	if (new2) {
1901 		mpol_put(new2->policy);
1902 		kmem_cache_free(sn_cache, new2);
1903 	}
1904 	return 0;
1905 }
1906 
1907 /**
1908  * mpol_shared_policy_init - initialize shared policy for inode
1909  * @sp: pointer to inode shared policy
1910  * @mpol:  struct mempolicy to install
1911  *
1912  * Install non-NULL @mpol in inode's shared policy rb-tree.
1913  * On entry, the current task has a reference on a non-NULL @mpol.
1914  * This must be released on exit.
1915  * This is called at get_inode() calls and we can use GFP_KERNEL.
1916  */
1917 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1918 {
1919 	int ret;
1920 
1921 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1922 	spin_lock_init(&sp->lock);
1923 
1924 	if (mpol) {
1925 		struct vm_area_struct pvma;
1926 		struct mempolicy *new;
1927 		NODEMASK_SCRATCH(scratch);
1928 
1929 		if (!scratch)
1930 			return;
1931 		/* contextualize the tmpfs mount point mempolicy */
1932 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1933 		if (IS_ERR(new)) {
1934 			mpol_put(mpol);	/* drop our ref on sb mpol */
1935 			NODEMASK_SCRATCH_FREE(scratch);
1936 			return;		/* no valid nodemask intersection */
1937 		}
1938 
1939 		task_lock(current);
1940 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1941 		task_unlock(current);
1942 		mpol_put(mpol);	/* drop our ref on sb mpol */
1943 		if (ret) {
1944 			NODEMASK_SCRATCH_FREE(scratch);
1945 			mpol_put(new);
1946 			return;
1947 		}
1948 
1949 		/* Create pseudo-vma that contains just the policy */
1950 		memset(&pvma, 0, sizeof(struct vm_area_struct));
1951 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
1952 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1953 		mpol_put(new);			/* drop initial ref */
1954 		NODEMASK_SCRATCH_FREE(scratch);
1955 	}
1956 }
1957 
1958 int mpol_set_shared_policy(struct shared_policy *info,
1959 			struct vm_area_struct *vma, struct mempolicy *npol)
1960 {
1961 	int err;
1962 	struct sp_node *new = NULL;
1963 	unsigned long sz = vma_pages(vma);
1964 
1965 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1966 		 vma->vm_pgoff,
1967 		 sz, npol ? npol->mode : -1,
1968 		 npol ? npol->flags : -1,
1969 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1970 
1971 	if (npol) {
1972 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1973 		if (!new)
1974 			return -ENOMEM;
1975 	}
1976 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1977 	if (err && new)
1978 		kmem_cache_free(sn_cache, new);
1979 	return err;
1980 }
1981 
1982 /* Free a backing policy store on inode delete. */
1983 void mpol_free_shared_policy(struct shared_policy *p)
1984 {
1985 	struct sp_node *n;
1986 	struct rb_node *next;
1987 
1988 	if (!p->root.rb_node)
1989 		return;
1990 	spin_lock(&p->lock);
1991 	next = rb_first(&p->root);
1992 	while (next) {
1993 		n = rb_entry(next, struct sp_node, nd);
1994 		next = rb_next(&n->nd);
1995 		rb_erase(&n->nd, &p->root);
1996 		mpol_put(n->policy);
1997 		kmem_cache_free(sn_cache, n);
1998 	}
1999 	spin_unlock(&p->lock);
2000 }
2001 
2002 /* assumes fs == KERNEL_DS */
2003 void __init numa_policy_init(void)
2004 {
2005 	nodemask_t interleave_nodes;
2006 	unsigned long largest = 0;
2007 	int nid, prefer = 0;
2008 
2009 	policy_cache = kmem_cache_create("numa_policy",
2010 					 sizeof(struct mempolicy),
2011 					 0, SLAB_PANIC, NULL);
2012 
2013 	sn_cache = kmem_cache_create("shared_policy_node",
2014 				     sizeof(struct sp_node),
2015 				     0, SLAB_PANIC, NULL);
2016 
2017 	/*
2018 	 * Set interleaving policy for system init. Interleaving is only
2019 	 * enabled across suitably sized nodes (default is >= 16MB), or
2020 	 * fall back to the largest node if they're all smaller.
2021 	 */
2022 	nodes_clear(interleave_nodes);
2023 	for_each_node_state(nid, N_HIGH_MEMORY) {
2024 		unsigned long total_pages = node_present_pages(nid);
2025 
2026 		/* Preserve the largest node */
2027 		if (largest < total_pages) {
2028 			largest = total_pages;
2029 			prefer = nid;
2030 		}
2031 
2032 		/* Interleave this node? */
2033 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2034 			node_set(nid, interleave_nodes);
2035 	}
2036 
2037 	/* All too small, use the largest */
2038 	if (unlikely(nodes_empty(interleave_nodes)))
2039 		node_set(prefer, interleave_nodes);
2040 
2041 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2042 		printk("numa_policy_init: interleaving failed\n");
2043 }
2044 
2045 /* Reset policy of current process to default */
2046 void numa_default_policy(void)
2047 {
2048 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2049 }
2050 
2051 /*
2052  * Parse and format mempolicy from/to strings
2053  */
2054 
2055 /*
2056  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2057  * Used only for mpol_parse_str() and mpol_to_str()
2058  */
2059 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2060 static const char * const policy_types[] =
2061 	{ "default", "prefer", "bind", "interleave", "local" };
2062 
2063 
2064 #ifdef CONFIG_TMPFS
2065 /**
2066  * mpol_parse_str - parse string to mempolicy
2067  * @str:  string containing mempolicy to parse
2068  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2069  * @no_context:  flag whether to "contextualize" the mempolicy
2070  *
2071  * Format of input:
2072  *	<mode>[=<flags>][:<nodelist>]
2073  *
2074  * if @no_context is true, save the input nodemask in w.user_nodemask in
2075  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2076  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2077  * mount option.  Note that if 'static' or 'relative' mode flags were
2078  * specified, the input nodemask will already have been saved.  Saving
2079  * it again is redundant, but safe.
2080  *
2081  * On success, returns 0, else 1
2082  */
2083 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2084 {
2085 	struct mempolicy *new = NULL;
2086 	unsigned short uninitialized_var(mode);
2087 	unsigned short uninitialized_var(mode_flags);
2088 	nodemask_t nodes;
2089 	char *nodelist = strchr(str, ':');
2090 	char *flags = strchr(str, '=');
2091 	int i;
2092 	int err = 1;
2093 
2094 	if (nodelist) {
2095 		/* NUL-terminate mode or flags string */
2096 		*nodelist++ = '\0';
2097 		if (nodelist_parse(nodelist, nodes))
2098 			goto out;
2099 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2100 			goto out;
2101 	} else
2102 		nodes_clear(nodes);
2103 
2104 	if (flags)
2105 		*flags++ = '\0';	/* terminate mode string */
2106 
2107 	for (i = 0; i <= MPOL_LOCAL; i++) {
2108 		if (!strcmp(str, policy_types[i])) {
2109 			mode = i;
2110 			break;
2111 		}
2112 	}
2113 	if (i > MPOL_LOCAL)
2114 		goto out;
2115 
2116 	switch (mode) {
2117 	case MPOL_PREFERRED:
2118 		/*
2119 		 * Insist on a nodelist of one node only
2120 		 */
2121 		if (nodelist) {
2122 			char *rest = nodelist;
2123 			while (isdigit(*rest))
2124 				rest++;
2125 			if (!*rest)
2126 				err = 0;
2127 		}
2128 		break;
2129 	case MPOL_INTERLEAVE:
2130 		/*
2131 		 * Default to online nodes with memory if no nodelist
2132 		 */
2133 		if (!nodelist)
2134 			nodes = node_states[N_HIGH_MEMORY];
2135 		err = 0;
2136 		break;
2137 	case MPOL_LOCAL:
2138 		/*
2139 		 * Don't allow a nodelist;  mpol_new() checks flags
2140 		 */
2141 		if (nodelist)
2142 			goto out;
2143 		mode = MPOL_PREFERRED;
2144 		break;
2145 
2146 	/*
2147 	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2148 	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2149 	 */
2150 	}
2151 
2152 	mode_flags = 0;
2153 	if (flags) {
2154 		/*
2155 		 * Currently, we only support two mutually exclusive
2156 		 * mode flags.
2157 		 */
2158 		if (!strcmp(flags, "static"))
2159 			mode_flags |= MPOL_F_STATIC_NODES;
2160 		else if (!strcmp(flags, "relative"))
2161 			mode_flags |= MPOL_F_RELATIVE_NODES;
2162 		else
2163 			err = 1;
2164 	}
2165 
2166 	new = mpol_new(mode, mode_flags, &nodes);
2167 	if (IS_ERR(new))
2168 		err = 1;
2169 	else {
2170 		int ret;
2171 		NODEMASK_SCRATCH(scratch);
2172 		if (scratch) {
2173 			task_lock(current);
2174 			ret = mpol_set_nodemask(new, &nodes, scratch);
2175 			task_unlock(current);
2176 		} else
2177 			ret = -ENOMEM;
2178 		NODEMASK_SCRATCH_FREE(scratch);
2179 		if (ret) {
2180 			err = 1;
2181 			mpol_put(new);
2182 		} else if (no_context) {
2183 			/* save for contextualization */
2184 			new->w.user_nodemask = nodes;
2185 		}
2186 	}
2187 
2188 out:
2189 	/* Restore string for error message */
2190 	if (nodelist)
2191 		*--nodelist = ':';
2192 	if (flags)
2193 		*--flags = '=';
2194 	if (!err)
2195 		*mpol = new;
2196 	return err;
2197 }
2198 #endif /* CONFIG_TMPFS */
2199 
2200 /**
2201  * mpol_to_str - format a mempolicy structure for printing
2202  * @buffer:  to contain formatted mempolicy string
2203  * @maxlen:  length of @buffer
2204  * @pol:  pointer to mempolicy to be formatted
2205  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2206  *
2207  * Convert a mempolicy into a string.
2208  * Returns the number of characters in buffer (if positive)
2209  * or an error (negative)
2210  */
2211 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2212 {
2213 	char *p = buffer;
2214 	int l;
2215 	nodemask_t nodes;
2216 	unsigned short mode;
2217 	unsigned short flags = pol ? pol->flags : 0;
2218 
2219 	/*
2220 	 * Sanity check:  room for longest mode, flag and some nodes
2221 	 */
2222 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2223 
2224 	if (!pol || pol == &default_policy)
2225 		mode = MPOL_DEFAULT;
2226 	else
2227 		mode = pol->mode;
2228 
2229 	switch (mode) {
2230 	case MPOL_DEFAULT:
2231 		nodes_clear(nodes);
2232 		break;
2233 
2234 	case MPOL_PREFERRED:
2235 		nodes_clear(nodes);
2236 		if (flags & MPOL_F_LOCAL)
2237 			mode = MPOL_LOCAL;	/* pseudo-policy */
2238 		else
2239 			node_set(pol->v.preferred_node, nodes);
2240 		break;
2241 
2242 	case MPOL_BIND:
2243 		/* Fall through */
2244 	case MPOL_INTERLEAVE:
2245 		if (no_context)
2246 			nodes = pol->w.user_nodemask;
2247 		else
2248 			nodes = pol->v.nodes;
2249 		break;
2250 
2251 	default:
2252 		BUG();
2253 	}
2254 
2255 	l = strlen(policy_types[mode]);
2256 	if (buffer + maxlen < p + l + 1)
2257 		return -ENOSPC;
2258 
2259 	strcpy(p, policy_types[mode]);
2260 	p += l;
2261 
2262 	if (flags & MPOL_MODE_FLAGS) {
2263 		if (buffer + maxlen < p + 2)
2264 			return -ENOSPC;
2265 		*p++ = '=';
2266 
2267 		/*
2268 		 * Currently, the only defined flags are mutually exclusive
2269 		 */
2270 		if (flags & MPOL_F_STATIC_NODES)
2271 			p += snprintf(p, buffer + maxlen - p, "static");
2272 		else if (flags & MPOL_F_RELATIVE_NODES)
2273 			p += snprintf(p, buffer + maxlen - p, "relative");
2274 	}
2275 
2276 	if (!nodes_empty(nodes)) {
2277 		if (buffer + maxlen < p + 2)
2278 			return -ENOSPC;
2279 		*p++ = ':';
2280 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2281 	}
2282 	return p - buffer;
2283 }
2284 
2285 struct numa_maps {
2286 	unsigned long pages;
2287 	unsigned long anon;
2288 	unsigned long active;
2289 	unsigned long writeback;
2290 	unsigned long mapcount_max;
2291 	unsigned long dirty;
2292 	unsigned long swapcache;
2293 	unsigned long node[MAX_NUMNODES];
2294 };
2295 
2296 static void gather_stats(struct page *page, void *private, int pte_dirty)
2297 {
2298 	struct numa_maps *md = private;
2299 	int count = page_mapcount(page);
2300 
2301 	md->pages++;
2302 	if (pte_dirty || PageDirty(page))
2303 		md->dirty++;
2304 
2305 	if (PageSwapCache(page))
2306 		md->swapcache++;
2307 
2308 	if (PageActive(page) || PageUnevictable(page))
2309 		md->active++;
2310 
2311 	if (PageWriteback(page))
2312 		md->writeback++;
2313 
2314 	if (PageAnon(page))
2315 		md->anon++;
2316 
2317 	if (count > md->mapcount_max)
2318 		md->mapcount_max = count;
2319 
2320 	md->node[page_to_nid(page)]++;
2321 }
2322 
2323 #ifdef CONFIG_HUGETLB_PAGE
2324 static void check_huge_range(struct vm_area_struct *vma,
2325 		unsigned long start, unsigned long end,
2326 		struct numa_maps *md)
2327 {
2328 	unsigned long addr;
2329 	struct page *page;
2330 	struct hstate *h = hstate_vma(vma);
2331 	unsigned long sz = huge_page_size(h);
2332 
2333 	for (addr = start; addr < end; addr += sz) {
2334 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2335 						addr & huge_page_mask(h));
2336 		pte_t pte;
2337 
2338 		if (!ptep)
2339 			continue;
2340 
2341 		pte = *ptep;
2342 		if (pte_none(pte))
2343 			continue;
2344 
2345 		page = pte_page(pte);
2346 		if (!page)
2347 			continue;
2348 
2349 		gather_stats(page, md, pte_dirty(*ptep));
2350 	}
2351 }
2352 #else
2353 static inline void check_huge_range(struct vm_area_struct *vma,
2354 		unsigned long start, unsigned long end,
2355 		struct numa_maps *md)
2356 {
2357 }
2358 #endif
2359 
2360 /*
2361  * Display pages allocated per node and memory policy via /proc.
2362  */
2363 int show_numa_map(struct seq_file *m, void *v)
2364 {
2365 	struct proc_maps_private *priv = m->private;
2366 	struct vm_area_struct *vma = v;
2367 	struct numa_maps *md;
2368 	struct file *file = vma->vm_file;
2369 	struct mm_struct *mm = vma->vm_mm;
2370 	struct mempolicy *pol;
2371 	int n;
2372 	char buffer[50];
2373 
2374 	if (!mm)
2375 		return 0;
2376 
2377 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2378 	if (!md)
2379 		return 0;
2380 
2381 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2382 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2383 	mpol_cond_put(pol);
2384 
2385 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2386 
2387 	if (file) {
2388 		seq_printf(m, " file=");
2389 		seq_path(m, &file->f_path, "\n\t= ");
2390 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2391 		seq_printf(m, " heap");
2392 	} else if (vma->vm_start <= mm->start_stack &&
2393 			vma->vm_end >= mm->start_stack) {
2394 		seq_printf(m, " stack");
2395 	}
2396 
2397 	if (is_vm_hugetlb_page(vma)) {
2398 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2399 		seq_printf(m, " huge");
2400 	} else {
2401 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2402 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2403 	}
2404 
2405 	if (!md->pages)
2406 		goto out;
2407 
2408 	if (md->anon)
2409 		seq_printf(m," anon=%lu",md->anon);
2410 
2411 	if (md->dirty)
2412 		seq_printf(m," dirty=%lu",md->dirty);
2413 
2414 	if (md->pages != md->anon && md->pages != md->dirty)
2415 		seq_printf(m, " mapped=%lu", md->pages);
2416 
2417 	if (md->mapcount_max > 1)
2418 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2419 
2420 	if (md->swapcache)
2421 		seq_printf(m," swapcache=%lu", md->swapcache);
2422 
2423 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2424 		seq_printf(m," active=%lu", md->active);
2425 
2426 	if (md->writeback)
2427 		seq_printf(m," writeback=%lu", md->writeback);
2428 
2429 	for_each_node_state(n, N_HIGH_MEMORY)
2430 		if (md->node[n])
2431 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2432 out:
2433 	seq_putc(m, '\n');
2434 	kfree(md);
2435 
2436 	if (m->count < m->size)
2437 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2438 	return 0;
2439 }
2440