xref: /linux/mm/mempolicy.c (revision c145211d1f9e2ef19e7b4c2b943f68366daa97af)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/module.h>
79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h>
81 #include <linux/init.h>
82 #include <linux/compat.h>
83 #include <linux/swap.h>
84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h>
87 #include <linux/ksm.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h>
93 
94 #include <asm/tlbflush.h>
95 #include <asm/uaccess.h>
96 
97 #include "internal.h"
98 
99 /* Internal flags */
100 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
101 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
102 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
103 
104 static struct kmem_cache *policy_cache;
105 static struct kmem_cache *sn_cache;
106 
107 /* Highest zone. An specific allocation for a zone below that is not
108    policied. */
109 enum zone_type policy_zone = 0;
110 
111 /*
112  * run-time system-wide default policy => local allocation
113  */
114 struct mempolicy default_policy = {
115 	.refcnt = ATOMIC_INIT(1), /* never free it */
116 	.mode = MPOL_PREFERRED,
117 	.flags = MPOL_F_LOCAL,
118 };
119 
120 static const struct mempolicy_operations {
121 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
123 } mpol_ops[MPOL_MAX];
124 
125 /* Check that the nodemask contains at least one populated zone */
126 static int is_valid_nodemask(const nodemask_t *nodemask)
127 {
128 	int nd, k;
129 
130 	/* Check that there is something useful in this mask */
131 	k = policy_zone;
132 
133 	for_each_node_mask(nd, *nodemask) {
134 		struct zone *z;
135 
136 		for (k = 0; k <= policy_zone; k++) {
137 			z = &NODE_DATA(nd)->node_zones[k];
138 			if (z->present_pages > 0)
139 				return 1;
140 		}
141 	}
142 
143 	return 0;
144 }
145 
146 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
147 {
148 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
149 }
150 
151 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
152 				   const nodemask_t *rel)
153 {
154 	nodemask_t tmp;
155 	nodes_fold(tmp, *orig, nodes_weight(*rel));
156 	nodes_onto(*ret, tmp, *rel);
157 }
158 
159 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
160 {
161 	if (nodes_empty(*nodes))
162 		return -EINVAL;
163 	pol->v.nodes = *nodes;
164 	return 0;
165 }
166 
167 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (!nodes)
170 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
171 	else if (nodes_empty(*nodes))
172 		return -EINVAL;			/*  no allowed nodes */
173 	else
174 		pol->v.preferred_node = first_node(*nodes);
175 	return 0;
176 }
177 
178 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
179 {
180 	if (!is_valid_nodemask(nodes))
181 		return -EINVAL;
182 	pol->v.nodes = *nodes;
183 	return 0;
184 }
185 
186 /*
187  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
188  * any, for the new policy.  mpol_new() has already validated the nodes
189  * parameter with respect to the policy mode and flags.  But, we need to
190  * handle an empty nodemask with MPOL_PREFERRED here.
191  *
192  * Must be called holding task's alloc_lock to protect task's mems_allowed
193  * and mempolicy.  May also be called holding the mmap_semaphore for write.
194  */
195 static int mpol_set_nodemask(struct mempolicy *pol,
196 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
197 {
198 	int ret;
199 
200 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
201 	if (pol == NULL)
202 		return 0;
203 	/* Check N_HIGH_MEMORY */
204 	nodes_and(nsc->mask1,
205 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
206 
207 	VM_BUG_ON(!nodes);
208 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
209 		nodes = NULL;	/* explicit local allocation */
210 	else {
211 		if (pol->flags & MPOL_F_RELATIVE_NODES)
212 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
213 		else
214 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
215 
216 		if (mpol_store_user_nodemask(pol))
217 			pol->w.user_nodemask = *nodes;
218 		else
219 			pol->w.cpuset_mems_allowed =
220 						cpuset_current_mems_allowed;
221 	}
222 
223 	if (nodes)
224 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
225 	else
226 		ret = mpol_ops[pol->mode].create(pol, NULL);
227 	return ret;
228 }
229 
230 /*
231  * This function just creates a new policy, does some check and simple
232  * initialization. You must invoke mpol_set_nodemask() to set nodes.
233  */
234 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
235 				  nodemask_t *nodes)
236 {
237 	struct mempolicy *policy;
238 
239 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
240 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
241 
242 	if (mode == MPOL_DEFAULT) {
243 		if (nodes && !nodes_empty(*nodes))
244 			return ERR_PTR(-EINVAL);
245 		return NULL;	/* simply delete any existing policy */
246 	}
247 	VM_BUG_ON(!nodes);
248 
249 	/*
250 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
251 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
252 	 * All other modes require a valid pointer to a non-empty nodemask.
253 	 */
254 	if (mode == MPOL_PREFERRED) {
255 		if (nodes_empty(*nodes)) {
256 			if (((flags & MPOL_F_STATIC_NODES) ||
257 			     (flags & MPOL_F_RELATIVE_NODES)))
258 				return ERR_PTR(-EINVAL);
259 		}
260 	} else if (nodes_empty(*nodes))
261 		return ERR_PTR(-EINVAL);
262 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
263 	if (!policy)
264 		return ERR_PTR(-ENOMEM);
265 	atomic_set(&policy->refcnt, 1);
266 	policy->mode = mode;
267 	policy->flags = flags;
268 
269 	return policy;
270 }
271 
272 /* Slow path of a mpol destructor. */
273 void __mpol_put(struct mempolicy *p)
274 {
275 	if (!atomic_dec_and_test(&p->refcnt))
276 		return;
277 	kmem_cache_free(policy_cache, p);
278 }
279 
280 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
281 {
282 }
283 
284 static void mpol_rebind_nodemask(struct mempolicy *pol,
285 				 const nodemask_t *nodes)
286 {
287 	nodemask_t tmp;
288 
289 	if (pol->flags & MPOL_F_STATIC_NODES)
290 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
291 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
292 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
293 	else {
294 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
295 			    *nodes);
296 		pol->w.cpuset_mems_allowed = *nodes;
297 	}
298 
299 	pol->v.nodes = tmp;
300 	if (!node_isset(current->il_next, tmp)) {
301 		current->il_next = next_node(current->il_next, tmp);
302 		if (current->il_next >= MAX_NUMNODES)
303 			current->il_next = first_node(tmp);
304 		if (current->il_next >= MAX_NUMNODES)
305 			current->il_next = numa_node_id();
306 	}
307 }
308 
309 static void mpol_rebind_preferred(struct mempolicy *pol,
310 				  const nodemask_t *nodes)
311 {
312 	nodemask_t tmp;
313 
314 	if (pol->flags & MPOL_F_STATIC_NODES) {
315 		int node = first_node(pol->w.user_nodemask);
316 
317 		if (node_isset(node, *nodes)) {
318 			pol->v.preferred_node = node;
319 			pol->flags &= ~MPOL_F_LOCAL;
320 		} else
321 			pol->flags |= MPOL_F_LOCAL;
322 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
323 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
324 		pol->v.preferred_node = first_node(tmp);
325 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
326 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
327 						   pol->w.cpuset_mems_allowed,
328 						   *nodes);
329 		pol->w.cpuset_mems_allowed = *nodes;
330 	}
331 }
332 
333 /* Migrate a policy to a different set of nodes */
334 static void mpol_rebind_policy(struct mempolicy *pol,
335 			       const nodemask_t *newmask)
336 {
337 	if (!pol)
338 		return;
339 	if (!mpol_store_user_nodemask(pol) &&
340 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
341 		return;
342 	mpol_ops[pol->mode].rebind(pol, newmask);
343 }
344 
345 /*
346  * Wrapper for mpol_rebind_policy() that just requires task
347  * pointer, and updates task mempolicy.
348  *
349  * Called with task's alloc_lock held.
350  */
351 
352 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
353 {
354 	mpol_rebind_policy(tsk->mempolicy, new);
355 }
356 
357 /*
358  * Rebind each vma in mm to new nodemask.
359  *
360  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
361  */
362 
363 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
364 {
365 	struct vm_area_struct *vma;
366 
367 	down_write(&mm->mmap_sem);
368 	for (vma = mm->mmap; vma; vma = vma->vm_next)
369 		mpol_rebind_policy(vma->vm_policy, new);
370 	up_write(&mm->mmap_sem);
371 }
372 
373 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
374 	[MPOL_DEFAULT] = {
375 		.rebind = mpol_rebind_default,
376 	},
377 	[MPOL_INTERLEAVE] = {
378 		.create = mpol_new_interleave,
379 		.rebind = mpol_rebind_nodemask,
380 	},
381 	[MPOL_PREFERRED] = {
382 		.create = mpol_new_preferred,
383 		.rebind = mpol_rebind_preferred,
384 	},
385 	[MPOL_BIND] = {
386 		.create = mpol_new_bind,
387 		.rebind = mpol_rebind_nodemask,
388 	},
389 };
390 
391 static void gather_stats(struct page *, void *, int pte_dirty);
392 static void migrate_page_add(struct page *page, struct list_head *pagelist,
393 				unsigned long flags);
394 
395 /* Scan through pages checking if pages follow certain conditions. */
396 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
397 		unsigned long addr, unsigned long end,
398 		const nodemask_t *nodes, unsigned long flags,
399 		void *private)
400 {
401 	pte_t *orig_pte;
402 	pte_t *pte;
403 	spinlock_t *ptl;
404 
405 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
406 	do {
407 		struct page *page;
408 		int nid;
409 
410 		if (!pte_present(*pte))
411 			continue;
412 		page = vm_normal_page(vma, addr, *pte);
413 		if (!page)
414 			continue;
415 		/*
416 		 * vm_normal_page() filters out zero pages, but there might
417 		 * still be PageReserved pages to skip, perhaps in a VDSO.
418 		 * And we cannot move PageKsm pages sensibly or safely yet.
419 		 */
420 		if (PageReserved(page) || PageKsm(page))
421 			continue;
422 		nid = page_to_nid(page);
423 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
424 			continue;
425 
426 		if (flags & MPOL_MF_STATS)
427 			gather_stats(page, private, pte_dirty(*pte));
428 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
429 			migrate_page_add(page, private, flags);
430 		else
431 			break;
432 	} while (pte++, addr += PAGE_SIZE, addr != end);
433 	pte_unmap_unlock(orig_pte, ptl);
434 	return addr != end;
435 }
436 
437 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
438 		unsigned long addr, unsigned long end,
439 		const nodemask_t *nodes, unsigned long flags,
440 		void *private)
441 {
442 	pmd_t *pmd;
443 	unsigned long next;
444 
445 	pmd = pmd_offset(pud, addr);
446 	do {
447 		next = pmd_addr_end(addr, end);
448 		if (pmd_none_or_clear_bad(pmd))
449 			continue;
450 		if (check_pte_range(vma, pmd, addr, next, nodes,
451 				    flags, private))
452 			return -EIO;
453 	} while (pmd++, addr = next, addr != end);
454 	return 0;
455 }
456 
457 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
458 		unsigned long addr, unsigned long end,
459 		const nodemask_t *nodes, unsigned long flags,
460 		void *private)
461 {
462 	pud_t *pud;
463 	unsigned long next;
464 
465 	pud = pud_offset(pgd, addr);
466 	do {
467 		next = pud_addr_end(addr, end);
468 		if (pud_none_or_clear_bad(pud))
469 			continue;
470 		if (check_pmd_range(vma, pud, addr, next, nodes,
471 				    flags, private))
472 			return -EIO;
473 	} while (pud++, addr = next, addr != end);
474 	return 0;
475 }
476 
477 static inline int check_pgd_range(struct vm_area_struct *vma,
478 		unsigned long addr, unsigned long end,
479 		const nodemask_t *nodes, unsigned long flags,
480 		void *private)
481 {
482 	pgd_t *pgd;
483 	unsigned long next;
484 
485 	pgd = pgd_offset(vma->vm_mm, addr);
486 	do {
487 		next = pgd_addr_end(addr, end);
488 		if (pgd_none_or_clear_bad(pgd))
489 			continue;
490 		if (check_pud_range(vma, pgd, addr, next, nodes,
491 				    flags, private))
492 			return -EIO;
493 	} while (pgd++, addr = next, addr != end);
494 	return 0;
495 }
496 
497 /*
498  * Check if all pages in a range are on a set of nodes.
499  * If pagelist != NULL then isolate pages from the LRU and
500  * put them on the pagelist.
501  */
502 static struct vm_area_struct *
503 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
504 		const nodemask_t *nodes, unsigned long flags, void *private)
505 {
506 	int err;
507 	struct vm_area_struct *first, *vma, *prev;
508 
509 
510 	first = find_vma(mm, start);
511 	if (!first)
512 		return ERR_PTR(-EFAULT);
513 	prev = NULL;
514 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
515 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
516 			if (!vma->vm_next && vma->vm_end < end)
517 				return ERR_PTR(-EFAULT);
518 			if (prev && prev->vm_end < vma->vm_start)
519 				return ERR_PTR(-EFAULT);
520 		}
521 		if (!is_vm_hugetlb_page(vma) &&
522 		    ((flags & MPOL_MF_STRICT) ||
523 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
524 				vma_migratable(vma)))) {
525 			unsigned long endvma = vma->vm_end;
526 
527 			if (endvma > end)
528 				endvma = end;
529 			if (vma->vm_start > start)
530 				start = vma->vm_start;
531 			err = check_pgd_range(vma, start, endvma, nodes,
532 						flags, private);
533 			if (err) {
534 				first = ERR_PTR(err);
535 				break;
536 			}
537 		}
538 		prev = vma;
539 	}
540 	return first;
541 }
542 
543 /* Apply policy to a single VMA */
544 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
545 {
546 	int err = 0;
547 	struct mempolicy *old = vma->vm_policy;
548 
549 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
550 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
551 		 vma->vm_ops, vma->vm_file,
552 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
553 
554 	if (vma->vm_ops && vma->vm_ops->set_policy)
555 		err = vma->vm_ops->set_policy(vma, new);
556 	if (!err) {
557 		mpol_get(new);
558 		vma->vm_policy = new;
559 		mpol_put(old);
560 	}
561 	return err;
562 }
563 
564 /* Step 2: apply policy to a range and do splits. */
565 static int mbind_range(struct mm_struct *mm, unsigned long start,
566 		       unsigned long end, struct mempolicy *new_pol)
567 {
568 	struct vm_area_struct *next;
569 	struct vm_area_struct *prev;
570 	struct vm_area_struct *vma;
571 	int err = 0;
572 	pgoff_t pgoff;
573 	unsigned long vmstart;
574 	unsigned long vmend;
575 
576 	vma = find_vma_prev(mm, start, &prev);
577 	if (!vma || vma->vm_start > start)
578 		return -EFAULT;
579 
580 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
581 		next = vma->vm_next;
582 		vmstart = max(start, vma->vm_start);
583 		vmend   = min(end, vma->vm_end);
584 
585 		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
586 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
587 				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
588 		if (prev) {
589 			vma = prev;
590 			next = vma->vm_next;
591 			continue;
592 		}
593 		if (vma->vm_start != vmstart) {
594 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
595 			if (err)
596 				goto out;
597 		}
598 		if (vma->vm_end != vmend) {
599 			err = split_vma(vma->vm_mm, vma, vmend, 0);
600 			if (err)
601 				goto out;
602 		}
603 		err = policy_vma(vma, new_pol);
604 		if (err)
605 			goto out;
606 	}
607 
608  out:
609 	return err;
610 }
611 
612 /*
613  * Update task->flags PF_MEMPOLICY bit: set iff non-default
614  * mempolicy.  Allows more rapid checking of this (combined perhaps
615  * with other PF_* flag bits) on memory allocation hot code paths.
616  *
617  * If called from outside this file, the task 'p' should -only- be
618  * a newly forked child not yet visible on the task list, because
619  * manipulating the task flags of a visible task is not safe.
620  *
621  * The above limitation is why this routine has the funny name
622  * mpol_fix_fork_child_flag().
623  *
624  * It is also safe to call this with a task pointer of current,
625  * which the static wrapper mpol_set_task_struct_flag() does,
626  * for use within this file.
627  */
628 
629 void mpol_fix_fork_child_flag(struct task_struct *p)
630 {
631 	if (p->mempolicy)
632 		p->flags |= PF_MEMPOLICY;
633 	else
634 		p->flags &= ~PF_MEMPOLICY;
635 }
636 
637 static void mpol_set_task_struct_flag(void)
638 {
639 	mpol_fix_fork_child_flag(current);
640 }
641 
642 /* Set the process memory policy */
643 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
644 			     nodemask_t *nodes)
645 {
646 	struct mempolicy *new, *old;
647 	struct mm_struct *mm = current->mm;
648 	NODEMASK_SCRATCH(scratch);
649 	int ret;
650 
651 	if (!scratch)
652 		return -ENOMEM;
653 
654 	new = mpol_new(mode, flags, nodes);
655 	if (IS_ERR(new)) {
656 		ret = PTR_ERR(new);
657 		goto out;
658 	}
659 	/*
660 	 * prevent changing our mempolicy while show_numa_maps()
661 	 * is using it.
662 	 * Note:  do_set_mempolicy() can be called at init time
663 	 * with no 'mm'.
664 	 */
665 	if (mm)
666 		down_write(&mm->mmap_sem);
667 	task_lock(current);
668 	ret = mpol_set_nodemask(new, nodes, scratch);
669 	if (ret) {
670 		task_unlock(current);
671 		if (mm)
672 			up_write(&mm->mmap_sem);
673 		mpol_put(new);
674 		goto out;
675 	}
676 	old = current->mempolicy;
677 	current->mempolicy = new;
678 	mpol_set_task_struct_flag();
679 	if (new && new->mode == MPOL_INTERLEAVE &&
680 	    nodes_weight(new->v.nodes))
681 		current->il_next = first_node(new->v.nodes);
682 	task_unlock(current);
683 	if (mm)
684 		up_write(&mm->mmap_sem);
685 
686 	mpol_put(old);
687 	ret = 0;
688 out:
689 	NODEMASK_SCRATCH_FREE(scratch);
690 	return ret;
691 }
692 
693 /*
694  * Return nodemask for policy for get_mempolicy() query
695  *
696  * Called with task's alloc_lock held
697  */
698 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
699 {
700 	nodes_clear(*nodes);
701 	if (p == &default_policy)
702 		return;
703 
704 	switch (p->mode) {
705 	case MPOL_BIND:
706 		/* Fall through */
707 	case MPOL_INTERLEAVE:
708 		*nodes = p->v.nodes;
709 		break;
710 	case MPOL_PREFERRED:
711 		if (!(p->flags & MPOL_F_LOCAL))
712 			node_set(p->v.preferred_node, *nodes);
713 		/* else return empty node mask for local allocation */
714 		break;
715 	default:
716 		BUG();
717 	}
718 }
719 
720 static int lookup_node(struct mm_struct *mm, unsigned long addr)
721 {
722 	struct page *p;
723 	int err;
724 
725 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
726 	if (err >= 0) {
727 		err = page_to_nid(p);
728 		put_page(p);
729 	}
730 	return err;
731 }
732 
733 /* Retrieve NUMA policy */
734 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
735 			     unsigned long addr, unsigned long flags)
736 {
737 	int err;
738 	struct mm_struct *mm = current->mm;
739 	struct vm_area_struct *vma = NULL;
740 	struct mempolicy *pol = current->mempolicy;
741 
742 	if (flags &
743 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
744 		return -EINVAL;
745 
746 	if (flags & MPOL_F_MEMS_ALLOWED) {
747 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
748 			return -EINVAL;
749 		*policy = 0;	/* just so it's initialized */
750 		task_lock(current);
751 		*nmask  = cpuset_current_mems_allowed;
752 		task_unlock(current);
753 		return 0;
754 	}
755 
756 	if (flags & MPOL_F_ADDR) {
757 		/*
758 		 * Do NOT fall back to task policy if the
759 		 * vma/shared policy at addr is NULL.  We
760 		 * want to return MPOL_DEFAULT in this case.
761 		 */
762 		down_read(&mm->mmap_sem);
763 		vma = find_vma_intersection(mm, addr, addr+1);
764 		if (!vma) {
765 			up_read(&mm->mmap_sem);
766 			return -EFAULT;
767 		}
768 		if (vma->vm_ops && vma->vm_ops->get_policy)
769 			pol = vma->vm_ops->get_policy(vma, addr);
770 		else
771 			pol = vma->vm_policy;
772 	} else if (addr)
773 		return -EINVAL;
774 
775 	if (!pol)
776 		pol = &default_policy;	/* indicates default behavior */
777 
778 	if (flags & MPOL_F_NODE) {
779 		if (flags & MPOL_F_ADDR) {
780 			err = lookup_node(mm, addr);
781 			if (err < 0)
782 				goto out;
783 			*policy = err;
784 		} else if (pol == current->mempolicy &&
785 				pol->mode == MPOL_INTERLEAVE) {
786 			*policy = current->il_next;
787 		} else {
788 			err = -EINVAL;
789 			goto out;
790 		}
791 	} else {
792 		*policy = pol == &default_policy ? MPOL_DEFAULT :
793 						pol->mode;
794 		/*
795 		 * Internal mempolicy flags must be masked off before exposing
796 		 * the policy to userspace.
797 		 */
798 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
799 	}
800 
801 	if (vma) {
802 		up_read(&current->mm->mmap_sem);
803 		vma = NULL;
804 	}
805 
806 	err = 0;
807 	if (nmask) {
808 		if (mpol_store_user_nodemask(pol)) {
809 			*nmask = pol->w.user_nodemask;
810 		} else {
811 			task_lock(current);
812 			get_policy_nodemask(pol, nmask);
813 			task_unlock(current);
814 		}
815 	}
816 
817  out:
818 	mpol_cond_put(pol);
819 	if (vma)
820 		up_read(&current->mm->mmap_sem);
821 	return err;
822 }
823 
824 #ifdef CONFIG_MIGRATION
825 /*
826  * page migration
827  */
828 static void migrate_page_add(struct page *page, struct list_head *pagelist,
829 				unsigned long flags)
830 {
831 	/*
832 	 * Avoid migrating a page that is shared with others.
833 	 */
834 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
835 		if (!isolate_lru_page(page)) {
836 			list_add_tail(&page->lru, pagelist);
837 			inc_zone_page_state(page, NR_ISOLATED_ANON +
838 					    page_is_file_cache(page));
839 		}
840 	}
841 }
842 
843 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
844 {
845 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
846 }
847 
848 /*
849  * Migrate pages from one node to a target node.
850  * Returns error or the number of pages not migrated.
851  */
852 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
853 			   int flags)
854 {
855 	nodemask_t nmask;
856 	LIST_HEAD(pagelist);
857 	int err = 0;
858 
859 	nodes_clear(nmask);
860 	node_set(source, nmask);
861 
862 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
863 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
864 
865 	if (!list_empty(&pagelist))
866 		err = migrate_pages(&pagelist, new_node_page, dest, 0);
867 
868 	return err;
869 }
870 
871 /*
872  * Move pages between the two nodesets so as to preserve the physical
873  * layout as much as possible.
874  *
875  * Returns the number of page that could not be moved.
876  */
877 int do_migrate_pages(struct mm_struct *mm,
878 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
879 {
880 	int busy = 0;
881 	int err;
882 	nodemask_t tmp;
883 
884 	err = migrate_prep();
885 	if (err)
886 		return err;
887 
888 	down_read(&mm->mmap_sem);
889 
890 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
891 	if (err)
892 		goto out;
893 
894 	/*
895 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
896 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
897 	 * bit in 'tmp', and return that <source, dest> pair for migration.
898 	 * The pair of nodemasks 'to' and 'from' define the map.
899 	 *
900 	 * If no pair of bits is found that way, fallback to picking some
901 	 * pair of 'source' and 'dest' bits that are not the same.  If the
902 	 * 'source' and 'dest' bits are the same, this represents a node
903 	 * that will be migrating to itself, so no pages need move.
904 	 *
905 	 * If no bits are left in 'tmp', or if all remaining bits left
906 	 * in 'tmp' correspond to the same bit in 'to', return false
907 	 * (nothing left to migrate).
908 	 *
909 	 * This lets us pick a pair of nodes to migrate between, such that
910 	 * if possible the dest node is not already occupied by some other
911 	 * source node, minimizing the risk of overloading the memory on a
912 	 * node that would happen if we migrated incoming memory to a node
913 	 * before migrating outgoing memory source that same node.
914 	 *
915 	 * A single scan of tmp is sufficient.  As we go, we remember the
916 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
917 	 * that not only moved, but what's better, moved to an empty slot
918 	 * (d is not set in tmp), then we break out then, with that pair.
919 	 * Otherwise when we finish scannng from_tmp, we at least have the
920 	 * most recent <s, d> pair that moved.  If we get all the way through
921 	 * the scan of tmp without finding any node that moved, much less
922 	 * moved to an empty node, then there is nothing left worth migrating.
923 	 */
924 
925 	tmp = *from_nodes;
926 	while (!nodes_empty(tmp)) {
927 		int s,d;
928 		int source = -1;
929 		int dest = 0;
930 
931 		for_each_node_mask(s, tmp) {
932 			d = node_remap(s, *from_nodes, *to_nodes);
933 			if (s == d)
934 				continue;
935 
936 			source = s;	/* Node moved. Memorize */
937 			dest = d;
938 
939 			/* dest not in remaining from nodes? */
940 			if (!node_isset(dest, tmp))
941 				break;
942 		}
943 		if (source == -1)
944 			break;
945 
946 		node_clear(source, tmp);
947 		err = migrate_to_node(mm, source, dest, flags);
948 		if (err > 0)
949 			busy += err;
950 		if (err < 0)
951 			break;
952 	}
953 out:
954 	up_read(&mm->mmap_sem);
955 	if (err < 0)
956 		return err;
957 	return busy;
958 
959 }
960 
961 /*
962  * Allocate a new page for page migration based on vma policy.
963  * Start assuming that page is mapped by vma pointed to by @private.
964  * Search forward from there, if not.  N.B., this assumes that the
965  * list of pages handed to migrate_pages()--which is how we get here--
966  * is in virtual address order.
967  */
968 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
969 {
970 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
971 	unsigned long uninitialized_var(address);
972 
973 	while (vma) {
974 		address = page_address_in_vma(page, vma);
975 		if (address != -EFAULT)
976 			break;
977 		vma = vma->vm_next;
978 	}
979 
980 	/*
981 	 * if !vma, alloc_page_vma() will use task or system default policy
982 	 */
983 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
984 }
985 #else
986 
987 static void migrate_page_add(struct page *page, struct list_head *pagelist,
988 				unsigned long flags)
989 {
990 }
991 
992 int do_migrate_pages(struct mm_struct *mm,
993 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
994 {
995 	return -ENOSYS;
996 }
997 
998 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
999 {
1000 	return NULL;
1001 }
1002 #endif
1003 
1004 static long do_mbind(unsigned long start, unsigned long len,
1005 		     unsigned short mode, unsigned short mode_flags,
1006 		     nodemask_t *nmask, unsigned long flags)
1007 {
1008 	struct vm_area_struct *vma;
1009 	struct mm_struct *mm = current->mm;
1010 	struct mempolicy *new;
1011 	unsigned long end;
1012 	int err;
1013 	LIST_HEAD(pagelist);
1014 
1015 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1016 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1017 		return -EINVAL;
1018 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1019 		return -EPERM;
1020 
1021 	if (start & ~PAGE_MASK)
1022 		return -EINVAL;
1023 
1024 	if (mode == MPOL_DEFAULT)
1025 		flags &= ~MPOL_MF_STRICT;
1026 
1027 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1028 	end = start + len;
1029 
1030 	if (end < start)
1031 		return -EINVAL;
1032 	if (end == start)
1033 		return 0;
1034 
1035 	new = mpol_new(mode, mode_flags, nmask);
1036 	if (IS_ERR(new))
1037 		return PTR_ERR(new);
1038 
1039 	/*
1040 	 * If we are using the default policy then operation
1041 	 * on discontinuous address spaces is okay after all
1042 	 */
1043 	if (!new)
1044 		flags |= MPOL_MF_DISCONTIG_OK;
1045 
1046 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1047 		 start, start + len, mode, mode_flags,
1048 		 nmask ? nodes_addr(*nmask)[0] : -1);
1049 
1050 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1051 
1052 		err = migrate_prep();
1053 		if (err)
1054 			goto mpol_out;
1055 	}
1056 	{
1057 		NODEMASK_SCRATCH(scratch);
1058 		if (scratch) {
1059 			down_write(&mm->mmap_sem);
1060 			task_lock(current);
1061 			err = mpol_set_nodemask(new, nmask, scratch);
1062 			task_unlock(current);
1063 			if (err)
1064 				up_write(&mm->mmap_sem);
1065 		} else
1066 			err = -ENOMEM;
1067 		NODEMASK_SCRATCH_FREE(scratch);
1068 	}
1069 	if (err)
1070 		goto mpol_out;
1071 
1072 	vma = check_range(mm, start, end, nmask,
1073 			  flags | MPOL_MF_INVERT, &pagelist);
1074 
1075 	err = PTR_ERR(vma);
1076 	if (!IS_ERR(vma)) {
1077 		int nr_failed = 0;
1078 
1079 		err = mbind_range(mm, start, end, new);
1080 
1081 		if (!list_empty(&pagelist))
1082 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1083 						(unsigned long)vma, 0);
1084 
1085 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1086 			err = -EIO;
1087 	} else
1088 		putback_lru_pages(&pagelist);
1089 
1090 	up_write(&mm->mmap_sem);
1091  mpol_out:
1092 	mpol_put(new);
1093 	return err;
1094 }
1095 
1096 /*
1097  * User space interface with variable sized bitmaps for nodelists.
1098  */
1099 
1100 /* Copy a node mask from user space. */
1101 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1102 		     unsigned long maxnode)
1103 {
1104 	unsigned long k;
1105 	unsigned long nlongs;
1106 	unsigned long endmask;
1107 
1108 	--maxnode;
1109 	nodes_clear(*nodes);
1110 	if (maxnode == 0 || !nmask)
1111 		return 0;
1112 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1113 		return -EINVAL;
1114 
1115 	nlongs = BITS_TO_LONGS(maxnode);
1116 	if ((maxnode % BITS_PER_LONG) == 0)
1117 		endmask = ~0UL;
1118 	else
1119 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1120 
1121 	/* When the user specified more nodes than supported just check
1122 	   if the non supported part is all zero. */
1123 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1124 		if (nlongs > PAGE_SIZE/sizeof(long))
1125 			return -EINVAL;
1126 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1127 			unsigned long t;
1128 			if (get_user(t, nmask + k))
1129 				return -EFAULT;
1130 			if (k == nlongs - 1) {
1131 				if (t & endmask)
1132 					return -EINVAL;
1133 			} else if (t)
1134 				return -EINVAL;
1135 		}
1136 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1137 		endmask = ~0UL;
1138 	}
1139 
1140 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1141 		return -EFAULT;
1142 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1143 	return 0;
1144 }
1145 
1146 /* Copy a kernel node mask to user space */
1147 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1148 			      nodemask_t *nodes)
1149 {
1150 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1151 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1152 
1153 	if (copy > nbytes) {
1154 		if (copy > PAGE_SIZE)
1155 			return -EINVAL;
1156 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1157 			return -EFAULT;
1158 		copy = nbytes;
1159 	}
1160 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1161 }
1162 
1163 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1164 		unsigned long, mode, unsigned long __user *, nmask,
1165 		unsigned long, maxnode, unsigned, flags)
1166 {
1167 	nodemask_t nodes;
1168 	int err;
1169 	unsigned short mode_flags;
1170 
1171 	mode_flags = mode & MPOL_MODE_FLAGS;
1172 	mode &= ~MPOL_MODE_FLAGS;
1173 	if (mode >= MPOL_MAX)
1174 		return -EINVAL;
1175 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1176 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1177 		return -EINVAL;
1178 	err = get_nodes(&nodes, nmask, maxnode);
1179 	if (err)
1180 		return err;
1181 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1182 }
1183 
1184 /* Set the process memory policy */
1185 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1186 		unsigned long, maxnode)
1187 {
1188 	int err;
1189 	nodemask_t nodes;
1190 	unsigned short flags;
1191 
1192 	flags = mode & MPOL_MODE_FLAGS;
1193 	mode &= ~MPOL_MODE_FLAGS;
1194 	if ((unsigned int)mode >= MPOL_MAX)
1195 		return -EINVAL;
1196 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1197 		return -EINVAL;
1198 	err = get_nodes(&nodes, nmask, maxnode);
1199 	if (err)
1200 		return err;
1201 	return do_set_mempolicy(mode, flags, &nodes);
1202 }
1203 
1204 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1205 		const unsigned long __user *, old_nodes,
1206 		const unsigned long __user *, new_nodes)
1207 {
1208 	const struct cred *cred = current_cred(), *tcred;
1209 	struct mm_struct *mm;
1210 	struct task_struct *task;
1211 	nodemask_t old;
1212 	nodemask_t new;
1213 	nodemask_t task_nodes;
1214 	int err;
1215 
1216 	err = get_nodes(&old, old_nodes, maxnode);
1217 	if (err)
1218 		return err;
1219 
1220 	err = get_nodes(&new, new_nodes, maxnode);
1221 	if (err)
1222 		return err;
1223 
1224 	/* Find the mm_struct */
1225 	read_lock(&tasklist_lock);
1226 	task = pid ? find_task_by_vpid(pid) : current;
1227 	if (!task) {
1228 		read_unlock(&tasklist_lock);
1229 		return -ESRCH;
1230 	}
1231 	mm = get_task_mm(task);
1232 	read_unlock(&tasklist_lock);
1233 
1234 	if (!mm)
1235 		return -EINVAL;
1236 
1237 	/*
1238 	 * Check if this process has the right to modify the specified
1239 	 * process. The right exists if the process has administrative
1240 	 * capabilities, superuser privileges or the same
1241 	 * userid as the target process.
1242 	 */
1243 	rcu_read_lock();
1244 	tcred = __task_cred(task);
1245 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1246 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1247 	    !capable(CAP_SYS_NICE)) {
1248 		rcu_read_unlock();
1249 		err = -EPERM;
1250 		goto out;
1251 	}
1252 	rcu_read_unlock();
1253 
1254 	task_nodes = cpuset_mems_allowed(task);
1255 	/* Is the user allowed to access the target nodes? */
1256 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1257 		err = -EPERM;
1258 		goto out;
1259 	}
1260 
1261 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1262 		err = -EINVAL;
1263 		goto out;
1264 	}
1265 
1266 	err = security_task_movememory(task);
1267 	if (err)
1268 		goto out;
1269 
1270 	err = do_migrate_pages(mm, &old, &new,
1271 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1272 out:
1273 	mmput(mm);
1274 	return err;
1275 }
1276 
1277 
1278 /* Retrieve NUMA policy */
1279 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1280 		unsigned long __user *, nmask, unsigned long, maxnode,
1281 		unsigned long, addr, unsigned long, flags)
1282 {
1283 	int err;
1284 	int uninitialized_var(pval);
1285 	nodemask_t nodes;
1286 
1287 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1288 		return -EINVAL;
1289 
1290 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1291 
1292 	if (err)
1293 		return err;
1294 
1295 	if (policy && put_user(pval, policy))
1296 		return -EFAULT;
1297 
1298 	if (nmask)
1299 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1300 
1301 	return err;
1302 }
1303 
1304 #ifdef CONFIG_COMPAT
1305 
1306 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1307 				     compat_ulong_t __user *nmask,
1308 				     compat_ulong_t maxnode,
1309 				     compat_ulong_t addr, compat_ulong_t flags)
1310 {
1311 	long err;
1312 	unsigned long __user *nm = NULL;
1313 	unsigned long nr_bits, alloc_size;
1314 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1315 
1316 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1317 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1318 
1319 	if (nmask)
1320 		nm = compat_alloc_user_space(alloc_size);
1321 
1322 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1323 
1324 	if (!err && nmask) {
1325 		err = copy_from_user(bm, nm, alloc_size);
1326 		/* ensure entire bitmap is zeroed */
1327 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1328 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1329 	}
1330 
1331 	return err;
1332 }
1333 
1334 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1335 				     compat_ulong_t maxnode)
1336 {
1337 	long err = 0;
1338 	unsigned long __user *nm = NULL;
1339 	unsigned long nr_bits, alloc_size;
1340 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1341 
1342 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1343 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1344 
1345 	if (nmask) {
1346 		err = compat_get_bitmap(bm, nmask, nr_bits);
1347 		nm = compat_alloc_user_space(alloc_size);
1348 		err |= copy_to_user(nm, bm, alloc_size);
1349 	}
1350 
1351 	if (err)
1352 		return -EFAULT;
1353 
1354 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1355 }
1356 
1357 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1358 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1359 			     compat_ulong_t maxnode, compat_ulong_t flags)
1360 {
1361 	long err = 0;
1362 	unsigned long __user *nm = NULL;
1363 	unsigned long nr_bits, alloc_size;
1364 	nodemask_t bm;
1365 
1366 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1367 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1368 
1369 	if (nmask) {
1370 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1371 		nm = compat_alloc_user_space(alloc_size);
1372 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1373 	}
1374 
1375 	if (err)
1376 		return -EFAULT;
1377 
1378 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1379 }
1380 
1381 #endif
1382 
1383 /*
1384  * get_vma_policy(@task, @vma, @addr)
1385  * @task - task for fallback if vma policy == default
1386  * @vma   - virtual memory area whose policy is sought
1387  * @addr  - address in @vma for shared policy lookup
1388  *
1389  * Returns effective policy for a VMA at specified address.
1390  * Falls back to @task or system default policy, as necessary.
1391  * Current or other task's task mempolicy and non-shared vma policies
1392  * are protected by the task's mmap_sem, which must be held for read by
1393  * the caller.
1394  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1395  * count--added by the get_policy() vm_op, as appropriate--to protect against
1396  * freeing by another task.  It is the caller's responsibility to free the
1397  * extra reference for shared policies.
1398  */
1399 static struct mempolicy *get_vma_policy(struct task_struct *task,
1400 		struct vm_area_struct *vma, unsigned long addr)
1401 {
1402 	struct mempolicy *pol = task->mempolicy;
1403 
1404 	if (vma) {
1405 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1406 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1407 									addr);
1408 			if (vpol)
1409 				pol = vpol;
1410 		} else if (vma->vm_policy)
1411 			pol = vma->vm_policy;
1412 	}
1413 	if (!pol)
1414 		pol = &default_policy;
1415 	return pol;
1416 }
1417 
1418 /*
1419  * Return a nodemask representing a mempolicy for filtering nodes for
1420  * page allocation
1421  */
1422 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1423 {
1424 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1425 	if (unlikely(policy->mode == MPOL_BIND) &&
1426 			gfp_zone(gfp) >= policy_zone &&
1427 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1428 		return &policy->v.nodes;
1429 
1430 	return NULL;
1431 }
1432 
1433 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1434 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1435 {
1436 	int nd = numa_node_id();
1437 
1438 	switch (policy->mode) {
1439 	case MPOL_PREFERRED:
1440 		if (!(policy->flags & MPOL_F_LOCAL))
1441 			nd = policy->v.preferred_node;
1442 		break;
1443 	case MPOL_BIND:
1444 		/*
1445 		 * Normally, MPOL_BIND allocations are node-local within the
1446 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1447 		 * current node is part of the mask, we use the zonelist for
1448 		 * the first node in the mask instead.
1449 		 */
1450 		if (unlikely(gfp & __GFP_THISNODE) &&
1451 				unlikely(!node_isset(nd, policy->v.nodes)))
1452 			nd = first_node(policy->v.nodes);
1453 		break;
1454 	case MPOL_INTERLEAVE: /* should not happen */
1455 		break;
1456 	default:
1457 		BUG();
1458 	}
1459 	return node_zonelist(nd, gfp);
1460 }
1461 
1462 /* Do dynamic interleaving for a process */
1463 static unsigned interleave_nodes(struct mempolicy *policy)
1464 {
1465 	unsigned nid, next;
1466 	struct task_struct *me = current;
1467 
1468 	nid = me->il_next;
1469 	next = next_node(nid, policy->v.nodes);
1470 	if (next >= MAX_NUMNODES)
1471 		next = first_node(policy->v.nodes);
1472 	if (next < MAX_NUMNODES)
1473 		me->il_next = next;
1474 	return nid;
1475 }
1476 
1477 /*
1478  * Depending on the memory policy provide a node from which to allocate the
1479  * next slab entry.
1480  * @policy must be protected by freeing by the caller.  If @policy is
1481  * the current task's mempolicy, this protection is implicit, as only the
1482  * task can change it's policy.  The system default policy requires no
1483  * such protection.
1484  */
1485 unsigned slab_node(struct mempolicy *policy)
1486 {
1487 	if (!policy || policy->flags & MPOL_F_LOCAL)
1488 		return numa_node_id();
1489 
1490 	switch (policy->mode) {
1491 	case MPOL_PREFERRED:
1492 		/*
1493 		 * handled MPOL_F_LOCAL above
1494 		 */
1495 		return policy->v.preferred_node;
1496 
1497 	case MPOL_INTERLEAVE:
1498 		return interleave_nodes(policy);
1499 
1500 	case MPOL_BIND: {
1501 		/*
1502 		 * Follow bind policy behavior and start allocation at the
1503 		 * first node.
1504 		 */
1505 		struct zonelist *zonelist;
1506 		struct zone *zone;
1507 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1508 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1509 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1510 							&policy->v.nodes,
1511 							&zone);
1512 		return zone->node;
1513 	}
1514 
1515 	default:
1516 		BUG();
1517 	}
1518 }
1519 
1520 /* Do static interleaving for a VMA with known offset. */
1521 static unsigned offset_il_node(struct mempolicy *pol,
1522 		struct vm_area_struct *vma, unsigned long off)
1523 {
1524 	unsigned nnodes = nodes_weight(pol->v.nodes);
1525 	unsigned target;
1526 	int c;
1527 	int nid = -1;
1528 
1529 	if (!nnodes)
1530 		return numa_node_id();
1531 	target = (unsigned int)off % nnodes;
1532 	c = 0;
1533 	do {
1534 		nid = next_node(nid, pol->v.nodes);
1535 		c++;
1536 	} while (c <= target);
1537 	return nid;
1538 }
1539 
1540 /* Determine a node number for interleave */
1541 static inline unsigned interleave_nid(struct mempolicy *pol,
1542 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1543 {
1544 	if (vma) {
1545 		unsigned long off;
1546 
1547 		/*
1548 		 * for small pages, there is no difference between
1549 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1550 		 * for huge pages, since vm_pgoff is in units of small
1551 		 * pages, we need to shift off the always 0 bits to get
1552 		 * a useful offset.
1553 		 */
1554 		BUG_ON(shift < PAGE_SHIFT);
1555 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1556 		off += (addr - vma->vm_start) >> shift;
1557 		return offset_il_node(pol, vma, off);
1558 	} else
1559 		return interleave_nodes(pol);
1560 }
1561 
1562 #ifdef CONFIG_HUGETLBFS
1563 /*
1564  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1565  * @vma = virtual memory area whose policy is sought
1566  * @addr = address in @vma for shared policy lookup and interleave policy
1567  * @gfp_flags = for requested zone
1568  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1569  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1570  *
1571  * Returns a zonelist suitable for a huge page allocation and a pointer
1572  * to the struct mempolicy for conditional unref after allocation.
1573  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1574  * @nodemask for filtering the zonelist.
1575  */
1576 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1577 				gfp_t gfp_flags, struct mempolicy **mpol,
1578 				nodemask_t **nodemask)
1579 {
1580 	struct zonelist *zl;
1581 
1582 	*mpol = get_vma_policy(current, vma, addr);
1583 	*nodemask = NULL;	/* assume !MPOL_BIND */
1584 
1585 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1586 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1587 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1588 	} else {
1589 		zl = policy_zonelist(gfp_flags, *mpol);
1590 		if ((*mpol)->mode == MPOL_BIND)
1591 			*nodemask = &(*mpol)->v.nodes;
1592 	}
1593 	return zl;
1594 }
1595 
1596 /*
1597  * init_nodemask_of_mempolicy
1598  *
1599  * If the current task's mempolicy is "default" [NULL], return 'false'
1600  * to indicate default policy.  Otherwise, extract the policy nodemask
1601  * for 'bind' or 'interleave' policy into the argument nodemask, or
1602  * initialize the argument nodemask to contain the single node for
1603  * 'preferred' or 'local' policy and return 'true' to indicate presence
1604  * of non-default mempolicy.
1605  *
1606  * We don't bother with reference counting the mempolicy [mpol_get/put]
1607  * because the current task is examining it's own mempolicy and a task's
1608  * mempolicy is only ever changed by the task itself.
1609  *
1610  * N.B., it is the caller's responsibility to free a returned nodemask.
1611  */
1612 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1613 {
1614 	struct mempolicy *mempolicy;
1615 	int nid;
1616 
1617 	if (!(mask && current->mempolicy))
1618 		return false;
1619 
1620 	mempolicy = current->mempolicy;
1621 	switch (mempolicy->mode) {
1622 	case MPOL_PREFERRED:
1623 		if (mempolicy->flags & MPOL_F_LOCAL)
1624 			nid = numa_node_id();
1625 		else
1626 			nid = mempolicy->v.preferred_node;
1627 		init_nodemask_of_node(mask, nid);
1628 		break;
1629 
1630 	case MPOL_BIND:
1631 		/* Fall through */
1632 	case MPOL_INTERLEAVE:
1633 		*mask =  mempolicy->v.nodes;
1634 		break;
1635 
1636 	default:
1637 		BUG();
1638 	}
1639 
1640 	return true;
1641 }
1642 #endif
1643 
1644 /* Allocate a page in interleaved policy.
1645    Own path because it needs to do special accounting. */
1646 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1647 					unsigned nid)
1648 {
1649 	struct zonelist *zl;
1650 	struct page *page;
1651 
1652 	zl = node_zonelist(nid, gfp);
1653 	page = __alloc_pages(gfp, order, zl);
1654 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1655 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1656 	return page;
1657 }
1658 
1659 /**
1660  * 	alloc_page_vma	- Allocate a page for a VMA.
1661  *
1662  * 	@gfp:
1663  *      %GFP_USER    user allocation.
1664  *      %GFP_KERNEL  kernel allocations,
1665  *      %GFP_HIGHMEM highmem/user allocations,
1666  *      %GFP_FS      allocation should not call back into a file system.
1667  *      %GFP_ATOMIC  don't sleep.
1668  *
1669  * 	@vma:  Pointer to VMA or NULL if not available.
1670  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1671  *
1672  * 	This function allocates a page from the kernel page pool and applies
1673  *	a NUMA policy associated with the VMA or the current process.
1674  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1675  *	mm_struct of the VMA to prevent it from going away. Should be used for
1676  *	all allocations for pages that will be mapped into
1677  * 	user space. Returns NULL when no page can be allocated.
1678  *
1679  *	Should be called with the mm_sem of the vma hold.
1680  */
1681 struct page *
1682 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1683 {
1684 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1685 	struct zonelist *zl;
1686 
1687 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1688 		unsigned nid;
1689 
1690 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1691 		mpol_cond_put(pol);
1692 		return alloc_page_interleave(gfp, 0, nid);
1693 	}
1694 	zl = policy_zonelist(gfp, pol);
1695 	if (unlikely(mpol_needs_cond_ref(pol))) {
1696 		/*
1697 		 * slow path: ref counted shared policy
1698 		 */
1699 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1700 						zl, policy_nodemask(gfp, pol));
1701 		__mpol_put(pol);
1702 		return page;
1703 	}
1704 	/*
1705 	 * fast path:  default or task policy
1706 	 */
1707 	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1708 }
1709 
1710 /**
1711  * 	alloc_pages_current - Allocate pages.
1712  *
1713  *	@gfp:
1714  *		%GFP_USER   user allocation,
1715  *      	%GFP_KERNEL kernel allocation,
1716  *      	%GFP_HIGHMEM highmem allocation,
1717  *      	%GFP_FS     don't call back into a file system.
1718  *      	%GFP_ATOMIC don't sleep.
1719  *	@order: Power of two of allocation size in pages. 0 is a single page.
1720  *
1721  *	Allocate a page from the kernel page pool.  When not in
1722  *	interrupt context and apply the current process NUMA policy.
1723  *	Returns NULL when no page can be allocated.
1724  *
1725  *	Don't call cpuset_update_task_memory_state() unless
1726  *	1) it's ok to take cpuset_sem (can WAIT), and
1727  *	2) allocating for current task (not interrupt).
1728  */
1729 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1730 {
1731 	struct mempolicy *pol = current->mempolicy;
1732 
1733 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1734 		pol = &default_policy;
1735 
1736 	/*
1737 	 * No reference counting needed for current->mempolicy
1738 	 * nor system default_policy
1739 	 */
1740 	if (pol->mode == MPOL_INTERLEAVE)
1741 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1742 	return __alloc_pages_nodemask(gfp, order,
1743 			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1744 }
1745 EXPORT_SYMBOL(alloc_pages_current);
1746 
1747 /*
1748  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1749  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1750  * with the mems_allowed returned by cpuset_mems_allowed().  This
1751  * keeps mempolicies cpuset relative after its cpuset moves.  See
1752  * further kernel/cpuset.c update_nodemask().
1753  */
1754 
1755 /* Slow path of a mempolicy duplicate */
1756 struct mempolicy *__mpol_dup(struct mempolicy *old)
1757 {
1758 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1759 
1760 	if (!new)
1761 		return ERR_PTR(-ENOMEM);
1762 	rcu_read_lock();
1763 	if (current_cpuset_is_being_rebound()) {
1764 		nodemask_t mems = cpuset_mems_allowed(current);
1765 		mpol_rebind_policy(old, &mems);
1766 	}
1767 	rcu_read_unlock();
1768 	*new = *old;
1769 	atomic_set(&new->refcnt, 1);
1770 	return new;
1771 }
1772 
1773 /*
1774  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1775  * eliminate the * MPOL_F_* flags that require conditional ref and
1776  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1777  * after return.  Use the returned value.
1778  *
1779  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1780  * policy lookup, even if the policy needs/has extra ref on lookup.
1781  * shmem_readahead needs this.
1782  */
1783 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1784 						struct mempolicy *frompol)
1785 {
1786 	if (!mpol_needs_cond_ref(frompol))
1787 		return frompol;
1788 
1789 	*tompol = *frompol;
1790 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1791 	__mpol_put(frompol);
1792 	return tompol;
1793 }
1794 
1795 static int mpol_match_intent(const struct mempolicy *a,
1796 			     const struct mempolicy *b)
1797 {
1798 	if (a->flags != b->flags)
1799 		return 0;
1800 	if (!mpol_store_user_nodemask(a))
1801 		return 1;
1802 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1803 }
1804 
1805 /* Slow path of a mempolicy comparison */
1806 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1807 {
1808 	if (!a || !b)
1809 		return 0;
1810 	if (a->mode != b->mode)
1811 		return 0;
1812 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1813 		return 0;
1814 	switch (a->mode) {
1815 	case MPOL_BIND:
1816 		/* Fall through */
1817 	case MPOL_INTERLEAVE:
1818 		return nodes_equal(a->v.nodes, b->v.nodes);
1819 	case MPOL_PREFERRED:
1820 		return a->v.preferred_node == b->v.preferred_node &&
1821 			a->flags == b->flags;
1822 	default:
1823 		BUG();
1824 		return 0;
1825 	}
1826 }
1827 
1828 /*
1829  * Shared memory backing store policy support.
1830  *
1831  * Remember policies even when nobody has shared memory mapped.
1832  * The policies are kept in Red-Black tree linked from the inode.
1833  * They are protected by the sp->lock spinlock, which should be held
1834  * for any accesses to the tree.
1835  */
1836 
1837 /* lookup first element intersecting start-end */
1838 /* Caller holds sp->lock */
1839 static struct sp_node *
1840 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1841 {
1842 	struct rb_node *n = sp->root.rb_node;
1843 
1844 	while (n) {
1845 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1846 
1847 		if (start >= p->end)
1848 			n = n->rb_right;
1849 		else if (end <= p->start)
1850 			n = n->rb_left;
1851 		else
1852 			break;
1853 	}
1854 	if (!n)
1855 		return NULL;
1856 	for (;;) {
1857 		struct sp_node *w = NULL;
1858 		struct rb_node *prev = rb_prev(n);
1859 		if (!prev)
1860 			break;
1861 		w = rb_entry(prev, struct sp_node, nd);
1862 		if (w->end <= start)
1863 			break;
1864 		n = prev;
1865 	}
1866 	return rb_entry(n, struct sp_node, nd);
1867 }
1868 
1869 /* Insert a new shared policy into the list. */
1870 /* Caller holds sp->lock */
1871 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1872 {
1873 	struct rb_node **p = &sp->root.rb_node;
1874 	struct rb_node *parent = NULL;
1875 	struct sp_node *nd;
1876 
1877 	while (*p) {
1878 		parent = *p;
1879 		nd = rb_entry(parent, struct sp_node, nd);
1880 		if (new->start < nd->start)
1881 			p = &(*p)->rb_left;
1882 		else if (new->end > nd->end)
1883 			p = &(*p)->rb_right;
1884 		else
1885 			BUG();
1886 	}
1887 	rb_link_node(&new->nd, parent, p);
1888 	rb_insert_color(&new->nd, &sp->root);
1889 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1890 		 new->policy ? new->policy->mode : 0);
1891 }
1892 
1893 /* Find shared policy intersecting idx */
1894 struct mempolicy *
1895 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1896 {
1897 	struct mempolicy *pol = NULL;
1898 	struct sp_node *sn;
1899 
1900 	if (!sp->root.rb_node)
1901 		return NULL;
1902 	spin_lock(&sp->lock);
1903 	sn = sp_lookup(sp, idx, idx+1);
1904 	if (sn) {
1905 		mpol_get(sn->policy);
1906 		pol = sn->policy;
1907 	}
1908 	spin_unlock(&sp->lock);
1909 	return pol;
1910 }
1911 
1912 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1913 {
1914 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1915 	rb_erase(&n->nd, &sp->root);
1916 	mpol_put(n->policy);
1917 	kmem_cache_free(sn_cache, n);
1918 }
1919 
1920 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1921 				struct mempolicy *pol)
1922 {
1923 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1924 
1925 	if (!n)
1926 		return NULL;
1927 	n->start = start;
1928 	n->end = end;
1929 	mpol_get(pol);
1930 	pol->flags |= MPOL_F_SHARED;	/* for unref */
1931 	n->policy = pol;
1932 	return n;
1933 }
1934 
1935 /* Replace a policy range. */
1936 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1937 				 unsigned long end, struct sp_node *new)
1938 {
1939 	struct sp_node *n, *new2 = NULL;
1940 
1941 restart:
1942 	spin_lock(&sp->lock);
1943 	n = sp_lookup(sp, start, end);
1944 	/* Take care of old policies in the same range. */
1945 	while (n && n->start < end) {
1946 		struct rb_node *next = rb_next(&n->nd);
1947 		if (n->start >= start) {
1948 			if (n->end <= end)
1949 				sp_delete(sp, n);
1950 			else
1951 				n->start = end;
1952 		} else {
1953 			/* Old policy spanning whole new range. */
1954 			if (n->end > end) {
1955 				if (!new2) {
1956 					spin_unlock(&sp->lock);
1957 					new2 = sp_alloc(end, n->end, n->policy);
1958 					if (!new2)
1959 						return -ENOMEM;
1960 					goto restart;
1961 				}
1962 				n->end = start;
1963 				sp_insert(sp, new2);
1964 				new2 = NULL;
1965 				break;
1966 			} else
1967 				n->end = start;
1968 		}
1969 		if (!next)
1970 			break;
1971 		n = rb_entry(next, struct sp_node, nd);
1972 	}
1973 	if (new)
1974 		sp_insert(sp, new);
1975 	spin_unlock(&sp->lock);
1976 	if (new2) {
1977 		mpol_put(new2->policy);
1978 		kmem_cache_free(sn_cache, new2);
1979 	}
1980 	return 0;
1981 }
1982 
1983 /**
1984  * mpol_shared_policy_init - initialize shared policy for inode
1985  * @sp: pointer to inode shared policy
1986  * @mpol:  struct mempolicy to install
1987  *
1988  * Install non-NULL @mpol in inode's shared policy rb-tree.
1989  * On entry, the current task has a reference on a non-NULL @mpol.
1990  * This must be released on exit.
1991  * This is called at get_inode() calls and we can use GFP_KERNEL.
1992  */
1993 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1994 {
1995 	int ret;
1996 
1997 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1998 	spin_lock_init(&sp->lock);
1999 
2000 	if (mpol) {
2001 		struct vm_area_struct pvma;
2002 		struct mempolicy *new;
2003 		NODEMASK_SCRATCH(scratch);
2004 
2005 		if (!scratch)
2006 			return;
2007 		/* contextualize the tmpfs mount point mempolicy */
2008 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2009 		if (IS_ERR(new)) {
2010 			mpol_put(mpol);	/* drop our ref on sb mpol */
2011 			NODEMASK_SCRATCH_FREE(scratch);
2012 			return;		/* no valid nodemask intersection */
2013 		}
2014 
2015 		task_lock(current);
2016 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2017 		task_unlock(current);
2018 		mpol_put(mpol);	/* drop our ref on sb mpol */
2019 		if (ret) {
2020 			NODEMASK_SCRATCH_FREE(scratch);
2021 			mpol_put(new);
2022 			return;
2023 		}
2024 
2025 		/* Create pseudo-vma that contains just the policy */
2026 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2027 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2028 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2029 		mpol_put(new);			/* drop initial ref */
2030 		NODEMASK_SCRATCH_FREE(scratch);
2031 	}
2032 }
2033 
2034 int mpol_set_shared_policy(struct shared_policy *info,
2035 			struct vm_area_struct *vma, struct mempolicy *npol)
2036 {
2037 	int err;
2038 	struct sp_node *new = NULL;
2039 	unsigned long sz = vma_pages(vma);
2040 
2041 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2042 		 vma->vm_pgoff,
2043 		 sz, npol ? npol->mode : -1,
2044 		 npol ? npol->flags : -1,
2045 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2046 
2047 	if (npol) {
2048 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2049 		if (!new)
2050 			return -ENOMEM;
2051 	}
2052 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2053 	if (err && new)
2054 		kmem_cache_free(sn_cache, new);
2055 	return err;
2056 }
2057 
2058 /* Free a backing policy store on inode delete. */
2059 void mpol_free_shared_policy(struct shared_policy *p)
2060 {
2061 	struct sp_node *n;
2062 	struct rb_node *next;
2063 
2064 	if (!p->root.rb_node)
2065 		return;
2066 	spin_lock(&p->lock);
2067 	next = rb_first(&p->root);
2068 	while (next) {
2069 		n = rb_entry(next, struct sp_node, nd);
2070 		next = rb_next(&n->nd);
2071 		rb_erase(&n->nd, &p->root);
2072 		mpol_put(n->policy);
2073 		kmem_cache_free(sn_cache, n);
2074 	}
2075 	spin_unlock(&p->lock);
2076 }
2077 
2078 /* assumes fs == KERNEL_DS */
2079 void __init numa_policy_init(void)
2080 {
2081 	nodemask_t interleave_nodes;
2082 	unsigned long largest = 0;
2083 	int nid, prefer = 0;
2084 
2085 	policy_cache = kmem_cache_create("numa_policy",
2086 					 sizeof(struct mempolicy),
2087 					 0, SLAB_PANIC, NULL);
2088 
2089 	sn_cache = kmem_cache_create("shared_policy_node",
2090 				     sizeof(struct sp_node),
2091 				     0, SLAB_PANIC, NULL);
2092 
2093 	/*
2094 	 * Set interleaving policy for system init. Interleaving is only
2095 	 * enabled across suitably sized nodes (default is >= 16MB), or
2096 	 * fall back to the largest node if they're all smaller.
2097 	 */
2098 	nodes_clear(interleave_nodes);
2099 	for_each_node_state(nid, N_HIGH_MEMORY) {
2100 		unsigned long total_pages = node_present_pages(nid);
2101 
2102 		/* Preserve the largest node */
2103 		if (largest < total_pages) {
2104 			largest = total_pages;
2105 			prefer = nid;
2106 		}
2107 
2108 		/* Interleave this node? */
2109 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2110 			node_set(nid, interleave_nodes);
2111 	}
2112 
2113 	/* All too small, use the largest */
2114 	if (unlikely(nodes_empty(interleave_nodes)))
2115 		node_set(prefer, interleave_nodes);
2116 
2117 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2118 		printk("numa_policy_init: interleaving failed\n");
2119 }
2120 
2121 /* Reset policy of current process to default */
2122 void numa_default_policy(void)
2123 {
2124 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2125 }
2126 
2127 /*
2128  * Parse and format mempolicy from/to strings
2129  */
2130 
2131 /*
2132  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2133  * Used only for mpol_parse_str() and mpol_to_str()
2134  */
2135 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2136 static const char * const policy_types[] =
2137 	{ "default", "prefer", "bind", "interleave", "local" };
2138 
2139 
2140 #ifdef CONFIG_TMPFS
2141 /**
2142  * mpol_parse_str - parse string to mempolicy
2143  * @str:  string containing mempolicy to parse
2144  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2145  * @no_context:  flag whether to "contextualize" the mempolicy
2146  *
2147  * Format of input:
2148  *	<mode>[=<flags>][:<nodelist>]
2149  *
2150  * if @no_context is true, save the input nodemask in w.user_nodemask in
2151  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2152  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2153  * mount option.  Note that if 'static' or 'relative' mode flags were
2154  * specified, the input nodemask will already have been saved.  Saving
2155  * it again is redundant, but safe.
2156  *
2157  * On success, returns 0, else 1
2158  */
2159 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2160 {
2161 	struct mempolicy *new = NULL;
2162 	unsigned short uninitialized_var(mode);
2163 	unsigned short uninitialized_var(mode_flags);
2164 	nodemask_t nodes;
2165 	char *nodelist = strchr(str, ':');
2166 	char *flags = strchr(str, '=');
2167 	int i;
2168 	int err = 1;
2169 
2170 	if (nodelist) {
2171 		/* NUL-terminate mode or flags string */
2172 		*nodelist++ = '\0';
2173 		if (nodelist_parse(nodelist, nodes))
2174 			goto out;
2175 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2176 			goto out;
2177 	} else
2178 		nodes_clear(nodes);
2179 
2180 	if (flags)
2181 		*flags++ = '\0';	/* terminate mode string */
2182 
2183 	for (i = 0; i <= MPOL_LOCAL; i++) {
2184 		if (!strcmp(str, policy_types[i])) {
2185 			mode = i;
2186 			break;
2187 		}
2188 	}
2189 	if (i > MPOL_LOCAL)
2190 		goto out;
2191 
2192 	switch (mode) {
2193 	case MPOL_PREFERRED:
2194 		/*
2195 		 * Insist on a nodelist of one node only
2196 		 */
2197 		if (nodelist) {
2198 			char *rest = nodelist;
2199 			while (isdigit(*rest))
2200 				rest++;
2201 			if (*rest)
2202 				goto out;
2203 		}
2204 		break;
2205 	case MPOL_INTERLEAVE:
2206 		/*
2207 		 * Default to online nodes with memory if no nodelist
2208 		 */
2209 		if (!nodelist)
2210 			nodes = node_states[N_HIGH_MEMORY];
2211 		break;
2212 	case MPOL_LOCAL:
2213 		/*
2214 		 * Don't allow a nodelist;  mpol_new() checks flags
2215 		 */
2216 		if (nodelist)
2217 			goto out;
2218 		mode = MPOL_PREFERRED;
2219 		break;
2220 	case MPOL_DEFAULT:
2221 		/*
2222 		 * Insist on a empty nodelist
2223 		 */
2224 		if (!nodelist)
2225 			err = 0;
2226 		goto out;
2227 	case MPOL_BIND:
2228 		/*
2229 		 * Insist on a nodelist
2230 		 */
2231 		if (!nodelist)
2232 			goto out;
2233 	}
2234 
2235 	mode_flags = 0;
2236 	if (flags) {
2237 		/*
2238 		 * Currently, we only support two mutually exclusive
2239 		 * mode flags.
2240 		 */
2241 		if (!strcmp(flags, "static"))
2242 			mode_flags |= MPOL_F_STATIC_NODES;
2243 		else if (!strcmp(flags, "relative"))
2244 			mode_flags |= MPOL_F_RELATIVE_NODES;
2245 		else
2246 			goto out;
2247 	}
2248 
2249 	new = mpol_new(mode, mode_flags, &nodes);
2250 	if (IS_ERR(new))
2251 		goto out;
2252 
2253 	{
2254 		int ret;
2255 		NODEMASK_SCRATCH(scratch);
2256 		if (scratch) {
2257 			task_lock(current);
2258 			ret = mpol_set_nodemask(new, &nodes, scratch);
2259 			task_unlock(current);
2260 		} else
2261 			ret = -ENOMEM;
2262 		NODEMASK_SCRATCH_FREE(scratch);
2263 		if (ret) {
2264 			mpol_put(new);
2265 			goto out;
2266 		}
2267 	}
2268 	err = 0;
2269 	if (no_context) {
2270 		/* save for contextualization */
2271 		new->w.user_nodemask = nodes;
2272 	}
2273 
2274 out:
2275 	/* Restore string for error message */
2276 	if (nodelist)
2277 		*--nodelist = ':';
2278 	if (flags)
2279 		*--flags = '=';
2280 	if (!err)
2281 		*mpol = new;
2282 	return err;
2283 }
2284 #endif /* CONFIG_TMPFS */
2285 
2286 /**
2287  * mpol_to_str - format a mempolicy structure for printing
2288  * @buffer:  to contain formatted mempolicy string
2289  * @maxlen:  length of @buffer
2290  * @pol:  pointer to mempolicy to be formatted
2291  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2292  *
2293  * Convert a mempolicy into a string.
2294  * Returns the number of characters in buffer (if positive)
2295  * or an error (negative)
2296  */
2297 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2298 {
2299 	char *p = buffer;
2300 	int l;
2301 	nodemask_t nodes;
2302 	unsigned short mode;
2303 	unsigned short flags = pol ? pol->flags : 0;
2304 
2305 	/*
2306 	 * Sanity check:  room for longest mode, flag and some nodes
2307 	 */
2308 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2309 
2310 	if (!pol || pol == &default_policy)
2311 		mode = MPOL_DEFAULT;
2312 	else
2313 		mode = pol->mode;
2314 
2315 	switch (mode) {
2316 	case MPOL_DEFAULT:
2317 		nodes_clear(nodes);
2318 		break;
2319 
2320 	case MPOL_PREFERRED:
2321 		nodes_clear(nodes);
2322 		if (flags & MPOL_F_LOCAL)
2323 			mode = MPOL_LOCAL;	/* pseudo-policy */
2324 		else
2325 			node_set(pol->v.preferred_node, nodes);
2326 		break;
2327 
2328 	case MPOL_BIND:
2329 		/* Fall through */
2330 	case MPOL_INTERLEAVE:
2331 		if (no_context)
2332 			nodes = pol->w.user_nodemask;
2333 		else
2334 			nodes = pol->v.nodes;
2335 		break;
2336 
2337 	default:
2338 		BUG();
2339 	}
2340 
2341 	l = strlen(policy_types[mode]);
2342 	if (buffer + maxlen < p + l + 1)
2343 		return -ENOSPC;
2344 
2345 	strcpy(p, policy_types[mode]);
2346 	p += l;
2347 
2348 	if (flags & MPOL_MODE_FLAGS) {
2349 		if (buffer + maxlen < p + 2)
2350 			return -ENOSPC;
2351 		*p++ = '=';
2352 
2353 		/*
2354 		 * Currently, the only defined flags are mutually exclusive
2355 		 */
2356 		if (flags & MPOL_F_STATIC_NODES)
2357 			p += snprintf(p, buffer + maxlen - p, "static");
2358 		else if (flags & MPOL_F_RELATIVE_NODES)
2359 			p += snprintf(p, buffer + maxlen - p, "relative");
2360 	}
2361 
2362 	if (!nodes_empty(nodes)) {
2363 		if (buffer + maxlen < p + 2)
2364 			return -ENOSPC;
2365 		*p++ = ':';
2366 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2367 	}
2368 	return p - buffer;
2369 }
2370 
2371 struct numa_maps {
2372 	unsigned long pages;
2373 	unsigned long anon;
2374 	unsigned long active;
2375 	unsigned long writeback;
2376 	unsigned long mapcount_max;
2377 	unsigned long dirty;
2378 	unsigned long swapcache;
2379 	unsigned long node[MAX_NUMNODES];
2380 };
2381 
2382 static void gather_stats(struct page *page, void *private, int pte_dirty)
2383 {
2384 	struct numa_maps *md = private;
2385 	int count = page_mapcount(page);
2386 
2387 	md->pages++;
2388 	if (pte_dirty || PageDirty(page))
2389 		md->dirty++;
2390 
2391 	if (PageSwapCache(page))
2392 		md->swapcache++;
2393 
2394 	if (PageActive(page) || PageUnevictable(page))
2395 		md->active++;
2396 
2397 	if (PageWriteback(page))
2398 		md->writeback++;
2399 
2400 	if (PageAnon(page))
2401 		md->anon++;
2402 
2403 	if (count > md->mapcount_max)
2404 		md->mapcount_max = count;
2405 
2406 	md->node[page_to_nid(page)]++;
2407 }
2408 
2409 #ifdef CONFIG_HUGETLB_PAGE
2410 static void check_huge_range(struct vm_area_struct *vma,
2411 		unsigned long start, unsigned long end,
2412 		struct numa_maps *md)
2413 {
2414 	unsigned long addr;
2415 	struct page *page;
2416 	struct hstate *h = hstate_vma(vma);
2417 	unsigned long sz = huge_page_size(h);
2418 
2419 	for (addr = start; addr < end; addr += sz) {
2420 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2421 						addr & huge_page_mask(h));
2422 		pte_t pte;
2423 
2424 		if (!ptep)
2425 			continue;
2426 
2427 		pte = *ptep;
2428 		if (pte_none(pte))
2429 			continue;
2430 
2431 		page = pte_page(pte);
2432 		if (!page)
2433 			continue;
2434 
2435 		gather_stats(page, md, pte_dirty(*ptep));
2436 	}
2437 }
2438 #else
2439 static inline void check_huge_range(struct vm_area_struct *vma,
2440 		unsigned long start, unsigned long end,
2441 		struct numa_maps *md)
2442 {
2443 }
2444 #endif
2445 
2446 /*
2447  * Display pages allocated per node and memory policy via /proc.
2448  */
2449 int show_numa_map(struct seq_file *m, void *v)
2450 {
2451 	struct proc_maps_private *priv = m->private;
2452 	struct vm_area_struct *vma = v;
2453 	struct numa_maps *md;
2454 	struct file *file = vma->vm_file;
2455 	struct mm_struct *mm = vma->vm_mm;
2456 	struct mempolicy *pol;
2457 	int n;
2458 	char buffer[50];
2459 
2460 	if (!mm)
2461 		return 0;
2462 
2463 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2464 	if (!md)
2465 		return 0;
2466 
2467 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2468 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2469 	mpol_cond_put(pol);
2470 
2471 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2472 
2473 	if (file) {
2474 		seq_printf(m, " file=");
2475 		seq_path(m, &file->f_path, "\n\t= ");
2476 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2477 		seq_printf(m, " heap");
2478 	} else if (vma->vm_start <= mm->start_stack &&
2479 			vma->vm_end >= mm->start_stack) {
2480 		seq_printf(m, " stack");
2481 	}
2482 
2483 	if (is_vm_hugetlb_page(vma)) {
2484 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2485 		seq_printf(m, " huge");
2486 	} else {
2487 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2488 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2489 	}
2490 
2491 	if (!md->pages)
2492 		goto out;
2493 
2494 	if (md->anon)
2495 		seq_printf(m," anon=%lu",md->anon);
2496 
2497 	if (md->dirty)
2498 		seq_printf(m," dirty=%lu",md->dirty);
2499 
2500 	if (md->pages != md->anon && md->pages != md->dirty)
2501 		seq_printf(m, " mapped=%lu", md->pages);
2502 
2503 	if (md->mapcount_max > 1)
2504 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2505 
2506 	if (md->swapcache)
2507 		seq_printf(m," swapcache=%lu", md->swapcache);
2508 
2509 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2510 		seq_printf(m," active=%lu", md->active);
2511 
2512 	if (md->writeback)
2513 		seq_printf(m," writeback=%lu", md->writeback);
2514 
2515 	for_each_node_state(n, N_HIGH_MEMORY)
2516 		if (md->node[n])
2517 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2518 out:
2519 	seq_putc(m, '\n');
2520 	kfree(md);
2521 
2522 	if (m->count < m->size)
2523 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2524 	return 0;
2525 }
2526