xref: /linux/mm/mempolicy.c (revision 5bdef865eb358b6f3760e25e591ae115e9eeddef)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #include <linux/mempolicy.h>
69 #include <linux/mm.h>
70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h>
76 #include <linux/gfp.h>
77 #include <linux/slab.h>
78 #include <linux/string.h>
79 #include <linux/module.h>
80 #include <linux/nsproxy.h>
81 #include <linux/interrupt.h>
82 #include <linux/init.h>
83 #include <linux/compat.h>
84 #include <linux/swap.h>
85 #include <linux/seq_file.h>
86 #include <linux/proc_fs.h>
87 #include <linux/migrate.h>
88 #include <linux/rmap.h>
89 #include <linux/security.h>
90 #include <linux/syscalls.h>
91 #include <linux/ctype.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 #include "internal.h"
97 
98 /* Internal flags */
99 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
100 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
101 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
102 
103 static struct kmem_cache *policy_cache;
104 static struct kmem_cache *sn_cache;
105 
106 /* Highest zone. An specific allocation for a zone below that is not
107    policied. */
108 enum zone_type policy_zone = 0;
109 
110 /*
111  * run-time system-wide default policy => local allocation
112  */
113 struct mempolicy default_policy = {
114 	.refcnt = ATOMIC_INIT(1), /* never free it */
115 	.mode = MPOL_PREFERRED,
116 	.flags = MPOL_F_LOCAL,
117 };
118 
119 static const struct mempolicy_operations {
120 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
122 } mpol_ops[MPOL_MAX];
123 
124 /* Check that the nodemask contains at least one populated zone */
125 static int is_valid_nodemask(const nodemask_t *nodemask)
126 {
127 	int nd, k;
128 
129 	/* Check that there is something useful in this mask */
130 	k = policy_zone;
131 
132 	for_each_node_mask(nd, *nodemask) {
133 		struct zone *z;
134 
135 		for (k = 0; k <= policy_zone; k++) {
136 			z = &NODE_DATA(nd)->node_zones[k];
137 			if (z->present_pages > 0)
138 				return 1;
139 		}
140 	}
141 
142 	return 0;
143 }
144 
145 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
146 {
147 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
148 }
149 
150 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
151 				   const nodemask_t *rel)
152 {
153 	nodemask_t tmp;
154 	nodes_fold(tmp, *orig, nodes_weight(*rel));
155 	nodes_onto(*ret, tmp, *rel);
156 }
157 
158 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
159 {
160 	if (nodes_empty(*nodes))
161 		return -EINVAL;
162 	pol->v.nodes = *nodes;
163 	return 0;
164 }
165 
166 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
167 {
168 	if (!nodes)
169 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
170 	else if (nodes_empty(*nodes))
171 		return -EINVAL;			/*  no allowed nodes */
172 	else
173 		pol->v.preferred_node = first_node(*nodes);
174 	return 0;
175 }
176 
177 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
178 {
179 	if (!is_valid_nodemask(nodes))
180 		return -EINVAL;
181 	pol->v.nodes = *nodes;
182 	return 0;
183 }
184 
185 /*
186  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187  * any, for the new policy.  mpol_new() has already validated the nodes
188  * parameter with respect to the policy mode and flags.  But, we need to
189  * handle an empty nodemask with MPOL_PREFERRED here.
190  *
191  * Must be called holding task's alloc_lock to protect task's mems_allowed
192  * and mempolicy.  May also be called holding the mmap_semaphore for write.
193  */
194 static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
195 {
196 	nodemask_t cpuset_context_nmask;
197 	int ret;
198 
199 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 	if (pol == NULL)
201 		return 0;
202 
203 	VM_BUG_ON(!nodes);
204 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 		nodes = NULL;	/* explicit local allocation */
206 	else {
207 		if (pol->flags & MPOL_F_RELATIVE_NODES)
208 			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
209 					       &cpuset_current_mems_allowed);
210 		else
211 			nodes_and(cpuset_context_nmask, *nodes,
212 				  cpuset_current_mems_allowed);
213 		if (mpol_store_user_nodemask(pol))
214 			pol->w.user_nodemask = *nodes;
215 		else
216 			pol->w.cpuset_mems_allowed =
217 						cpuset_current_mems_allowed;
218 	}
219 
220 	ret = mpol_ops[pol->mode].create(pol,
221 				nodes ? &cpuset_context_nmask : NULL);
222 	return ret;
223 }
224 
225 /*
226  * This function just creates a new policy, does some check and simple
227  * initialization. You must invoke mpol_set_nodemask() to set nodes.
228  */
229 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
230 				  nodemask_t *nodes)
231 {
232 	struct mempolicy *policy;
233 
234 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
235 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
236 
237 	if (mode == MPOL_DEFAULT) {
238 		if (nodes && !nodes_empty(*nodes))
239 			return ERR_PTR(-EINVAL);
240 		return NULL;	/* simply delete any existing policy */
241 	}
242 	VM_BUG_ON(!nodes);
243 
244 	/*
245 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
246 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
247 	 * All other modes require a valid pointer to a non-empty nodemask.
248 	 */
249 	if (mode == MPOL_PREFERRED) {
250 		if (nodes_empty(*nodes)) {
251 			if (((flags & MPOL_F_STATIC_NODES) ||
252 			     (flags & MPOL_F_RELATIVE_NODES)))
253 				return ERR_PTR(-EINVAL);
254 		}
255 	} else if (nodes_empty(*nodes))
256 		return ERR_PTR(-EINVAL);
257 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
258 	if (!policy)
259 		return ERR_PTR(-ENOMEM);
260 	atomic_set(&policy->refcnt, 1);
261 	policy->mode = mode;
262 	policy->flags = flags;
263 
264 	return policy;
265 }
266 
267 /* Slow path of a mpol destructor. */
268 void __mpol_put(struct mempolicy *p)
269 {
270 	if (!atomic_dec_and_test(&p->refcnt))
271 		return;
272 	kmem_cache_free(policy_cache, p);
273 }
274 
275 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
276 {
277 }
278 
279 static void mpol_rebind_nodemask(struct mempolicy *pol,
280 				 const nodemask_t *nodes)
281 {
282 	nodemask_t tmp;
283 
284 	if (pol->flags & MPOL_F_STATIC_NODES)
285 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
286 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
287 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
288 	else {
289 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
290 			    *nodes);
291 		pol->w.cpuset_mems_allowed = *nodes;
292 	}
293 
294 	pol->v.nodes = tmp;
295 	if (!node_isset(current->il_next, tmp)) {
296 		current->il_next = next_node(current->il_next, tmp);
297 		if (current->il_next >= MAX_NUMNODES)
298 			current->il_next = first_node(tmp);
299 		if (current->il_next >= MAX_NUMNODES)
300 			current->il_next = numa_node_id();
301 	}
302 }
303 
304 static void mpol_rebind_preferred(struct mempolicy *pol,
305 				  const nodemask_t *nodes)
306 {
307 	nodemask_t tmp;
308 
309 	if (pol->flags & MPOL_F_STATIC_NODES) {
310 		int node = first_node(pol->w.user_nodemask);
311 
312 		if (node_isset(node, *nodes)) {
313 			pol->v.preferred_node = node;
314 			pol->flags &= ~MPOL_F_LOCAL;
315 		} else
316 			pol->flags |= MPOL_F_LOCAL;
317 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
318 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
319 		pol->v.preferred_node = first_node(tmp);
320 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
321 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
322 						   pol->w.cpuset_mems_allowed,
323 						   *nodes);
324 		pol->w.cpuset_mems_allowed = *nodes;
325 	}
326 }
327 
328 /* Migrate a policy to a different set of nodes */
329 static void mpol_rebind_policy(struct mempolicy *pol,
330 			       const nodemask_t *newmask)
331 {
332 	if (!pol)
333 		return;
334 	if (!mpol_store_user_nodemask(pol) &&
335 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
336 		return;
337 	mpol_ops[pol->mode].rebind(pol, newmask);
338 }
339 
340 /*
341  * Wrapper for mpol_rebind_policy() that just requires task
342  * pointer, and updates task mempolicy.
343  *
344  * Called with task's alloc_lock held.
345  */
346 
347 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
348 {
349 	mpol_rebind_policy(tsk->mempolicy, new);
350 }
351 
352 /*
353  * Rebind each vma in mm to new nodemask.
354  *
355  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
356  */
357 
358 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
359 {
360 	struct vm_area_struct *vma;
361 
362 	down_write(&mm->mmap_sem);
363 	for (vma = mm->mmap; vma; vma = vma->vm_next)
364 		mpol_rebind_policy(vma->vm_policy, new);
365 	up_write(&mm->mmap_sem);
366 }
367 
368 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
369 	[MPOL_DEFAULT] = {
370 		.rebind = mpol_rebind_default,
371 	},
372 	[MPOL_INTERLEAVE] = {
373 		.create = mpol_new_interleave,
374 		.rebind = mpol_rebind_nodemask,
375 	},
376 	[MPOL_PREFERRED] = {
377 		.create = mpol_new_preferred,
378 		.rebind = mpol_rebind_preferred,
379 	},
380 	[MPOL_BIND] = {
381 		.create = mpol_new_bind,
382 		.rebind = mpol_rebind_nodemask,
383 	},
384 };
385 
386 static void gather_stats(struct page *, void *, int pte_dirty);
387 static void migrate_page_add(struct page *page, struct list_head *pagelist,
388 				unsigned long flags);
389 
390 /* Scan through pages checking if pages follow certain conditions. */
391 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
392 		unsigned long addr, unsigned long end,
393 		const nodemask_t *nodes, unsigned long flags,
394 		void *private)
395 {
396 	pte_t *orig_pte;
397 	pte_t *pte;
398 	spinlock_t *ptl;
399 
400 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
401 	do {
402 		struct page *page;
403 		int nid;
404 
405 		if (!pte_present(*pte))
406 			continue;
407 		page = vm_normal_page(vma, addr, *pte);
408 		if (!page)
409 			continue;
410 		/*
411 		 * The check for PageReserved here is important to avoid
412 		 * handling zero pages and other pages that may have been
413 		 * marked special by the system.
414 		 *
415 		 * If the PageReserved would not be checked here then f.e.
416 		 * the location of the zero page could have an influence
417 		 * on MPOL_MF_STRICT, zero pages would be counted for
418 		 * the per node stats, and there would be useless attempts
419 		 * to put zero pages on the migration list.
420 		 */
421 		if (PageReserved(page))
422 			continue;
423 		nid = page_to_nid(page);
424 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
425 			continue;
426 
427 		if (flags & MPOL_MF_STATS)
428 			gather_stats(page, private, pte_dirty(*pte));
429 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
430 			migrate_page_add(page, private, flags);
431 		else
432 			break;
433 	} while (pte++, addr += PAGE_SIZE, addr != end);
434 	pte_unmap_unlock(orig_pte, ptl);
435 	return addr != end;
436 }
437 
438 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
439 		unsigned long addr, unsigned long end,
440 		const nodemask_t *nodes, unsigned long flags,
441 		void *private)
442 {
443 	pmd_t *pmd;
444 	unsigned long next;
445 
446 	pmd = pmd_offset(pud, addr);
447 	do {
448 		next = pmd_addr_end(addr, end);
449 		if (pmd_none_or_clear_bad(pmd))
450 			continue;
451 		if (check_pte_range(vma, pmd, addr, next, nodes,
452 				    flags, private))
453 			return -EIO;
454 	} while (pmd++, addr = next, addr != end);
455 	return 0;
456 }
457 
458 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
459 		unsigned long addr, unsigned long end,
460 		const nodemask_t *nodes, unsigned long flags,
461 		void *private)
462 {
463 	pud_t *pud;
464 	unsigned long next;
465 
466 	pud = pud_offset(pgd, addr);
467 	do {
468 		next = pud_addr_end(addr, end);
469 		if (pud_none_or_clear_bad(pud))
470 			continue;
471 		if (check_pmd_range(vma, pud, addr, next, nodes,
472 				    flags, private))
473 			return -EIO;
474 	} while (pud++, addr = next, addr != end);
475 	return 0;
476 }
477 
478 static inline int check_pgd_range(struct vm_area_struct *vma,
479 		unsigned long addr, unsigned long end,
480 		const nodemask_t *nodes, unsigned long flags,
481 		void *private)
482 {
483 	pgd_t *pgd;
484 	unsigned long next;
485 
486 	pgd = pgd_offset(vma->vm_mm, addr);
487 	do {
488 		next = pgd_addr_end(addr, end);
489 		if (pgd_none_or_clear_bad(pgd))
490 			continue;
491 		if (check_pud_range(vma, pgd, addr, next, nodes,
492 				    flags, private))
493 			return -EIO;
494 	} while (pgd++, addr = next, addr != end);
495 	return 0;
496 }
497 
498 /*
499  * Check if all pages in a range are on a set of nodes.
500  * If pagelist != NULL then isolate pages from the LRU and
501  * put them on the pagelist.
502  */
503 static struct vm_area_struct *
504 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
505 		const nodemask_t *nodes, unsigned long flags, void *private)
506 {
507 	int err;
508 	struct vm_area_struct *first, *vma, *prev;
509 
510 
511 	first = find_vma(mm, start);
512 	if (!first)
513 		return ERR_PTR(-EFAULT);
514 	prev = NULL;
515 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
516 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
517 			if (!vma->vm_next && vma->vm_end < end)
518 				return ERR_PTR(-EFAULT);
519 			if (prev && prev->vm_end < vma->vm_start)
520 				return ERR_PTR(-EFAULT);
521 		}
522 		if (!is_vm_hugetlb_page(vma) &&
523 		    ((flags & MPOL_MF_STRICT) ||
524 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
525 				vma_migratable(vma)))) {
526 			unsigned long endvma = vma->vm_end;
527 
528 			if (endvma > end)
529 				endvma = end;
530 			if (vma->vm_start > start)
531 				start = vma->vm_start;
532 			err = check_pgd_range(vma, start, endvma, nodes,
533 						flags, private);
534 			if (err) {
535 				first = ERR_PTR(err);
536 				break;
537 			}
538 		}
539 		prev = vma;
540 	}
541 	return first;
542 }
543 
544 /* Apply policy to a single VMA */
545 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
546 {
547 	int err = 0;
548 	struct mempolicy *old = vma->vm_policy;
549 
550 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
551 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
552 		 vma->vm_ops, vma->vm_file,
553 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
554 
555 	if (vma->vm_ops && vma->vm_ops->set_policy)
556 		err = vma->vm_ops->set_policy(vma, new);
557 	if (!err) {
558 		mpol_get(new);
559 		vma->vm_policy = new;
560 		mpol_put(old);
561 	}
562 	return err;
563 }
564 
565 /* Step 2: apply policy to a range and do splits. */
566 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
567 		       unsigned long end, struct mempolicy *new)
568 {
569 	struct vm_area_struct *next;
570 	int err;
571 
572 	err = 0;
573 	for (; vma && vma->vm_start < end; vma = next) {
574 		next = vma->vm_next;
575 		if (vma->vm_start < start)
576 			err = split_vma(vma->vm_mm, vma, start, 1);
577 		if (!err && vma->vm_end > end)
578 			err = split_vma(vma->vm_mm, vma, end, 0);
579 		if (!err)
580 			err = policy_vma(vma, new);
581 		if (err)
582 			break;
583 	}
584 	return err;
585 }
586 
587 /*
588  * Update task->flags PF_MEMPOLICY bit: set iff non-default
589  * mempolicy.  Allows more rapid checking of this (combined perhaps
590  * with other PF_* flag bits) on memory allocation hot code paths.
591  *
592  * If called from outside this file, the task 'p' should -only- be
593  * a newly forked child not yet visible on the task list, because
594  * manipulating the task flags of a visible task is not safe.
595  *
596  * The above limitation is why this routine has the funny name
597  * mpol_fix_fork_child_flag().
598  *
599  * It is also safe to call this with a task pointer of current,
600  * which the static wrapper mpol_set_task_struct_flag() does,
601  * for use within this file.
602  */
603 
604 void mpol_fix_fork_child_flag(struct task_struct *p)
605 {
606 	if (p->mempolicy)
607 		p->flags |= PF_MEMPOLICY;
608 	else
609 		p->flags &= ~PF_MEMPOLICY;
610 }
611 
612 static void mpol_set_task_struct_flag(void)
613 {
614 	mpol_fix_fork_child_flag(current);
615 }
616 
617 /* Set the process memory policy */
618 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
619 			     nodemask_t *nodes)
620 {
621 	struct mempolicy *new, *old;
622 	struct mm_struct *mm = current->mm;
623 	int ret;
624 
625 	new = mpol_new(mode, flags, nodes);
626 	if (IS_ERR(new))
627 		return PTR_ERR(new);
628 
629 	/*
630 	 * prevent changing our mempolicy while show_numa_maps()
631 	 * is using it.
632 	 * Note:  do_set_mempolicy() can be called at init time
633 	 * with no 'mm'.
634 	 */
635 	if (mm)
636 		down_write(&mm->mmap_sem);
637 	task_lock(current);
638 	ret = mpol_set_nodemask(new, nodes);
639 	if (ret) {
640 		task_unlock(current);
641 		if (mm)
642 			up_write(&mm->mmap_sem);
643 		mpol_put(new);
644 		return ret;
645 	}
646 	old = current->mempolicy;
647 	current->mempolicy = new;
648 	mpol_set_task_struct_flag();
649 	if (new && new->mode == MPOL_INTERLEAVE &&
650 	    nodes_weight(new->v.nodes))
651 		current->il_next = first_node(new->v.nodes);
652 	task_unlock(current);
653 	if (mm)
654 		up_write(&mm->mmap_sem);
655 
656 	mpol_put(old);
657 	return 0;
658 }
659 
660 /*
661  * Return nodemask for policy for get_mempolicy() query
662  *
663  * Called with task's alloc_lock held
664  */
665 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
666 {
667 	nodes_clear(*nodes);
668 	if (p == &default_policy)
669 		return;
670 
671 	switch (p->mode) {
672 	case MPOL_BIND:
673 		/* Fall through */
674 	case MPOL_INTERLEAVE:
675 		*nodes = p->v.nodes;
676 		break;
677 	case MPOL_PREFERRED:
678 		if (!(p->flags & MPOL_F_LOCAL))
679 			node_set(p->v.preferred_node, *nodes);
680 		/* else return empty node mask for local allocation */
681 		break;
682 	default:
683 		BUG();
684 	}
685 }
686 
687 static int lookup_node(struct mm_struct *mm, unsigned long addr)
688 {
689 	struct page *p;
690 	int err;
691 
692 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
693 	if (err >= 0) {
694 		err = page_to_nid(p);
695 		put_page(p);
696 	}
697 	return err;
698 }
699 
700 /* Retrieve NUMA policy */
701 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
702 			     unsigned long addr, unsigned long flags)
703 {
704 	int err;
705 	struct mm_struct *mm = current->mm;
706 	struct vm_area_struct *vma = NULL;
707 	struct mempolicy *pol = current->mempolicy;
708 
709 	if (flags &
710 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
711 		return -EINVAL;
712 
713 	if (flags & MPOL_F_MEMS_ALLOWED) {
714 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
715 			return -EINVAL;
716 		*policy = 0;	/* just so it's initialized */
717 		task_lock(current);
718 		*nmask  = cpuset_current_mems_allowed;
719 		task_unlock(current);
720 		return 0;
721 	}
722 
723 	if (flags & MPOL_F_ADDR) {
724 		/*
725 		 * Do NOT fall back to task policy if the
726 		 * vma/shared policy at addr is NULL.  We
727 		 * want to return MPOL_DEFAULT in this case.
728 		 */
729 		down_read(&mm->mmap_sem);
730 		vma = find_vma_intersection(mm, addr, addr+1);
731 		if (!vma) {
732 			up_read(&mm->mmap_sem);
733 			return -EFAULT;
734 		}
735 		if (vma->vm_ops && vma->vm_ops->get_policy)
736 			pol = vma->vm_ops->get_policy(vma, addr);
737 		else
738 			pol = vma->vm_policy;
739 	} else if (addr)
740 		return -EINVAL;
741 
742 	if (!pol)
743 		pol = &default_policy;	/* indicates default behavior */
744 
745 	if (flags & MPOL_F_NODE) {
746 		if (flags & MPOL_F_ADDR) {
747 			err = lookup_node(mm, addr);
748 			if (err < 0)
749 				goto out;
750 			*policy = err;
751 		} else if (pol == current->mempolicy &&
752 				pol->mode == MPOL_INTERLEAVE) {
753 			*policy = current->il_next;
754 		} else {
755 			err = -EINVAL;
756 			goto out;
757 		}
758 	} else {
759 		*policy = pol == &default_policy ? MPOL_DEFAULT :
760 						pol->mode;
761 		/*
762 		 * Internal mempolicy flags must be masked off before exposing
763 		 * the policy to userspace.
764 		 */
765 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
766 	}
767 
768 	if (vma) {
769 		up_read(&current->mm->mmap_sem);
770 		vma = NULL;
771 	}
772 
773 	err = 0;
774 	if (nmask) {
775 		task_lock(current);
776 		get_policy_nodemask(pol, nmask);
777 		task_unlock(current);
778 	}
779 
780  out:
781 	mpol_cond_put(pol);
782 	if (vma)
783 		up_read(&current->mm->mmap_sem);
784 	return err;
785 }
786 
787 #ifdef CONFIG_MIGRATION
788 /*
789  * page migration
790  */
791 static void migrate_page_add(struct page *page, struct list_head *pagelist,
792 				unsigned long flags)
793 {
794 	/*
795 	 * Avoid migrating a page that is shared with others.
796 	 */
797 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
798 		if (!isolate_lru_page(page)) {
799 			list_add_tail(&page->lru, pagelist);
800 		}
801 	}
802 }
803 
804 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
805 {
806 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
807 }
808 
809 /*
810  * Migrate pages from one node to a target node.
811  * Returns error or the number of pages not migrated.
812  */
813 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
814 			   int flags)
815 {
816 	nodemask_t nmask;
817 	LIST_HEAD(pagelist);
818 	int err = 0;
819 
820 	nodes_clear(nmask);
821 	node_set(source, nmask);
822 
823 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
824 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
825 
826 	if (!list_empty(&pagelist))
827 		err = migrate_pages(&pagelist, new_node_page, dest);
828 
829 	return err;
830 }
831 
832 /*
833  * Move pages between the two nodesets so as to preserve the physical
834  * layout as much as possible.
835  *
836  * Returns the number of page that could not be moved.
837  */
838 int do_migrate_pages(struct mm_struct *mm,
839 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
840 {
841 	int busy = 0;
842 	int err;
843 	nodemask_t tmp;
844 
845 	err = migrate_prep();
846 	if (err)
847 		return err;
848 
849 	down_read(&mm->mmap_sem);
850 
851 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
852 	if (err)
853 		goto out;
854 
855 /*
856  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
857  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
858  * bit in 'tmp', and return that <source, dest> pair for migration.
859  * The pair of nodemasks 'to' and 'from' define the map.
860  *
861  * If no pair of bits is found that way, fallback to picking some
862  * pair of 'source' and 'dest' bits that are not the same.  If the
863  * 'source' and 'dest' bits are the same, this represents a node
864  * that will be migrating to itself, so no pages need move.
865  *
866  * If no bits are left in 'tmp', or if all remaining bits left
867  * in 'tmp' correspond to the same bit in 'to', return false
868  * (nothing left to migrate).
869  *
870  * This lets us pick a pair of nodes to migrate between, such that
871  * if possible the dest node is not already occupied by some other
872  * source node, minimizing the risk of overloading the memory on a
873  * node that would happen if we migrated incoming memory to a node
874  * before migrating outgoing memory source that same node.
875  *
876  * A single scan of tmp is sufficient.  As we go, we remember the
877  * most recent <s, d> pair that moved (s != d).  If we find a pair
878  * that not only moved, but what's better, moved to an empty slot
879  * (d is not set in tmp), then we break out then, with that pair.
880  * Otherwise when we finish scannng from_tmp, we at least have the
881  * most recent <s, d> pair that moved.  If we get all the way through
882  * the scan of tmp without finding any node that moved, much less
883  * moved to an empty node, then there is nothing left worth migrating.
884  */
885 
886 	tmp = *from_nodes;
887 	while (!nodes_empty(tmp)) {
888 		int s,d;
889 		int source = -1;
890 		int dest = 0;
891 
892 		for_each_node_mask(s, tmp) {
893 			d = node_remap(s, *from_nodes, *to_nodes);
894 			if (s == d)
895 				continue;
896 
897 			source = s;	/* Node moved. Memorize */
898 			dest = d;
899 
900 			/* dest not in remaining from nodes? */
901 			if (!node_isset(dest, tmp))
902 				break;
903 		}
904 		if (source == -1)
905 			break;
906 
907 		node_clear(source, tmp);
908 		err = migrate_to_node(mm, source, dest, flags);
909 		if (err > 0)
910 			busy += err;
911 		if (err < 0)
912 			break;
913 	}
914 out:
915 	up_read(&mm->mmap_sem);
916 	if (err < 0)
917 		return err;
918 	return busy;
919 
920 }
921 
922 /*
923  * Allocate a new page for page migration based on vma policy.
924  * Start assuming that page is mapped by vma pointed to by @private.
925  * Search forward from there, if not.  N.B., this assumes that the
926  * list of pages handed to migrate_pages()--which is how we get here--
927  * is in virtual address order.
928  */
929 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
930 {
931 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
932 	unsigned long uninitialized_var(address);
933 
934 	while (vma) {
935 		address = page_address_in_vma(page, vma);
936 		if (address != -EFAULT)
937 			break;
938 		vma = vma->vm_next;
939 	}
940 
941 	/*
942 	 * if !vma, alloc_page_vma() will use task or system default policy
943 	 */
944 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
945 }
946 #else
947 
948 static void migrate_page_add(struct page *page, struct list_head *pagelist,
949 				unsigned long flags)
950 {
951 }
952 
953 int do_migrate_pages(struct mm_struct *mm,
954 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
955 {
956 	return -ENOSYS;
957 }
958 
959 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
960 {
961 	return NULL;
962 }
963 #endif
964 
965 static long do_mbind(unsigned long start, unsigned long len,
966 		     unsigned short mode, unsigned short mode_flags,
967 		     nodemask_t *nmask, unsigned long flags)
968 {
969 	struct vm_area_struct *vma;
970 	struct mm_struct *mm = current->mm;
971 	struct mempolicy *new;
972 	unsigned long end;
973 	int err;
974 	LIST_HEAD(pagelist);
975 
976 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
977 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
978 		return -EINVAL;
979 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
980 		return -EPERM;
981 
982 	if (start & ~PAGE_MASK)
983 		return -EINVAL;
984 
985 	if (mode == MPOL_DEFAULT)
986 		flags &= ~MPOL_MF_STRICT;
987 
988 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
989 	end = start + len;
990 
991 	if (end < start)
992 		return -EINVAL;
993 	if (end == start)
994 		return 0;
995 
996 	new = mpol_new(mode, mode_flags, nmask);
997 	if (IS_ERR(new))
998 		return PTR_ERR(new);
999 
1000 	/*
1001 	 * If we are using the default policy then operation
1002 	 * on discontinuous address spaces is okay after all
1003 	 */
1004 	if (!new)
1005 		flags |= MPOL_MF_DISCONTIG_OK;
1006 
1007 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1008 		 start, start + len, mode, mode_flags,
1009 		 nmask ? nodes_addr(*nmask)[0] : -1);
1010 
1011 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1012 
1013 		err = migrate_prep();
1014 		if (err)
1015 			return err;
1016 	}
1017 	down_write(&mm->mmap_sem);
1018 	task_lock(current);
1019 	err = mpol_set_nodemask(new, nmask);
1020 	task_unlock(current);
1021 	if (err) {
1022 		up_write(&mm->mmap_sem);
1023 		mpol_put(new);
1024 		return err;
1025 	}
1026 	vma = check_range(mm, start, end, nmask,
1027 			  flags | MPOL_MF_INVERT, &pagelist);
1028 
1029 	err = PTR_ERR(vma);
1030 	if (!IS_ERR(vma)) {
1031 		int nr_failed = 0;
1032 
1033 		err = mbind_range(vma, start, end, new);
1034 
1035 		if (!list_empty(&pagelist))
1036 			nr_failed = migrate_pages(&pagelist, new_vma_page,
1037 						(unsigned long)vma);
1038 
1039 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1040 			err = -EIO;
1041 	}
1042 
1043 	up_write(&mm->mmap_sem);
1044 	mpol_put(new);
1045 	return err;
1046 }
1047 
1048 /*
1049  * User space interface with variable sized bitmaps for nodelists.
1050  */
1051 
1052 /* Copy a node mask from user space. */
1053 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1054 		     unsigned long maxnode)
1055 {
1056 	unsigned long k;
1057 	unsigned long nlongs;
1058 	unsigned long endmask;
1059 
1060 	--maxnode;
1061 	nodes_clear(*nodes);
1062 	if (maxnode == 0 || !nmask)
1063 		return 0;
1064 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1065 		return -EINVAL;
1066 
1067 	nlongs = BITS_TO_LONGS(maxnode);
1068 	if ((maxnode % BITS_PER_LONG) == 0)
1069 		endmask = ~0UL;
1070 	else
1071 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1072 
1073 	/* When the user specified more nodes than supported just check
1074 	   if the non supported part is all zero. */
1075 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1076 		if (nlongs > PAGE_SIZE/sizeof(long))
1077 			return -EINVAL;
1078 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1079 			unsigned long t;
1080 			if (get_user(t, nmask + k))
1081 				return -EFAULT;
1082 			if (k == nlongs - 1) {
1083 				if (t & endmask)
1084 					return -EINVAL;
1085 			} else if (t)
1086 				return -EINVAL;
1087 		}
1088 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1089 		endmask = ~0UL;
1090 	}
1091 
1092 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1093 		return -EFAULT;
1094 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1095 	return 0;
1096 }
1097 
1098 /* Copy a kernel node mask to user space */
1099 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1100 			      nodemask_t *nodes)
1101 {
1102 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1103 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1104 
1105 	if (copy > nbytes) {
1106 		if (copy > PAGE_SIZE)
1107 			return -EINVAL;
1108 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1109 			return -EFAULT;
1110 		copy = nbytes;
1111 	}
1112 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1113 }
1114 
1115 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1116 		unsigned long, mode, unsigned long __user *, nmask,
1117 		unsigned long, maxnode, unsigned, flags)
1118 {
1119 	nodemask_t nodes;
1120 	int err;
1121 	unsigned short mode_flags;
1122 
1123 	mode_flags = mode & MPOL_MODE_FLAGS;
1124 	mode &= ~MPOL_MODE_FLAGS;
1125 	if (mode >= MPOL_MAX)
1126 		return -EINVAL;
1127 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1128 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1129 		return -EINVAL;
1130 	err = get_nodes(&nodes, nmask, maxnode);
1131 	if (err)
1132 		return err;
1133 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1134 }
1135 
1136 /* Set the process memory policy */
1137 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1138 		unsigned long, maxnode)
1139 {
1140 	int err;
1141 	nodemask_t nodes;
1142 	unsigned short flags;
1143 
1144 	flags = mode & MPOL_MODE_FLAGS;
1145 	mode &= ~MPOL_MODE_FLAGS;
1146 	if ((unsigned int)mode >= MPOL_MAX)
1147 		return -EINVAL;
1148 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1149 		return -EINVAL;
1150 	err = get_nodes(&nodes, nmask, maxnode);
1151 	if (err)
1152 		return err;
1153 	return do_set_mempolicy(mode, flags, &nodes);
1154 }
1155 
1156 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1157 		const unsigned long __user *, old_nodes,
1158 		const unsigned long __user *, new_nodes)
1159 {
1160 	const struct cred *cred = current_cred(), *tcred;
1161 	struct mm_struct *mm;
1162 	struct task_struct *task;
1163 	nodemask_t old;
1164 	nodemask_t new;
1165 	nodemask_t task_nodes;
1166 	int err;
1167 
1168 	err = get_nodes(&old, old_nodes, maxnode);
1169 	if (err)
1170 		return err;
1171 
1172 	err = get_nodes(&new, new_nodes, maxnode);
1173 	if (err)
1174 		return err;
1175 
1176 	/* Find the mm_struct */
1177 	read_lock(&tasklist_lock);
1178 	task = pid ? find_task_by_vpid(pid) : current;
1179 	if (!task) {
1180 		read_unlock(&tasklist_lock);
1181 		return -ESRCH;
1182 	}
1183 	mm = get_task_mm(task);
1184 	read_unlock(&tasklist_lock);
1185 
1186 	if (!mm)
1187 		return -EINVAL;
1188 
1189 	/*
1190 	 * Check if this process has the right to modify the specified
1191 	 * process. The right exists if the process has administrative
1192 	 * capabilities, superuser privileges or the same
1193 	 * userid as the target process.
1194 	 */
1195 	rcu_read_lock();
1196 	tcred = __task_cred(task);
1197 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1198 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1199 	    !capable(CAP_SYS_NICE)) {
1200 		rcu_read_unlock();
1201 		err = -EPERM;
1202 		goto out;
1203 	}
1204 	rcu_read_unlock();
1205 
1206 	task_nodes = cpuset_mems_allowed(task);
1207 	/* Is the user allowed to access the target nodes? */
1208 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1209 		err = -EPERM;
1210 		goto out;
1211 	}
1212 
1213 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1214 		err = -EINVAL;
1215 		goto out;
1216 	}
1217 
1218 	err = security_task_movememory(task);
1219 	if (err)
1220 		goto out;
1221 
1222 	err = do_migrate_pages(mm, &old, &new,
1223 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1224 out:
1225 	mmput(mm);
1226 	return err;
1227 }
1228 
1229 
1230 /* Retrieve NUMA policy */
1231 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1232 		unsigned long __user *, nmask, unsigned long, maxnode,
1233 		unsigned long, addr, unsigned long, flags)
1234 {
1235 	int err;
1236 	int uninitialized_var(pval);
1237 	nodemask_t nodes;
1238 
1239 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1240 		return -EINVAL;
1241 
1242 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1243 
1244 	if (err)
1245 		return err;
1246 
1247 	if (policy && put_user(pval, policy))
1248 		return -EFAULT;
1249 
1250 	if (nmask)
1251 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1252 
1253 	return err;
1254 }
1255 
1256 #ifdef CONFIG_COMPAT
1257 
1258 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1259 				     compat_ulong_t __user *nmask,
1260 				     compat_ulong_t maxnode,
1261 				     compat_ulong_t addr, compat_ulong_t flags)
1262 {
1263 	long err;
1264 	unsigned long __user *nm = NULL;
1265 	unsigned long nr_bits, alloc_size;
1266 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1267 
1268 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1269 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1270 
1271 	if (nmask)
1272 		nm = compat_alloc_user_space(alloc_size);
1273 
1274 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1275 
1276 	if (!err && nmask) {
1277 		err = copy_from_user(bm, nm, alloc_size);
1278 		/* ensure entire bitmap is zeroed */
1279 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1280 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1281 	}
1282 
1283 	return err;
1284 }
1285 
1286 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1287 				     compat_ulong_t maxnode)
1288 {
1289 	long err = 0;
1290 	unsigned long __user *nm = NULL;
1291 	unsigned long nr_bits, alloc_size;
1292 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1293 
1294 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1295 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1296 
1297 	if (nmask) {
1298 		err = compat_get_bitmap(bm, nmask, nr_bits);
1299 		nm = compat_alloc_user_space(alloc_size);
1300 		err |= copy_to_user(nm, bm, alloc_size);
1301 	}
1302 
1303 	if (err)
1304 		return -EFAULT;
1305 
1306 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1307 }
1308 
1309 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1310 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1311 			     compat_ulong_t maxnode, compat_ulong_t flags)
1312 {
1313 	long err = 0;
1314 	unsigned long __user *nm = NULL;
1315 	unsigned long nr_bits, alloc_size;
1316 	nodemask_t bm;
1317 
1318 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1319 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1320 
1321 	if (nmask) {
1322 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1323 		nm = compat_alloc_user_space(alloc_size);
1324 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1325 	}
1326 
1327 	if (err)
1328 		return -EFAULT;
1329 
1330 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1331 }
1332 
1333 #endif
1334 
1335 /*
1336  * get_vma_policy(@task, @vma, @addr)
1337  * @task - task for fallback if vma policy == default
1338  * @vma   - virtual memory area whose policy is sought
1339  * @addr  - address in @vma for shared policy lookup
1340  *
1341  * Returns effective policy for a VMA at specified address.
1342  * Falls back to @task or system default policy, as necessary.
1343  * Current or other task's task mempolicy and non-shared vma policies
1344  * are protected by the task's mmap_sem, which must be held for read by
1345  * the caller.
1346  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1347  * count--added by the get_policy() vm_op, as appropriate--to protect against
1348  * freeing by another task.  It is the caller's responsibility to free the
1349  * extra reference for shared policies.
1350  */
1351 static struct mempolicy *get_vma_policy(struct task_struct *task,
1352 		struct vm_area_struct *vma, unsigned long addr)
1353 {
1354 	struct mempolicy *pol = task->mempolicy;
1355 
1356 	if (vma) {
1357 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1358 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1359 									addr);
1360 			if (vpol)
1361 				pol = vpol;
1362 		} else if (vma->vm_policy)
1363 			pol = vma->vm_policy;
1364 	}
1365 	if (!pol)
1366 		pol = &default_policy;
1367 	return pol;
1368 }
1369 
1370 /*
1371  * Return a nodemask representing a mempolicy for filtering nodes for
1372  * page allocation
1373  */
1374 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1375 {
1376 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1377 	if (unlikely(policy->mode == MPOL_BIND) &&
1378 			gfp_zone(gfp) >= policy_zone &&
1379 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1380 		return &policy->v.nodes;
1381 
1382 	return NULL;
1383 }
1384 
1385 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1386 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1387 {
1388 	int nd = numa_node_id();
1389 
1390 	switch (policy->mode) {
1391 	case MPOL_PREFERRED:
1392 		if (!(policy->flags & MPOL_F_LOCAL))
1393 			nd = policy->v.preferred_node;
1394 		break;
1395 	case MPOL_BIND:
1396 		/*
1397 		 * Normally, MPOL_BIND allocations are node-local within the
1398 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1399 		 * current node is part of the mask, we use the zonelist for
1400 		 * the first node in the mask instead.
1401 		 */
1402 		if (unlikely(gfp & __GFP_THISNODE) &&
1403 				unlikely(!node_isset(nd, policy->v.nodes)))
1404 			nd = first_node(policy->v.nodes);
1405 		break;
1406 	case MPOL_INTERLEAVE: /* should not happen */
1407 		break;
1408 	default:
1409 		BUG();
1410 	}
1411 	return node_zonelist(nd, gfp);
1412 }
1413 
1414 /* Do dynamic interleaving for a process */
1415 static unsigned interleave_nodes(struct mempolicy *policy)
1416 {
1417 	unsigned nid, next;
1418 	struct task_struct *me = current;
1419 
1420 	nid = me->il_next;
1421 	next = next_node(nid, policy->v.nodes);
1422 	if (next >= MAX_NUMNODES)
1423 		next = first_node(policy->v.nodes);
1424 	if (next < MAX_NUMNODES)
1425 		me->il_next = next;
1426 	return nid;
1427 }
1428 
1429 /*
1430  * Depending on the memory policy provide a node from which to allocate the
1431  * next slab entry.
1432  * @policy must be protected by freeing by the caller.  If @policy is
1433  * the current task's mempolicy, this protection is implicit, as only the
1434  * task can change it's policy.  The system default policy requires no
1435  * such protection.
1436  */
1437 unsigned slab_node(struct mempolicy *policy)
1438 {
1439 	if (!policy || policy->flags & MPOL_F_LOCAL)
1440 		return numa_node_id();
1441 
1442 	switch (policy->mode) {
1443 	case MPOL_PREFERRED:
1444 		/*
1445 		 * handled MPOL_F_LOCAL above
1446 		 */
1447 		return policy->v.preferred_node;
1448 
1449 	case MPOL_INTERLEAVE:
1450 		return interleave_nodes(policy);
1451 
1452 	case MPOL_BIND: {
1453 		/*
1454 		 * Follow bind policy behavior and start allocation at the
1455 		 * first node.
1456 		 */
1457 		struct zonelist *zonelist;
1458 		struct zone *zone;
1459 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1460 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1461 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1462 							&policy->v.nodes,
1463 							&zone);
1464 		return zone->node;
1465 	}
1466 
1467 	default:
1468 		BUG();
1469 	}
1470 }
1471 
1472 /* Do static interleaving for a VMA with known offset. */
1473 static unsigned offset_il_node(struct mempolicy *pol,
1474 		struct vm_area_struct *vma, unsigned long off)
1475 {
1476 	unsigned nnodes = nodes_weight(pol->v.nodes);
1477 	unsigned target;
1478 	int c;
1479 	int nid = -1;
1480 
1481 	if (!nnodes)
1482 		return numa_node_id();
1483 	target = (unsigned int)off % nnodes;
1484 	c = 0;
1485 	do {
1486 		nid = next_node(nid, pol->v.nodes);
1487 		c++;
1488 	} while (c <= target);
1489 	return nid;
1490 }
1491 
1492 /* Determine a node number for interleave */
1493 static inline unsigned interleave_nid(struct mempolicy *pol,
1494 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1495 {
1496 	if (vma) {
1497 		unsigned long off;
1498 
1499 		/*
1500 		 * for small pages, there is no difference between
1501 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1502 		 * for huge pages, since vm_pgoff is in units of small
1503 		 * pages, we need to shift off the always 0 bits to get
1504 		 * a useful offset.
1505 		 */
1506 		BUG_ON(shift < PAGE_SHIFT);
1507 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1508 		off += (addr - vma->vm_start) >> shift;
1509 		return offset_il_node(pol, vma, off);
1510 	} else
1511 		return interleave_nodes(pol);
1512 }
1513 
1514 #ifdef CONFIG_HUGETLBFS
1515 /*
1516  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1517  * @vma = virtual memory area whose policy is sought
1518  * @addr = address in @vma for shared policy lookup and interleave policy
1519  * @gfp_flags = for requested zone
1520  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1521  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1522  *
1523  * Returns a zonelist suitable for a huge page allocation and a pointer
1524  * to the struct mempolicy for conditional unref after allocation.
1525  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1526  * @nodemask for filtering the zonelist.
1527  */
1528 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1529 				gfp_t gfp_flags, struct mempolicy **mpol,
1530 				nodemask_t **nodemask)
1531 {
1532 	struct zonelist *zl;
1533 
1534 	*mpol = get_vma_policy(current, vma, addr);
1535 	*nodemask = NULL;	/* assume !MPOL_BIND */
1536 
1537 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1538 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1539 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1540 	} else {
1541 		zl = policy_zonelist(gfp_flags, *mpol);
1542 		if ((*mpol)->mode == MPOL_BIND)
1543 			*nodemask = &(*mpol)->v.nodes;
1544 	}
1545 	return zl;
1546 }
1547 #endif
1548 
1549 /* Allocate a page in interleaved policy.
1550    Own path because it needs to do special accounting. */
1551 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1552 					unsigned nid)
1553 {
1554 	struct zonelist *zl;
1555 	struct page *page;
1556 
1557 	zl = node_zonelist(nid, gfp);
1558 	page = __alloc_pages(gfp, order, zl);
1559 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1560 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1561 	return page;
1562 }
1563 
1564 /**
1565  * 	alloc_page_vma	- Allocate a page for a VMA.
1566  *
1567  * 	@gfp:
1568  *      %GFP_USER    user allocation.
1569  *      %GFP_KERNEL  kernel allocations,
1570  *      %GFP_HIGHMEM highmem/user allocations,
1571  *      %GFP_FS      allocation should not call back into a file system.
1572  *      %GFP_ATOMIC  don't sleep.
1573  *
1574  * 	@vma:  Pointer to VMA or NULL if not available.
1575  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1576  *
1577  * 	This function allocates a page from the kernel page pool and applies
1578  *	a NUMA policy associated with the VMA or the current process.
1579  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1580  *	mm_struct of the VMA to prevent it from going away. Should be used for
1581  *	all allocations for pages that will be mapped into
1582  * 	user space. Returns NULL when no page can be allocated.
1583  *
1584  *	Should be called with the mm_sem of the vma hold.
1585  */
1586 struct page *
1587 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1588 {
1589 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1590 	struct zonelist *zl;
1591 
1592 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1593 		unsigned nid;
1594 
1595 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1596 		mpol_cond_put(pol);
1597 		return alloc_page_interleave(gfp, 0, nid);
1598 	}
1599 	zl = policy_zonelist(gfp, pol);
1600 	if (unlikely(mpol_needs_cond_ref(pol))) {
1601 		/*
1602 		 * slow path: ref counted shared policy
1603 		 */
1604 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1605 						zl, policy_nodemask(gfp, pol));
1606 		__mpol_put(pol);
1607 		return page;
1608 	}
1609 	/*
1610 	 * fast path:  default or task policy
1611 	 */
1612 	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1613 }
1614 
1615 /**
1616  * 	alloc_pages_current - Allocate pages.
1617  *
1618  *	@gfp:
1619  *		%GFP_USER   user allocation,
1620  *      	%GFP_KERNEL kernel allocation,
1621  *      	%GFP_HIGHMEM highmem allocation,
1622  *      	%GFP_FS     don't call back into a file system.
1623  *      	%GFP_ATOMIC don't sleep.
1624  *	@order: Power of two of allocation size in pages. 0 is a single page.
1625  *
1626  *	Allocate a page from the kernel page pool.  When not in
1627  *	interrupt context and apply the current process NUMA policy.
1628  *	Returns NULL when no page can be allocated.
1629  *
1630  *	Don't call cpuset_update_task_memory_state() unless
1631  *	1) it's ok to take cpuset_sem (can WAIT), and
1632  *	2) allocating for current task (not interrupt).
1633  */
1634 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1635 {
1636 	struct mempolicy *pol = current->mempolicy;
1637 
1638 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1639 		pol = &default_policy;
1640 
1641 	/*
1642 	 * No reference counting needed for current->mempolicy
1643 	 * nor system default_policy
1644 	 */
1645 	if (pol->mode == MPOL_INTERLEAVE)
1646 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1647 	return __alloc_pages_nodemask(gfp, order,
1648 			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1649 }
1650 EXPORT_SYMBOL(alloc_pages_current);
1651 
1652 /*
1653  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1654  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1655  * with the mems_allowed returned by cpuset_mems_allowed().  This
1656  * keeps mempolicies cpuset relative after its cpuset moves.  See
1657  * further kernel/cpuset.c update_nodemask().
1658  */
1659 
1660 /* Slow path of a mempolicy duplicate */
1661 struct mempolicy *__mpol_dup(struct mempolicy *old)
1662 {
1663 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1664 
1665 	if (!new)
1666 		return ERR_PTR(-ENOMEM);
1667 	if (current_cpuset_is_being_rebound()) {
1668 		nodemask_t mems = cpuset_mems_allowed(current);
1669 		mpol_rebind_policy(old, &mems);
1670 	}
1671 	*new = *old;
1672 	atomic_set(&new->refcnt, 1);
1673 	return new;
1674 }
1675 
1676 /*
1677  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1678  * eliminate the * MPOL_F_* flags that require conditional ref and
1679  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1680  * after return.  Use the returned value.
1681  *
1682  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1683  * policy lookup, even if the policy needs/has extra ref on lookup.
1684  * shmem_readahead needs this.
1685  */
1686 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1687 						struct mempolicy *frompol)
1688 {
1689 	if (!mpol_needs_cond_ref(frompol))
1690 		return frompol;
1691 
1692 	*tompol = *frompol;
1693 	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1694 	__mpol_put(frompol);
1695 	return tompol;
1696 }
1697 
1698 static int mpol_match_intent(const struct mempolicy *a,
1699 			     const struct mempolicy *b)
1700 {
1701 	if (a->flags != b->flags)
1702 		return 0;
1703 	if (!mpol_store_user_nodemask(a))
1704 		return 1;
1705 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1706 }
1707 
1708 /* Slow path of a mempolicy comparison */
1709 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1710 {
1711 	if (!a || !b)
1712 		return 0;
1713 	if (a->mode != b->mode)
1714 		return 0;
1715 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1716 		return 0;
1717 	switch (a->mode) {
1718 	case MPOL_BIND:
1719 		/* Fall through */
1720 	case MPOL_INTERLEAVE:
1721 		return nodes_equal(a->v.nodes, b->v.nodes);
1722 	case MPOL_PREFERRED:
1723 		return a->v.preferred_node == b->v.preferred_node &&
1724 			a->flags == b->flags;
1725 	default:
1726 		BUG();
1727 		return 0;
1728 	}
1729 }
1730 
1731 /*
1732  * Shared memory backing store policy support.
1733  *
1734  * Remember policies even when nobody has shared memory mapped.
1735  * The policies are kept in Red-Black tree linked from the inode.
1736  * They are protected by the sp->lock spinlock, which should be held
1737  * for any accesses to the tree.
1738  */
1739 
1740 /* lookup first element intersecting start-end */
1741 /* Caller holds sp->lock */
1742 static struct sp_node *
1743 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1744 {
1745 	struct rb_node *n = sp->root.rb_node;
1746 
1747 	while (n) {
1748 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1749 
1750 		if (start >= p->end)
1751 			n = n->rb_right;
1752 		else if (end <= p->start)
1753 			n = n->rb_left;
1754 		else
1755 			break;
1756 	}
1757 	if (!n)
1758 		return NULL;
1759 	for (;;) {
1760 		struct sp_node *w = NULL;
1761 		struct rb_node *prev = rb_prev(n);
1762 		if (!prev)
1763 			break;
1764 		w = rb_entry(prev, struct sp_node, nd);
1765 		if (w->end <= start)
1766 			break;
1767 		n = prev;
1768 	}
1769 	return rb_entry(n, struct sp_node, nd);
1770 }
1771 
1772 /* Insert a new shared policy into the list. */
1773 /* Caller holds sp->lock */
1774 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1775 {
1776 	struct rb_node **p = &sp->root.rb_node;
1777 	struct rb_node *parent = NULL;
1778 	struct sp_node *nd;
1779 
1780 	while (*p) {
1781 		parent = *p;
1782 		nd = rb_entry(parent, struct sp_node, nd);
1783 		if (new->start < nd->start)
1784 			p = &(*p)->rb_left;
1785 		else if (new->end > nd->end)
1786 			p = &(*p)->rb_right;
1787 		else
1788 			BUG();
1789 	}
1790 	rb_link_node(&new->nd, parent, p);
1791 	rb_insert_color(&new->nd, &sp->root);
1792 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1793 		 new->policy ? new->policy->mode : 0);
1794 }
1795 
1796 /* Find shared policy intersecting idx */
1797 struct mempolicy *
1798 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1799 {
1800 	struct mempolicy *pol = NULL;
1801 	struct sp_node *sn;
1802 
1803 	if (!sp->root.rb_node)
1804 		return NULL;
1805 	spin_lock(&sp->lock);
1806 	sn = sp_lookup(sp, idx, idx+1);
1807 	if (sn) {
1808 		mpol_get(sn->policy);
1809 		pol = sn->policy;
1810 	}
1811 	spin_unlock(&sp->lock);
1812 	return pol;
1813 }
1814 
1815 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1816 {
1817 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1818 	rb_erase(&n->nd, &sp->root);
1819 	mpol_put(n->policy);
1820 	kmem_cache_free(sn_cache, n);
1821 }
1822 
1823 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1824 				struct mempolicy *pol)
1825 {
1826 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1827 
1828 	if (!n)
1829 		return NULL;
1830 	n->start = start;
1831 	n->end = end;
1832 	mpol_get(pol);
1833 	pol->flags |= MPOL_F_SHARED;	/* for unref */
1834 	n->policy = pol;
1835 	return n;
1836 }
1837 
1838 /* Replace a policy range. */
1839 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1840 				 unsigned long end, struct sp_node *new)
1841 {
1842 	struct sp_node *n, *new2 = NULL;
1843 
1844 restart:
1845 	spin_lock(&sp->lock);
1846 	n = sp_lookup(sp, start, end);
1847 	/* Take care of old policies in the same range. */
1848 	while (n && n->start < end) {
1849 		struct rb_node *next = rb_next(&n->nd);
1850 		if (n->start >= start) {
1851 			if (n->end <= end)
1852 				sp_delete(sp, n);
1853 			else
1854 				n->start = end;
1855 		} else {
1856 			/* Old policy spanning whole new range. */
1857 			if (n->end > end) {
1858 				if (!new2) {
1859 					spin_unlock(&sp->lock);
1860 					new2 = sp_alloc(end, n->end, n->policy);
1861 					if (!new2)
1862 						return -ENOMEM;
1863 					goto restart;
1864 				}
1865 				n->end = start;
1866 				sp_insert(sp, new2);
1867 				new2 = NULL;
1868 				break;
1869 			} else
1870 				n->end = start;
1871 		}
1872 		if (!next)
1873 			break;
1874 		n = rb_entry(next, struct sp_node, nd);
1875 	}
1876 	if (new)
1877 		sp_insert(sp, new);
1878 	spin_unlock(&sp->lock);
1879 	if (new2) {
1880 		mpol_put(new2->policy);
1881 		kmem_cache_free(sn_cache, new2);
1882 	}
1883 	return 0;
1884 }
1885 
1886 /**
1887  * mpol_shared_policy_init - initialize shared policy for inode
1888  * @sp: pointer to inode shared policy
1889  * @mpol:  struct mempolicy to install
1890  *
1891  * Install non-NULL @mpol in inode's shared policy rb-tree.
1892  * On entry, the current task has a reference on a non-NULL @mpol.
1893  * This must be released on exit.
1894  */
1895 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1896 {
1897 	int ret;
1898 
1899 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1900 	spin_lock_init(&sp->lock);
1901 
1902 	if (mpol) {
1903 		struct vm_area_struct pvma;
1904 		struct mempolicy *new;
1905 
1906 		/* contextualize the tmpfs mount point mempolicy */
1907 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1908 		if (IS_ERR(new)) {
1909 			mpol_put(mpol);	/* drop our ref on sb mpol */
1910 			return;		/* no valid nodemask intersection */
1911 		}
1912 
1913 		task_lock(current);
1914 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
1915 		task_unlock(current);
1916 		mpol_put(mpol);	/* drop our ref on sb mpol */
1917 		if (ret) {
1918 			mpol_put(new);
1919 			return;
1920 		}
1921 
1922 		/* Create pseudo-vma that contains just the policy */
1923 		memset(&pvma, 0, sizeof(struct vm_area_struct));
1924 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
1925 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1926 		mpol_put(new);			/* drop initial ref */
1927 	}
1928 }
1929 
1930 int mpol_set_shared_policy(struct shared_policy *info,
1931 			struct vm_area_struct *vma, struct mempolicy *npol)
1932 {
1933 	int err;
1934 	struct sp_node *new = NULL;
1935 	unsigned long sz = vma_pages(vma);
1936 
1937 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1938 		 vma->vm_pgoff,
1939 		 sz, npol ? npol->mode : -1,
1940 		 npol ? npol->flags : -1,
1941 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1942 
1943 	if (npol) {
1944 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1945 		if (!new)
1946 			return -ENOMEM;
1947 	}
1948 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1949 	if (err && new)
1950 		kmem_cache_free(sn_cache, new);
1951 	return err;
1952 }
1953 
1954 /* Free a backing policy store on inode delete. */
1955 void mpol_free_shared_policy(struct shared_policy *p)
1956 {
1957 	struct sp_node *n;
1958 	struct rb_node *next;
1959 
1960 	if (!p->root.rb_node)
1961 		return;
1962 	spin_lock(&p->lock);
1963 	next = rb_first(&p->root);
1964 	while (next) {
1965 		n = rb_entry(next, struct sp_node, nd);
1966 		next = rb_next(&n->nd);
1967 		rb_erase(&n->nd, &p->root);
1968 		mpol_put(n->policy);
1969 		kmem_cache_free(sn_cache, n);
1970 	}
1971 	spin_unlock(&p->lock);
1972 }
1973 
1974 /* assumes fs == KERNEL_DS */
1975 void __init numa_policy_init(void)
1976 {
1977 	nodemask_t interleave_nodes;
1978 	unsigned long largest = 0;
1979 	int nid, prefer = 0;
1980 
1981 	policy_cache = kmem_cache_create("numa_policy",
1982 					 sizeof(struct mempolicy),
1983 					 0, SLAB_PANIC, NULL);
1984 
1985 	sn_cache = kmem_cache_create("shared_policy_node",
1986 				     sizeof(struct sp_node),
1987 				     0, SLAB_PANIC, NULL);
1988 
1989 	/*
1990 	 * Set interleaving policy for system init. Interleaving is only
1991 	 * enabled across suitably sized nodes (default is >= 16MB), or
1992 	 * fall back to the largest node if they're all smaller.
1993 	 */
1994 	nodes_clear(interleave_nodes);
1995 	for_each_node_state(nid, N_HIGH_MEMORY) {
1996 		unsigned long total_pages = node_present_pages(nid);
1997 
1998 		/* Preserve the largest node */
1999 		if (largest < total_pages) {
2000 			largest = total_pages;
2001 			prefer = nid;
2002 		}
2003 
2004 		/* Interleave this node? */
2005 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2006 			node_set(nid, interleave_nodes);
2007 	}
2008 
2009 	/* All too small, use the largest */
2010 	if (unlikely(nodes_empty(interleave_nodes)))
2011 		node_set(prefer, interleave_nodes);
2012 
2013 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2014 		printk("numa_policy_init: interleaving failed\n");
2015 }
2016 
2017 /* Reset policy of current process to default */
2018 void numa_default_policy(void)
2019 {
2020 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2021 }
2022 
2023 /*
2024  * Parse and format mempolicy from/to strings
2025  */
2026 
2027 /*
2028  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2029  * Used only for mpol_parse_str() and mpol_to_str()
2030  */
2031 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2032 static const char * const policy_types[] =
2033 	{ "default", "prefer", "bind", "interleave", "local" };
2034 
2035 
2036 #ifdef CONFIG_TMPFS
2037 /**
2038  * mpol_parse_str - parse string to mempolicy
2039  * @str:  string containing mempolicy to parse
2040  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2041  * @no_context:  flag whether to "contextualize" the mempolicy
2042  *
2043  * Format of input:
2044  *	<mode>[=<flags>][:<nodelist>]
2045  *
2046  * if @no_context is true, save the input nodemask in w.user_nodemask in
2047  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2048  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2049  * mount option.  Note that if 'static' or 'relative' mode flags were
2050  * specified, the input nodemask will already have been saved.  Saving
2051  * it again is redundant, but safe.
2052  *
2053  * On success, returns 0, else 1
2054  */
2055 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2056 {
2057 	struct mempolicy *new = NULL;
2058 	unsigned short uninitialized_var(mode);
2059 	unsigned short uninitialized_var(mode_flags);
2060 	nodemask_t nodes;
2061 	char *nodelist = strchr(str, ':');
2062 	char *flags = strchr(str, '=');
2063 	int i;
2064 	int err = 1;
2065 
2066 	if (nodelist) {
2067 		/* NUL-terminate mode or flags string */
2068 		*nodelist++ = '\0';
2069 		if (nodelist_parse(nodelist, nodes))
2070 			goto out;
2071 		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2072 			goto out;
2073 	} else
2074 		nodes_clear(nodes);
2075 
2076 	if (flags)
2077 		*flags++ = '\0';	/* terminate mode string */
2078 
2079 	for (i = 0; i <= MPOL_LOCAL; i++) {
2080 		if (!strcmp(str, policy_types[i])) {
2081 			mode = i;
2082 			break;
2083 		}
2084 	}
2085 	if (i > MPOL_LOCAL)
2086 		goto out;
2087 
2088 	switch (mode) {
2089 	case MPOL_PREFERRED:
2090 		/*
2091 		 * Insist on a nodelist of one node only
2092 		 */
2093 		if (nodelist) {
2094 			char *rest = nodelist;
2095 			while (isdigit(*rest))
2096 				rest++;
2097 			if (!*rest)
2098 				err = 0;
2099 		}
2100 		break;
2101 	case MPOL_INTERLEAVE:
2102 		/*
2103 		 * Default to online nodes with memory if no nodelist
2104 		 */
2105 		if (!nodelist)
2106 			nodes = node_states[N_HIGH_MEMORY];
2107 		err = 0;
2108 		break;
2109 	case MPOL_LOCAL:
2110 		/*
2111 		 * Don't allow a nodelist;  mpol_new() checks flags
2112 		 */
2113 		if (nodelist)
2114 			goto out;
2115 		mode = MPOL_PREFERRED;
2116 		break;
2117 
2118 	/*
2119 	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2120 	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2121 	 */
2122 	}
2123 
2124 	mode_flags = 0;
2125 	if (flags) {
2126 		/*
2127 		 * Currently, we only support two mutually exclusive
2128 		 * mode flags.
2129 		 */
2130 		if (!strcmp(flags, "static"))
2131 			mode_flags |= MPOL_F_STATIC_NODES;
2132 		else if (!strcmp(flags, "relative"))
2133 			mode_flags |= MPOL_F_RELATIVE_NODES;
2134 		else
2135 			err = 1;
2136 	}
2137 
2138 	new = mpol_new(mode, mode_flags, &nodes);
2139 	if (IS_ERR(new))
2140 		err = 1;
2141 	else {
2142 		int ret;
2143 
2144 		task_lock(current);
2145 		ret = mpol_set_nodemask(new, &nodes);
2146 		task_unlock(current);
2147 		if (ret)
2148 			err = 1;
2149 		else if (no_context) {
2150 			/* save for contextualization */
2151 			new->w.user_nodemask = nodes;
2152 		}
2153 	}
2154 
2155 out:
2156 	/* Restore string for error message */
2157 	if (nodelist)
2158 		*--nodelist = ':';
2159 	if (flags)
2160 		*--flags = '=';
2161 	if (!err)
2162 		*mpol = new;
2163 	return err;
2164 }
2165 #endif /* CONFIG_TMPFS */
2166 
2167 /**
2168  * mpol_to_str - format a mempolicy structure for printing
2169  * @buffer:  to contain formatted mempolicy string
2170  * @maxlen:  length of @buffer
2171  * @pol:  pointer to mempolicy to be formatted
2172  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2173  *
2174  * Convert a mempolicy into a string.
2175  * Returns the number of characters in buffer (if positive)
2176  * or an error (negative)
2177  */
2178 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2179 {
2180 	char *p = buffer;
2181 	int l;
2182 	nodemask_t nodes;
2183 	unsigned short mode;
2184 	unsigned short flags = pol ? pol->flags : 0;
2185 
2186 	/*
2187 	 * Sanity check:  room for longest mode, flag and some nodes
2188 	 */
2189 	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2190 
2191 	if (!pol || pol == &default_policy)
2192 		mode = MPOL_DEFAULT;
2193 	else
2194 		mode = pol->mode;
2195 
2196 	switch (mode) {
2197 	case MPOL_DEFAULT:
2198 		nodes_clear(nodes);
2199 		break;
2200 
2201 	case MPOL_PREFERRED:
2202 		nodes_clear(nodes);
2203 		if (flags & MPOL_F_LOCAL)
2204 			mode = MPOL_LOCAL;	/* pseudo-policy */
2205 		else
2206 			node_set(pol->v.preferred_node, nodes);
2207 		break;
2208 
2209 	case MPOL_BIND:
2210 		/* Fall through */
2211 	case MPOL_INTERLEAVE:
2212 		if (no_context)
2213 			nodes = pol->w.user_nodemask;
2214 		else
2215 			nodes = pol->v.nodes;
2216 		break;
2217 
2218 	default:
2219 		BUG();
2220 	}
2221 
2222 	l = strlen(policy_types[mode]);
2223 	if (buffer + maxlen < p + l + 1)
2224 		return -ENOSPC;
2225 
2226 	strcpy(p, policy_types[mode]);
2227 	p += l;
2228 
2229 	if (flags & MPOL_MODE_FLAGS) {
2230 		if (buffer + maxlen < p + 2)
2231 			return -ENOSPC;
2232 		*p++ = '=';
2233 
2234 		/*
2235 		 * Currently, the only defined flags are mutually exclusive
2236 		 */
2237 		if (flags & MPOL_F_STATIC_NODES)
2238 			p += snprintf(p, buffer + maxlen - p, "static");
2239 		else if (flags & MPOL_F_RELATIVE_NODES)
2240 			p += snprintf(p, buffer + maxlen - p, "relative");
2241 	}
2242 
2243 	if (!nodes_empty(nodes)) {
2244 		if (buffer + maxlen < p + 2)
2245 			return -ENOSPC;
2246 		*p++ = ':';
2247 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2248 	}
2249 	return p - buffer;
2250 }
2251 
2252 struct numa_maps {
2253 	unsigned long pages;
2254 	unsigned long anon;
2255 	unsigned long active;
2256 	unsigned long writeback;
2257 	unsigned long mapcount_max;
2258 	unsigned long dirty;
2259 	unsigned long swapcache;
2260 	unsigned long node[MAX_NUMNODES];
2261 };
2262 
2263 static void gather_stats(struct page *page, void *private, int pte_dirty)
2264 {
2265 	struct numa_maps *md = private;
2266 	int count = page_mapcount(page);
2267 
2268 	md->pages++;
2269 	if (pte_dirty || PageDirty(page))
2270 		md->dirty++;
2271 
2272 	if (PageSwapCache(page))
2273 		md->swapcache++;
2274 
2275 	if (PageActive(page) || PageUnevictable(page))
2276 		md->active++;
2277 
2278 	if (PageWriteback(page))
2279 		md->writeback++;
2280 
2281 	if (PageAnon(page))
2282 		md->anon++;
2283 
2284 	if (count > md->mapcount_max)
2285 		md->mapcount_max = count;
2286 
2287 	md->node[page_to_nid(page)]++;
2288 }
2289 
2290 #ifdef CONFIG_HUGETLB_PAGE
2291 static void check_huge_range(struct vm_area_struct *vma,
2292 		unsigned long start, unsigned long end,
2293 		struct numa_maps *md)
2294 {
2295 	unsigned long addr;
2296 	struct page *page;
2297 	struct hstate *h = hstate_vma(vma);
2298 	unsigned long sz = huge_page_size(h);
2299 
2300 	for (addr = start; addr < end; addr += sz) {
2301 		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2302 						addr & huge_page_mask(h));
2303 		pte_t pte;
2304 
2305 		if (!ptep)
2306 			continue;
2307 
2308 		pte = *ptep;
2309 		if (pte_none(pte))
2310 			continue;
2311 
2312 		page = pte_page(pte);
2313 		if (!page)
2314 			continue;
2315 
2316 		gather_stats(page, md, pte_dirty(*ptep));
2317 	}
2318 }
2319 #else
2320 static inline void check_huge_range(struct vm_area_struct *vma,
2321 		unsigned long start, unsigned long end,
2322 		struct numa_maps *md)
2323 {
2324 }
2325 #endif
2326 
2327 /*
2328  * Display pages allocated per node and memory policy via /proc.
2329  */
2330 int show_numa_map(struct seq_file *m, void *v)
2331 {
2332 	struct proc_maps_private *priv = m->private;
2333 	struct vm_area_struct *vma = v;
2334 	struct numa_maps *md;
2335 	struct file *file = vma->vm_file;
2336 	struct mm_struct *mm = vma->vm_mm;
2337 	struct mempolicy *pol;
2338 	int n;
2339 	char buffer[50];
2340 
2341 	if (!mm)
2342 		return 0;
2343 
2344 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2345 	if (!md)
2346 		return 0;
2347 
2348 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2349 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2350 	mpol_cond_put(pol);
2351 
2352 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2353 
2354 	if (file) {
2355 		seq_printf(m, " file=");
2356 		seq_path(m, &file->f_path, "\n\t= ");
2357 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2358 		seq_printf(m, " heap");
2359 	} else if (vma->vm_start <= mm->start_stack &&
2360 			vma->vm_end >= mm->start_stack) {
2361 		seq_printf(m, " stack");
2362 	}
2363 
2364 	if (is_vm_hugetlb_page(vma)) {
2365 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2366 		seq_printf(m, " huge");
2367 	} else {
2368 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2369 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2370 	}
2371 
2372 	if (!md->pages)
2373 		goto out;
2374 
2375 	if (md->anon)
2376 		seq_printf(m," anon=%lu",md->anon);
2377 
2378 	if (md->dirty)
2379 		seq_printf(m," dirty=%lu",md->dirty);
2380 
2381 	if (md->pages != md->anon && md->pages != md->dirty)
2382 		seq_printf(m, " mapped=%lu", md->pages);
2383 
2384 	if (md->mapcount_max > 1)
2385 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2386 
2387 	if (md->swapcache)
2388 		seq_printf(m," swapcache=%lu", md->swapcache);
2389 
2390 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2391 		seq_printf(m," active=%lu", md->active);
2392 
2393 	if (md->writeback)
2394 		seq_printf(m," writeback=%lu", md->writeback);
2395 
2396 	for_each_node_state(n, N_HIGH_MEMORY)
2397 		if (md->node[n])
2398 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2399 out:
2400 	seq_putc(m, '\n');
2401 	kfree(md);
2402 
2403 	if (m->count < m->size)
2404 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2405 	return 0;
2406 }
2407