xref: /linux/mm/mempolicy.c (revision 2b8232ce512105e28453f301d1510de8363bccd1)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 #include <linux/seq_file.h>
88 #include <linux/proc_fs.h>
89 #include <linux/migrate.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 /* Internal flags */
97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100 
101 static struct kmem_cache *policy_cache;
102 static struct kmem_cache *sn_cache;
103 
104 /* Highest zone. An specific allocation for a zone below that is not
105    policied. */
106 enum zone_type policy_zone = 0;
107 
108 struct mempolicy default_policy = {
109 	.refcnt = ATOMIC_INIT(1), /* never free it */
110 	.policy = MPOL_DEFAULT,
111 };
112 
113 /* Do sanity checking on a policy */
114 static int mpol_check_policy(int mode, nodemask_t *nodes)
115 {
116 	int empty = nodes_empty(*nodes);
117 
118 	switch (mode) {
119 	case MPOL_DEFAULT:
120 		if (!empty)
121 			return -EINVAL;
122 		break;
123 	case MPOL_BIND:
124 	case MPOL_INTERLEAVE:
125 		/* Preferred will only use the first bit, but allow
126 		   more for now. */
127 		if (empty)
128 			return -EINVAL;
129 		break;
130 	}
131 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
132 }
133 
134 /* Generate a custom zonelist for the BIND policy. */
135 static struct zonelist *bind_zonelist(nodemask_t *nodes)
136 {
137 	struct zonelist *zl;
138 	int num, max, nd;
139 	enum zone_type k;
140 
141 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
142 	max++;			/* space for zlcache_ptr (see mmzone.h) */
143 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
144 	if (!zl)
145 		return ERR_PTR(-ENOMEM);
146 	zl->zlcache_ptr = NULL;
147 	num = 0;
148 	/* First put in the highest zones from all nodes, then all the next
149 	   lower zones etc. Avoid empty zones because the memory allocator
150 	   doesn't like them. If you implement node hot removal you
151 	   have to fix that. */
152 	k = MAX_NR_ZONES - 1;
153 	while (1) {
154 		for_each_node_mask(nd, *nodes) {
155 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
156 			if (z->present_pages > 0)
157 				zl->zones[num++] = z;
158 		}
159 		if (k == 0)
160 			break;
161 		k--;
162 	}
163 	if (num == 0) {
164 		kfree(zl);
165 		return ERR_PTR(-EINVAL);
166 	}
167 	zl->zones[num] = NULL;
168 	return zl;
169 }
170 
171 /* Create a new policy */
172 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
173 {
174 	struct mempolicy *policy;
175 
176 	pr_debug("setting mode %d nodes[0] %lx\n",
177 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
178 
179 	if (mode == MPOL_DEFAULT)
180 		return NULL;
181 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
182 	if (!policy)
183 		return ERR_PTR(-ENOMEM);
184 	atomic_set(&policy->refcnt, 1);
185 	switch (mode) {
186 	case MPOL_INTERLEAVE:
187 		policy->v.nodes = *nodes;
188 		if (nodes_weight(*nodes) == 0) {
189 			kmem_cache_free(policy_cache, policy);
190 			return ERR_PTR(-EINVAL);
191 		}
192 		break;
193 	case MPOL_PREFERRED:
194 		policy->v.preferred_node = first_node(*nodes);
195 		if (policy->v.preferred_node >= MAX_NUMNODES)
196 			policy->v.preferred_node = -1;
197 		break;
198 	case MPOL_BIND:
199 		policy->v.zonelist = bind_zonelist(nodes);
200 		if (IS_ERR(policy->v.zonelist)) {
201 			void *error_code = policy->v.zonelist;
202 			kmem_cache_free(policy_cache, policy);
203 			return error_code;
204 		}
205 		break;
206 	}
207 	policy->policy = mode;
208 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
209 	return policy;
210 }
211 
212 static void gather_stats(struct page *, void *, int pte_dirty);
213 static void migrate_page_add(struct page *page, struct list_head *pagelist,
214 				unsigned long flags);
215 
216 /* Scan through pages checking if pages follow certain conditions. */
217 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
218 		unsigned long addr, unsigned long end,
219 		const nodemask_t *nodes, unsigned long flags,
220 		void *private)
221 {
222 	pte_t *orig_pte;
223 	pte_t *pte;
224 	spinlock_t *ptl;
225 
226 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
227 	do {
228 		struct page *page;
229 		int nid;
230 
231 		if (!pte_present(*pte))
232 			continue;
233 		page = vm_normal_page(vma, addr, *pte);
234 		if (!page)
235 			continue;
236 		/*
237 		 * The check for PageReserved here is important to avoid
238 		 * handling zero pages and other pages that may have been
239 		 * marked special by the system.
240 		 *
241 		 * If the PageReserved would not be checked here then f.e.
242 		 * the location of the zero page could have an influence
243 		 * on MPOL_MF_STRICT, zero pages would be counted for
244 		 * the per node stats, and there would be useless attempts
245 		 * to put zero pages on the migration list.
246 		 */
247 		if (PageReserved(page))
248 			continue;
249 		nid = page_to_nid(page);
250 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
251 			continue;
252 
253 		if (flags & MPOL_MF_STATS)
254 			gather_stats(page, private, pte_dirty(*pte));
255 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
256 			migrate_page_add(page, private, flags);
257 		else
258 			break;
259 	} while (pte++, addr += PAGE_SIZE, addr != end);
260 	pte_unmap_unlock(orig_pte, ptl);
261 	return addr != end;
262 }
263 
264 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
265 		unsigned long addr, unsigned long end,
266 		const nodemask_t *nodes, unsigned long flags,
267 		void *private)
268 {
269 	pmd_t *pmd;
270 	unsigned long next;
271 
272 	pmd = pmd_offset(pud, addr);
273 	do {
274 		next = pmd_addr_end(addr, end);
275 		if (pmd_none_or_clear_bad(pmd))
276 			continue;
277 		if (check_pte_range(vma, pmd, addr, next, nodes,
278 				    flags, private))
279 			return -EIO;
280 	} while (pmd++, addr = next, addr != end);
281 	return 0;
282 }
283 
284 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
285 		unsigned long addr, unsigned long end,
286 		const nodemask_t *nodes, unsigned long flags,
287 		void *private)
288 {
289 	pud_t *pud;
290 	unsigned long next;
291 
292 	pud = pud_offset(pgd, addr);
293 	do {
294 		next = pud_addr_end(addr, end);
295 		if (pud_none_or_clear_bad(pud))
296 			continue;
297 		if (check_pmd_range(vma, pud, addr, next, nodes,
298 				    flags, private))
299 			return -EIO;
300 	} while (pud++, addr = next, addr != end);
301 	return 0;
302 }
303 
304 static inline int check_pgd_range(struct vm_area_struct *vma,
305 		unsigned long addr, unsigned long end,
306 		const nodemask_t *nodes, unsigned long flags,
307 		void *private)
308 {
309 	pgd_t *pgd;
310 	unsigned long next;
311 
312 	pgd = pgd_offset(vma->vm_mm, addr);
313 	do {
314 		next = pgd_addr_end(addr, end);
315 		if (pgd_none_or_clear_bad(pgd))
316 			continue;
317 		if (check_pud_range(vma, pgd, addr, next, nodes,
318 				    flags, private))
319 			return -EIO;
320 	} while (pgd++, addr = next, addr != end);
321 	return 0;
322 }
323 
324 /*
325  * Check if all pages in a range are on a set of nodes.
326  * If pagelist != NULL then isolate pages from the LRU and
327  * put them on the pagelist.
328  */
329 static struct vm_area_struct *
330 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
331 		const nodemask_t *nodes, unsigned long flags, void *private)
332 {
333 	int err;
334 	struct vm_area_struct *first, *vma, *prev;
335 
336 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
337 
338 		err = migrate_prep();
339 		if (err)
340 			return ERR_PTR(err);
341 	}
342 
343 	first = find_vma(mm, start);
344 	if (!first)
345 		return ERR_PTR(-EFAULT);
346 	prev = NULL;
347 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
348 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
349 			if (!vma->vm_next && vma->vm_end < end)
350 				return ERR_PTR(-EFAULT);
351 			if (prev && prev->vm_end < vma->vm_start)
352 				return ERR_PTR(-EFAULT);
353 		}
354 		if (!is_vm_hugetlb_page(vma) &&
355 		    ((flags & MPOL_MF_STRICT) ||
356 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
357 				vma_migratable(vma)))) {
358 			unsigned long endvma = vma->vm_end;
359 
360 			if (endvma > end)
361 				endvma = end;
362 			if (vma->vm_start > start)
363 				start = vma->vm_start;
364 			err = check_pgd_range(vma, start, endvma, nodes,
365 						flags, private);
366 			if (err) {
367 				first = ERR_PTR(err);
368 				break;
369 			}
370 		}
371 		prev = vma;
372 	}
373 	return first;
374 }
375 
376 /* Apply policy to a single VMA */
377 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
378 {
379 	int err = 0;
380 	struct mempolicy *old = vma->vm_policy;
381 
382 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
383 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
384 		 vma->vm_ops, vma->vm_file,
385 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
386 
387 	if (vma->vm_ops && vma->vm_ops->set_policy)
388 		err = vma->vm_ops->set_policy(vma, new);
389 	if (!err) {
390 		mpol_get(new);
391 		vma->vm_policy = new;
392 		mpol_free(old);
393 	}
394 	return err;
395 }
396 
397 /* Step 2: apply policy to a range and do splits. */
398 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
399 		       unsigned long end, struct mempolicy *new)
400 {
401 	struct vm_area_struct *next;
402 	int err;
403 
404 	err = 0;
405 	for (; vma && vma->vm_start < end; vma = next) {
406 		next = vma->vm_next;
407 		if (vma->vm_start < start)
408 			err = split_vma(vma->vm_mm, vma, start, 1);
409 		if (!err && vma->vm_end > end)
410 			err = split_vma(vma->vm_mm, vma, end, 0);
411 		if (!err)
412 			err = policy_vma(vma, new);
413 		if (err)
414 			break;
415 	}
416 	return err;
417 }
418 
419 static int contextualize_policy(int mode, nodemask_t *nodes)
420 {
421 	if (!nodes)
422 		return 0;
423 
424 	cpuset_update_task_memory_state();
425 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
426 		return -EINVAL;
427 	return mpol_check_policy(mode, nodes);
428 }
429 
430 
431 /*
432  * Update task->flags PF_MEMPOLICY bit: set iff non-default
433  * mempolicy.  Allows more rapid checking of this (combined perhaps
434  * with other PF_* flag bits) on memory allocation hot code paths.
435  *
436  * If called from outside this file, the task 'p' should -only- be
437  * a newly forked child not yet visible on the task list, because
438  * manipulating the task flags of a visible task is not safe.
439  *
440  * The above limitation is why this routine has the funny name
441  * mpol_fix_fork_child_flag().
442  *
443  * It is also safe to call this with a task pointer of current,
444  * which the static wrapper mpol_set_task_struct_flag() does,
445  * for use within this file.
446  */
447 
448 void mpol_fix_fork_child_flag(struct task_struct *p)
449 {
450 	if (p->mempolicy)
451 		p->flags |= PF_MEMPOLICY;
452 	else
453 		p->flags &= ~PF_MEMPOLICY;
454 }
455 
456 static void mpol_set_task_struct_flag(void)
457 {
458 	mpol_fix_fork_child_flag(current);
459 }
460 
461 /* Set the process memory policy */
462 long do_set_mempolicy(int mode, nodemask_t *nodes)
463 {
464 	struct mempolicy *new;
465 
466 	if (contextualize_policy(mode, nodes))
467 		return -EINVAL;
468 	new = mpol_new(mode, nodes);
469 	if (IS_ERR(new))
470 		return PTR_ERR(new);
471 	mpol_free(current->mempolicy);
472 	current->mempolicy = new;
473 	mpol_set_task_struct_flag();
474 	if (new && new->policy == MPOL_INTERLEAVE)
475 		current->il_next = first_node(new->v.nodes);
476 	return 0;
477 }
478 
479 /* Fill a zone bitmap for a policy */
480 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
481 {
482 	int i;
483 
484 	nodes_clear(*nodes);
485 	switch (p->policy) {
486 	case MPOL_BIND:
487 		for (i = 0; p->v.zonelist->zones[i]; i++)
488 			node_set(zone_to_nid(p->v.zonelist->zones[i]),
489 				*nodes);
490 		break;
491 	case MPOL_DEFAULT:
492 		break;
493 	case MPOL_INTERLEAVE:
494 		*nodes = p->v.nodes;
495 		break;
496 	case MPOL_PREFERRED:
497 		/* or use current node instead of online map? */
498 		if (p->v.preferred_node < 0)
499 			*nodes = node_online_map;
500 		else
501 			node_set(p->v.preferred_node, *nodes);
502 		break;
503 	default:
504 		BUG();
505 	}
506 }
507 
508 static int lookup_node(struct mm_struct *mm, unsigned long addr)
509 {
510 	struct page *p;
511 	int err;
512 
513 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
514 	if (err >= 0) {
515 		err = page_to_nid(p);
516 		put_page(p);
517 	}
518 	return err;
519 }
520 
521 /* Retrieve NUMA policy */
522 long do_get_mempolicy(int *policy, nodemask_t *nmask,
523 			unsigned long addr, unsigned long flags)
524 {
525 	int err;
526 	struct mm_struct *mm = current->mm;
527 	struct vm_area_struct *vma = NULL;
528 	struct mempolicy *pol = current->mempolicy;
529 
530 	cpuset_update_task_memory_state();
531 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
532 		return -EINVAL;
533 	if (flags & MPOL_F_ADDR) {
534 		down_read(&mm->mmap_sem);
535 		vma = find_vma_intersection(mm, addr, addr+1);
536 		if (!vma) {
537 			up_read(&mm->mmap_sem);
538 			return -EFAULT;
539 		}
540 		if (vma->vm_ops && vma->vm_ops->get_policy)
541 			pol = vma->vm_ops->get_policy(vma, addr);
542 		else
543 			pol = vma->vm_policy;
544 	} else if (addr)
545 		return -EINVAL;
546 
547 	if (!pol)
548 		pol = &default_policy;
549 
550 	if (flags & MPOL_F_NODE) {
551 		if (flags & MPOL_F_ADDR) {
552 			err = lookup_node(mm, addr);
553 			if (err < 0)
554 				goto out;
555 			*policy = err;
556 		} else if (pol == current->mempolicy &&
557 				pol->policy == MPOL_INTERLEAVE) {
558 			*policy = current->il_next;
559 		} else {
560 			err = -EINVAL;
561 			goto out;
562 		}
563 	} else
564 		*policy = pol->policy;
565 
566 	if (vma) {
567 		up_read(&current->mm->mmap_sem);
568 		vma = NULL;
569 	}
570 
571 	err = 0;
572 	if (nmask)
573 		get_zonemask(pol, nmask);
574 
575  out:
576 	if (vma)
577 		up_read(&current->mm->mmap_sem);
578 	return err;
579 }
580 
581 #ifdef CONFIG_MIGRATION
582 /*
583  * page migration
584  */
585 static void migrate_page_add(struct page *page, struct list_head *pagelist,
586 				unsigned long flags)
587 {
588 	/*
589 	 * Avoid migrating a page that is shared with others.
590 	 */
591 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
592 		isolate_lru_page(page, pagelist);
593 }
594 
595 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
596 {
597 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
598 }
599 
600 /*
601  * Migrate pages from one node to a target node.
602  * Returns error or the number of pages not migrated.
603  */
604 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
605 {
606 	nodemask_t nmask;
607 	LIST_HEAD(pagelist);
608 	int err = 0;
609 
610 	nodes_clear(nmask);
611 	node_set(source, nmask);
612 
613 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
614 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
615 
616 	if (!list_empty(&pagelist))
617 		err = migrate_pages(&pagelist, new_node_page, dest);
618 
619 	return err;
620 }
621 
622 /*
623  * Move pages between the two nodesets so as to preserve the physical
624  * layout as much as possible.
625  *
626  * Returns the number of page that could not be moved.
627  */
628 int do_migrate_pages(struct mm_struct *mm,
629 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
630 {
631 	LIST_HEAD(pagelist);
632 	int busy = 0;
633 	int err = 0;
634 	nodemask_t tmp;
635 
636   	down_read(&mm->mmap_sem);
637 
638 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
639 	if (err)
640 		goto out;
641 
642 /*
643  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
644  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
645  * bit in 'tmp', and return that <source, dest> pair for migration.
646  * The pair of nodemasks 'to' and 'from' define the map.
647  *
648  * If no pair of bits is found that way, fallback to picking some
649  * pair of 'source' and 'dest' bits that are not the same.  If the
650  * 'source' and 'dest' bits are the same, this represents a node
651  * that will be migrating to itself, so no pages need move.
652  *
653  * If no bits are left in 'tmp', or if all remaining bits left
654  * in 'tmp' correspond to the same bit in 'to', return false
655  * (nothing left to migrate).
656  *
657  * This lets us pick a pair of nodes to migrate between, such that
658  * if possible the dest node is not already occupied by some other
659  * source node, minimizing the risk of overloading the memory on a
660  * node that would happen if we migrated incoming memory to a node
661  * before migrating outgoing memory source that same node.
662  *
663  * A single scan of tmp is sufficient.  As we go, we remember the
664  * most recent <s, d> pair that moved (s != d).  If we find a pair
665  * that not only moved, but what's better, moved to an empty slot
666  * (d is not set in tmp), then we break out then, with that pair.
667  * Otherwise when we finish scannng from_tmp, we at least have the
668  * most recent <s, d> pair that moved.  If we get all the way through
669  * the scan of tmp without finding any node that moved, much less
670  * moved to an empty node, then there is nothing left worth migrating.
671  */
672 
673 	tmp = *from_nodes;
674 	while (!nodes_empty(tmp)) {
675 		int s,d;
676 		int source = -1;
677 		int dest = 0;
678 
679 		for_each_node_mask(s, tmp) {
680 			d = node_remap(s, *from_nodes, *to_nodes);
681 			if (s == d)
682 				continue;
683 
684 			source = s;	/* Node moved. Memorize */
685 			dest = d;
686 
687 			/* dest not in remaining from nodes? */
688 			if (!node_isset(dest, tmp))
689 				break;
690 		}
691 		if (source == -1)
692 			break;
693 
694 		node_clear(source, tmp);
695 		err = migrate_to_node(mm, source, dest, flags);
696 		if (err > 0)
697 			busy += err;
698 		if (err < 0)
699 			break;
700 	}
701 out:
702 	up_read(&mm->mmap_sem);
703 	if (err < 0)
704 		return err;
705 	return busy;
706 
707 }
708 
709 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
710 {
711 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
712 
713 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
714 					page_address_in_vma(page, vma));
715 }
716 #else
717 
718 static void migrate_page_add(struct page *page, struct list_head *pagelist,
719 				unsigned long flags)
720 {
721 }
722 
723 int do_migrate_pages(struct mm_struct *mm,
724 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
725 {
726 	return -ENOSYS;
727 }
728 
729 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
730 {
731 	return NULL;
732 }
733 #endif
734 
735 long do_mbind(unsigned long start, unsigned long len,
736 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
737 {
738 	struct vm_area_struct *vma;
739 	struct mm_struct *mm = current->mm;
740 	struct mempolicy *new;
741 	unsigned long end;
742 	int err;
743 	LIST_HEAD(pagelist);
744 
745 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
746 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
747 	    || mode > MPOL_MAX)
748 		return -EINVAL;
749 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
750 		return -EPERM;
751 
752 	if (start & ~PAGE_MASK)
753 		return -EINVAL;
754 
755 	if (mode == MPOL_DEFAULT)
756 		flags &= ~MPOL_MF_STRICT;
757 
758 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
759 	end = start + len;
760 
761 	if (end < start)
762 		return -EINVAL;
763 	if (end == start)
764 		return 0;
765 
766 	if (mpol_check_policy(mode, nmask))
767 		return -EINVAL;
768 
769 	new = mpol_new(mode, nmask);
770 	if (IS_ERR(new))
771 		return PTR_ERR(new);
772 
773 	/*
774 	 * If we are using the default policy then operation
775 	 * on discontinuous address spaces is okay after all
776 	 */
777 	if (!new)
778 		flags |= MPOL_MF_DISCONTIG_OK;
779 
780 	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
781 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
782 
783 	down_write(&mm->mmap_sem);
784 	vma = check_range(mm, start, end, nmask,
785 			  flags | MPOL_MF_INVERT, &pagelist);
786 
787 	err = PTR_ERR(vma);
788 	if (!IS_ERR(vma)) {
789 		int nr_failed = 0;
790 
791 		err = mbind_range(vma, start, end, new);
792 
793 		if (!list_empty(&pagelist))
794 			nr_failed = migrate_pages(&pagelist, new_vma_page,
795 						(unsigned long)vma);
796 
797 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
798 			err = -EIO;
799 	}
800 
801 	up_write(&mm->mmap_sem);
802 	mpol_free(new);
803 	return err;
804 }
805 
806 /*
807  * User space interface with variable sized bitmaps for nodelists.
808  */
809 
810 /* Copy a node mask from user space. */
811 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
812 		     unsigned long maxnode)
813 {
814 	unsigned long k;
815 	unsigned long nlongs;
816 	unsigned long endmask;
817 
818 	--maxnode;
819 	nodes_clear(*nodes);
820 	if (maxnode == 0 || !nmask)
821 		return 0;
822 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
823 		return -EINVAL;
824 
825 	nlongs = BITS_TO_LONGS(maxnode);
826 	if ((maxnode % BITS_PER_LONG) == 0)
827 		endmask = ~0UL;
828 	else
829 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
830 
831 	/* When the user specified more nodes than supported just check
832 	   if the non supported part is all zero. */
833 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
834 		if (nlongs > PAGE_SIZE/sizeof(long))
835 			return -EINVAL;
836 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
837 			unsigned long t;
838 			if (get_user(t, nmask + k))
839 				return -EFAULT;
840 			if (k == nlongs - 1) {
841 				if (t & endmask)
842 					return -EINVAL;
843 			} else if (t)
844 				return -EINVAL;
845 		}
846 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
847 		endmask = ~0UL;
848 	}
849 
850 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
851 		return -EFAULT;
852 	nodes_addr(*nodes)[nlongs-1] &= endmask;
853 	return 0;
854 }
855 
856 /* Copy a kernel node mask to user space */
857 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
858 			      nodemask_t *nodes)
859 {
860 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
861 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
862 
863 	if (copy > nbytes) {
864 		if (copy > PAGE_SIZE)
865 			return -EINVAL;
866 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
867 			return -EFAULT;
868 		copy = nbytes;
869 	}
870 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
871 }
872 
873 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
874 			unsigned long mode,
875 			unsigned long __user *nmask, unsigned long maxnode,
876 			unsigned flags)
877 {
878 	nodemask_t nodes;
879 	int err;
880 
881 	err = get_nodes(&nodes, nmask, maxnode);
882 	if (err)
883 		return err;
884 #ifdef CONFIG_CPUSETS
885 	/* Restrict the nodes to the allowed nodes in the cpuset */
886 	nodes_and(nodes, nodes, current->mems_allowed);
887 #endif
888 	return do_mbind(start, len, mode, &nodes, flags);
889 }
890 
891 /* Set the process memory policy */
892 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
893 		unsigned long maxnode)
894 {
895 	int err;
896 	nodemask_t nodes;
897 
898 	if (mode < 0 || mode > MPOL_MAX)
899 		return -EINVAL;
900 	err = get_nodes(&nodes, nmask, maxnode);
901 	if (err)
902 		return err;
903 	return do_set_mempolicy(mode, &nodes);
904 }
905 
906 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
907 		const unsigned long __user *old_nodes,
908 		const unsigned long __user *new_nodes)
909 {
910 	struct mm_struct *mm;
911 	struct task_struct *task;
912 	nodemask_t old;
913 	nodemask_t new;
914 	nodemask_t task_nodes;
915 	int err;
916 
917 	err = get_nodes(&old, old_nodes, maxnode);
918 	if (err)
919 		return err;
920 
921 	err = get_nodes(&new, new_nodes, maxnode);
922 	if (err)
923 		return err;
924 
925 	/* Find the mm_struct */
926 	read_lock(&tasklist_lock);
927 	task = pid ? find_task_by_pid(pid) : current;
928 	if (!task) {
929 		read_unlock(&tasklist_lock);
930 		return -ESRCH;
931 	}
932 	mm = get_task_mm(task);
933 	read_unlock(&tasklist_lock);
934 
935 	if (!mm)
936 		return -EINVAL;
937 
938 	/*
939 	 * Check if this process has the right to modify the specified
940 	 * process. The right exists if the process has administrative
941 	 * capabilities, superuser privileges or the same
942 	 * userid as the target process.
943 	 */
944 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
945 	    (current->uid != task->suid) && (current->uid != task->uid) &&
946 	    !capable(CAP_SYS_NICE)) {
947 		err = -EPERM;
948 		goto out;
949 	}
950 
951 	task_nodes = cpuset_mems_allowed(task);
952 	/* Is the user allowed to access the target nodes? */
953 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
954 		err = -EPERM;
955 		goto out;
956 	}
957 
958 	if (!nodes_subset(new, node_online_map)) {
959 		err = -EINVAL;
960 		goto out;
961 	}
962 
963 	err = security_task_movememory(task);
964 	if (err)
965 		goto out;
966 
967 	err = do_migrate_pages(mm, &old, &new,
968 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
969 out:
970 	mmput(mm);
971 	return err;
972 }
973 
974 
975 /* Retrieve NUMA policy */
976 asmlinkage long sys_get_mempolicy(int __user *policy,
977 				unsigned long __user *nmask,
978 				unsigned long maxnode,
979 				unsigned long addr, unsigned long flags)
980 {
981 	int err, pval;
982 	nodemask_t nodes;
983 
984 	if (nmask != NULL && maxnode < MAX_NUMNODES)
985 		return -EINVAL;
986 
987 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
988 
989 	if (err)
990 		return err;
991 
992 	if (policy && put_user(pval, policy))
993 		return -EFAULT;
994 
995 	if (nmask)
996 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
997 
998 	return err;
999 }
1000 
1001 #ifdef CONFIG_COMPAT
1002 
1003 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1004 				     compat_ulong_t __user *nmask,
1005 				     compat_ulong_t maxnode,
1006 				     compat_ulong_t addr, compat_ulong_t flags)
1007 {
1008 	long err;
1009 	unsigned long __user *nm = NULL;
1010 	unsigned long nr_bits, alloc_size;
1011 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1012 
1013 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1014 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1015 
1016 	if (nmask)
1017 		nm = compat_alloc_user_space(alloc_size);
1018 
1019 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1020 
1021 	if (!err && nmask) {
1022 		err = copy_from_user(bm, nm, alloc_size);
1023 		/* ensure entire bitmap is zeroed */
1024 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1025 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1026 	}
1027 
1028 	return err;
1029 }
1030 
1031 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1032 				     compat_ulong_t maxnode)
1033 {
1034 	long err = 0;
1035 	unsigned long __user *nm = NULL;
1036 	unsigned long nr_bits, alloc_size;
1037 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1038 
1039 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1040 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1041 
1042 	if (nmask) {
1043 		err = compat_get_bitmap(bm, nmask, nr_bits);
1044 		nm = compat_alloc_user_space(alloc_size);
1045 		err |= copy_to_user(nm, bm, alloc_size);
1046 	}
1047 
1048 	if (err)
1049 		return -EFAULT;
1050 
1051 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1052 }
1053 
1054 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1055 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1056 			     compat_ulong_t maxnode, compat_ulong_t flags)
1057 {
1058 	long err = 0;
1059 	unsigned long __user *nm = NULL;
1060 	unsigned long nr_bits, alloc_size;
1061 	nodemask_t bm;
1062 
1063 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1064 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1065 
1066 	if (nmask) {
1067 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1068 		nm = compat_alloc_user_space(alloc_size);
1069 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1070 	}
1071 
1072 	if (err)
1073 		return -EFAULT;
1074 
1075 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1076 }
1077 
1078 #endif
1079 
1080 /*
1081  * get_vma_policy(@task, @vma, @addr)
1082  * @task - task for fallback if vma policy == default
1083  * @vma   - virtual memory area whose policy is sought
1084  * @addr  - address in @vma for shared policy lookup
1085  *
1086  * Returns effective policy for a VMA at specified address.
1087  * Falls back to @task or system default policy, as necessary.
1088  * Returned policy has extra reference count if shared, vma,
1089  * or some other task's policy [show_numa_maps() can pass
1090  * @task != current].  It is the caller's responsibility to
1091  * free the reference in these cases.
1092  */
1093 static struct mempolicy * get_vma_policy(struct task_struct *task,
1094 		struct vm_area_struct *vma, unsigned long addr)
1095 {
1096 	struct mempolicy *pol = task->mempolicy;
1097 	int shared_pol = 0;
1098 
1099 	if (vma) {
1100 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1101 			pol = vma->vm_ops->get_policy(vma, addr);
1102 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1103 		} else if (vma->vm_policy &&
1104 				vma->vm_policy->policy != MPOL_DEFAULT)
1105 			pol = vma->vm_policy;
1106 	}
1107 	if (!pol)
1108 		pol = &default_policy;
1109 	else if (!shared_pol && pol != current->mempolicy)
1110 		mpol_get(pol);	/* vma or other task's policy */
1111 	return pol;
1112 }
1113 
1114 /* Return a zonelist representing a mempolicy */
1115 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1116 {
1117 	int nd;
1118 
1119 	switch (policy->policy) {
1120 	case MPOL_PREFERRED:
1121 		nd = policy->v.preferred_node;
1122 		if (nd < 0)
1123 			nd = numa_node_id();
1124 		break;
1125 	case MPOL_BIND:
1126 		/* Lower zones don't get a policy applied */
1127 		/* Careful: current->mems_allowed might have moved */
1128 		if (gfp_zone(gfp) >= policy_zone)
1129 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1130 				return policy->v.zonelist;
1131 		/*FALL THROUGH*/
1132 	case MPOL_INTERLEAVE: /* should not happen */
1133 	case MPOL_DEFAULT:
1134 		nd = numa_node_id();
1135 		break;
1136 	default:
1137 		nd = 0;
1138 		BUG();
1139 	}
1140 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1141 }
1142 
1143 /* Do dynamic interleaving for a process */
1144 static unsigned interleave_nodes(struct mempolicy *policy)
1145 {
1146 	unsigned nid, next;
1147 	struct task_struct *me = current;
1148 
1149 	nid = me->il_next;
1150 	next = next_node(nid, policy->v.nodes);
1151 	if (next >= MAX_NUMNODES)
1152 		next = first_node(policy->v.nodes);
1153 	me->il_next = next;
1154 	return nid;
1155 }
1156 
1157 /*
1158  * Depending on the memory policy provide a node from which to allocate the
1159  * next slab entry.
1160  */
1161 unsigned slab_node(struct mempolicy *policy)
1162 {
1163 	int pol = policy ? policy->policy : MPOL_DEFAULT;
1164 
1165 	switch (pol) {
1166 	case MPOL_INTERLEAVE:
1167 		return interleave_nodes(policy);
1168 
1169 	case MPOL_BIND:
1170 		/*
1171 		 * Follow bind policy behavior and start allocation at the
1172 		 * first node.
1173 		 */
1174 		return zone_to_nid(policy->v.zonelist->zones[0]);
1175 
1176 	case MPOL_PREFERRED:
1177 		if (policy->v.preferred_node >= 0)
1178 			return policy->v.preferred_node;
1179 		/* Fall through */
1180 
1181 	default:
1182 		return numa_node_id();
1183 	}
1184 }
1185 
1186 /* Do static interleaving for a VMA with known offset. */
1187 static unsigned offset_il_node(struct mempolicy *pol,
1188 		struct vm_area_struct *vma, unsigned long off)
1189 {
1190 	unsigned nnodes = nodes_weight(pol->v.nodes);
1191 	unsigned target = (unsigned)off % nnodes;
1192 	int c;
1193 	int nid = -1;
1194 
1195 	c = 0;
1196 	do {
1197 		nid = next_node(nid, pol->v.nodes);
1198 		c++;
1199 	} while (c <= target);
1200 	return nid;
1201 }
1202 
1203 /* Determine a node number for interleave */
1204 static inline unsigned interleave_nid(struct mempolicy *pol,
1205 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1206 {
1207 	if (vma) {
1208 		unsigned long off;
1209 
1210 		/*
1211 		 * for small pages, there is no difference between
1212 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1213 		 * for huge pages, since vm_pgoff is in units of small
1214 		 * pages, we need to shift off the always 0 bits to get
1215 		 * a useful offset.
1216 		 */
1217 		BUG_ON(shift < PAGE_SHIFT);
1218 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1219 		off += (addr - vma->vm_start) >> shift;
1220 		return offset_il_node(pol, vma, off);
1221 	} else
1222 		return interleave_nodes(pol);
1223 }
1224 
1225 #ifdef CONFIG_HUGETLBFS
1226 /*
1227  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1228  * @vma = virtual memory area whose policy is sought
1229  * @addr = address in @vma for shared policy lookup and interleave policy
1230  * @gfp_flags = for requested zone
1231  * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1232  *
1233  * Returns a zonelist suitable for a huge page allocation.
1234  * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1235  * If it is also a policy for which get_vma_policy() returns an extra
1236  * reference, we must hold that reference until after allocation.
1237  * In that case, return policy via @mpol so hugetlb allocation can drop
1238  * the reference.  For non-'BIND referenced policies, we can/do drop the
1239  * reference here, so the caller doesn't need to know about the special case
1240  * for default and current task policy.
1241  */
1242 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1243 				gfp_t gfp_flags, struct mempolicy **mpol)
1244 {
1245 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1246 	struct zonelist *zl;
1247 
1248 	*mpol = NULL;		/* probably no unref needed */
1249 	if (pol->policy == MPOL_INTERLEAVE) {
1250 		unsigned nid;
1251 
1252 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1253 		__mpol_free(pol);		/* finished with pol */
1254 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1255 	}
1256 
1257 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1258 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1259 		if (pol->policy != MPOL_BIND)
1260 			__mpol_free(pol);	/* finished with pol */
1261 		else
1262 			*mpol = pol;	/* unref needed after allocation */
1263 	}
1264 	return zl;
1265 }
1266 #endif
1267 
1268 /* Allocate a page in interleaved policy.
1269    Own path because it needs to do special accounting. */
1270 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1271 					unsigned nid)
1272 {
1273 	struct zonelist *zl;
1274 	struct page *page;
1275 
1276 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1277 	page = __alloc_pages(gfp, order, zl);
1278 	if (page && page_zone(page) == zl->zones[0])
1279 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1280 	return page;
1281 }
1282 
1283 /**
1284  * 	alloc_page_vma	- Allocate a page for a VMA.
1285  *
1286  * 	@gfp:
1287  *      %GFP_USER    user allocation.
1288  *      %GFP_KERNEL  kernel allocations,
1289  *      %GFP_HIGHMEM highmem/user allocations,
1290  *      %GFP_FS      allocation should not call back into a file system.
1291  *      %GFP_ATOMIC  don't sleep.
1292  *
1293  * 	@vma:  Pointer to VMA or NULL if not available.
1294  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1295  *
1296  * 	This function allocates a page from the kernel page pool and applies
1297  *	a NUMA policy associated with the VMA or the current process.
1298  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1299  *	mm_struct of the VMA to prevent it from going away. Should be used for
1300  *	all allocations for pages that will be mapped into
1301  * 	user space. Returns NULL when no page can be allocated.
1302  *
1303  *	Should be called with the mm_sem of the vma hold.
1304  */
1305 struct page *
1306 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1307 {
1308 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1309 	struct zonelist *zl;
1310 
1311 	cpuset_update_task_memory_state();
1312 
1313 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1314 		unsigned nid;
1315 
1316 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1317 		return alloc_page_interleave(gfp, 0, nid);
1318 	}
1319 	zl = zonelist_policy(gfp, pol);
1320 	if (pol != &default_policy && pol != current->mempolicy) {
1321 		/*
1322 		 * slow path: ref counted policy -- shared or vma
1323 		 */
1324 		struct page *page =  __alloc_pages(gfp, 0, zl);
1325 		__mpol_free(pol);
1326 		return page;
1327 	}
1328 	/*
1329 	 * fast path:  default or task policy
1330 	 */
1331 	return __alloc_pages(gfp, 0, zl);
1332 }
1333 
1334 /**
1335  * 	alloc_pages_current - Allocate pages.
1336  *
1337  *	@gfp:
1338  *		%GFP_USER   user allocation,
1339  *      	%GFP_KERNEL kernel allocation,
1340  *      	%GFP_HIGHMEM highmem allocation,
1341  *      	%GFP_FS     don't call back into a file system.
1342  *      	%GFP_ATOMIC don't sleep.
1343  *	@order: Power of two of allocation size in pages. 0 is a single page.
1344  *
1345  *	Allocate a page from the kernel page pool.  When not in
1346  *	interrupt context and apply the current process NUMA policy.
1347  *	Returns NULL when no page can be allocated.
1348  *
1349  *	Don't call cpuset_update_task_memory_state() unless
1350  *	1) it's ok to take cpuset_sem (can WAIT), and
1351  *	2) allocating for current task (not interrupt).
1352  */
1353 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1354 {
1355 	struct mempolicy *pol = current->mempolicy;
1356 
1357 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1358 		cpuset_update_task_memory_state();
1359 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1360 		pol = &default_policy;
1361 	if (pol->policy == MPOL_INTERLEAVE)
1362 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1363 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1364 }
1365 EXPORT_SYMBOL(alloc_pages_current);
1366 
1367 /*
1368  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1369  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1370  * with the mems_allowed returned by cpuset_mems_allowed().  This
1371  * keeps mempolicies cpuset relative after its cpuset moves.  See
1372  * further kernel/cpuset.c update_nodemask().
1373  */
1374 void *cpuset_being_rebound;
1375 
1376 /* Slow path of a mempolicy copy */
1377 struct mempolicy *__mpol_copy(struct mempolicy *old)
1378 {
1379 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1380 
1381 	if (!new)
1382 		return ERR_PTR(-ENOMEM);
1383 	if (current_cpuset_is_being_rebound()) {
1384 		nodemask_t mems = cpuset_mems_allowed(current);
1385 		mpol_rebind_policy(old, &mems);
1386 	}
1387 	*new = *old;
1388 	atomic_set(&new->refcnt, 1);
1389 	if (new->policy == MPOL_BIND) {
1390 		int sz = ksize(old->v.zonelist);
1391 		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1392 		if (!new->v.zonelist) {
1393 			kmem_cache_free(policy_cache, new);
1394 			return ERR_PTR(-ENOMEM);
1395 		}
1396 	}
1397 	return new;
1398 }
1399 
1400 /* Slow path of a mempolicy comparison */
1401 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1402 {
1403 	if (!a || !b)
1404 		return 0;
1405 	if (a->policy != b->policy)
1406 		return 0;
1407 	switch (a->policy) {
1408 	case MPOL_DEFAULT:
1409 		return 1;
1410 	case MPOL_INTERLEAVE:
1411 		return nodes_equal(a->v.nodes, b->v.nodes);
1412 	case MPOL_PREFERRED:
1413 		return a->v.preferred_node == b->v.preferred_node;
1414 	case MPOL_BIND: {
1415 		int i;
1416 		for (i = 0; a->v.zonelist->zones[i]; i++)
1417 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1418 				return 0;
1419 		return b->v.zonelist->zones[i] == NULL;
1420 	}
1421 	default:
1422 		BUG();
1423 		return 0;
1424 	}
1425 }
1426 
1427 /* Slow path of a mpol destructor. */
1428 void __mpol_free(struct mempolicy *p)
1429 {
1430 	if (!atomic_dec_and_test(&p->refcnt))
1431 		return;
1432 	if (p->policy == MPOL_BIND)
1433 		kfree(p->v.zonelist);
1434 	p->policy = MPOL_DEFAULT;
1435 	kmem_cache_free(policy_cache, p);
1436 }
1437 
1438 /*
1439  * Shared memory backing store policy support.
1440  *
1441  * Remember policies even when nobody has shared memory mapped.
1442  * The policies are kept in Red-Black tree linked from the inode.
1443  * They are protected by the sp->lock spinlock, which should be held
1444  * for any accesses to the tree.
1445  */
1446 
1447 /* lookup first element intersecting start-end */
1448 /* Caller holds sp->lock */
1449 static struct sp_node *
1450 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1451 {
1452 	struct rb_node *n = sp->root.rb_node;
1453 
1454 	while (n) {
1455 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1456 
1457 		if (start >= p->end)
1458 			n = n->rb_right;
1459 		else if (end <= p->start)
1460 			n = n->rb_left;
1461 		else
1462 			break;
1463 	}
1464 	if (!n)
1465 		return NULL;
1466 	for (;;) {
1467 		struct sp_node *w = NULL;
1468 		struct rb_node *prev = rb_prev(n);
1469 		if (!prev)
1470 			break;
1471 		w = rb_entry(prev, struct sp_node, nd);
1472 		if (w->end <= start)
1473 			break;
1474 		n = prev;
1475 	}
1476 	return rb_entry(n, struct sp_node, nd);
1477 }
1478 
1479 /* Insert a new shared policy into the list. */
1480 /* Caller holds sp->lock */
1481 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1482 {
1483 	struct rb_node **p = &sp->root.rb_node;
1484 	struct rb_node *parent = NULL;
1485 	struct sp_node *nd;
1486 
1487 	while (*p) {
1488 		parent = *p;
1489 		nd = rb_entry(parent, struct sp_node, nd);
1490 		if (new->start < nd->start)
1491 			p = &(*p)->rb_left;
1492 		else if (new->end > nd->end)
1493 			p = &(*p)->rb_right;
1494 		else
1495 			BUG();
1496 	}
1497 	rb_link_node(&new->nd, parent, p);
1498 	rb_insert_color(&new->nd, &sp->root);
1499 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1500 		 new->policy ? new->policy->policy : 0);
1501 }
1502 
1503 /* Find shared policy intersecting idx */
1504 struct mempolicy *
1505 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1506 {
1507 	struct mempolicy *pol = NULL;
1508 	struct sp_node *sn;
1509 
1510 	if (!sp->root.rb_node)
1511 		return NULL;
1512 	spin_lock(&sp->lock);
1513 	sn = sp_lookup(sp, idx, idx+1);
1514 	if (sn) {
1515 		mpol_get(sn->policy);
1516 		pol = sn->policy;
1517 	}
1518 	spin_unlock(&sp->lock);
1519 	return pol;
1520 }
1521 
1522 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1523 {
1524 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1525 	rb_erase(&n->nd, &sp->root);
1526 	mpol_free(n->policy);
1527 	kmem_cache_free(sn_cache, n);
1528 }
1529 
1530 struct sp_node *
1531 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1532 {
1533 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1534 
1535 	if (!n)
1536 		return NULL;
1537 	n->start = start;
1538 	n->end = end;
1539 	mpol_get(pol);
1540 	n->policy = pol;
1541 	return n;
1542 }
1543 
1544 /* Replace a policy range. */
1545 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1546 				 unsigned long end, struct sp_node *new)
1547 {
1548 	struct sp_node *n, *new2 = NULL;
1549 
1550 restart:
1551 	spin_lock(&sp->lock);
1552 	n = sp_lookup(sp, start, end);
1553 	/* Take care of old policies in the same range. */
1554 	while (n && n->start < end) {
1555 		struct rb_node *next = rb_next(&n->nd);
1556 		if (n->start >= start) {
1557 			if (n->end <= end)
1558 				sp_delete(sp, n);
1559 			else
1560 				n->start = end;
1561 		} else {
1562 			/* Old policy spanning whole new range. */
1563 			if (n->end > end) {
1564 				if (!new2) {
1565 					spin_unlock(&sp->lock);
1566 					new2 = sp_alloc(end, n->end, n->policy);
1567 					if (!new2)
1568 						return -ENOMEM;
1569 					goto restart;
1570 				}
1571 				n->end = start;
1572 				sp_insert(sp, new2);
1573 				new2 = NULL;
1574 				break;
1575 			} else
1576 				n->end = start;
1577 		}
1578 		if (!next)
1579 			break;
1580 		n = rb_entry(next, struct sp_node, nd);
1581 	}
1582 	if (new)
1583 		sp_insert(sp, new);
1584 	spin_unlock(&sp->lock);
1585 	if (new2) {
1586 		mpol_free(new2->policy);
1587 		kmem_cache_free(sn_cache, new2);
1588 	}
1589 	return 0;
1590 }
1591 
1592 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1593 				nodemask_t *policy_nodes)
1594 {
1595 	info->root = RB_ROOT;
1596 	spin_lock_init(&info->lock);
1597 
1598 	if (policy != MPOL_DEFAULT) {
1599 		struct mempolicy *newpol;
1600 
1601 		/* Falls back to MPOL_DEFAULT on any error */
1602 		newpol = mpol_new(policy, policy_nodes);
1603 		if (!IS_ERR(newpol)) {
1604 			/* Create pseudo-vma that contains just the policy */
1605 			struct vm_area_struct pvma;
1606 
1607 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1608 			/* Policy covers entire file */
1609 			pvma.vm_end = TASK_SIZE;
1610 			mpol_set_shared_policy(info, &pvma, newpol);
1611 			mpol_free(newpol);
1612 		}
1613 	}
1614 }
1615 
1616 int mpol_set_shared_policy(struct shared_policy *info,
1617 			struct vm_area_struct *vma, struct mempolicy *npol)
1618 {
1619 	int err;
1620 	struct sp_node *new = NULL;
1621 	unsigned long sz = vma_pages(vma);
1622 
1623 	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1624 		 vma->vm_pgoff,
1625 		 sz, npol? npol->policy : -1,
1626 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1627 
1628 	if (npol) {
1629 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1630 		if (!new)
1631 			return -ENOMEM;
1632 	}
1633 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1634 	if (err && new)
1635 		kmem_cache_free(sn_cache, new);
1636 	return err;
1637 }
1638 
1639 /* Free a backing policy store on inode delete. */
1640 void mpol_free_shared_policy(struct shared_policy *p)
1641 {
1642 	struct sp_node *n;
1643 	struct rb_node *next;
1644 
1645 	if (!p->root.rb_node)
1646 		return;
1647 	spin_lock(&p->lock);
1648 	next = rb_first(&p->root);
1649 	while (next) {
1650 		n = rb_entry(next, struct sp_node, nd);
1651 		next = rb_next(&n->nd);
1652 		rb_erase(&n->nd, &p->root);
1653 		mpol_free(n->policy);
1654 		kmem_cache_free(sn_cache, n);
1655 	}
1656 	spin_unlock(&p->lock);
1657 }
1658 
1659 /* assumes fs == KERNEL_DS */
1660 void __init numa_policy_init(void)
1661 {
1662 	nodemask_t interleave_nodes;
1663 	unsigned long largest = 0;
1664 	int nid, prefer = 0;
1665 
1666 	policy_cache = kmem_cache_create("numa_policy",
1667 					 sizeof(struct mempolicy),
1668 					 0, SLAB_PANIC, NULL);
1669 
1670 	sn_cache = kmem_cache_create("shared_policy_node",
1671 				     sizeof(struct sp_node),
1672 				     0, SLAB_PANIC, NULL);
1673 
1674 	/*
1675 	 * Set interleaving policy for system init. Interleaving is only
1676 	 * enabled across suitably sized nodes (default is >= 16MB), or
1677 	 * fall back to the largest node if they're all smaller.
1678 	 */
1679 	nodes_clear(interleave_nodes);
1680 	for_each_online_node(nid) {
1681 		unsigned long total_pages = node_present_pages(nid);
1682 
1683 		/* Preserve the largest node */
1684 		if (largest < total_pages) {
1685 			largest = total_pages;
1686 			prefer = nid;
1687 		}
1688 
1689 		/* Interleave this node? */
1690 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1691 			node_set(nid, interleave_nodes);
1692 	}
1693 
1694 	/* All too small, use the largest */
1695 	if (unlikely(nodes_empty(interleave_nodes)))
1696 		node_set(prefer, interleave_nodes);
1697 
1698 	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1699 		printk("numa_policy_init: interleaving failed\n");
1700 }
1701 
1702 /* Reset policy of current process to default */
1703 void numa_default_policy(void)
1704 {
1705 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1706 }
1707 
1708 /* Migrate a policy to a different set of nodes */
1709 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1710 {
1711 	nodemask_t *mpolmask;
1712 	nodemask_t tmp;
1713 
1714 	if (!pol)
1715 		return;
1716 	mpolmask = &pol->cpuset_mems_allowed;
1717 	if (nodes_equal(*mpolmask, *newmask))
1718 		return;
1719 
1720 	switch (pol->policy) {
1721 	case MPOL_DEFAULT:
1722 		break;
1723 	case MPOL_INTERLEAVE:
1724 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1725 		pol->v.nodes = tmp;
1726 		*mpolmask = *newmask;
1727 		current->il_next = node_remap(current->il_next,
1728 						*mpolmask, *newmask);
1729 		break;
1730 	case MPOL_PREFERRED:
1731 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1732 						*mpolmask, *newmask);
1733 		*mpolmask = *newmask;
1734 		break;
1735 	case MPOL_BIND: {
1736 		nodemask_t nodes;
1737 		struct zone **z;
1738 		struct zonelist *zonelist;
1739 
1740 		nodes_clear(nodes);
1741 		for (z = pol->v.zonelist->zones; *z; z++)
1742 			node_set(zone_to_nid(*z), nodes);
1743 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1744 		nodes = tmp;
1745 
1746 		zonelist = bind_zonelist(&nodes);
1747 
1748 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1749 		 * If that old zonelist has no remaining mems_allowed nodes,
1750 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1751 		 */
1752 
1753 		if (!IS_ERR(zonelist)) {
1754 			/* Good - got mem - substitute new zonelist */
1755 			kfree(pol->v.zonelist);
1756 			pol->v.zonelist = zonelist;
1757 		}
1758 		*mpolmask = *newmask;
1759 		break;
1760 	}
1761 	default:
1762 		BUG();
1763 		break;
1764 	}
1765 }
1766 
1767 /*
1768  * Wrapper for mpol_rebind_policy() that just requires task
1769  * pointer, and updates task mempolicy.
1770  */
1771 
1772 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1773 {
1774 	mpol_rebind_policy(tsk->mempolicy, new);
1775 }
1776 
1777 /*
1778  * Rebind each vma in mm to new nodemask.
1779  *
1780  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1781  */
1782 
1783 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1784 {
1785 	struct vm_area_struct *vma;
1786 
1787 	down_write(&mm->mmap_sem);
1788 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1789 		mpol_rebind_policy(vma->vm_policy, new);
1790 	up_write(&mm->mmap_sem);
1791 }
1792 
1793 /*
1794  * Display pages allocated per node and memory policy via /proc.
1795  */
1796 
1797 static const char * const policy_types[] =
1798 	{ "default", "prefer", "bind", "interleave" };
1799 
1800 /*
1801  * Convert a mempolicy into a string.
1802  * Returns the number of characters in buffer (if positive)
1803  * or an error (negative)
1804  */
1805 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1806 {
1807 	char *p = buffer;
1808 	int l;
1809 	nodemask_t nodes;
1810 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1811 
1812 	switch (mode) {
1813 	case MPOL_DEFAULT:
1814 		nodes_clear(nodes);
1815 		break;
1816 
1817 	case MPOL_PREFERRED:
1818 		nodes_clear(nodes);
1819 		node_set(pol->v.preferred_node, nodes);
1820 		break;
1821 
1822 	case MPOL_BIND:
1823 		get_zonemask(pol, &nodes);
1824 		break;
1825 
1826 	case MPOL_INTERLEAVE:
1827 		nodes = pol->v.nodes;
1828 		break;
1829 
1830 	default:
1831 		BUG();
1832 		return -EFAULT;
1833 	}
1834 
1835 	l = strlen(policy_types[mode]);
1836  	if (buffer + maxlen < p + l + 1)
1837  		return -ENOSPC;
1838 
1839 	strcpy(p, policy_types[mode]);
1840 	p += l;
1841 
1842 	if (!nodes_empty(nodes)) {
1843 		if (buffer + maxlen < p + 2)
1844 			return -ENOSPC;
1845 		*p++ = '=';
1846 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1847 	}
1848 	return p - buffer;
1849 }
1850 
1851 struct numa_maps {
1852 	unsigned long pages;
1853 	unsigned long anon;
1854 	unsigned long active;
1855 	unsigned long writeback;
1856 	unsigned long mapcount_max;
1857 	unsigned long dirty;
1858 	unsigned long swapcache;
1859 	unsigned long node[MAX_NUMNODES];
1860 };
1861 
1862 static void gather_stats(struct page *page, void *private, int pte_dirty)
1863 {
1864 	struct numa_maps *md = private;
1865 	int count = page_mapcount(page);
1866 
1867 	md->pages++;
1868 	if (pte_dirty || PageDirty(page))
1869 		md->dirty++;
1870 
1871 	if (PageSwapCache(page))
1872 		md->swapcache++;
1873 
1874 	if (PageActive(page))
1875 		md->active++;
1876 
1877 	if (PageWriteback(page))
1878 		md->writeback++;
1879 
1880 	if (PageAnon(page))
1881 		md->anon++;
1882 
1883 	if (count > md->mapcount_max)
1884 		md->mapcount_max = count;
1885 
1886 	md->node[page_to_nid(page)]++;
1887 }
1888 
1889 #ifdef CONFIG_HUGETLB_PAGE
1890 static void check_huge_range(struct vm_area_struct *vma,
1891 		unsigned long start, unsigned long end,
1892 		struct numa_maps *md)
1893 {
1894 	unsigned long addr;
1895 	struct page *page;
1896 
1897 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1898 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1899 		pte_t pte;
1900 
1901 		if (!ptep)
1902 			continue;
1903 
1904 		pte = *ptep;
1905 		if (pte_none(pte))
1906 			continue;
1907 
1908 		page = pte_page(pte);
1909 		if (!page)
1910 			continue;
1911 
1912 		gather_stats(page, md, pte_dirty(*ptep));
1913 	}
1914 }
1915 #else
1916 static inline void check_huge_range(struct vm_area_struct *vma,
1917 		unsigned long start, unsigned long end,
1918 		struct numa_maps *md)
1919 {
1920 }
1921 #endif
1922 
1923 int show_numa_map(struct seq_file *m, void *v)
1924 {
1925 	struct proc_maps_private *priv = m->private;
1926 	struct vm_area_struct *vma = v;
1927 	struct numa_maps *md;
1928 	struct file *file = vma->vm_file;
1929 	struct mm_struct *mm = vma->vm_mm;
1930 	struct mempolicy *pol;
1931 	int n;
1932 	char buffer[50];
1933 
1934 	if (!mm)
1935 		return 0;
1936 
1937 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1938 	if (!md)
1939 		return 0;
1940 
1941 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
1942 	mpol_to_str(buffer, sizeof(buffer), pol);
1943 	/*
1944 	 * unref shared or other task's mempolicy
1945 	 */
1946 	if (pol != &default_policy && pol != current->mempolicy)
1947 		__mpol_free(pol);
1948 
1949 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1950 
1951 	if (file) {
1952 		seq_printf(m, " file=");
1953 		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1954 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1955 		seq_printf(m, " heap");
1956 	} else if (vma->vm_start <= mm->start_stack &&
1957 			vma->vm_end >= mm->start_stack) {
1958 		seq_printf(m, " stack");
1959 	}
1960 
1961 	if (is_vm_hugetlb_page(vma)) {
1962 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1963 		seq_printf(m, " huge");
1964 	} else {
1965 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1966 				&node_online_map, MPOL_MF_STATS, md);
1967 	}
1968 
1969 	if (!md->pages)
1970 		goto out;
1971 
1972 	if (md->anon)
1973 		seq_printf(m," anon=%lu",md->anon);
1974 
1975 	if (md->dirty)
1976 		seq_printf(m," dirty=%lu",md->dirty);
1977 
1978 	if (md->pages != md->anon && md->pages != md->dirty)
1979 		seq_printf(m, " mapped=%lu", md->pages);
1980 
1981 	if (md->mapcount_max > 1)
1982 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1983 
1984 	if (md->swapcache)
1985 		seq_printf(m," swapcache=%lu", md->swapcache);
1986 
1987 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1988 		seq_printf(m," active=%lu", md->active);
1989 
1990 	if (md->writeback)
1991 		seq_printf(m," writeback=%lu", md->writeback);
1992 
1993 	for_each_online_node(n)
1994 		if (md->node[n])
1995 			seq_printf(m, " N%d=%lu", n, md->node[n]);
1996 out:
1997 	seq_putc(m, '\n');
1998 	kfree(md);
1999 
2000 	if (m->count < m->size)
2001 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2002 	return 0;
2003 }
2004 
2005