xref: /linux/mm/mempolicy.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 #include <linux/seq_file.h>
88 #include <linux/proc_fs.h>
89 #include <linux/migrate.h>
90 
91 #include <asm/tlbflush.h>
92 #include <asm/uaccess.h>
93 
94 /* Internal flags */
95 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
96 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
97 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
98 
99 static struct kmem_cache *policy_cache;
100 static struct kmem_cache *sn_cache;
101 
102 #define PDprintk(fmt...)
103 
104 /* Highest zone. An specific allocation for a zone below that is not
105    policied. */
106 int policy_zone = ZONE_DMA;
107 
108 struct mempolicy default_policy = {
109 	.refcnt = ATOMIC_INIT(1), /* never free it */
110 	.policy = MPOL_DEFAULT,
111 };
112 
113 /* Do sanity checking on a policy */
114 static int mpol_check_policy(int mode, nodemask_t *nodes)
115 {
116 	int empty = nodes_empty(*nodes);
117 
118 	switch (mode) {
119 	case MPOL_DEFAULT:
120 		if (!empty)
121 			return -EINVAL;
122 		break;
123 	case MPOL_BIND:
124 	case MPOL_INTERLEAVE:
125 		/* Preferred will only use the first bit, but allow
126 		   more for now. */
127 		if (empty)
128 			return -EINVAL;
129 		break;
130 	}
131 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
132 }
133 
134 /* Generate a custom zonelist for the BIND policy. */
135 static struct zonelist *bind_zonelist(nodemask_t *nodes)
136 {
137 	struct zonelist *zl;
138 	int num, max, nd, k;
139 
140 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
141 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
142 	if (!zl)
143 		return NULL;
144 	num = 0;
145 	/* First put in the highest zones from all nodes, then all the next
146 	   lower zones etc. Avoid empty zones because the memory allocator
147 	   doesn't like them. If you implement node hot removal you
148 	   have to fix that. */
149 	for (k = policy_zone; k >= 0; k--) {
150 		for_each_node_mask(nd, *nodes) {
151 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
152 			if (z->present_pages > 0)
153 				zl->zones[num++] = z;
154 		}
155 	}
156 	zl->zones[num] = NULL;
157 	return zl;
158 }
159 
160 /* Create a new policy */
161 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
162 {
163 	struct mempolicy *policy;
164 
165 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
166 	if (mode == MPOL_DEFAULT)
167 		return NULL;
168 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
169 	if (!policy)
170 		return ERR_PTR(-ENOMEM);
171 	atomic_set(&policy->refcnt, 1);
172 	switch (mode) {
173 	case MPOL_INTERLEAVE:
174 		policy->v.nodes = *nodes;
175 		if (nodes_weight(*nodes) == 0) {
176 			kmem_cache_free(policy_cache, policy);
177 			return ERR_PTR(-EINVAL);
178 		}
179 		break;
180 	case MPOL_PREFERRED:
181 		policy->v.preferred_node = first_node(*nodes);
182 		if (policy->v.preferred_node >= MAX_NUMNODES)
183 			policy->v.preferred_node = -1;
184 		break;
185 	case MPOL_BIND:
186 		policy->v.zonelist = bind_zonelist(nodes);
187 		if (policy->v.zonelist == NULL) {
188 			kmem_cache_free(policy_cache, policy);
189 			return ERR_PTR(-ENOMEM);
190 		}
191 		break;
192 	}
193 	policy->policy = mode;
194 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
195 	return policy;
196 }
197 
198 static void gather_stats(struct page *, void *, int pte_dirty);
199 static void migrate_page_add(struct page *page, struct list_head *pagelist,
200 				unsigned long flags);
201 
202 /* Scan through pages checking if pages follow certain conditions. */
203 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
204 		unsigned long addr, unsigned long end,
205 		const nodemask_t *nodes, unsigned long flags,
206 		void *private)
207 {
208 	pte_t *orig_pte;
209 	pte_t *pte;
210 	spinlock_t *ptl;
211 
212 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
213 	do {
214 		struct page *page;
215 		unsigned int nid;
216 
217 		if (!pte_present(*pte))
218 			continue;
219 		page = vm_normal_page(vma, addr, *pte);
220 		if (!page)
221 			continue;
222 		/*
223 		 * The check for PageReserved here is important to avoid
224 		 * handling zero pages and other pages that may have been
225 		 * marked special by the system.
226 		 *
227 		 * If the PageReserved would not be checked here then f.e.
228 		 * the location of the zero page could have an influence
229 		 * on MPOL_MF_STRICT, zero pages would be counted for
230 		 * the per node stats, and there would be useless attempts
231 		 * to put zero pages on the migration list.
232 		 */
233 		if (PageReserved(page))
234 			continue;
235 		nid = page_to_nid(page);
236 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
237 			continue;
238 
239 		if (flags & MPOL_MF_STATS)
240 			gather_stats(page, private, pte_dirty(*pte));
241 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
242 			migrate_page_add(page, private, flags);
243 		else
244 			break;
245 	} while (pte++, addr += PAGE_SIZE, addr != end);
246 	pte_unmap_unlock(orig_pte, ptl);
247 	return addr != end;
248 }
249 
250 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
251 		unsigned long addr, unsigned long end,
252 		const nodemask_t *nodes, unsigned long flags,
253 		void *private)
254 {
255 	pmd_t *pmd;
256 	unsigned long next;
257 
258 	pmd = pmd_offset(pud, addr);
259 	do {
260 		next = pmd_addr_end(addr, end);
261 		if (pmd_none_or_clear_bad(pmd))
262 			continue;
263 		if (check_pte_range(vma, pmd, addr, next, nodes,
264 				    flags, private))
265 			return -EIO;
266 	} while (pmd++, addr = next, addr != end);
267 	return 0;
268 }
269 
270 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
271 		unsigned long addr, unsigned long end,
272 		const nodemask_t *nodes, unsigned long flags,
273 		void *private)
274 {
275 	pud_t *pud;
276 	unsigned long next;
277 
278 	pud = pud_offset(pgd, addr);
279 	do {
280 		next = pud_addr_end(addr, end);
281 		if (pud_none_or_clear_bad(pud))
282 			continue;
283 		if (check_pmd_range(vma, pud, addr, next, nodes,
284 				    flags, private))
285 			return -EIO;
286 	} while (pud++, addr = next, addr != end);
287 	return 0;
288 }
289 
290 static inline int check_pgd_range(struct vm_area_struct *vma,
291 		unsigned long addr, unsigned long end,
292 		const nodemask_t *nodes, unsigned long flags,
293 		void *private)
294 {
295 	pgd_t *pgd;
296 	unsigned long next;
297 
298 	pgd = pgd_offset(vma->vm_mm, addr);
299 	do {
300 		next = pgd_addr_end(addr, end);
301 		if (pgd_none_or_clear_bad(pgd))
302 			continue;
303 		if (check_pud_range(vma, pgd, addr, next, nodes,
304 				    flags, private))
305 			return -EIO;
306 	} while (pgd++, addr = next, addr != end);
307 	return 0;
308 }
309 
310 /* Check if a vma is migratable */
311 static inline int vma_migratable(struct vm_area_struct *vma)
312 {
313 	if (vma->vm_flags & (
314 		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
315 		return 0;
316 	return 1;
317 }
318 
319 /*
320  * Check if all pages in a range are on a set of nodes.
321  * If pagelist != NULL then isolate pages from the LRU and
322  * put them on the pagelist.
323  */
324 static struct vm_area_struct *
325 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
326 		const nodemask_t *nodes, unsigned long flags, void *private)
327 {
328 	int err;
329 	struct vm_area_struct *first, *vma, *prev;
330 
331 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
332 
333 		err = migrate_prep();
334 		if (err)
335 			return ERR_PTR(err);
336 	}
337 
338 	first = find_vma(mm, start);
339 	if (!first)
340 		return ERR_PTR(-EFAULT);
341 	prev = NULL;
342 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
343 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
344 			if (!vma->vm_next && vma->vm_end < end)
345 				return ERR_PTR(-EFAULT);
346 			if (prev && prev->vm_end < vma->vm_start)
347 				return ERR_PTR(-EFAULT);
348 		}
349 		if (!is_vm_hugetlb_page(vma) &&
350 		    ((flags & MPOL_MF_STRICT) ||
351 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
352 				vma_migratable(vma)))) {
353 			unsigned long endvma = vma->vm_end;
354 
355 			if (endvma > end)
356 				endvma = end;
357 			if (vma->vm_start > start)
358 				start = vma->vm_start;
359 			err = check_pgd_range(vma, start, endvma, nodes,
360 						flags, private);
361 			if (err) {
362 				first = ERR_PTR(err);
363 				break;
364 			}
365 		}
366 		prev = vma;
367 	}
368 	return first;
369 }
370 
371 /* Apply policy to a single VMA */
372 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
373 {
374 	int err = 0;
375 	struct mempolicy *old = vma->vm_policy;
376 
377 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
378 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
379 		 vma->vm_ops, vma->vm_file,
380 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
381 
382 	if (vma->vm_ops && vma->vm_ops->set_policy)
383 		err = vma->vm_ops->set_policy(vma, new);
384 	if (!err) {
385 		mpol_get(new);
386 		vma->vm_policy = new;
387 		mpol_free(old);
388 	}
389 	return err;
390 }
391 
392 /* Step 2: apply policy to a range and do splits. */
393 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
394 		       unsigned long end, struct mempolicy *new)
395 {
396 	struct vm_area_struct *next;
397 	int err;
398 
399 	err = 0;
400 	for (; vma && vma->vm_start < end; vma = next) {
401 		next = vma->vm_next;
402 		if (vma->vm_start < start)
403 			err = split_vma(vma->vm_mm, vma, start, 1);
404 		if (!err && vma->vm_end > end)
405 			err = split_vma(vma->vm_mm, vma, end, 0);
406 		if (!err)
407 			err = policy_vma(vma, new);
408 		if (err)
409 			break;
410 	}
411 	return err;
412 }
413 
414 static int contextualize_policy(int mode, nodemask_t *nodes)
415 {
416 	if (!nodes)
417 		return 0;
418 
419 	cpuset_update_task_memory_state();
420 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
421 		return -EINVAL;
422 	return mpol_check_policy(mode, nodes);
423 }
424 
425 
426 /*
427  * Update task->flags PF_MEMPOLICY bit: set iff non-default
428  * mempolicy.  Allows more rapid checking of this (combined perhaps
429  * with other PF_* flag bits) on memory allocation hot code paths.
430  *
431  * If called from outside this file, the task 'p' should -only- be
432  * a newly forked child not yet visible on the task list, because
433  * manipulating the task flags of a visible task is not safe.
434  *
435  * The above limitation is why this routine has the funny name
436  * mpol_fix_fork_child_flag().
437  *
438  * It is also safe to call this with a task pointer of current,
439  * which the static wrapper mpol_set_task_struct_flag() does,
440  * for use within this file.
441  */
442 
443 void mpol_fix_fork_child_flag(struct task_struct *p)
444 {
445 	if (p->mempolicy)
446 		p->flags |= PF_MEMPOLICY;
447 	else
448 		p->flags &= ~PF_MEMPOLICY;
449 }
450 
451 static void mpol_set_task_struct_flag(void)
452 {
453 	mpol_fix_fork_child_flag(current);
454 }
455 
456 /* Set the process memory policy */
457 long do_set_mempolicy(int mode, nodemask_t *nodes)
458 {
459 	struct mempolicy *new;
460 
461 	if (contextualize_policy(mode, nodes))
462 		return -EINVAL;
463 	new = mpol_new(mode, nodes);
464 	if (IS_ERR(new))
465 		return PTR_ERR(new);
466 	mpol_free(current->mempolicy);
467 	current->mempolicy = new;
468 	mpol_set_task_struct_flag();
469 	if (new && new->policy == MPOL_INTERLEAVE)
470 		current->il_next = first_node(new->v.nodes);
471 	return 0;
472 }
473 
474 /* Fill a zone bitmap for a policy */
475 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
476 {
477 	int i;
478 
479 	nodes_clear(*nodes);
480 	switch (p->policy) {
481 	case MPOL_BIND:
482 		for (i = 0; p->v.zonelist->zones[i]; i++)
483 			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
484 				*nodes);
485 		break;
486 	case MPOL_DEFAULT:
487 		break;
488 	case MPOL_INTERLEAVE:
489 		*nodes = p->v.nodes;
490 		break;
491 	case MPOL_PREFERRED:
492 		/* or use current node instead of online map? */
493 		if (p->v.preferred_node < 0)
494 			*nodes = node_online_map;
495 		else
496 			node_set(p->v.preferred_node, *nodes);
497 		break;
498 	default:
499 		BUG();
500 	}
501 }
502 
503 static int lookup_node(struct mm_struct *mm, unsigned long addr)
504 {
505 	struct page *p;
506 	int err;
507 
508 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
509 	if (err >= 0) {
510 		err = page_to_nid(p);
511 		put_page(p);
512 	}
513 	return err;
514 }
515 
516 /* Retrieve NUMA policy */
517 long do_get_mempolicy(int *policy, nodemask_t *nmask,
518 			unsigned long addr, unsigned long flags)
519 {
520 	int err;
521 	struct mm_struct *mm = current->mm;
522 	struct vm_area_struct *vma = NULL;
523 	struct mempolicy *pol = current->mempolicy;
524 
525 	cpuset_update_task_memory_state();
526 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
527 		return -EINVAL;
528 	if (flags & MPOL_F_ADDR) {
529 		down_read(&mm->mmap_sem);
530 		vma = find_vma_intersection(mm, addr, addr+1);
531 		if (!vma) {
532 			up_read(&mm->mmap_sem);
533 			return -EFAULT;
534 		}
535 		if (vma->vm_ops && vma->vm_ops->get_policy)
536 			pol = vma->vm_ops->get_policy(vma, addr);
537 		else
538 			pol = vma->vm_policy;
539 	} else if (addr)
540 		return -EINVAL;
541 
542 	if (!pol)
543 		pol = &default_policy;
544 
545 	if (flags & MPOL_F_NODE) {
546 		if (flags & MPOL_F_ADDR) {
547 			err = lookup_node(mm, addr);
548 			if (err < 0)
549 				goto out;
550 			*policy = err;
551 		} else if (pol == current->mempolicy &&
552 				pol->policy == MPOL_INTERLEAVE) {
553 			*policy = current->il_next;
554 		} else {
555 			err = -EINVAL;
556 			goto out;
557 		}
558 	} else
559 		*policy = pol->policy;
560 
561 	if (vma) {
562 		up_read(&current->mm->mmap_sem);
563 		vma = NULL;
564 	}
565 
566 	err = 0;
567 	if (nmask)
568 		get_zonemask(pol, nmask);
569 
570  out:
571 	if (vma)
572 		up_read(&current->mm->mmap_sem);
573 	return err;
574 }
575 
576 #ifdef CONFIG_MIGRATION
577 /*
578  * page migration
579  */
580 static void migrate_page_add(struct page *page, struct list_head *pagelist,
581 				unsigned long flags)
582 {
583 	/*
584 	 * Avoid migrating a page that is shared with others.
585 	 */
586 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
587 		isolate_lru_page(page, pagelist);
588 }
589 
590 /*
591  * Migrate pages from one node to a target node.
592  * Returns error or the number of pages not migrated.
593  */
594 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
595 {
596 	nodemask_t nmask;
597 	LIST_HEAD(pagelist);
598 	int err = 0;
599 
600 	nodes_clear(nmask);
601 	node_set(source, nmask);
602 
603 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
604 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
605 
606 	if (!list_empty(&pagelist)) {
607 		err = migrate_pages_to(&pagelist, NULL, dest);
608 		if (!list_empty(&pagelist))
609 			putback_lru_pages(&pagelist);
610 	}
611 	return err;
612 }
613 
614 /*
615  * Move pages between the two nodesets so as to preserve the physical
616  * layout as much as possible.
617  *
618  * Returns the number of page that could not be moved.
619  */
620 int do_migrate_pages(struct mm_struct *mm,
621 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
622 {
623 	LIST_HEAD(pagelist);
624 	int busy = 0;
625 	int err = 0;
626 	nodemask_t tmp;
627 
628   	down_read(&mm->mmap_sem);
629 
630 /*
631  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
632  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
633  * bit in 'tmp', and return that <source, dest> pair for migration.
634  * The pair of nodemasks 'to' and 'from' define the map.
635  *
636  * If no pair of bits is found that way, fallback to picking some
637  * pair of 'source' and 'dest' bits that are not the same.  If the
638  * 'source' and 'dest' bits are the same, this represents a node
639  * that will be migrating to itself, so no pages need move.
640  *
641  * If no bits are left in 'tmp', or if all remaining bits left
642  * in 'tmp' correspond to the same bit in 'to', return false
643  * (nothing left to migrate).
644  *
645  * This lets us pick a pair of nodes to migrate between, such that
646  * if possible the dest node is not already occupied by some other
647  * source node, minimizing the risk of overloading the memory on a
648  * node that would happen if we migrated incoming memory to a node
649  * before migrating outgoing memory source that same node.
650  *
651  * A single scan of tmp is sufficient.  As we go, we remember the
652  * most recent <s, d> pair that moved (s != d).  If we find a pair
653  * that not only moved, but what's better, moved to an empty slot
654  * (d is not set in tmp), then we break out then, with that pair.
655  * Otherwise when we finish scannng from_tmp, we at least have the
656  * most recent <s, d> pair that moved.  If we get all the way through
657  * the scan of tmp without finding any node that moved, much less
658  * moved to an empty node, then there is nothing left worth migrating.
659  */
660 
661 	tmp = *from_nodes;
662 	while (!nodes_empty(tmp)) {
663 		int s,d;
664 		int source = -1;
665 		int dest = 0;
666 
667 		for_each_node_mask(s, tmp) {
668 			d = node_remap(s, *from_nodes, *to_nodes);
669 			if (s == d)
670 				continue;
671 
672 			source = s;	/* Node moved. Memorize */
673 			dest = d;
674 
675 			/* dest not in remaining from nodes? */
676 			if (!node_isset(dest, tmp))
677 				break;
678 		}
679 		if (source == -1)
680 			break;
681 
682 		node_clear(source, tmp);
683 		err = migrate_to_node(mm, source, dest, flags);
684 		if (err > 0)
685 			busy += err;
686 		if (err < 0)
687 			break;
688 	}
689 
690 	up_read(&mm->mmap_sem);
691 	if (err < 0)
692 		return err;
693 	return busy;
694 
695 }
696 
697 #else
698 
699 static void migrate_page_add(struct page *page, struct list_head *pagelist,
700 				unsigned long flags)
701 {
702 }
703 
704 int do_migrate_pages(struct mm_struct *mm,
705 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
706 {
707 	return -ENOSYS;
708 }
709 #endif
710 
711 long do_mbind(unsigned long start, unsigned long len,
712 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
713 {
714 	struct vm_area_struct *vma;
715 	struct mm_struct *mm = current->mm;
716 	struct mempolicy *new;
717 	unsigned long end;
718 	int err;
719 	LIST_HEAD(pagelist);
720 
721 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
722 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
723 	    || mode > MPOL_MAX)
724 		return -EINVAL;
725 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
726 		return -EPERM;
727 
728 	if (start & ~PAGE_MASK)
729 		return -EINVAL;
730 
731 	if (mode == MPOL_DEFAULT)
732 		flags &= ~MPOL_MF_STRICT;
733 
734 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
735 	end = start + len;
736 
737 	if (end < start)
738 		return -EINVAL;
739 	if (end == start)
740 		return 0;
741 
742 	if (mpol_check_policy(mode, nmask))
743 		return -EINVAL;
744 
745 	new = mpol_new(mode, nmask);
746 	if (IS_ERR(new))
747 		return PTR_ERR(new);
748 
749 	/*
750 	 * If we are using the default policy then operation
751 	 * on discontinuous address spaces is okay after all
752 	 */
753 	if (!new)
754 		flags |= MPOL_MF_DISCONTIG_OK;
755 
756 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
757 			mode,nodes_addr(nodes)[0]);
758 
759 	down_write(&mm->mmap_sem);
760 	vma = check_range(mm, start, end, nmask,
761 			  flags | MPOL_MF_INVERT, &pagelist);
762 
763 	err = PTR_ERR(vma);
764 	if (!IS_ERR(vma)) {
765 		int nr_failed = 0;
766 
767 		err = mbind_range(vma, start, end, new);
768 
769 		if (!list_empty(&pagelist))
770 			nr_failed = migrate_pages_to(&pagelist, vma, -1);
771 
772 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
773 			err = -EIO;
774 	}
775 
776 	if (!list_empty(&pagelist))
777 		putback_lru_pages(&pagelist);
778 
779 	up_write(&mm->mmap_sem);
780 	mpol_free(new);
781 	return err;
782 }
783 
784 /*
785  * User space interface with variable sized bitmaps for nodelists.
786  */
787 
788 /* Copy a node mask from user space. */
789 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
790 		     unsigned long maxnode)
791 {
792 	unsigned long k;
793 	unsigned long nlongs;
794 	unsigned long endmask;
795 
796 	--maxnode;
797 	nodes_clear(*nodes);
798 	if (maxnode == 0 || !nmask)
799 		return 0;
800 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
801 		return -EINVAL;
802 
803 	nlongs = BITS_TO_LONGS(maxnode);
804 	if ((maxnode % BITS_PER_LONG) == 0)
805 		endmask = ~0UL;
806 	else
807 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
808 
809 	/* When the user specified more nodes than supported just check
810 	   if the non supported part is all zero. */
811 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
812 		if (nlongs > PAGE_SIZE/sizeof(long))
813 			return -EINVAL;
814 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
815 			unsigned long t;
816 			if (get_user(t, nmask + k))
817 				return -EFAULT;
818 			if (k == nlongs - 1) {
819 				if (t & endmask)
820 					return -EINVAL;
821 			} else if (t)
822 				return -EINVAL;
823 		}
824 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
825 		endmask = ~0UL;
826 	}
827 
828 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
829 		return -EFAULT;
830 	nodes_addr(*nodes)[nlongs-1] &= endmask;
831 	return 0;
832 }
833 
834 /* Copy a kernel node mask to user space */
835 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
836 			      nodemask_t *nodes)
837 {
838 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
839 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
840 
841 	if (copy > nbytes) {
842 		if (copy > PAGE_SIZE)
843 			return -EINVAL;
844 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
845 			return -EFAULT;
846 		copy = nbytes;
847 	}
848 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
849 }
850 
851 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
852 			unsigned long mode,
853 			unsigned long __user *nmask, unsigned long maxnode,
854 			unsigned flags)
855 {
856 	nodemask_t nodes;
857 	int err;
858 
859 	err = get_nodes(&nodes, nmask, maxnode);
860 	if (err)
861 		return err;
862 	return do_mbind(start, len, mode, &nodes, flags);
863 }
864 
865 /* Set the process memory policy */
866 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
867 		unsigned long maxnode)
868 {
869 	int err;
870 	nodemask_t nodes;
871 
872 	if (mode < 0 || mode > MPOL_MAX)
873 		return -EINVAL;
874 	err = get_nodes(&nodes, nmask, maxnode);
875 	if (err)
876 		return err;
877 	return do_set_mempolicy(mode, &nodes);
878 }
879 
880 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
881 		const unsigned long __user *old_nodes,
882 		const unsigned long __user *new_nodes)
883 {
884 	struct mm_struct *mm;
885 	struct task_struct *task;
886 	nodemask_t old;
887 	nodemask_t new;
888 	nodemask_t task_nodes;
889 	int err;
890 
891 	err = get_nodes(&old, old_nodes, maxnode);
892 	if (err)
893 		return err;
894 
895 	err = get_nodes(&new, new_nodes, maxnode);
896 	if (err)
897 		return err;
898 
899 	/* Find the mm_struct */
900 	read_lock(&tasklist_lock);
901 	task = pid ? find_task_by_pid(pid) : current;
902 	if (!task) {
903 		read_unlock(&tasklist_lock);
904 		return -ESRCH;
905 	}
906 	mm = get_task_mm(task);
907 	read_unlock(&tasklist_lock);
908 
909 	if (!mm)
910 		return -EINVAL;
911 
912 	/*
913 	 * Check if this process has the right to modify the specified
914 	 * process. The right exists if the process has administrative
915 	 * capabilities, superuser privileges or the same
916 	 * userid as the target process.
917 	 */
918 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
919 	    (current->uid != task->suid) && (current->uid != task->uid) &&
920 	    !capable(CAP_SYS_NICE)) {
921 		err = -EPERM;
922 		goto out;
923 	}
924 
925 	task_nodes = cpuset_mems_allowed(task);
926 	/* Is the user allowed to access the target nodes? */
927 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
928 		err = -EPERM;
929 		goto out;
930 	}
931 
932 	err = do_migrate_pages(mm, &old, &new,
933 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
934 out:
935 	mmput(mm);
936 	return err;
937 }
938 
939 
940 /* Retrieve NUMA policy */
941 asmlinkage long sys_get_mempolicy(int __user *policy,
942 				unsigned long __user *nmask,
943 				unsigned long maxnode,
944 				unsigned long addr, unsigned long flags)
945 {
946 	int err, pval;
947 	nodemask_t nodes;
948 
949 	if (nmask != NULL && maxnode < MAX_NUMNODES)
950 		return -EINVAL;
951 
952 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
953 
954 	if (err)
955 		return err;
956 
957 	if (policy && put_user(pval, policy))
958 		return -EFAULT;
959 
960 	if (nmask)
961 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
962 
963 	return err;
964 }
965 
966 #ifdef CONFIG_COMPAT
967 
968 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
969 				     compat_ulong_t __user *nmask,
970 				     compat_ulong_t maxnode,
971 				     compat_ulong_t addr, compat_ulong_t flags)
972 {
973 	long err;
974 	unsigned long __user *nm = NULL;
975 	unsigned long nr_bits, alloc_size;
976 	DECLARE_BITMAP(bm, MAX_NUMNODES);
977 
978 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
979 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
980 
981 	if (nmask)
982 		nm = compat_alloc_user_space(alloc_size);
983 
984 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
985 
986 	if (!err && nmask) {
987 		err = copy_from_user(bm, nm, alloc_size);
988 		/* ensure entire bitmap is zeroed */
989 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
990 		err |= compat_put_bitmap(nmask, bm, nr_bits);
991 	}
992 
993 	return err;
994 }
995 
996 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
997 				     compat_ulong_t maxnode)
998 {
999 	long err = 0;
1000 	unsigned long __user *nm = NULL;
1001 	unsigned long nr_bits, alloc_size;
1002 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1003 
1004 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1005 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1006 
1007 	if (nmask) {
1008 		err = compat_get_bitmap(bm, nmask, nr_bits);
1009 		nm = compat_alloc_user_space(alloc_size);
1010 		err |= copy_to_user(nm, bm, alloc_size);
1011 	}
1012 
1013 	if (err)
1014 		return -EFAULT;
1015 
1016 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1017 }
1018 
1019 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1020 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1021 			     compat_ulong_t maxnode, compat_ulong_t flags)
1022 {
1023 	long err = 0;
1024 	unsigned long __user *nm = NULL;
1025 	unsigned long nr_bits, alloc_size;
1026 	nodemask_t bm;
1027 
1028 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1029 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1030 
1031 	if (nmask) {
1032 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1033 		nm = compat_alloc_user_space(alloc_size);
1034 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1035 	}
1036 
1037 	if (err)
1038 		return -EFAULT;
1039 
1040 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1041 }
1042 
1043 #endif
1044 
1045 /* Return effective policy for a VMA */
1046 static struct mempolicy * get_vma_policy(struct task_struct *task,
1047 		struct vm_area_struct *vma, unsigned long addr)
1048 {
1049 	struct mempolicy *pol = task->mempolicy;
1050 
1051 	if (vma) {
1052 		if (vma->vm_ops && vma->vm_ops->get_policy)
1053 			pol = vma->vm_ops->get_policy(vma, addr);
1054 		else if (vma->vm_policy &&
1055 				vma->vm_policy->policy != MPOL_DEFAULT)
1056 			pol = vma->vm_policy;
1057 	}
1058 	if (!pol)
1059 		pol = &default_policy;
1060 	return pol;
1061 }
1062 
1063 /* Return a zonelist representing a mempolicy */
1064 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1065 {
1066 	int nd;
1067 
1068 	switch (policy->policy) {
1069 	case MPOL_PREFERRED:
1070 		nd = policy->v.preferred_node;
1071 		if (nd < 0)
1072 			nd = numa_node_id();
1073 		break;
1074 	case MPOL_BIND:
1075 		/* Lower zones don't get a policy applied */
1076 		/* Careful: current->mems_allowed might have moved */
1077 		if (gfp_zone(gfp) >= policy_zone)
1078 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1079 				return policy->v.zonelist;
1080 		/*FALL THROUGH*/
1081 	case MPOL_INTERLEAVE: /* should not happen */
1082 	case MPOL_DEFAULT:
1083 		nd = numa_node_id();
1084 		break;
1085 	default:
1086 		nd = 0;
1087 		BUG();
1088 	}
1089 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1090 }
1091 
1092 /* Do dynamic interleaving for a process */
1093 static unsigned interleave_nodes(struct mempolicy *policy)
1094 {
1095 	unsigned nid, next;
1096 	struct task_struct *me = current;
1097 
1098 	nid = me->il_next;
1099 	next = next_node(nid, policy->v.nodes);
1100 	if (next >= MAX_NUMNODES)
1101 		next = first_node(policy->v.nodes);
1102 	me->il_next = next;
1103 	return nid;
1104 }
1105 
1106 /*
1107  * Depending on the memory policy provide a node from which to allocate the
1108  * next slab entry.
1109  */
1110 unsigned slab_node(struct mempolicy *policy)
1111 {
1112 	switch (policy->policy) {
1113 	case MPOL_INTERLEAVE:
1114 		return interleave_nodes(policy);
1115 
1116 	case MPOL_BIND:
1117 		/*
1118 		 * Follow bind policy behavior and start allocation at the
1119 		 * first node.
1120 		 */
1121 		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1122 
1123 	case MPOL_PREFERRED:
1124 		if (policy->v.preferred_node >= 0)
1125 			return policy->v.preferred_node;
1126 		/* Fall through */
1127 
1128 	default:
1129 		return numa_node_id();
1130 	}
1131 }
1132 
1133 /* Do static interleaving for a VMA with known offset. */
1134 static unsigned offset_il_node(struct mempolicy *pol,
1135 		struct vm_area_struct *vma, unsigned long off)
1136 {
1137 	unsigned nnodes = nodes_weight(pol->v.nodes);
1138 	unsigned target = (unsigned)off % nnodes;
1139 	int c;
1140 	int nid = -1;
1141 
1142 	c = 0;
1143 	do {
1144 		nid = next_node(nid, pol->v.nodes);
1145 		c++;
1146 	} while (c <= target);
1147 	return nid;
1148 }
1149 
1150 /* Determine a node number for interleave */
1151 static inline unsigned interleave_nid(struct mempolicy *pol,
1152 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1153 {
1154 	if (vma) {
1155 		unsigned long off;
1156 
1157 		off = vma->vm_pgoff;
1158 		off += (addr - vma->vm_start) >> shift;
1159 		return offset_il_node(pol, vma, off);
1160 	} else
1161 		return interleave_nodes(pol);
1162 }
1163 
1164 #ifdef CONFIG_HUGETLBFS
1165 /* Return a zonelist suitable for a huge page allocation. */
1166 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1167 {
1168 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1169 
1170 	if (pol->policy == MPOL_INTERLEAVE) {
1171 		unsigned nid;
1172 
1173 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1174 		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1175 	}
1176 	return zonelist_policy(GFP_HIGHUSER, pol);
1177 }
1178 #endif
1179 
1180 /* Allocate a page in interleaved policy.
1181    Own path because it needs to do special accounting. */
1182 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1183 					unsigned nid)
1184 {
1185 	struct zonelist *zl;
1186 	struct page *page;
1187 
1188 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1189 	page = __alloc_pages(gfp, order, zl);
1190 	if (page && page_zone(page) == zl->zones[0]) {
1191 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1192 		put_cpu();
1193 	}
1194 	return page;
1195 }
1196 
1197 /**
1198  * 	alloc_page_vma	- Allocate a page for a VMA.
1199  *
1200  * 	@gfp:
1201  *      %GFP_USER    user allocation.
1202  *      %GFP_KERNEL  kernel allocations,
1203  *      %GFP_HIGHMEM highmem/user allocations,
1204  *      %GFP_FS      allocation should not call back into a file system.
1205  *      %GFP_ATOMIC  don't sleep.
1206  *
1207  * 	@vma:  Pointer to VMA or NULL if not available.
1208  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1209  *
1210  * 	This function allocates a page from the kernel page pool and applies
1211  *	a NUMA policy associated with the VMA or the current process.
1212  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1213  *	mm_struct of the VMA to prevent it from going away. Should be used for
1214  *	all allocations for pages that will be mapped into
1215  * 	user space. Returns NULL when no page can be allocated.
1216  *
1217  *	Should be called with the mm_sem of the vma hold.
1218  */
1219 struct page *
1220 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1221 {
1222 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1223 
1224 	cpuset_update_task_memory_state();
1225 
1226 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1227 		unsigned nid;
1228 
1229 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1230 		return alloc_page_interleave(gfp, 0, nid);
1231 	}
1232 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1233 }
1234 
1235 /**
1236  * 	alloc_pages_current - Allocate pages.
1237  *
1238  *	@gfp:
1239  *		%GFP_USER   user allocation,
1240  *      	%GFP_KERNEL kernel allocation,
1241  *      	%GFP_HIGHMEM highmem allocation,
1242  *      	%GFP_FS     don't call back into a file system.
1243  *      	%GFP_ATOMIC don't sleep.
1244  *	@order: Power of two of allocation size in pages. 0 is a single page.
1245  *
1246  *	Allocate a page from the kernel page pool.  When not in
1247  *	interrupt context and apply the current process NUMA policy.
1248  *	Returns NULL when no page can be allocated.
1249  *
1250  *	Don't call cpuset_update_task_memory_state() unless
1251  *	1) it's ok to take cpuset_sem (can WAIT), and
1252  *	2) allocating for current task (not interrupt).
1253  */
1254 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1255 {
1256 	struct mempolicy *pol = current->mempolicy;
1257 
1258 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1259 		cpuset_update_task_memory_state();
1260 	if (!pol || in_interrupt())
1261 		pol = &default_policy;
1262 	if (pol->policy == MPOL_INTERLEAVE)
1263 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1264 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1265 }
1266 EXPORT_SYMBOL(alloc_pages_current);
1267 
1268 /*
1269  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1270  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1271  * with the mems_allowed returned by cpuset_mems_allowed().  This
1272  * keeps mempolicies cpuset relative after its cpuset moves.  See
1273  * further kernel/cpuset.c update_nodemask().
1274  */
1275 void *cpuset_being_rebound;
1276 
1277 /* Slow path of a mempolicy copy */
1278 struct mempolicy *__mpol_copy(struct mempolicy *old)
1279 {
1280 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1281 
1282 	if (!new)
1283 		return ERR_PTR(-ENOMEM);
1284 	if (current_cpuset_is_being_rebound()) {
1285 		nodemask_t mems = cpuset_mems_allowed(current);
1286 		mpol_rebind_policy(old, &mems);
1287 	}
1288 	*new = *old;
1289 	atomic_set(&new->refcnt, 1);
1290 	if (new->policy == MPOL_BIND) {
1291 		int sz = ksize(old->v.zonelist);
1292 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1293 		if (!new->v.zonelist) {
1294 			kmem_cache_free(policy_cache, new);
1295 			return ERR_PTR(-ENOMEM);
1296 		}
1297 		memcpy(new->v.zonelist, old->v.zonelist, sz);
1298 	}
1299 	return new;
1300 }
1301 
1302 /* Slow path of a mempolicy comparison */
1303 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1304 {
1305 	if (!a || !b)
1306 		return 0;
1307 	if (a->policy != b->policy)
1308 		return 0;
1309 	switch (a->policy) {
1310 	case MPOL_DEFAULT:
1311 		return 1;
1312 	case MPOL_INTERLEAVE:
1313 		return nodes_equal(a->v.nodes, b->v.nodes);
1314 	case MPOL_PREFERRED:
1315 		return a->v.preferred_node == b->v.preferred_node;
1316 	case MPOL_BIND: {
1317 		int i;
1318 		for (i = 0; a->v.zonelist->zones[i]; i++)
1319 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1320 				return 0;
1321 		return b->v.zonelist->zones[i] == NULL;
1322 	}
1323 	default:
1324 		BUG();
1325 		return 0;
1326 	}
1327 }
1328 
1329 /* Slow path of a mpol destructor. */
1330 void __mpol_free(struct mempolicy *p)
1331 {
1332 	if (!atomic_dec_and_test(&p->refcnt))
1333 		return;
1334 	if (p->policy == MPOL_BIND)
1335 		kfree(p->v.zonelist);
1336 	p->policy = MPOL_DEFAULT;
1337 	kmem_cache_free(policy_cache, p);
1338 }
1339 
1340 /*
1341  * Shared memory backing store policy support.
1342  *
1343  * Remember policies even when nobody has shared memory mapped.
1344  * The policies are kept in Red-Black tree linked from the inode.
1345  * They are protected by the sp->lock spinlock, which should be held
1346  * for any accesses to the tree.
1347  */
1348 
1349 /* lookup first element intersecting start-end */
1350 /* Caller holds sp->lock */
1351 static struct sp_node *
1352 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1353 {
1354 	struct rb_node *n = sp->root.rb_node;
1355 
1356 	while (n) {
1357 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1358 
1359 		if (start >= p->end)
1360 			n = n->rb_right;
1361 		else if (end <= p->start)
1362 			n = n->rb_left;
1363 		else
1364 			break;
1365 	}
1366 	if (!n)
1367 		return NULL;
1368 	for (;;) {
1369 		struct sp_node *w = NULL;
1370 		struct rb_node *prev = rb_prev(n);
1371 		if (!prev)
1372 			break;
1373 		w = rb_entry(prev, struct sp_node, nd);
1374 		if (w->end <= start)
1375 			break;
1376 		n = prev;
1377 	}
1378 	return rb_entry(n, struct sp_node, nd);
1379 }
1380 
1381 /* Insert a new shared policy into the list. */
1382 /* Caller holds sp->lock */
1383 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1384 {
1385 	struct rb_node **p = &sp->root.rb_node;
1386 	struct rb_node *parent = NULL;
1387 	struct sp_node *nd;
1388 
1389 	while (*p) {
1390 		parent = *p;
1391 		nd = rb_entry(parent, struct sp_node, nd);
1392 		if (new->start < nd->start)
1393 			p = &(*p)->rb_left;
1394 		else if (new->end > nd->end)
1395 			p = &(*p)->rb_right;
1396 		else
1397 			BUG();
1398 	}
1399 	rb_link_node(&new->nd, parent, p);
1400 	rb_insert_color(&new->nd, &sp->root);
1401 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1402 		 new->policy ? new->policy->policy : 0);
1403 }
1404 
1405 /* Find shared policy intersecting idx */
1406 struct mempolicy *
1407 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1408 {
1409 	struct mempolicy *pol = NULL;
1410 	struct sp_node *sn;
1411 
1412 	if (!sp->root.rb_node)
1413 		return NULL;
1414 	spin_lock(&sp->lock);
1415 	sn = sp_lookup(sp, idx, idx+1);
1416 	if (sn) {
1417 		mpol_get(sn->policy);
1418 		pol = sn->policy;
1419 	}
1420 	spin_unlock(&sp->lock);
1421 	return pol;
1422 }
1423 
1424 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1425 {
1426 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1427 	rb_erase(&n->nd, &sp->root);
1428 	mpol_free(n->policy);
1429 	kmem_cache_free(sn_cache, n);
1430 }
1431 
1432 struct sp_node *
1433 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1434 {
1435 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1436 
1437 	if (!n)
1438 		return NULL;
1439 	n->start = start;
1440 	n->end = end;
1441 	mpol_get(pol);
1442 	n->policy = pol;
1443 	return n;
1444 }
1445 
1446 /* Replace a policy range. */
1447 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1448 				 unsigned long end, struct sp_node *new)
1449 {
1450 	struct sp_node *n, *new2 = NULL;
1451 
1452 restart:
1453 	spin_lock(&sp->lock);
1454 	n = sp_lookup(sp, start, end);
1455 	/* Take care of old policies in the same range. */
1456 	while (n && n->start < end) {
1457 		struct rb_node *next = rb_next(&n->nd);
1458 		if (n->start >= start) {
1459 			if (n->end <= end)
1460 				sp_delete(sp, n);
1461 			else
1462 				n->start = end;
1463 		} else {
1464 			/* Old policy spanning whole new range. */
1465 			if (n->end > end) {
1466 				if (!new2) {
1467 					spin_unlock(&sp->lock);
1468 					new2 = sp_alloc(end, n->end, n->policy);
1469 					if (!new2)
1470 						return -ENOMEM;
1471 					goto restart;
1472 				}
1473 				n->end = start;
1474 				sp_insert(sp, new2);
1475 				new2 = NULL;
1476 				break;
1477 			} else
1478 				n->end = start;
1479 		}
1480 		if (!next)
1481 			break;
1482 		n = rb_entry(next, struct sp_node, nd);
1483 	}
1484 	if (new)
1485 		sp_insert(sp, new);
1486 	spin_unlock(&sp->lock);
1487 	if (new2) {
1488 		mpol_free(new2->policy);
1489 		kmem_cache_free(sn_cache, new2);
1490 	}
1491 	return 0;
1492 }
1493 
1494 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1495 				nodemask_t *policy_nodes)
1496 {
1497 	info->root = RB_ROOT;
1498 	spin_lock_init(&info->lock);
1499 
1500 	if (policy != MPOL_DEFAULT) {
1501 		struct mempolicy *newpol;
1502 
1503 		/* Falls back to MPOL_DEFAULT on any error */
1504 		newpol = mpol_new(policy, policy_nodes);
1505 		if (!IS_ERR(newpol)) {
1506 			/* Create pseudo-vma that contains just the policy */
1507 			struct vm_area_struct pvma;
1508 
1509 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1510 			/* Policy covers entire file */
1511 			pvma.vm_end = TASK_SIZE;
1512 			mpol_set_shared_policy(info, &pvma, newpol);
1513 			mpol_free(newpol);
1514 		}
1515 	}
1516 }
1517 
1518 int mpol_set_shared_policy(struct shared_policy *info,
1519 			struct vm_area_struct *vma, struct mempolicy *npol)
1520 {
1521 	int err;
1522 	struct sp_node *new = NULL;
1523 	unsigned long sz = vma_pages(vma);
1524 
1525 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1526 		 vma->vm_pgoff,
1527 		 sz, npol? npol->policy : -1,
1528 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1529 
1530 	if (npol) {
1531 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1532 		if (!new)
1533 			return -ENOMEM;
1534 	}
1535 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1536 	if (err && new)
1537 		kmem_cache_free(sn_cache, new);
1538 	return err;
1539 }
1540 
1541 /* Free a backing policy store on inode delete. */
1542 void mpol_free_shared_policy(struct shared_policy *p)
1543 {
1544 	struct sp_node *n;
1545 	struct rb_node *next;
1546 
1547 	if (!p->root.rb_node)
1548 		return;
1549 	spin_lock(&p->lock);
1550 	next = rb_first(&p->root);
1551 	while (next) {
1552 		n = rb_entry(next, struct sp_node, nd);
1553 		next = rb_next(&n->nd);
1554 		rb_erase(&n->nd, &p->root);
1555 		mpol_free(n->policy);
1556 		kmem_cache_free(sn_cache, n);
1557 	}
1558 	spin_unlock(&p->lock);
1559 }
1560 
1561 /* assumes fs == KERNEL_DS */
1562 void __init numa_policy_init(void)
1563 {
1564 	policy_cache = kmem_cache_create("numa_policy",
1565 					 sizeof(struct mempolicy),
1566 					 0, SLAB_PANIC, NULL, NULL);
1567 
1568 	sn_cache = kmem_cache_create("shared_policy_node",
1569 				     sizeof(struct sp_node),
1570 				     0, SLAB_PANIC, NULL, NULL);
1571 
1572 	/* Set interleaving policy for system init. This way not all
1573 	   the data structures allocated at system boot end up in node zero. */
1574 
1575 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1576 		printk("numa_policy_init: interleaving failed\n");
1577 }
1578 
1579 /* Reset policy of current process to default */
1580 void numa_default_policy(void)
1581 {
1582 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1583 }
1584 
1585 /* Migrate a policy to a different set of nodes */
1586 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1587 {
1588 	nodemask_t *mpolmask;
1589 	nodemask_t tmp;
1590 
1591 	if (!pol)
1592 		return;
1593 	mpolmask = &pol->cpuset_mems_allowed;
1594 	if (nodes_equal(*mpolmask, *newmask))
1595 		return;
1596 
1597 	switch (pol->policy) {
1598 	case MPOL_DEFAULT:
1599 		break;
1600 	case MPOL_INTERLEAVE:
1601 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1602 		pol->v.nodes = tmp;
1603 		*mpolmask = *newmask;
1604 		current->il_next = node_remap(current->il_next,
1605 						*mpolmask, *newmask);
1606 		break;
1607 	case MPOL_PREFERRED:
1608 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1609 						*mpolmask, *newmask);
1610 		*mpolmask = *newmask;
1611 		break;
1612 	case MPOL_BIND: {
1613 		nodemask_t nodes;
1614 		struct zone **z;
1615 		struct zonelist *zonelist;
1616 
1617 		nodes_clear(nodes);
1618 		for (z = pol->v.zonelist->zones; *z; z++)
1619 			node_set((*z)->zone_pgdat->node_id, nodes);
1620 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1621 		nodes = tmp;
1622 
1623 		zonelist = bind_zonelist(&nodes);
1624 
1625 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1626 		 * If that old zonelist has no remaining mems_allowed nodes,
1627 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1628 		 */
1629 
1630 		if (zonelist) {
1631 			/* Good - got mem - substitute new zonelist */
1632 			kfree(pol->v.zonelist);
1633 			pol->v.zonelist = zonelist;
1634 		}
1635 		*mpolmask = *newmask;
1636 		break;
1637 	}
1638 	default:
1639 		BUG();
1640 		break;
1641 	}
1642 }
1643 
1644 /*
1645  * Wrapper for mpol_rebind_policy() that just requires task
1646  * pointer, and updates task mempolicy.
1647  */
1648 
1649 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1650 {
1651 	mpol_rebind_policy(tsk->mempolicy, new);
1652 }
1653 
1654 /*
1655  * Rebind each vma in mm to new nodemask.
1656  *
1657  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1658  */
1659 
1660 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1661 {
1662 	struct vm_area_struct *vma;
1663 
1664 	down_write(&mm->mmap_sem);
1665 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1666 		mpol_rebind_policy(vma->vm_policy, new);
1667 	up_write(&mm->mmap_sem);
1668 }
1669 
1670 /*
1671  * Display pages allocated per node and memory policy via /proc.
1672  */
1673 
1674 static const char *policy_types[] = { "default", "prefer", "bind",
1675 				      "interleave" };
1676 
1677 /*
1678  * Convert a mempolicy into a string.
1679  * Returns the number of characters in buffer (if positive)
1680  * or an error (negative)
1681  */
1682 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1683 {
1684 	char *p = buffer;
1685 	int l;
1686 	nodemask_t nodes;
1687 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1688 
1689 	switch (mode) {
1690 	case MPOL_DEFAULT:
1691 		nodes_clear(nodes);
1692 		break;
1693 
1694 	case MPOL_PREFERRED:
1695 		nodes_clear(nodes);
1696 		node_set(pol->v.preferred_node, nodes);
1697 		break;
1698 
1699 	case MPOL_BIND:
1700 		get_zonemask(pol, &nodes);
1701 		break;
1702 
1703 	case MPOL_INTERLEAVE:
1704 		nodes = pol->v.nodes;
1705 		break;
1706 
1707 	default:
1708 		BUG();
1709 		return -EFAULT;
1710 	}
1711 
1712 	l = strlen(policy_types[mode]);
1713  	if (buffer + maxlen < p + l + 1)
1714  		return -ENOSPC;
1715 
1716 	strcpy(p, policy_types[mode]);
1717 	p += l;
1718 
1719 	if (!nodes_empty(nodes)) {
1720 		if (buffer + maxlen < p + 2)
1721 			return -ENOSPC;
1722 		*p++ = '=';
1723 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1724 	}
1725 	return p - buffer;
1726 }
1727 
1728 struct numa_maps {
1729 	unsigned long pages;
1730 	unsigned long anon;
1731 	unsigned long active;
1732 	unsigned long writeback;
1733 	unsigned long mapcount_max;
1734 	unsigned long dirty;
1735 	unsigned long swapcache;
1736 	unsigned long node[MAX_NUMNODES];
1737 };
1738 
1739 static void gather_stats(struct page *page, void *private, int pte_dirty)
1740 {
1741 	struct numa_maps *md = private;
1742 	int count = page_mapcount(page);
1743 
1744 	md->pages++;
1745 	if (pte_dirty || PageDirty(page))
1746 		md->dirty++;
1747 
1748 	if (PageSwapCache(page))
1749 		md->swapcache++;
1750 
1751 	if (PageActive(page))
1752 		md->active++;
1753 
1754 	if (PageWriteback(page))
1755 		md->writeback++;
1756 
1757 	if (PageAnon(page))
1758 		md->anon++;
1759 
1760 	if (count > md->mapcount_max)
1761 		md->mapcount_max = count;
1762 
1763 	md->node[page_to_nid(page)]++;
1764 }
1765 
1766 #ifdef CONFIG_HUGETLB_PAGE
1767 static void check_huge_range(struct vm_area_struct *vma,
1768 		unsigned long start, unsigned long end,
1769 		struct numa_maps *md)
1770 {
1771 	unsigned long addr;
1772 	struct page *page;
1773 
1774 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1775 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1776 		pte_t pte;
1777 
1778 		if (!ptep)
1779 			continue;
1780 
1781 		pte = *ptep;
1782 		if (pte_none(pte))
1783 			continue;
1784 
1785 		page = pte_page(pte);
1786 		if (!page)
1787 			continue;
1788 
1789 		gather_stats(page, md, pte_dirty(*ptep));
1790 	}
1791 }
1792 #else
1793 static inline void check_huge_range(struct vm_area_struct *vma,
1794 		unsigned long start, unsigned long end,
1795 		struct numa_maps *md)
1796 {
1797 }
1798 #endif
1799 
1800 int show_numa_map(struct seq_file *m, void *v)
1801 {
1802 	struct task_struct *task = m->private;
1803 	struct vm_area_struct *vma = v;
1804 	struct numa_maps *md;
1805 	struct file *file = vma->vm_file;
1806 	struct mm_struct *mm = vma->vm_mm;
1807 	int n;
1808 	char buffer[50];
1809 
1810 	if (!mm)
1811 		return 0;
1812 
1813 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1814 	if (!md)
1815 		return 0;
1816 
1817 	mpol_to_str(buffer, sizeof(buffer),
1818 			get_vma_policy(task, vma, vma->vm_start));
1819 
1820 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1821 
1822 	if (file) {
1823 		seq_printf(m, " file=");
1824 		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1825 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1826 		seq_printf(m, " heap");
1827 	} else if (vma->vm_start <= mm->start_stack &&
1828 			vma->vm_end >= mm->start_stack) {
1829 		seq_printf(m, " stack");
1830 	}
1831 
1832 	if (is_vm_hugetlb_page(vma)) {
1833 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1834 		seq_printf(m, " huge");
1835 	} else {
1836 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1837 				&node_online_map, MPOL_MF_STATS, md);
1838 	}
1839 
1840 	if (!md->pages)
1841 		goto out;
1842 
1843 	if (md->anon)
1844 		seq_printf(m," anon=%lu",md->anon);
1845 
1846 	if (md->dirty)
1847 		seq_printf(m," dirty=%lu",md->dirty);
1848 
1849 	if (md->pages != md->anon && md->pages != md->dirty)
1850 		seq_printf(m, " mapped=%lu", md->pages);
1851 
1852 	if (md->mapcount_max > 1)
1853 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1854 
1855 	if (md->swapcache)
1856 		seq_printf(m," swapcache=%lu", md->swapcache);
1857 
1858 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1859 		seq_printf(m," active=%lu", md->active);
1860 
1861 	if (md->writeback)
1862 		seq_printf(m," writeback=%lu", md->writeback);
1863 
1864 	for_each_online_node(n)
1865 		if (md->node[n])
1866 			seq_printf(m, " N%d=%lu", n, md->node[n]);
1867 out:
1868 	seq_putc(m, '\n');
1869 	kfree(md);
1870 
1871 	if (m->count < m->size)
1872 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1873 	return 0;
1874 }
1875 
1876