xref: /linux/mm/mempolicy.c (revision d8327c784b51b57dac2c26cfad87dce0d68dfd98)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
87 #include <linux/seq_file.h>
88 #include <linux/proc_fs.h>
89 
90 #include <asm/tlbflush.h>
91 #include <asm/uaccess.h>
92 
93 /* Internal flags */
94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
97 
98 /* The number of pages to migrate per call to migrate_pages() */
99 #define MIGRATE_CHUNK_SIZE 256
100 
101 static kmem_cache_t *policy_cache;
102 static kmem_cache_t *sn_cache;
103 
104 #define PDprintk(fmt...)
105 
106 /* Highest zone. An specific allocation for a zone below that is not
107    policied. */
108 int policy_zone = ZONE_DMA;
109 
110 struct mempolicy default_policy = {
111 	.refcnt = ATOMIC_INIT(1), /* never free it */
112 	.policy = MPOL_DEFAULT,
113 };
114 
115 /* Do sanity checking on a policy */
116 static int mpol_check_policy(int mode, nodemask_t *nodes)
117 {
118 	int empty = nodes_empty(*nodes);
119 
120 	switch (mode) {
121 	case MPOL_DEFAULT:
122 		if (!empty)
123 			return -EINVAL;
124 		break;
125 	case MPOL_BIND:
126 	case MPOL_INTERLEAVE:
127 		/* Preferred will only use the first bit, but allow
128 		   more for now. */
129 		if (empty)
130 			return -EINVAL;
131 		break;
132 	}
133 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134 }
135 
136 /* Generate a custom zonelist for the BIND policy. */
137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
138 {
139 	struct zonelist *zl;
140 	int num, max, nd, k;
141 
142 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
143 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
144 	if (!zl)
145 		return NULL;
146 	num = 0;
147 	/* First put in the highest zones from all nodes, then all the next
148 	   lower zones etc. Avoid empty zones because the memory allocator
149 	   doesn't like them. If you implement node hot removal you
150 	   have to fix that. */
151 	for (k = policy_zone; k >= 0; k--) {
152 		for_each_node_mask(nd, *nodes) {
153 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 			if (z->present_pages > 0)
155 				zl->zones[num++] = z;
156 		}
157 	}
158 	zl->zones[num] = NULL;
159 	return zl;
160 }
161 
162 /* Create a new policy */
163 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
164 {
165 	struct mempolicy *policy;
166 
167 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
168 	if (mode == MPOL_DEFAULT)
169 		return NULL;
170 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
171 	if (!policy)
172 		return ERR_PTR(-ENOMEM);
173 	atomic_set(&policy->refcnt, 1);
174 	switch (mode) {
175 	case MPOL_INTERLEAVE:
176 		policy->v.nodes = *nodes;
177 		if (nodes_weight(*nodes) == 0) {
178 			kmem_cache_free(policy_cache, policy);
179 			return ERR_PTR(-EINVAL);
180 		}
181 		break;
182 	case MPOL_PREFERRED:
183 		policy->v.preferred_node = first_node(*nodes);
184 		if (policy->v.preferred_node >= MAX_NUMNODES)
185 			policy->v.preferred_node = -1;
186 		break;
187 	case MPOL_BIND:
188 		policy->v.zonelist = bind_zonelist(nodes);
189 		if (policy->v.zonelist == NULL) {
190 			kmem_cache_free(policy_cache, policy);
191 			return ERR_PTR(-ENOMEM);
192 		}
193 		break;
194 	}
195 	policy->policy = mode;
196 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
197 	return policy;
198 }
199 
200 static void gather_stats(struct page *, void *, int pte_dirty);
201 static void migrate_page_add(struct page *page, struct list_head *pagelist,
202 				unsigned long flags);
203 
204 /* Scan through pages checking if pages follow certain conditions. */
205 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
206 		unsigned long addr, unsigned long end,
207 		const nodemask_t *nodes, unsigned long flags,
208 		void *private)
209 {
210 	pte_t *orig_pte;
211 	pte_t *pte;
212 	spinlock_t *ptl;
213 
214 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
215 	do {
216 		struct page *page;
217 		unsigned int nid;
218 
219 		if (!pte_present(*pte))
220 			continue;
221 		page = vm_normal_page(vma, addr, *pte);
222 		if (!page)
223 			continue;
224 		/*
225 		 * The check for PageReserved here is important to avoid
226 		 * handling zero pages and other pages that may have been
227 		 * marked special by the system.
228 		 *
229 		 * If the PageReserved would not be checked here then f.e.
230 		 * the location of the zero page could have an influence
231 		 * on MPOL_MF_STRICT, zero pages would be counted for
232 		 * the per node stats, and there would be useless attempts
233 		 * to put zero pages on the migration list.
234 		 */
235 		if (PageReserved(page))
236 			continue;
237 		nid = page_to_nid(page);
238 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
239 			continue;
240 
241 		if (flags & MPOL_MF_STATS)
242 			gather_stats(page, private, pte_dirty(*pte));
243 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
244 			migrate_page_add(page, private, flags);
245 		else
246 			break;
247 	} while (pte++, addr += PAGE_SIZE, addr != end);
248 	pte_unmap_unlock(orig_pte, ptl);
249 	return addr != end;
250 }
251 
252 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
253 		unsigned long addr, unsigned long end,
254 		const nodemask_t *nodes, unsigned long flags,
255 		void *private)
256 {
257 	pmd_t *pmd;
258 	unsigned long next;
259 
260 	pmd = pmd_offset(pud, addr);
261 	do {
262 		next = pmd_addr_end(addr, end);
263 		if (pmd_none_or_clear_bad(pmd))
264 			continue;
265 		if (check_pte_range(vma, pmd, addr, next, nodes,
266 				    flags, private))
267 			return -EIO;
268 	} while (pmd++, addr = next, addr != end);
269 	return 0;
270 }
271 
272 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
273 		unsigned long addr, unsigned long end,
274 		const nodemask_t *nodes, unsigned long flags,
275 		void *private)
276 {
277 	pud_t *pud;
278 	unsigned long next;
279 
280 	pud = pud_offset(pgd, addr);
281 	do {
282 		next = pud_addr_end(addr, end);
283 		if (pud_none_or_clear_bad(pud))
284 			continue;
285 		if (check_pmd_range(vma, pud, addr, next, nodes,
286 				    flags, private))
287 			return -EIO;
288 	} while (pud++, addr = next, addr != end);
289 	return 0;
290 }
291 
292 static inline int check_pgd_range(struct vm_area_struct *vma,
293 		unsigned long addr, unsigned long end,
294 		const nodemask_t *nodes, unsigned long flags,
295 		void *private)
296 {
297 	pgd_t *pgd;
298 	unsigned long next;
299 
300 	pgd = pgd_offset(vma->vm_mm, addr);
301 	do {
302 		next = pgd_addr_end(addr, end);
303 		if (pgd_none_or_clear_bad(pgd))
304 			continue;
305 		if (check_pud_range(vma, pgd, addr, next, nodes,
306 				    flags, private))
307 			return -EIO;
308 	} while (pgd++, addr = next, addr != end);
309 	return 0;
310 }
311 
312 /* Check if a vma is migratable */
313 static inline int vma_migratable(struct vm_area_struct *vma)
314 {
315 	if (vma->vm_flags & (
316 		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
317 		return 0;
318 	return 1;
319 }
320 
321 /*
322  * Check if all pages in a range are on a set of nodes.
323  * If pagelist != NULL then isolate pages from the LRU and
324  * put them on the pagelist.
325  */
326 static struct vm_area_struct *
327 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
328 		const nodemask_t *nodes, unsigned long flags, void *private)
329 {
330 	int err;
331 	struct vm_area_struct *first, *vma, *prev;
332 
333 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
334 		/* Must have swap device for migration */
335 		if (nr_swap_pages <= 0)
336 			return ERR_PTR(-ENODEV);
337 
338 		/*
339 		 * Clear the LRU lists so pages can be isolated.
340 		 * Note that pages may be moved off the LRU after we have
341 		 * drained them. Those pages will fail to migrate like other
342 		 * pages that may be busy.
343 		 */
344 		lru_add_drain_all();
345 	}
346 
347 	first = find_vma(mm, start);
348 	if (!first)
349 		return ERR_PTR(-EFAULT);
350 	prev = NULL;
351 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
352 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
353 			if (!vma->vm_next && vma->vm_end < end)
354 				return ERR_PTR(-EFAULT);
355 			if (prev && prev->vm_end < vma->vm_start)
356 				return ERR_PTR(-EFAULT);
357 		}
358 		if (!is_vm_hugetlb_page(vma) &&
359 		    ((flags & MPOL_MF_STRICT) ||
360 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
361 				vma_migratable(vma)))) {
362 			unsigned long endvma = vma->vm_end;
363 
364 			if (endvma > end)
365 				endvma = end;
366 			if (vma->vm_start > start)
367 				start = vma->vm_start;
368 			err = check_pgd_range(vma, start, endvma, nodes,
369 						flags, private);
370 			if (err) {
371 				first = ERR_PTR(err);
372 				break;
373 			}
374 		}
375 		prev = vma;
376 	}
377 	return first;
378 }
379 
380 /* Apply policy to a single VMA */
381 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
382 {
383 	int err = 0;
384 	struct mempolicy *old = vma->vm_policy;
385 
386 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
387 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
388 		 vma->vm_ops, vma->vm_file,
389 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
390 
391 	if (vma->vm_ops && vma->vm_ops->set_policy)
392 		err = vma->vm_ops->set_policy(vma, new);
393 	if (!err) {
394 		mpol_get(new);
395 		vma->vm_policy = new;
396 		mpol_free(old);
397 	}
398 	return err;
399 }
400 
401 /* Step 2: apply policy to a range and do splits. */
402 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
403 		       unsigned long end, struct mempolicy *new)
404 {
405 	struct vm_area_struct *next;
406 	int err;
407 
408 	err = 0;
409 	for (; vma && vma->vm_start < end; vma = next) {
410 		next = vma->vm_next;
411 		if (vma->vm_start < start)
412 			err = split_vma(vma->vm_mm, vma, start, 1);
413 		if (!err && vma->vm_end > end)
414 			err = split_vma(vma->vm_mm, vma, end, 0);
415 		if (!err)
416 			err = policy_vma(vma, new);
417 		if (err)
418 			break;
419 	}
420 	return err;
421 }
422 
423 static int contextualize_policy(int mode, nodemask_t *nodes)
424 {
425 	if (!nodes)
426 		return 0;
427 
428 	cpuset_update_task_memory_state();
429 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
430 		return -EINVAL;
431 	return mpol_check_policy(mode, nodes);
432 }
433 
434 /* Set the process memory policy */
435 long do_set_mempolicy(int mode, nodemask_t *nodes)
436 {
437 	struct mempolicy *new;
438 
439 	if (contextualize_policy(mode, nodes))
440 		return -EINVAL;
441 	new = mpol_new(mode, nodes);
442 	if (IS_ERR(new))
443 		return PTR_ERR(new);
444 	mpol_free(current->mempolicy);
445 	current->mempolicy = new;
446 	if (new && new->policy == MPOL_INTERLEAVE)
447 		current->il_next = first_node(new->v.nodes);
448 	return 0;
449 }
450 
451 /* Fill a zone bitmap for a policy */
452 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
453 {
454 	int i;
455 
456 	nodes_clear(*nodes);
457 	switch (p->policy) {
458 	case MPOL_BIND:
459 		for (i = 0; p->v.zonelist->zones[i]; i++)
460 			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
461 				*nodes);
462 		break;
463 	case MPOL_DEFAULT:
464 		break;
465 	case MPOL_INTERLEAVE:
466 		*nodes = p->v.nodes;
467 		break;
468 	case MPOL_PREFERRED:
469 		/* or use current node instead of online map? */
470 		if (p->v.preferred_node < 0)
471 			*nodes = node_online_map;
472 		else
473 			node_set(p->v.preferred_node, *nodes);
474 		break;
475 	default:
476 		BUG();
477 	}
478 }
479 
480 static int lookup_node(struct mm_struct *mm, unsigned long addr)
481 {
482 	struct page *p;
483 	int err;
484 
485 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
486 	if (err >= 0) {
487 		err = page_to_nid(p);
488 		put_page(p);
489 	}
490 	return err;
491 }
492 
493 /* Retrieve NUMA policy */
494 long do_get_mempolicy(int *policy, nodemask_t *nmask,
495 			unsigned long addr, unsigned long flags)
496 {
497 	int err;
498 	struct mm_struct *mm = current->mm;
499 	struct vm_area_struct *vma = NULL;
500 	struct mempolicy *pol = current->mempolicy;
501 
502 	cpuset_update_task_memory_state();
503 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
504 		return -EINVAL;
505 	if (flags & MPOL_F_ADDR) {
506 		down_read(&mm->mmap_sem);
507 		vma = find_vma_intersection(mm, addr, addr+1);
508 		if (!vma) {
509 			up_read(&mm->mmap_sem);
510 			return -EFAULT;
511 		}
512 		if (vma->vm_ops && vma->vm_ops->get_policy)
513 			pol = vma->vm_ops->get_policy(vma, addr);
514 		else
515 			pol = vma->vm_policy;
516 	} else if (addr)
517 		return -EINVAL;
518 
519 	if (!pol)
520 		pol = &default_policy;
521 
522 	if (flags & MPOL_F_NODE) {
523 		if (flags & MPOL_F_ADDR) {
524 			err = lookup_node(mm, addr);
525 			if (err < 0)
526 				goto out;
527 			*policy = err;
528 		} else if (pol == current->mempolicy &&
529 				pol->policy == MPOL_INTERLEAVE) {
530 			*policy = current->il_next;
531 		} else {
532 			err = -EINVAL;
533 			goto out;
534 		}
535 	} else
536 		*policy = pol->policy;
537 
538 	if (vma) {
539 		up_read(&current->mm->mmap_sem);
540 		vma = NULL;
541 	}
542 
543 	err = 0;
544 	if (nmask)
545 		get_zonemask(pol, nmask);
546 
547  out:
548 	if (vma)
549 		up_read(&current->mm->mmap_sem);
550 	return err;
551 }
552 
553 /*
554  * page migration
555  */
556 
557 static void migrate_page_add(struct page *page, struct list_head *pagelist,
558 				unsigned long flags)
559 {
560 	/*
561 	 * Avoid migrating a page that is shared with others.
562 	 */
563 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
564 		if (isolate_lru_page(page))
565 			list_add_tail(&page->lru, pagelist);
566 	}
567 }
568 
569 /*
570  * Migrate the list 'pagelist' of pages to a certain destination.
571  *
572  * Specify destination with either non-NULL vma or dest_node >= 0
573  * Return the number of pages not migrated or error code
574  */
575 static int migrate_pages_to(struct list_head *pagelist,
576 			struct vm_area_struct *vma, int dest)
577 {
578 	LIST_HEAD(newlist);
579 	LIST_HEAD(moved);
580 	LIST_HEAD(failed);
581 	int err = 0;
582 	unsigned long offset = 0;
583 	int nr_pages;
584 	struct page *page;
585 	struct list_head *p;
586 
587 redo:
588 	nr_pages = 0;
589 	list_for_each(p, pagelist) {
590 		if (vma) {
591 			/*
592 			 * The address passed to alloc_page_vma is used to
593 			 * generate the proper interleave behavior. We fake
594 			 * the address here by an increasing offset in order
595 			 * to get the proper distribution of pages.
596 			 *
597 			 * No decision has been made as to which page
598 			 * a certain old page is moved to so we cannot
599 			 * specify the correct address.
600 			 */
601 			page = alloc_page_vma(GFP_HIGHUSER, vma,
602 					offset + vma->vm_start);
603 			offset += PAGE_SIZE;
604 		}
605 		else
606 			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
607 
608 		if (!page) {
609 			err = -ENOMEM;
610 			goto out;
611 		}
612 		list_add_tail(&page->lru, &newlist);
613 		nr_pages++;
614 		if (nr_pages > MIGRATE_CHUNK_SIZE)
615 			break;
616 	}
617 	err = migrate_pages(pagelist, &newlist, &moved, &failed);
618 
619 	putback_lru_pages(&moved);	/* Call release pages instead ?? */
620 
621 	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
622 		goto redo;
623 out:
624 	/* Return leftover allocated pages */
625 	while (!list_empty(&newlist)) {
626 		page = list_entry(newlist.next, struct page, lru);
627 		list_del(&page->lru);
628 		__free_page(page);
629 	}
630 	list_splice(&failed, pagelist);
631 	if (err < 0)
632 		return err;
633 
634 	/* Calculate number of leftover pages */
635 	nr_pages = 0;
636 	list_for_each(p, pagelist)
637 		nr_pages++;
638 	return nr_pages;
639 }
640 
641 /*
642  * Migrate pages from one node to a target node.
643  * Returns error or the number of pages not migrated.
644  */
645 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
646 {
647 	nodemask_t nmask;
648 	LIST_HEAD(pagelist);
649 	int err = 0;
650 
651 	nodes_clear(nmask);
652 	node_set(source, nmask);
653 
654 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
655 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
656 
657 	if (!list_empty(&pagelist)) {
658 		err = migrate_pages_to(&pagelist, NULL, dest);
659 		if (!list_empty(&pagelist))
660 			putback_lru_pages(&pagelist);
661 	}
662 	return err;
663 }
664 
665 /*
666  * Move pages between the two nodesets so as to preserve the physical
667  * layout as much as possible.
668  *
669  * Returns the number of page that could not be moved.
670  */
671 int do_migrate_pages(struct mm_struct *mm,
672 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
673 {
674 	LIST_HEAD(pagelist);
675 	int busy = 0;
676 	int err = 0;
677 	nodemask_t tmp;
678 
679   	down_read(&mm->mmap_sem);
680 
681 /*
682  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
683  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
684  * bit in 'tmp', and return that <source, dest> pair for migration.
685  * The pair of nodemasks 'to' and 'from' define the map.
686  *
687  * If no pair of bits is found that way, fallback to picking some
688  * pair of 'source' and 'dest' bits that are not the same.  If the
689  * 'source' and 'dest' bits are the same, this represents a node
690  * that will be migrating to itself, so no pages need move.
691  *
692  * If no bits are left in 'tmp', or if all remaining bits left
693  * in 'tmp' correspond to the same bit in 'to', return false
694  * (nothing left to migrate).
695  *
696  * This lets us pick a pair of nodes to migrate between, such that
697  * if possible the dest node is not already occupied by some other
698  * source node, minimizing the risk of overloading the memory on a
699  * node that would happen if we migrated incoming memory to a node
700  * before migrating outgoing memory source that same node.
701  *
702  * A single scan of tmp is sufficient.  As we go, we remember the
703  * most recent <s, d> pair that moved (s != d).  If we find a pair
704  * that not only moved, but what's better, moved to an empty slot
705  * (d is not set in tmp), then we break out then, with that pair.
706  * Otherwise when we finish scannng from_tmp, we at least have the
707  * most recent <s, d> pair that moved.  If we get all the way through
708  * the scan of tmp without finding any node that moved, much less
709  * moved to an empty node, then there is nothing left worth migrating.
710  */
711 
712 	tmp = *from_nodes;
713 	while (!nodes_empty(tmp)) {
714 		int s,d;
715 		int source = -1;
716 		int dest = 0;
717 
718 		for_each_node_mask(s, tmp) {
719 			d = node_remap(s, *from_nodes, *to_nodes);
720 			if (s == d)
721 				continue;
722 
723 			source = s;	/* Node moved. Memorize */
724 			dest = d;
725 
726 			/* dest not in remaining from nodes? */
727 			if (!node_isset(dest, tmp))
728 				break;
729 		}
730 		if (source == -1)
731 			break;
732 
733 		node_clear(source, tmp);
734 		err = migrate_to_node(mm, source, dest, flags);
735 		if (err > 0)
736 			busy += err;
737 		if (err < 0)
738 			break;
739 	}
740 
741 	up_read(&mm->mmap_sem);
742 	if (err < 0)
743 		return err;
744 	return busy;
745 }
746 
747 long do_mbind(unsigned long start, unsigned long len,
748 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
749 {
750 	struct vm_area_struct *vma;
751 	struct mm_struct *mm = current->mm;
752 	struct mempolicy *new;
753 	unsigned long end;
754 	int err;
755 	LIST_HEAD(pagelist);
756 
757 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
758 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
759 	    || mode > MPOL_MAX)
760 		return -EINVAL;
761 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
762 		return -EPERM;
763 
764 	if (start & ~PAGE_MASK)
765 		return -EINVAL;
766 
767 	if (mode == MPOL_DEFAULT)
768 		flags &= ~MPOL_MF_STRICT;
769 
770 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
771 	end = start + len;
772 
773 	if (end < start)
774 		return -EINVAL;
775 	if (end == start)
776 		return 0;
777 
778 	if (mpol_check_policy(mode, nmask))
779 		return -EINVAL;
780 
781 	new = mpol_new(mode, nmask);
782 	if (IS_ERR(new))
783 		return PTR_ERR(new);
784 
785 	/*
786 	 * If we are using the default policy then operation
787 	 * on discontinuous address spaces is okay after all
788 	 */
789 	if (!new)
790 		flags |= MPOL_MF_DISCONTIG_OK;
791 
792 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
793 			mode,nodes_addr(nodes)[0]);
794 
795 	down_write(&mm->mmap_sem);
796 	vma = check_range(mm, start, end, nmask,
797 			  flags | MPOL_MF_INVERT, &pagelist);
798 
799 	err = PTR_ERR(vma);
800 	if (!IS_ERR(vma)) {
801 		int nr_failed = 0;
802 
803 		err = mbind_range(vma, start, end, new);
804 
805 		if (!list_empty(&pagelist))
806 			nr_failed = migrate_pages_to(&pagelist, vma, -1);
807 
808 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
809 			err = -EIO;
810 	}
811 	if (!list_empty(&pagelist))
812 		putback_lru_pages(&pagelist);
813 
814 	up_write(&mm->mmap_sem);
815 	mpol_free(new);
816 	return err;
817 }
818 
819 /*
820  * User space interface with variable sized bitmaps for nodelists.
821  */
822 
823 /* Copy a node mask from user space. */
824 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
825 		     unsigned long maxnode)
826 {
827 	unsigned long k;
828 	unsigned long nlongs;
829 	unsigned long endmask;
830 
831 	--maxnode;
832 	nodes_clear(*nodes);
833 	if (maxnode == 0 || !nmask)
834 		return 0;
835 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
836 		return -EINVAL;
837 
838 	nlongs = BITS_TO_LONGS(maxnode);
839 	if ((maxnode % BITS_PER_LONG) == 0)
840 		endmask = ~0UL;
841 	else
842 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
843 
844 	/* When the user specified more nodes than supported just check
845 	   if the non supported part is all zero. */
846 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
847 		if (nlongs > PAGE_SIZE/sizeof(long))
848 			return -EINVAL;
849 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
850 			unsigned long t;
851 			if (get_user(t, nmask + k))
852 				return -EFAULT;
853 			if (k == nlongs - 1) {
854 				if (t & endmask)
855 					return -EINVAL;
856 			} else if (t)
857 				return -EINVAL;
858 		}
859 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
860 		endmask = ~0UL;
861 	}
862 
863 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
864 		return -EFAULT;
865 	nodes_addr(*nodes)[nlongs-1] &= endmask;
866 	return 0;
867 }
868 
869 /* Copy a kernel node mask to user space */
870 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
871 			      nodemask_t *nodes)
872 {
873 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
874 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
875 
876 	if (copy > nbytes) {
877 		if (copy > PAGE_SIZE)
878 			return -EINVAL;
879 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
880 			return -EFAULT;
881 		copy = nbytes;
882 	}
883 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
884 }
885 
886 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
887 			unsigned long mode,
888 			unsigned long __user *nmask, unsigned long maxnode,
889 			unsigned flags)
890 {
891 	nodemask_t nodes;
892 	int err;
893 
894 	err = get_nodes(&nodes, nmask, maxnode);
895 	if (err)
896 		return err;
897 	return do_mbind(start, len, mode, &nodes, flags);
898 }
899 
900 /* Set the process memory policy */
901 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
902 		unsigned long maxnode)
903 {
904 	int err;
905 	nodemask_t nodes;
906 
907 	if (mode < 0 || mode > MPOL_MAX)
908 		return -EINVAL;
909 	err = get_nodes(&nodes, nmask, maxnode);
910 	if (err)
911 		return err;
912 	return do_set_mempolicy(mode, &nodes);
913 }
914 
915 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
916 		const unsigned long __user *old_nodes,
917 		const unsigned long __user *new_nodes)
918 {
919 	struct mm_struct *mm;
920 	struct task_struct *task;
921 	nodemask_t old;
922 	nodemask_t new;
923 	nodemask_t task_nodes;
924 	int err;
925 
926 	err = get_nodes(&old, old_nodes, maxnode);
927 	if (err)
928 		return err;
929 
930 	err = get_nodes(&new, new_nodes, maxnode);
931 	if (err)
932 		return err;
933 
934 	/* Find the mm_struct */
935 	read_lock(&tasklist_lock);
936 	task = pid ? find_task_by_pid(pid) : current;
937 	if (!task) {
938 		read_unlock(&tasklist_lock);
939 		return -ESRCH;
940 	}
941 	mm = get_task_mm(task);
942 	read_unlock(&tasklist_lock);
943 
944 	if (!mm)
945 		return -EINVAL;
946 
947 	/*
948 	 * Check if this process has the right to modify the specified
949 	 * process. The right exists if the process has administrative
950 	 * capabilities, superuser priviledges or the same
951 	 * userid as the target process.
952 	 */
953 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
954 	    (current->uid != task->suid) && (current->uid != task->uid) &&
955 	    !capable(CAP_SYS_NICE)) {
956 		err = -EPERM;
957 		goto out;
958 	}
959 
960 	task_nodes = cpuset_mems_allowed(task);
961 	/* Is the user allowed to access the target nodes? */
962 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
963 		err = -EPERM;
964 		goto out;
965 	}
966 
967 	err = do_migrate_pages(mm, &old, &new,
968 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
969 out:
970 	mmput(mm);
971 	return err;
972 }
973 
974 
975 /* Retrieve NUMA policy */
976 asmlinkage long sys_get_mempolicy(int __user *policy,
977 				unsigned long __user *nmask,
978 				unsigned long maxnode,
979 				unsigned long addr, unsigned long flags)
980 {
981 	int err, pval;
982 	nodemask_t nodes;
983 
984 	if (nmask != NULL && maxnode < MAX_NUMNODES)
985 		return -EINVAL;
986 
987 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
988 
989 	if (err)
990 		return err;
991 
992 	if (policy && put_user(pval, policy))
993 		return -EFAULT;
994 
995 	if (nmask)
996 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
997 
998 	return err;
999 }
1000 
1001 #ifdef CONFIG_COMPAT
1002 
1003 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1004 				     compat_ulong_t __user *nmask,
1005 				     compat_ulong_t maxnode,
1006 				     compat_ulong_t addr, compat_ulong_t flags)
1007 {
1008 	long err;
1009 	unsigned long __user *nm = NULL;
1010 	unsigned long nr_bits, alloc_size;
1011 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1012 
1013 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1014 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1015 
1016 	if (nmask)
1017 		nm = compat_alloc_user_space(alloc_size);
1018 
1019 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1020 
1021 	if (!err && nmask) {
1022 		err = copy_from_user(bm, nm, alloc_size);
1023 		/* ensure entire bitmap is zeroed */
1024 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1025 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1026 	}
1027 
1028 	return err;
1029 }
1030 
1031 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1032 				     compat_ulong_t maxnode)
1033 {
1034 	long err = 0;
1035 	unsigned long __user *nm = NULL;
1036 	unsigned long nr_bits, alloc_size;
1037 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1038 
1039 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1040 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1041 
1042 	if (nmask) {
1043 		err = compat_get_bitmap(bm, nmask, nr_bits);
1044 		nm = compat_alloc_user_space(alloc_size);
1045 		err |= copy_to_user(nm, bm, alloc_size);
1046 	}
1047 
1048 	if (err)
1049 		return -EFAULT;
1050 
1051 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1052 }
1053 
1054 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1055 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1056 			     compat_ulong_t maxnode, compat_ulong_t flags)
1057 {
1058 	long err = 0;
1059 	unsigned long __user *nm = NULL;
1060 	unsigned long nr_bits, alloc_size;
1061 	nodemask_t bm;
1062 
1063 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1064 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1065 
1066 	if (nmask) {
1067 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1068 		nm = compat_alloc_user_space(alloc_size);
1069 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1070 	}
1071 
1072 	if (err)
1073 		return -EFAULT;
1074 
1075 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1076 }
1077 
1078 #endif
1079 
1080 /* Return effective policy for a VMA */
1081 static struct mempolicy * get_vma_policy(struct task_struct *task,
1082 		struct vm_area_struct *vma, unsigned long addr)
1083 {
1084 	struct mempolicy *pol = task->mempolicy;
1085 
1086 	if (vma) {
1087 		if (vma->vm_ops && vma->vm_ops->get_policy)
1088 			pol = vma->vm_ops->get_policy(vma, addr);
1089 		else if (vma->vm_policy &&
1090 				vma->vm_policy->policy != MPOL_DEFAULT)
1091 			pol = vma->vm_policy;
1092 	}
1093 	if (!pol)
1094 		pol = &default_policy;
1095 	return pol;
1096 }
1097 
1098 /* Return a zonelist representing a mempolicy */
1099 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1100 {
1101 	int nd;
1102 
1103 	switch (policy->policy) {
1104 	case MPOL_PREFERRED:
1105 		nd = policy->v.preferred_node;
1106 		if (nd < 0)
1107 			nd = numa_node_id();
1108 		break;
1109 	case MPOL_BIND:
1110 		/* Lower zones don't get a policy applied */
1111 		/* Careful: current->mems_allowed might have moved */
1112 		if (gfp_zone(gfp) >= policy_zone)
1113 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1114 				return policy->v.zonelist;
1115 		/*FALL THROUGH*/
1116 	case MPOL_INTERLEAVE: /* should not happen */
1117 	case MPOL_DEFAULT:
1118 		nd = numa_node_id();
1119 		break;
1120 	default:
1121 		nd = 0;
1122 		BUG();
1123 	}
1124 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1125 }
1126 
1127 /* Do dynamic interleaving for a process */
1128 static unsigned interleave_nodes(struct mempolicy *policy)
1129 {
1130 	unsigned nid, next;
1131 	struct task_struct *me = current;
1132 
1133 	nid = me->il_next;
1134 	next = next_node(nid, policy->v.nodes);
1135 	if (next >= MAX_NUMNODES)
1136 		next = first_node(policy->v.nodes);
1137 	me->il_next = next;
1138 	return nid;
1139 }
1140 
1141 /*
1142  * Depending on the memory policy provide a node from which to allocate the
1143  * next slab entry.
1144  */
1145 unsigned slab_node(struct mempolicy *policy)
1146 {
1147 	switch (policy->policy) {
1148 	case MPOL_INTERLEAVE:
1149 		return interleave_nodes(policy);
1150 
1151 	case MPOL_BIND:
1152 		/*
1153 		 * Follow bind policy behavior and start allocation at the
1154 		 * first node.
1155 		 */
1156 		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1157 
1158 	case MPOL_PREFERRED:
1159 		if (policy->v.preferred_node >= 0)
1160 			return policy->v.preferred_node;
1161 		/* Fall through */
1162 
1163 	default:
1164 		return numa_node_id();
1165 	}
1166 }
1167 
1168 /* Do static interleaving for a VMA with known offset. */
1169 static unsigned offset_il_node(struct mempolicy *pol,
1170 		struct vm_area_struct *vma, unsigned long off)
1171 {
1172 	unsigned nnodes = nodes_weight(pol->v.nodes);
1173 	unsigned target = (unsigned)off % nnodes;
1174 	int c;
1175 	int nid = -1;
1176 
1177 	c = 0;
1178 	do {
1179 		nid = next_node(nid, pol->v.nodes);
1180 		c++;
1181 	} while (c <= target);
1182 	return nid;
1183 }
1184 
1185 /* Determine a node number for interleave */
1186 static inline unsigned interleave_nid(struct mempolicy *pol,
1187 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1188 {
1189 	if (vma) {
1190 		unsigned long off;
1191 
1192 		off = vma->vm_pgoff;
1193 		off += (addr - vma->vm_start) >> shift;
1194 		return offset_il_node(pol, vma, off);
1195 	} else
1196 		return interleave_nodes(pol);
1197 }
1198 
1199 #ifdef CONFIG_HUGETLBFS
1200 /* Return a zonelist suitable for a huge page allocation. */
1201 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1202 {
1203 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1204 
1205 	if (pol->policy == MPOL_INTERLEAVE) {
1206 		unsigned nid;
1207 
1208 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1209 		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1210 	}
1211 	return zonelist_policy(GFP_HIGHUSER, pol);
1212 }
1213 #endif
1214 
1215 /* Allocate a page in interleaved policy.
1216    Own path because it needs to do special accounting. */
1217 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1218 					unsigned nid)
1219 {
1220 	struct zonelist *zl;
1221 	struct page *page;
1222 
1223 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1224 	page = __alloc_pages(gfp, order, zl);
1225 	if (page && page_zone(page) == zl->zones[0]) {
1226 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1227 		put_cpu();
1228 	}
1229 	return page;
1230 }
1231 
1232 /**
1233  * 	alloc_page_vma	- Allocate a page for a VMA.
1234  *
1235  * 	@gfp:
1236  *      %GFP_USER    user allocation.
1237  *      %GFP_KERNEL  kernel allocations,
1238  *      %GFP_HIGHMEM highmem/user allocations,
1239  *      %GFP_FS      allocation should not call back into a file system.
1240  *      %GFP_ATOMIC  don't sleep.
1241  *
1242  * 	@vma:  Pointer to VMA or NULL if not available.
1243  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1244  *
1245  * 	This function allocates a page from the kernel page pool and applies
1246  *	a NUMA policy associated with the VMA or the current process.
1247  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1248  *	mm_struct of the VMA to prevent it from going away. Should be used for
1249  *	all allocations for pages that will be mapped into
1250  * 	user space. Returns NULL when no page can be allocated.
1251  *
1252  *	Should be called with the mm_sem of the vma hold.
1253  */
1254 struct page *
1255 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1256 {
1257 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1258 
1259 	cpuset_update_task_memory_state();
1260 
1261 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1262 		unsigned nid;
1263 
1264 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1265 		return alloc_page_interleave(gfp, 0, nid);
1266 	}
1267 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1268 }
1269 
1270 /**
1271  * 	alloc_pages_current - Allocate pages.
1272  *
1273  *	@gfp:
1274  *		%GFP_USER   user allocation,
1275  *      	%GFP_KERNEL kernel allocation,
1276  *      	%GFP_HIGHMEM highmem allocation,
1277  *      	%GFP_FS     don't call back into a file system.
1278  *      	%GFP_ATOMIC don't sleep.
1279  *	@order: Power of two of allocation size in pages. 0 is a single page.
1280  *
1281  *	Allocate a page from the kernel page pool.  When not in
1282  *	interrupt context and apply the current process NUMA policy.
1283  *	Returns NULL when no page can be allocated.
1284  *
1285  *	Don't call cpuset_update_task_memory_state() unless
1286  *	1) it's ok to take cpuset_sem (can WAIT), and
1287  *	2) allocating for current task (not interrupt).
1288  */
1289 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1290 {
1291 	struct mempolicy *pol = current->mempolicy;
1292 
1293 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1294 		cpuset_update_task_memory_state();
1295 	if (!pol || in_interrupt())
1296 		pol = &default_policy;
1297 	if (pol->policy == MPOL_INTERLEAVE)
1298 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1299 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1300 }
1301 EXPORT_SYMBOL(alloc_pages_current);
1302 
1303 /*
1304  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1305  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1306  * with the mems_allowed returned by cpuset_mems_allowed().  This
1307  * keeps mempolicies cpuset relative after its cpuset moves.  See
1308  * further kernel/cpuset.c update_nodemask().
1309  */
1310 void *cpuset_being_rebound;
1311 
1312 /* Slow path of a mempolicy copy */
1313 struct mempolicy *__mpol_copy(struct mempolicy *old)
1314 {
1315 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1316 
1317 	if (!new)
1318 		return ERR_PTR(-ENOMEM);
1319 	if (current_cpuset_is_being_rebound()) {
1320 		nodemask_t mems = cpuset_mems_allowed(current);
1321 		mpol_rebind_policy(old, &mems);
1322 	}
1323 	*new = *old;
1324 	atomic_set(&new->refcnt, 1);
1325 	if (new->policy == MPOL_BIND) {
1326 		int sz = ksize(old->v.zonelist);
1327 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1328 		if (!new->v.zonelist) {
1329 			kmem_cache_free(policy_cache, new);
1330 			return ERR_PTR(-ENOMEM);
1331 		}
1332 		memcpy(new->v.zonelist, old->v.zonelist, sz);
1333 	}
1334 	return new;
1335 }
1336 
1337 /* Slow path of a mempolicy comparison */
1338 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1339 {
1340 	if (!a || !b)
1341 		return 0;
1342 	if (a->policy != b->policy)
1343 		return 0;
1344 	switch (a->policy) {
1345 	case MPOL_DEFAULT:
1346 		return 1;
1347 	case MPOL_INTERLEAVE:
1348 		return nodes_equal(a->v.nodes, b->v.nodes);
1349 	case MPOL_PREFERRED:
1350 		return a->v.preferred_node == b->v.preferred_node;
1351 	case MPOL_BIND: {
1352 		int i;
1353 		for (i = 0; a->v.zonelist->zones[i]; i++)
1354 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1355 				return 0;
1356 		return b->v.zonelist->zones[i] == NULL;
1357 	}
1358 	default:
1359 		BUG();
1360 		return 0;
1361 	}
1362 }
1363 
1364 /* Slow path of a mpol destructor. */
1365 void __mpol_free(struct mempolicy *p)
1366 {
1367 	if (!atomic_dec_and_test(&p->refcnt))
1368 		return;
1369 	if (p->policy == MPOL_BIND)
1370 		kfree(p->v.zonelist);
1371 	p->policy = MPOL_DEFAULT;
1372 	kmem_cache_free(policy_cache, p);
1373 }
1374 
1375 /*
1376  * Shared memory backing store policy support.
1377  *
1378  * Remember policies even when nobody has shared memory mapped.
1379  * The policies are kept in Red-Black tree linked from the inode.
1380  * They are protected by the sp->lock spinlock, which should be held
1381  * for any accesses to the tree.
1382  */
1383 
1384 /* lookup first element intersecting start-end */
1385 /* Caller holds sp->lock */
1386 static struct sp_node *
1387 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1388 {
1389 	struct rb_node *n = sp->root.rb_node;
1390 
1391 	while (n) {
1392 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1393 
1394 		if (start >= p->end)
1395 			n = n->rb_right;
1396 		else if (end <= p->start)
1397 			n = n->rb_left;
1398 		else
1399 			break;
1400 	}
1401 	if (!n)
1402 		return NULL;
1403 	for (;;) {
1404 		struct sp_node *w = NULL;
1405 		struct rb_node *prev = rb_prev(n);
1406 		if (!prev)
1407 			break;
1408 		w = rb_entry(prev, struct sp_node, nd);
1409 		if (w->end <= start)
1410 			break;
1411 		n = prev;
1412 	}
1413 	return rb_entry(n, struct sp_node, nd);
1414 }
1415 
1416 /* Insert a new shared policy into the list. */
1417 /* Caller holds sp->lock */
1418 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1419 {
1420 	struct rb_node **p = &sp->root.rb_node;
1421 	struct rb_node *parent = NULL;
1422 	struct sp_node *nd;
1423 
1424 	while (*p) {
1425 		parent = *p;
1426 		nd = rb_entry(parent, struct sp_node, nd);
1427 		if (new->start < nd->start)
1428 			p = &(*p)->rb_left;
1429 		else if (new->end > nd->end)
1430 			p = &(*p)->rb_right;
1431 		else
1432 			BUG();
1433 	}
1434 	rb_link_node(&new->nd, parent, p);
1435 	rb_insert_color(&new->nd, &sp->root);
1436 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1437 		 new->policy ? new->policy->policy : 0);
1438 }
1439 
1440 /* Find shared policy intersecting idx */
1441 struct mempolicy *
1442 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1443 {
1444 	struct mempolicy *pol = NULL;
1445 	struct sp_node *sn;
1446 
1447 	if (!sp->root.rb_node)
1448 		return NULL;
1449 	spin_lock(&sp->lock);
1450 	sn = sp_lookup(sp, idx, idx+1);
1451 	if (sn) {
1452 		mpol_get(sn->policy);
1453 		pol = sn->policy;
1454 	}
1455 	spin_unlock(&sp->lock);
1456 	return pol;
1457 }
1458 
1459 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1460 {
1461 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1462 	rb_erase(&n->nd, &sp->root);
1463 	mpol_free(n->policy);
1464 	kmem_cache_free(sn_cache, n);
1465 }
1466 
1467 struct sp_node *
1468 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1469 {
1470 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1471 
1472 	if (!n)
1473 		return NULL;
1474 	n->start = start;
1475 	n->end = end;
1476 	mpol_get(pol);
1477 	n->policy = pol;
1478 	return n;
1479 }
1480 
1481 /* Replace a policy range. */
1482 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1483 				 unsigned long end, struct sp_node *new)
1484 {
1485 	struct sp_node *n, *new2 = NULL;
1486 
1487 restart:
1488 	spin_lock(&sp->lock);
1489 	n = sp_lookup(sp, start, end);
1490 	/* Take care of old policies in the same range. */
1491 	while (n && n->start < end) {
1492 		struct rb_node *next = rb_next(&n->nd);
1493 		if (n->start >= start) {
1494 			if (n->end <= end)
1495 				sp_delete(sp, n);
1496 			else
1497 				n->start = end;
1498 		} else {
1499 			/* Old policy spanning whole new range. */
1500 			if (n->end > end) {
1501 				if (!new2) {
1502 					spin_unlock(&sp->lock);
1503 					new2 = sp_alloc(end, n->end, n->policy);
1504 					if (!new2)
1505 						return -ENOMEM;
1506 					goto restart;
1507 				}
1508 				n->end = start;
1509 				sp_insert(sp, new2);
1510 				new2 = NULL;
1511 				break;
1512 			} else
1513 				n->end = start;
1514 		}
1515 		if (!next)
1516 			break;
1517 		n = rb_entry(next, struct sp_node, nd);
1518 	}
1519 	if (new)
1520 		sp_insert(sp, new);
1521 	spin_unlock(&sp->lock);
1522 	if (new2) {
1523 		mpol_free(new2->policy);
1524 		kmem_cache_free(sn_cache, new2);
1525 	}
1526 	return 0;
1527 }
1528 
1529 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1530 				nodemask_t *policy_nodes)
1531 {
1532 	info->root = RB_ROOT;
1533 	spin_lock_init(&info->lock);
1534 
1535 	if (policy != MPOL_DEFAULT) {
1536 		struct mempolicy *newpol;
1537 
1538 		/* Falls back to MPOL_DEFAULT on any error */
1539 		newpol = mpol_new(policy, policy_nodes);
1540 		if (!IS_ERR(newpol)) {
1541 			/* Create pseudo-vma that contains just the policy */
1542 			struct vm_area_struct pvma;
1543 
1544 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1545 			/* Policy covers entire file */
1546 			pvma.vm_end = TASK_SIZE;
1547 			mpol_set_shared_policy(info, &pvma, newpol);
1548 			mpol_free(newpol);
1549 		}
1550 	}
1551 }
1552 
1553 int mpol_set_shared_policy(struct shared_policy *info,
1554 			struct vm_area_struct *vma, struct mempolicy *npol)
1555 {
1556 	int err;
1557 	struct sp_node *new = NULL;
1558 	unsigned long sz = vma_pages(vma);
1559 
1560 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1561 		 vma->vm_pgoff,
1562 		 sz, npol? npol->policy : -1,
1563 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1564 
1565 	if (npol) {
1566 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1567 		if (!new)
1568 			return -ENOMEM;
1569 	}
1570 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1571 	if (err && new)
1572 		kmem_cache_free(sn_cache, new);
1573 	return err;
1574 }
1575 
1576 /* Free a backing policy store on inode delete. */
1577 void mpol_free_shared_policy(struct shared_policy *p)
1578 {
1579 	struct sp_node *n;
1580 	struct rb_node *next;
1581 
1582 	if (!p->root.rb_node)
1583 		return;
1584 	spin_lock(&p->lock);
1585 	next = rb_first(&p->root);
1586 	while (next) {
1587 		n = rb_entry(next, struct sp_node, nd);
1588 		next = rb_next(&n->nd);
1589 		rb_erase(&n->nd, &p->root);
1590 		mpol_free(n->policy);
1591 		kmem_cache_free(sn_cache, n);
1592 	}
1593 	spin_unlock(&p->lock);
1594 }
1595 
1596 /* assumes fs == KERNEL_DS */
1597 void __init numa_policy_init(void)
1598 {
1599 	policy_cache = kmem_cache_create("numa_policy",
1600 					 sizeof(struct mempolicy),
1601 					 0, SLAB_PANIC, NULL, NULL);
1602 
1603 	sn_cache = kmem_cache_create("shared_policy_node",
1604 				     sizeof(struct sp_node),
1605 				     0, SLAB_PANIC, NULL, NULL);
1606 
1607 	/* Set interleaving policy for system init. This way not all
1608 	   the data structures allocated at system boot end up in node zero. */
1609 
1610 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1611 		printk("numa_policy_init: interleaving failed\n");
1612 }
1613 
1614 /* Reset policy of current process to default */
1615 void numa_default_policy(void)
1616 {
1617 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1618 }
1619 
1620 /* Migrate a policy to a different set of nodes */
1621 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1622 {
1623 	nodemask_t *mpolmask;
1624 	nodemask_t tmp;
1625 
1626 	if (!pol)
1627 		return;
1628 	mpolmask = &pol->cpuset_mems_allowed;
1629 	if (nodes_equal(*mpolmask, *newmask))
1630 		return;
1631 
1632 	switch (pol->policy) {
1633 	case MPOL_DEFAULT:
1634 		break;
1635 	case MPOL_INTERLEAVE:
1636 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1637 		pol->v.nodes = tmp;
1638 		*mpolmask = *newmask;
1639 		current->il_next = node_remap(current->il_next,
1640 						*mpolmask, *newmask);
1641 		break;
1642 	case MPOL_PREFERRED:
1643 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1644 						*mpolmask, *newmask);
1645 		*mpolmask = *newmask;
1646 		break;
1647 	case MPOL_BIND: {
1648 		nodemask_t nodes;
1649 		struct zone **z;
1650 		struct zonelist *zonelist;
1651 
1652 		nodes_clear(nodes);
1653 		for (z = pol->v.zonelist->zones; *z; z++)
1654 			node_set((*z)->zone_pgdat->node_id, nodes);
1655 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1656 		nodes = tmp;
1657 
1658 		zonelist = bind_zonelist(&nodes);
1659 
1660 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1661 		 * If that old zonelist has no remaining mems_allowed nodes,
1662 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1663 		 */
1664 
1665 		if (zonelist) {
1666 			/* Good - got mem - substitute new zonelist */
1667 			kfree(pol->v.zonelist);
1668 			pol->v.zonelist = zonelist;
1669 		}
1670 		*mpolmask = *newmask;
1671 		break;
1672 	}
1673 	default:
1674 		BUG();
1675 		break;
1676 	}
1677 }
1678 
1679 /*
1680  * Wrapper for mpol_rebind_policy() that just requires task
1681  * pointer, and updates task mempolicy.
1682  */
1683 
1684 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1685 {
1686 	mpol_rebind_policy(tsk->mempolicy, new);
1687 }
1688 
1689 /*
1690  * Rebind each vma in mm to new nodemask.
1691  *
1692  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1693  */
1694 
1695 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1696 {
1697 	struct vm_area_struct *vma;
1698 
1699 	down_write(&mm->mmap_sem);
1700 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1701 		mpol_rebind_policy(vma->vm_policy, new);
1702 	up_write(&mm->mmap_sem);
1703 }
1704 
1705 /*
1706  * Display pages allocated per node and memory policy via /proc.
1707  */
1708 
1709 static const char *policy_types[] = { "default", "prefer", "bind",
1710 				      "interleave" };
1711 
1712 /*
1713  * Convert a mempolicy into a string.
1714  * Returns the number of characters in buffer (if positive)
1715  * or an error (negative)
1716  */
1717 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1718 {
1719 	char *p = buffer;
1720 	int l;
1721 	nodemask_t nodes;
1722 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1723 
1724 	switch (mode) {
1725 	case MPOL_DEFAULT:
1726 		nodes_clear(nodes);
1727 		break;
1728 
1729 	case MPOL_PREFERRED:
1730 		nodes_clear(nodes);
1731 		node_set(pol->v.preferred_node, nodes);
1732 		break;
1733 
1734 	case MPOL_BIND:
1735 		get_zonemask(pol, &nodes);
1736 		break;
1737 
1738 	case MPOL_INTERLEAVE:
1739 		nodes = pol->v.nodes;
1740 		break;
1741 
1742 	default:
1743 		BUG();
1744 		return -EFAULT;
1745 	}
1746 
1747 	l = strlen(policy_types[mode]);
1748  	if (buffer + maxlen < p + l + 1)
1749  		return -ENOSPC;
1750 
1751 	strcpy(p, policy_types[mode]);
1752 	p += l;
1753 
1754 	if (!nodes_empty(nodes)) {
1755 		if (buffer + maxlen < p + 2)
1756 			return -ENOSPC;
1757 		*p++ = '=';
1758 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1759 	}
1760 	return p - buffer;
1761 }
1762 
1763 struct numa_maps {
1764 	unsigned long pages;
1765 	unsigned long anon;
1766 	unsigned long active;
1767 	unsigned long writeback;
1768 	unsigned long mapcount_max;
1769 	unsigned long dirty;
1770 	unsigned long swapcache;
1771 	unsigned long node[MAX_NUMNODES];
1772 };
1773 
1774 static void gather_stats(struct page *page, void *private, int pte_dirty)
1775 {
1776 	struct numa_maps *md = private;
1777 	int count = page_mapcount(page);
1778 
1779 	md->pages++;
1780 	if (pte_dirty || PageDirty(page))
1781 		md->dirty++;
1782 
1783 	if (PageSwapCache(page))
1784 		md->swapcache++;
1785 
1786 	if (PageActive(page))
1787 		md->active++;
1788 
1789 	if (PageWriteback(page))
1790 		md->writeback++;
1791 
1792 	if (PageAnon(page))
1793 		md->anon++;
1794 
1795 	if (count > md->mapcount_max)
1796 		md->mapcount_max = count;
1797 
1798 	md->node[page_to_nid(page)]++;
1799 	cond_resched();
1800 }
1801 
1802 #ifdef CONFIG_HUGETLB_PAGE
1803 static void check_huge_range(struct vm_area_struct *vma,
1804 		unsigned long start, unsigned long end,
1805 		struct numa_maps *md)
1806 {
1807 	unsigned long addr;
1808 	struct page *page;
1809 
1810 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812 		pte_t pte;
1813 
1814 		if (!ptep)
1815 			continue;
1816 
1817 		pte = *ptep;
1818 		if (pte_none(pte))
1819 			continue;
1820 
1821 		page = pte_page(pte);
1822 		if (!page)
1823 			continue;
1824 
1825 		gather_stats(page, md, pte_dirty(*ptep));
1826 	}
1827 }
1828 #else
1829 static inline void check_huge_range(struct vm_area_struct *vma,
1830 		unsigned long start, unsigned long end,
1831 		struct numa_maps *md)
1832 {
1833 }
1834 #endif
1835 
1836 int show_numa_map(struct seq_file *m, void *v)
1837 {
1838 	struct task_struct *task = m->private;
1839 	struct vm_area_struct *vma = v;
1840 	struct numa_maps *md;
1841 	struct file *file = vma->vm_file;
1842 	struct mm_struct *mm = vma->vm_mm;
1843 	int n;
1844 	char buffer[50];
1845 
1846 	if (!mm)
1847 		return 0;
1848 
1849 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1850 	if (!md)
1851 		return 0;
1852 
1853 	mpol_to_str(buffer, sizeof(buffer),
1854 			get_vma_policy(task, vma, vma->vm_start));
1855 
1856 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1857 
1858 	if (file) {
1859 		seq_printf(m, " file=");
1860 		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1861 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862 		seq_printf(m, " heap");
1863 	} else if (vma->vm_start <= mm->start_stack &&
1864 			vma->vm_end >= mm->start_stack) {
1865 		seq_printf(m, " stack");
1866 	}
1867 
1868 	if (is_vm_hugetlb_page(vma)) {
1869 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870 		seq_printf(m, " huge");
1871 	} else {
1872 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873 				&node_online_map, MPOL_MF_STATS, md);
1874 	}
1875 
1876 	if (!md->pages)
1877 		goto out;
1878 
1879 	if (md->anon)
1880 		seq_printf(m," anon=%lu",md->anon);
1881 
1882 	if (md->dirty)
1883 		seq_printf(m," dirty=%lu",md->dirty);
1884 
1885 	if (md->pages != md->anon && md->pages != md->dirty)
1886 		seq_printf(m, " mapped=%lu", md->pages);
1887 
1888 	if (md->mapcount_max > 1)
1889 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890 
1891 	if (md->swapcache)
1892 		seq_printf(m," swapcache=%lu", md->swapcache);
1893 
1894 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895 		seq_printf(m," active=%lu", md->active);
1896 
1897 	if (md->writeback)
1898 		seq_printf(m," writeback=%lu", md->writeback);
1899 
1900 	for_each_online_node(n)
1901 		if (md->node[n])
1902 			seq_printf(m, " N%d=%lu", n, md->node[n]);
1903 out:
1904 	seq_putc(m, '\n');
1905 	kfree(md);
1906 
1907 	if (m->count < m->size)
1908 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1909 	return 0;
1910 }
1911 
1912