xref: /linux/mm/mempolicy.c (revision 60b2737de1b1ddfdb90f3ba622634eb49d6f3603)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * Subject to the GNU Public License, version 2.
6  *
7  * NUMA policy allows the user to give hints in which node(s) memory should
8  * be allocated.
9  *
10  * Support four policies per VMA and per process:
11  *
12  * The VMA policy has priority over the process policy for a page fault.
13  *
14  * interleave     Allocate memory interleaved over a set of nodes,
15  *                with normal fallback if it fails.
16  *                For VMA based allocations this interleaves based on the
17  *                offset into the backing object or offset into the mapping
18  *                for anonymous memory. For process policy an process counter
19  *                is used.
20  * bind           Only allocate memory on a specific set of nodes,
21  *                no fallback.
22  * preferred       Try a specific node first before normal fallback.
23  *                As a special case node -1 here means do the allocation
24  *                on the local CPU. This is normally identical to default,
25  *                but useful to set in a VMA when you have a non default
26  *                process policy.
27  * default        Allocate on the local node first, or when on a VMA
28  *                use the process policy. This is what Linux always did
29  *		  in a NUMA aware kernel and still does by, ahem, default.
30  *
31  * The process policy is applied for most non interrupt memory allocations
32  * in that process' context. Interrupts ignore the policies and always
33  * try to allocate on the local CPU. The VMA policy is only applied for memory
34  * allocations for a VMA in the VM.
35  *
36  * Currently there are a few corner cases in swapping where the policy
37  * is not applied, but the majority should be handled. When process policy
38  * is used it is not remembered over swap outs/swap ins.
39  *
40  * Only the highest zone in the zone hierarchy gets policied. Allocations
41  * requesting a lower zone just use default policy. This implies that
42  * on systems with highmem kernel lowmem allocation don't get policied.
43  * Same with GFP_DMA allocations.
44  *
45  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46  * all users and remembered even when nobody has memory mapped.
47  */
48 
49 /* Notebook:
50    fix mmap readahead to honour policy and enable policy for any page cache
51    object
52    statistics for bigpages
53    global policy for page cache? currently it uses process policy. Requires
54    first item above.
55    handle mremap for shared memory (currently ignored for the policy)
56    grows down?
57    make bind policy root only? It can trigger oom much faster and the
58    kernel is not always grateful with that.
59    could replace all the switch()es with a mempolicy_ops structure.
60 */
61 
62 #include <linux/mempolicy.h>
63 #include <linux/mm.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
68 #include <linux/mm.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
81 
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
84 
85 #define PDprintk(fmt...)
86 
87 /* Highest zone. An specific allocation for a zone below that is not
88    policied. */
89 static int policy_zone;
90 
91 static struct mempolicy default_policy = {
92 	.refcnt = ATOMIC_INIT(1), /* never free it */
93 	.policy = MPOL_DEFAULT,
94 };
95 
96 /* Check if all specified nodes are online */
97 static int nodes_online(unsigned long *nodes)
98 {
99 	DECLARE_BITMAP(online2, MAX_NUMNODES);
100 
101 	bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 	if (bitmap_empty(online2, MAX_NUMNODES))
103 		set_bit(0, online2);
104 	if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 		return -EINVAL;
106 	return 0;
107 }
108 
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, unsigned long *nodes)
111 {
112 	int empty = bitmap_empty(nodes, MAX_NUMNODES);
113 
114 	switch (mode) {
115 	case MPOL_DEFAULT:
116 		if (!empty)
117 			return -EINVAL;
118 		break;
119 	case MPOL_BIND:
120 	case MPOL_INTERLEAVE:
121 		/* Preferred will only use the first bit, but allow
122 		   more for now. */
123 		if (empty)
124 			return -EINVAL;
125 		break;
126 	}
127 	return nodes_online(nodes);
128 }
129 
130 /* Copy a node mask from user space. */
131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 		     unsigned long maxnode, int mode)
133 {
134 	unsigned long k;
135 	unsigned long nlongs;
136 	unsigned long endmask;
137 
138 	--maxnode;
139 	bitmap_zero(nodes, MAX_NUMNODES);
140 	if (maxnode == 0 || !nmask)
141 		return 0;
142 
143 	nlongs = BITS_TO_LONGS(maxnode);
144 	if ((maxnode % BITS_PER_LONG) == 0)
145 		endmask = ~0UL;
146 	else
147 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148 
149 	/* When the user specified more nodes than supported just check
150 	   if the non supported part is all zero. */
151 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 		if (nlongs > PAGE_SIZE/sizeof(long))
153 			return -EINVAL;
154 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 			unsigned long t;
156 			if (get_user(t,  nmask + k))
157 				return -EFAULT;
158 			if (k == nlongs - 1) {
159 				if (t & endmask)
160 					return -EINVAL;
161 			} else if (t)
162 				return -EINVAL;
163 		}
164 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 		endmask = ~0UL;
166 	}
167 
168 	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 		return -EFAULT;
170 	nodes[nlongs-1] &= endmask;
171 	/* Update current mems_allowed */
172 	cpuset_update_current_mems_allowed();
173 	/* Ignore nodes not set in current->mems_allowed */
174 	cpuset_restrict_to_mems_allowed(nodes);
175 	return mpol_check_policy(mode, nodes);
176 }
177 
178 /* Generate a custom zonelist for the BIND policy. */
179 static struct zonelist *bind_zonelist(unsigned long *nodes)
180 {
181 	struct zonelist *zl;
182 	int num, max, nd;
183 
184 	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 	if (!zl)
187 		return NULL;
188 	num = 0;
189 	for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 	     nd < MAX_NUMNODES;
191 	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 		int k;
193 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 			if (!z->present_pages)
196 				continue;
197 			zl->zones[num++] = z;
198 			if (k > policy_zone)
199 				policy_zone = k;
200 		}
201 	}
202 	BUG_ON(num >= max);
203 	zl->zones[num] = NULL;
204 	return zl;
205 }
206 
207 /* Create a new policy */
208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209 {
210 	struct mempolicy *policy;
211 
212 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 	if (mode == MPOL_DEFAULT)
214 		return NULL;
215 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 	if (!policy)
217 		return ERR_PTR(-ENOMEM);
218 	atomic_set(&policy->refcnt, 1);
219 	switch (mode) {
220 	case MPOL_INTERLEAVE:
221 		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 		break;
223 	case MPOL_PREFERRED:
224 		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 		if (policy->v.preferred_node >= MAX_NUMNODES)
226 			policy->v.preferred_node = -1;
227 		break;
228 	case MPOL_BIND:
229 		policy->v.zonelist = bind_zonelist(nodes);
230 		if (policy->v.zonelist == NULL) {
231 			kmem_cache_free(policy_cache, policy);
232 			return ERR_PTR(-ENOMEM);
233 		}
234 		break;
235 	}
236 	policy->policy = mode;
237 	return policy;
238 }
239 
240 /* Ensure all existing pages follow the policy. */
241 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
242 		unsigned long addr, unsigned long end, unsigned long *nodes)
243 {
244 	pte_t *orig_pte;
245 	pte_t *pte;
246 
247 	spin_lock(&mm->page_table_lock);
248 	orig_pte = pte = pte_offset_map(pmd, addr);
249 	do {
250 		unsigned long pfn;
251 		unsigned int nid;
252 
253 		if (!pte_present(*pte))
254 			continue;
255 		pfn = pte_pfn(*pte);
256 		if (!pfn_valid(pfn))
257 			continue;
258 		nid = pfn_to_nid(pfn);
259 		if (!test_bit(nid, nodes))
260 			break;
261 	} while (pte++, addr += PAGE_SIZE, addr != end);
262 	pte_unmap(orig_pte);
263 	spin_unlock(&mm->page_table_lock);
264 	return addr != end;
265 }
266 
267 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
268 		unsigned long addr, unsigned long end, unsigned long *nodes)
269 {
270 	pmd_t *pmd;
271 	unsigned long next;
272 
273 	pmd = pmd_offset(pud, addr);
274 	do {
275 		next = pmd_addr_end(addr, end);
276 		if (pmd_none_or_clear_bad(pmd))
277 			continue;
278 		if (check_pte_range(mm, pmd, addr, next, nodes))
279 			return -EIO;
280 	} while (pmd++, addr = next, addr != end);
281 	return 0;
282 }
283 
284 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
285 		unsigned long addr, unsigned long end, unsigned long *nodes)
286 {
287 	pud_t *pud;
288 	unsigned long next;
289 
290 	pud = pud_offset(pgd, addr);
291 	do {
292 		next = pud_addr_end(addr, end);
293 		if (pud_none_or_clear_bad(pud))
294 			continue;
295 		if (check_pmd_range(mm, pud, addr, next, nodes))
296 			return -EIO;
297 	} while (pud++, addr = next, addr != end);
298 	return 0;
299 }
300 
301 static inline int check_pgd_range(struct mm_struct *mm,
302 		unsigned long addr, unsigned long end, unsigned long *nodes)
303 {
304 	pgd_t *pgd;
305 	unsigned long next;
306 
307 	pgd = pgd_offset(mm, addr);
308 	do {
309 		next = pgd_addr_end(addr, end);
310 		if (pgd_none_or_clear_bad(pgd))
311 			continue;
312 		if (check_pud_range(mm, pgd, addr, next, nodes))
313 			return -EIO;
314 	} while (pgd++, addr = next, addr != end);
315 	return 0;
316 }
317 
318 /* Step 1: check the range */
319 static struct vm_area_struct *
320 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 	    unsigned long *nodes, unsigned long flags)
322 {
323 	int err;
324 	struct vm_area_struct *first, *vma, *prev;
325 
326 	first = find_vma(mm, start);
327 	if (!first)
328 		return ERR_PTR(-EFAULT);
329 	prev = NULL;
330 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 		if (!vma->vm_next && vma->vm_end < end)
332 			return ERR_PTR(-EFAULT);
333 		if (prev && prev->vm_end < vma->vm_start)
334 			return ERR_PTR(-EFAULT);
335 		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
336 			err = check_pgd_range(vma->vm_mm,
337 					   vma->vm_start, vma->vm_end, nodes);
338 			if (err) {
339 				first = ERR_PTR(err);
340 				break;
341 			}
342 		}
343 		prev = vma;
344 	}
345 	return first;
346 }
347 
348 /* Apply policy to a single VMA */
349 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
350 {
351 	int err = 0;
352 	struct mempolicy *old = vma->vm_policy;
353 
354 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
355 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
356 		 vma->vm_ops, vma->vm_file,
357 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
358 
359 	if (vma->vm_ops && vma->vm_ops->set_policy)
360 		err = vma->vm_ops->set_policy(vma, new);
361 	if (!err) {
362 		mpol_get(new);
363 		vma->vm_policy = new;
364 		mpol_free(old);
365 	}
366 	return err;
367 }
368 
369 /* Step 2: apply policy to a range and do splits. */
370 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
371 		       unsigned long end, struct mempolicy *new)
372 {
373 	struct vm_area_struct *next;
374 	int err;
375 
376 	err = 0;
377 	for (; vma && vma->vm_start < end; vma = next) {
378 		next = vma->vm_next;
379 		if (vma->vm_start < start)
380 			err = split_vma(vma->vm_mm, vma, start, 1);
381 		if (!err && vma->vm_end > end)
382 			err = split_vma(vma->vm_mm, vma, end, 0);
383 		if (!err)
384 			err = policy_vma(vma, new);
385 		if (err)
386 			break;
387 	}
388 	return err;
389 }
390 
391 /* Change policy for a memory range */
392 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
393 			  unsigned long mode,
394 			  unsigned long __user *nmask, unsigned long maxnode,
395 			  unsigned flags)
396 {
397 	struct vm_area_struct *vma;
398 	struct mm_struct *mm = current->mm;
399 	struct mempolicy *new;
400 	unsigned long end;
401 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
402 	int err;
403 
404 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
405 		return -EINVAL;
406 	if (start & ~PAGE_MASK)
407 		return -EINVAL;
408 	if (mode == MPOL_DEFAULT)
409 		flags &= ~MPOL_MF_STRICT;
410 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
411 	end = start + len;
412 	if (end < start)
413 		return -EINVAL;
414 	if (end == start)
415 		return 0;
416 
417 	err = get_nodes(nodes, nmask, maxnode, mode);
418 	if (err)
419 		return err;
420 
421 	new = mpol_new(mode, nodes);
422 	if (IS_ERR(new))
423 		return PTR_ERR(new);
424 
425 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
426 			mode,nodes[0]);
427 
428 	down_write(&mm->mmap_sem);
429 	vma = check_range(mm, start, end, nodes, flags);
430 	err = PTR_ERR(vma);
431 	if (!IS_ERR(vma))
432 		err = mbind_range(vma, start, end, new);
433 	up_write(&mm->mmap_sem);
434 	mpol_free(new);
435 	return err;
436 }
437 
438 /* Set the process memory policy */
439 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
440 				   unsigned long maxnode)
441 {
442 	int err;
443 	struct mempolicy *new;
444 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
445 
446 	if (mode > MPOL_MAX)
447 		return -EINVAL;
448 	err = get_nodes(nodes, nmask, maxnode, mode);
449 	if (err)
450 		return err;
451 	new = mpol_new(mode, nodes);
452 	if (IS_ERR(new))
453 		return PTR_ERR(new);
454 	mpol_free(current->mempolicy);
455 	current->mempolicy = new;
456 	if (new && new->policy == MPOL_INTERLEAVE)
457 		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
458 	return 0;
459 }
460 
461 /* Fill a zone bitmap for a policy */
462 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
463 {
464 	int i;
465 
466 	bitmap_zero(nodes, MAX_NUMNODES);
467 	switch (p->policy) {
468 	case MPOL_BIND:
469 		for (i = 0; p->v.zonelist->zones[i]; i++)
470 			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
471 		break;
472 	case MPOL_DEFAULT:
473 		break;
474 	case MPOL_INTERLEAVE:
475 		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
476 		break;
477 	case MPOL_PREFERRED:
478 		/* or use current node instead of online map? */
479 		if (p->v.preferred_node < 0)
480 			bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
481 		else
482 			__set_bit(p->v.preferred_node, nodes);
483 		break;
484 	default:
485 		BUG();
486 	}
487 }
488 
489 static int lookup_node(struct mm_struct *mm, unsigned long addr)
490 {
491 	struct page *p;
492 	int err;
493 
494 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
495 	if (err >= 0) {
496 		err = page_to_nid(p);
497 		put_page(p);
498 	}
499 	return err;
500 }
501 
502 /* Copy a kernel node mask to user space */
503 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
504 			      void *nodes, unsigned nbytes)
505 {
506 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
507 
508 	if (copy > nbytes) {
509 		if (copy > PAGE_SIZE)
510 			return -EINVAL;
511 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
512 			return -EFAULT;
513 		copy = nbytes;
514 	}
515 	return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
516 }
517 
518 /* Retrieve NUMA policy */
519 asmlinkage long sys_get_mempolicy(int __user *policy,
520 				  unsigned long __user *nmask,
521 				  unsigned long maxnode,
522 				  unsigned long addr, unsigned long flags)
523 {
524 	int err, pval;
525 	struct mm_struct *mm = current->mm;
526 	struct vm_area_struct *vma = NULL;
527 	struct mempolicy *pol = current->mempolicy;
528 
529 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
530 		return -EINVAL;
531 	if (nmask != NULL && maxnode < MAX_NUMNODES)
532 		return -EINVAL;
533 	if (flags & MPOL_F_ADDR) {
534 		down_read(&mm->mmap_sem);
535 		vma = find_vma_intersection(mm, addr, addr+1);
536 		if (!vma) {
537 			up_read(&mm->mmap_sem);
538 			return -EFAULT;
539 		}
540 		if (vma->vm_ops && vma->vm_ops->get_policy)
541 			pol = vma->vm_ops->get_policy(vma, addr);
542 		else
543 			pol = vma->vm_policy;
544 	} else if (addr)
545 		return -EINVAL;
546 
547 	if (!pol)
548 		pol = &default_policy;
549 
550 	if (flags & MPOL_F_NODE) {
551 		if (flags & MPOL_F_ADDR) {
552 			err = lookup_node(mm, addr);
553 			if (err < 0)
554 				goto out;
555 			pval = err;
556 		} else if (pol == current->mempolicy &&
557 				pol->policy == MPOL_INTERLEAVE) {
558 			pval = current->il_next;
559 		} else {
560 			err = -EINVAL;
561 			goto out;
562 		}
563 	} else
564 		pval = pol->policy;
565 
566 	if (vma) {
567 		up_read(&current->mm->mmap_sem);
568 		vma = NULL;
569 	}
570 
571 	if (policy && put_user(pval, policy))
572 		return -EFAULT;
573 
574 	err = 0;
575 	if (nmask) {
576 		DECLARE_BITMAP(nodes, MAX_NUMNODES);
577 		get_zonemask(pol, nodes);
578 		err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
579 	}
580 
581  out:
582 	if (vma)
583 		up_read(&current->mm->mmap_sem);
584 	return err;
585 }
586 
587 #ifdef CONFIG_COMPAT
588 
589 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
590 				     compat_ulong_t __user *nmask,
591 				     compat_ulong_t maxnode,
592 				     compat_ulong_t addr, compat_ulong_t flags)
593 {
594 	long err;
595 	unsigned long __user *nm = NULL;
596 	unsigned long nr_bits, alloc_size;
597 	DECLARE_BITMAP(bm, MAX_NUMNODES);
598 
599 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
600 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
601 
602 	if (nmask)
603 		nm = compat_alloc_user_space(alloc_size);
604 
605 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
606 
607 	if (!err && nmask) {
608 		err = copy_from_user(bm, nm, alloc_size);
609 		/* ensure entire bitmap is zeroed */
610 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
611 		err |= compat_put_bitmap(nmask, bm, nr_bits);
612 	}
613 
614 	return err;
615 }
616 
617 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
618 				     compat_ulong_t maxnode)
619 {
620 	long err = 0;
621 	unsigned long __user *nm = NULL;
622 	unsigned long nr_bits, alloc_size;
623 	DECLARE_BITMAP(bm, MAX_NUMNODES);
624 
625 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
626 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
627 
628 	if (nmask) {
629 		err = compat_get_bitmap(bm, nmask, nr_bits);
630 		nm = compat_alloc_user_space(alloc_size);
631 		err |= copy_to_user(nm, bm, alloc_size);
632 	}
633 
634 	if (err)
635 		return -EFAULT;
636 
637 	return sys_set_mempolicy(mode, nm, nr_bits+1);
638 }
639 
640 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
641 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
642 			     compat_ulong_t maxnode, compat_ulong_t flags)
643 {
644 	long err = 0;
645 	unsigned long __user *nm = NULL;
646 	unsigned long nr_bits, alloc_size;
647 	DECLARE_BITMAP(bm, MAX_NUMNODES);
648 
649 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
650 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
651 
652 	if (nmask) {
653 		err = compat_get_bitmap(bm, nmask, nr_bits);
654 		nm = compat_alloc_user_space(alloc_size);
655 		err |= copy_to_user(nm, bm, alloc_size);
656 	}
657 
658 	if (err)
659 		return -EFAULT;
660 
661 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
662 }
663 
664 #endif
665 
666 /* Return effective policy for a VMA */
667 static struct mempolicy *
668 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
669 {
670 	struct mempolicy *pol = current->mempolicy;
671 
672 	if (vma) {
673 		if (vma->vm_ops && vma->vm_ops->get_policy)
674 		        pol = vma->vm_ops->get_policy(vma, addr);
675 		else if (vma->vm_policy &&
676 				vma->vm_policy->policy != MPOL_DEFAULT)
677 			pol = vma->vm_policy;
678 	}
679 	if (!pol)
680 		pol = &default_policy;
681 	return pol;
682 }
683 
684 /* Return a zonelist representing a mempolicy */
685 static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
686 {
687 	int nd;
688 
689 	switch (policy->policy) {
690 	case MPOL_PREFERRED:
691 		nd = policy->v.preferred_node;
692 		if (nd < 0)
693 			nd = numa_node_id();
694 		break;
695 	case MPOL_BIND:
696 		/* Lower zones don't get a policy applied */
697 		/* Careful: current->mems_allowed might have moved */
698 		if ((gfp & GFP_ZONEMASK) >= policy_zone)
699 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
700 				return policy->v.zonelist;
701 		/*FALL THROUGH*/
702 	case MPOL_INTERLEAVE: /* should not happen */
703 	case MPOL_DEFAULT:
704 		nd = numa_node_id();
705 		break;
706 	default:
707 		nd = 0;
708 		BUG();
709 	}
710 	return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
711 }
712 
713 /* Do dynamic interleaving for a process */
714 static unsigned interleave_nodes(struct mempolicy *policy)
715 {
716 	unsigned nid, next;
717 	struct task_struct *me = current;
718 
719 	nid = me->il_next;
720 	BUG_ON(nid >= MAX_NUMNODES);
721 	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
722 	if (next >= MAX_NUMNODES)
723 		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
724 	me->il_next = next;
725 	return nid;
726 }
727 
728 /* Do static interleaving for a VMA with known offset. */
729 static unsigned offset_il_node(struct mempolicy *pol,
730 		struct vm_area_struct *vma, unsigned long off)
731 {
732 	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
733 	unsigned target = (unsigned)off % nnodes;
734 	int c;
735 	int nid = -1;
736 
737 	c = 0;
738 	do {
739 		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
740 		c++;
741 	} while (c <= target);
742 	BUG_ON(nid >= MAX_NUMNODES);
743 	BUG_ON(!test_bit(nid, pol->v.nodes));
744 	return nid;
745 }
746 
747 /* Allocate a page in interleaved policy.
748    Own path because it needs to do special accounting. */
749 static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
750 {
751 	struct zonelist *zl;
752 	struct page *page;
753 
754 	BUG_ON(!node_online(nid));
755 	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
756 	page = __alloc_pages(gfp, order, zl);
757 	if (page && page_zone(page) == zl->zones[0]) {
758 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
759 		put_cpu();
760 	}
761 	return page;
762 }
763 
764 /**
765  * 	alloc_page_vma	- Allocate a page for a VMA.
766  *
767  * 	@gfp:
768  *      %GFP_USER    user allocation.
769  *      %GFP_KERNEL  kernel allocations,
770  *      %GFP_HIGHMEM highmem/user allocations,
771  *      %GFP_FS      allocation should not call back into a file system.
772  *      %GFP_ATOMIC  don't sleep.
773  *
774  * 	@vma:  Pointer to VMA or NULL if not available.
775  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
776  *
777  * 	This function allocates a page from the kernel page pool and applies
778  *	a NUMA policy associated with the VMA or the current process.
779  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
780  *	mm_struct of the VMA to prevent it from going away. Should be used for
781  *	all allocations for pages that will be mapped into
782  * 	user space. Returns NULL when no page can be allocated.
783  *
784  *	Should be called with the mm_sem of the vma hold.
785  */
786 struct page *
787 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
788 {
789 	struct mempolicy *pol = get_vma_policy(vma, addr);
790 
791 	cpuset_update_current_mems_allowed();
792 
793 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
794 		unsigned nid;
795 		if (vma) {
796 			unsigned long off;
797 			BUG_ON(addr >= vma->vm_end);
798 			BUG_ON(addr < vma->vm_start);
799 			off = vma->vm_pgoff;
800 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
801 			nid = offset_il_node(pol, vma, off);
802 		} else {
803 			/* fall back to process interleaving */
804 			nid = interleave_nodes(pol);
805 		}
806 		return alloc_page_interleave(gfp, 0, nid);
807 	}
808 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
809 }
810 
811 /**
812  * 	alloc_pages_current - Allocate pages.
813  *
814  *	@gfp:
815  *		%GFP_USER   user allocation,
816  *      	%GFP_KERNEL kernel allocation,
817  *      	%GFP_HIGHMEM highmem allocation,
818  *      	%GFP_FS     don't call back into a file system.
819  *      	%GFP_ATOMIC don't sleep.
820  *	@order: Power of two of allocation size in pages. 0 is a single page.
821  *
822  *	Allocate a page from the kernel page pool.  When not in
823  *	interrupt context and apply the current process NUMA policy.
824  *	Returns NULL when no page can be allocated.
825  *
826  *	Don't call cpuset_update_current_mems_allowed() unless
827  *	1) it's ok to take cpuset_sem (can WAIT), and
828  *	2) allocating for current task (not interrupt).
829  */
830 struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
831 {
832 	struct mempolicy *pol = current->mempolicy;
833 
834 	if ((gfp & __GFP_WAIT) && !in_interrupt())
835 		cpuset_update_current_mems_allowed();
836 	if (!pol || in_interrupt())
837 		pol = &default_policy;
838 	if (pol->policy == MPOL_INTERLEAVE)
839 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
840 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
841 }
842 EXPORT_SYMBOL(alloc_pages_current);
843 
844 /* Slow path of a mempolicy copy */
845 struct mempolicy *__mpol_copy(struct mempolicy *old)
846 {
847 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
848 
849 	if (!new)
850 		return ERR_PTR(-ENOMEM);
851 	*new = *old;
852 	atomic_set(&new->refcnt, 1);
853 	if (new->policy == MPOL_BIND) {
854 		int sz = ksize(old->v.zonelist);
855 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
856 		if (!new->v.zonelist) {
857 			kmem_cache_free(policy_cache, new);
858 			return ERR_PTR(-ENOMEM);
859 		}
860 		memcpy(new->v.zonelist, old->v.zonelist, sz);
861 	}
862 	return new;
863 }
864 
865 /* Slow path of a mempolicy comparison */
866 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
867 {
868 	if (!a || !b)
869 		return 0;
870 	if (a->policy != b->policy)
871 		return 0;
872 	switch (a->policy) {
873 	case MPOL_DEFAULT:
874 		return 1;
875 	case MPOL_INTERLEAVE:
876 		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
877 	case MPOL_PREFERRED:
878 		return a->v.preferred_node == b->v.preferred_node;
879 	case MPOL_BIND: {
880 		int i;
881 		for (i = 0; a->v.zonelist->zones[i]; i++)
882 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
883 				return 0;
884 		return b->v.zonelist->zones[i] == NULL;
885 	}
886 	default:
887 		BUG();
888 		return 0;
889 	}
890 }
891 
892 /* Slow path of a mpol destructor. */
893 void __mpol_free(struct mempolicy *p)
894 {
895 	if (!atomic_dec_and_test(&p->refcnt))
896 		return;
897 	if (p->policy == MPOL_BIND)
898 		kfree(p->v.zonelist);
899 	p->policy = MPOL_DEFAULT;
900 	kmem_cache_free(policy_cache, p);
901 }
902 
903 /*
904  * Hugetlb policy. Same as above, just works with node numbers instead of
905  * zonelists.
906  */
907 
908 /* Find first node suitable for an allocation */
909 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
910 {
911 	struct mempolicy *pol = get_vma_policy(vma, addr);
912 
913 	switch (pol->policy) {
914 	case MPOL_DEFAULT:
915 		return numa_node_id();
916 	case MPOL_BIND:
917 		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
918 	case MPOL_INTERLEAVE:
919 		return interleave_nodes(pol);
920 	case MPOL_PREFERRED:
921 		return pol->v.preferred_node >= 0 ?
922 				pol->v.preferred_node : numa_node_id();
923 	}
924 	BUG();
925 	return 0;
926 }
927 
928 /* Find secondary valid nodes for an allocation */
929 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
930 {
931 	struct mempolicy *pol = get_vma_policy(vma, addr);
932 
933 	switch (pol->policy) {
934 	case MPOL_PREFERRED:
935 	case MPOL_DEFAULT:
936 	case MPOL_INTERLEAVE:
937 		return 1;
938 	case MPOL_BIND: {
939 		struct zone **z;
940 		for (z = pol->v.zonelist->zones; *z; z++)
941 			if ((*z)->zone_pgdat->node_id == nid)
942 				return 1;
943 		return 0;
944 	}
945 	default:
946 		BUG();
947 		return 0;
948 	}
949 }
950 
951 /*
952  * Shared memory backing store policy support.
953  *
954  * Remember policies even when nobody has shared memory mapped.
955  * The policies are kept in Red-Black tree linked from the inode.
956  * They are protected by the sp->lock spinlock, which should be held
957  * for any accesses to the tree.
958  */
959 
960 /* lookup first element intersecting start-end */
961 /* Caller holds sp->lock */
962 static struct sp_node *
963 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
964 {
965 	struct rb_node *n = sp->root.rb_node;
966 
967 	while (n) {
968 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
969 
970 		if (start >= p->end)
971 			n = n->rb_right;
972 		else if (end <= p->start)
973 			n = n->rb_left;
974 		else
975 			break;
976 	}
977 	if (!n)
978 		return NULL;
979 	for (;;) {
980 		struct sp_node *w = NULL;
981 		struct rb_node *prev = rb_prev(n);
982 		if (!prev)
983 			break;
984 		w = rb_entry(prev, struct sp_node, nd);
985 		if (w->end <= start)
986 			break;
987 		n = prev;
988 	}
989 	return rb_entry(n, struct sp_node, nd);
990 }
991 
992 /* Insert a new shared policy into the list. */
993 /* Caller holds sp->lock */
994 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
995 {
996 	struct rb_node **p = &sp->root.rb_node;
997 	struct rb_node *parent = NULL;
998 	struct sp_node *nd;
999 
1000 	while (*p) {
1001 		parent = *p;
1002 		nd = rb_entry(parent, struct sp_node, nd);
1003 		if (new->start < nd->start)
1004 			p = &(*p)->rb_left;
1005 		else if (new->end > nd->end)
1006 			p = &(*p)->rb_right;
1007 		else
1008 			BUG();
1009 	}
1010 	rb_link_node(&new->nd, parent, p);
1011 	rb_insert_color(&new->nd, &sp->root);
1012 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1013 		 new->policy ? new->policy->policy : 0);
1014 }
1015 
1016 /* Find shared policy intersecting idx */
1017 struct mempolicy *
1018 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1019 {
1020 	struct mempolicy *pol = NULL;
1021 	struct sp_node *sn;
1022 
1023 	if (!sp->root.rb_node)
1024 		return NULL;
1025 	spin_lock(&sp->lock);
1026 	sn = sp_lookup(sp, idx, idx+1);
1027 	if (sn) {
1028 		mpol_get(sn->policy);
1029 		pol = sn->policy;
1030 	}
1031 	spin_unlock(&sp->lock);
1032 	return pol;
1033 }
1034 
1035 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1036 {
1037 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1038 	rb_erase(&n->nd, &sp->root);
1039 	mpol_free(n->policy);
1040 	kmem_cache_free(sn_cache, n);
1041 }
1042 
1043 struct sp_node *
1044 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1045 {
1046 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1047 
1048 	if (!n)
1049 		return NULL;
1050 	n->start = start;
1051 	n->end = end;
1052 	mpol_get(pol);
1053 	n->policy = pol;
1054 	return n;
1055 }
1056 
1057 /* Replace a policy range. */
1058 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1059 				 unsigned long end, struct sp_node *new)
1060 {
1061 	struct sp_node *n, *new2 = NULL;
1062 
1063 restart:
1064 	spin_lock(&sp->lock);
1065 	n = sp_lookup(sp, start, end);
1066 	/* Take care of old policies in the same range. */
1067 	while (n && n->start < end) {
1068 		struct rb_node *next = rb_next(&n->nd);
1069 		if (n->start >= start) {
1070 			if (n->end <= end)
1071 				sp_delete(sp, n);
1072 			else
1073 				n->start = end;
1074 		} else {
1075 			/* Old policy spanning whole new range. */
1076 			if (n->end > end) {
1077 				if (!new2) {
1078 					spin_unlock(&sp->lock);
1079 					new2 = sp_alloc(end, n->end, n->policy);
1080 					if (!new2)
1081 						return -ENOMEM;
1082 					goto restart;
1083 				}
1084 				n->end = start;
1085 				sp_insert(sp, new2);
1086 				new2 = NULL;
1087 				break;
1088 			} else
1089 				n->end = start;
1090 		}
1091 		if (!next)
1092 			break;
1093 		n = rb_entry(next, struct sp_node, nd);
1094 	}
1095 	if (new)
1096 		sp_insert(sp, new);
1097 	spin_unlock(&sp->lock);
1098 	if (new2) {
1099 		mpol_free(new2->policy);
1100 		kmem_cache_free(sn_cache, new2);
1101 	}
1102 	return 0;
1103 }
1104 
1105 int mpol_set_shared_policy(struct shared_policy *info,
1106 			struct vm_area_struct *vma, struct mempolicy *npol)
1107 {
1108 	int err;
1109 	struct sp_node *new = NULL;
1110 	unsigned long sz = vma_pages(vma);
1111 
1112 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1113 		 vma->vm_pgoff,
1114 		 sz, npol? npol->policy : -1,
1115 		npol ? npol->v.nodes[0] : -1);
1116 
1117 	if (npol) {
1118 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1119 		if (!new)
1120 			return -ENOMEM;
1121 	}
1122 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1123 	if (err && new)
1124 		kmem_cache_free(sn_cache, new);
1125 	return err;
1126 }
1127 
1128 /* Free a backing policy store on inode delete. */
1129 void mpol_free_shared_policy(struct shared_policy *p)
1130 {
1131 	struct sp_node *n;
1132 	struct rb_node *next;
1133 
1134 	if (!p->root.rb_node)
1135 		return;
1136 	spin_lock(&p->lock);
1137 	next = rb_first(&p->root);
1138 	while (next) {
1139 		n = rb_entry(next, struct sp_node, nd);
1140 		next = rb_next(&n->nd);
1141 		mpol_free(n->policy);
1142 		kmem_cache_free(sn_cache, n);
1143 	}
1144 	spin_unlock(&p->lock);
1145 	p->root = RB_ROOT;
1146 }
1147 
1148 /* assumes fs == KERNEL_DS */
1149 void __init numa_policy_init(void)
1150 {
1151 	policy_cache = kmem_cache_create("numa_policy",
1152 					 sizeof(struct mempolicy),
1153 					 0, SLAB_PANIC, NULL, NULL);
1154 
1155 	sn_cache = kmem_cache_create("shared_policy_node",
1156 				     sizeof(struct sp_node),
1157 				     0, SLAB_PANIC, NULL, NULL);
1158 
1159 	/* Set interleaving policy for system init. This way not all
1160 	   the data structures allocated at system boot end up in node zero. */
1161 
1162 	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1163 							MAX_NUMNODES) < 0)
1164 		printk("numa_policy_init: interleaving failed\n");
1165 }
1166 
1167 /* Reset policy of current process to default.
1168  * Assumes fs == KERNEL_DS */
1169 void numa_default_policy(void)
1170 {
1171 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1172 }
1173