xref: /linux/mm/mempolicy.c (revision 54a8a2220c936a47840c9a3d74910c5a56fae2ed)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * Subject to the GNU Public License, version 2.
6  *
7  * NUMA policy allows the user to give hints in which node(s) memory should
8  * be allocated.
9  *
10  * Support four policies per VMA and per process:
11  *
12  * The VMA policy has priority over the process policy for a page fault.
13  *
14  * interleave     Allocate memory interleaved over a set of nodes,
15  *                with normal fallback if it fails.
16  *                For VMA based allocations this interleaves based on the
17  *                offset into the backing object or offset into the mapping
18  *                for anonymous memory. For process policy an process counter
19  *                is used.
20  * bind           Only allocate memory on a specific set of nodes,
21  *                no fallback.
22  * preferred       Try a specific node first before normal fallback.
23  *                As a special case node -1 here means do the allocation
24  *                on the local CPU. This is normally identical to default,
25  *                but useful to set in a VMA when you have a non default
26  *                process policy.
27  * default        Allocate on the local node first, or when on a VMA
28  *                use the process policy. This is what Linux always did
29  *		  in a NUMA aware kernel and still does by, ahem, default.
30  *
31  * The process policy is applied for most non interrupt memory allocations
32  * in that process' context. Interrupts ignore the policies and always
33  * try to allocate on the local CPU. The VMA policy is only applied for memory
34  * allocations for a VMA in the VM.
35  *
36  * Currently there are a few corner cases in swapping where the policy
37  * is not applied, but the majority should be handled. When process policy
38  * is used it is not remembered over swap outs/swap ins.
39  *
40  * Only the highest zone in the zone hierarchy gets policied. Allocations
41  * requesting a lower zone just use default policy. This implies that
42  * on systems with highmem kernel lowmem allocation don't get policied.
43  * Same with GFP_DMA allocations.
44  *
45  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46  * all users and remembered even when nobody has memory mapped.
47  */
48 
49 /* Notebook:
50    fix mmap readahead to honour policy and enable policy for any page cache
51    object
52    statistics for bigpages
53    global policy for page cache? currently it uses process policy. Requires
54    first item above.
55    handle mremap for shared memory (currently ignored for the policy)
56    grows down?
57    make bind policy root only? It can trigger oom much faster and the
58    kernel is not always grateful with that.
59    could replace all the switch()es with a mempolicy_ops structure.
60 */
61 
62 #include <linux/mempolicy.h>
63 #include <linux/mm.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
68 #include <linux/mm.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
81 
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
84 
85 #define PDprintk(fmt...)
86 
87 /* Highest zone. An specific allocation for a zone below that is not
88    policied. */
89 static int policy_zone;
90 
91 struct mempolicy default_policy = {
92 	.refcnt = ATOMIC_INIT(1), /* never free it */
93 	.policy = MPOL_DEFAULT,
94 };
95 
96 /* Check if all specified nodes are online */
97 static int nodes_online(unsigned long *nodes)
98 {
99 	DECLARE_BITMAP(online2, MAX_NUMNODES);
100 
101 	bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 	if (bitmap_empty(online2, MAX_NUMNODES))
103 		set_bit(0, online2);
104 	if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 		return -EINVAL;
106 	return 0;
107 }
108 
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, unsigned long *nodes)
111 {
112 	int empty = bitmap_empty(nodes, MAX_NUMNODES);
113 
114 	switch (mode) {
115 	case MPOL_DEFAULT:
116 		if (!empty)
117 			return -EINVAL;
118 		break;
119 	case MPOL_BIND:
120 	case MPOL_INTERLEAVE:
121 		/* Preferred will only use the first bit, but allow
122 		   more for now. */
123 		if (empty)
124 			return -EINVAL;
125 		break;
126 	}
127 	return nodes_online(nodes);
128 }
129 
130 /* Copy a node mask from user space. */
131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 		     unsigned long maxnode, int mode)
133 {
134 	unsigned long k;
135 	unsigned long nlongs;
136 	unsigned long endmask;
137 
138 	--maxnode;
139 	bitmap_zero(nodes, MAX_NUMNODES);
140 	if (maxnode == 0 || !nmask)
141 		return 0;
142 
143 	nlongs = BITS_TO_LONGS(maxnode);
144 	if ((maxnode % BITS_PER_LONG) == 0)
145 		endmask = ~0UL;
146 	else
147 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148 
149 	/* When the user specified more nodes than supported just check
150 	   if the non supported part is all zero. */
151 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 		if (nlongs > PAGE_SIZE/sizeof(long))
153 			return -EINVAL;
154 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 			unsigned long t;
156 			if (get_user(t,  nmask + k))
157 				return -EFAULT;
158 			if (k == nlongs - 1) {
159 				if (t & endmask)
160 					return -EINVAL;
161 			} else if (t)
162 				return -EINVAL;
163 		}
164 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 		endmask = ~0UL;
166 	}
167 
168 	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 		return -EFAULT;
170 	nodes[nlongs-1] &= endmask;
171 	/* Update current mems_allowed */
172 	cpuset_update_current_mems_allowed();
173 	/* Ignore nodes not set in current->mems_allowed */
174 	cpuset_restrict_to_mems_allowed(nodes);
175 	return mpol_check_policy(mode, nodes);
176 }
177 
178 /* Generate a custom zonelist for the BIND policy. */
179 static struct zonelist *bind_zonelist(unsigned long *nodes)
180 {
181 	struct zonelist *zl;
182 	int num, max, nd;
183 
184 	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 	if (!zl)
187 		return NULL;
188 	num = 0;
189 	for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 	     nd < MAX_NUMNODES;
191 	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 		int k;
193 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 			if (!z->present_pages)
196 				continue;
197 			zl->zones[num++] = z;
198 			if (k > policy_zone)
199 				policy_zone = k;
200 		}
201 	}
202 	BUG_ON(num >= max);
203 	zl->zones[num] = NULL;
204 	return zl;
205 }
206 
207 /* Create a new policy */
208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209 {
210 	struct mempolicy *policy;
211 
212 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 	if (mode == MPOL_DEFAULT)
214 		return NULL;
215 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 	if (!policy)
217 		return ERR_PTR(-ENOMEM);
218 	atomic_set(&policy->refcnt, 1);
219 	switch (mode) {
220 	case MPOL_INTERLEAVE:
221 		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 		break;
223 	case MPOL_PREFERRED:
224 		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 		if (policy->v.preferred_node >= MAX_NUMNODES)
226 			policy->v.preferred_node = -1;
227 		break;
228 	case MPOL_BIND:
229 		policy->v.zonelist = bind_zonelist(nodes);
230 		if (policy->v.zonelist == NULL) {
231 			kmem_cache_free(policy_cache, policy);
232 			return ERR_PTR(-ENOMEM);
233 		}
234 		break;
235 	}
236 	policy->policy = mode;
237 	return policy;
238 }
239 
240 /* Ensure all existing pages follow the policy. */
241 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
242 		unsigned long addr, unsigned long end, unsigned long *nodes)
243 {
244 	pte_t *orig_pte;
245 	pte_t *pte;
246 
247 	spin_lock(&mm->page_table_lock);
248 	orig_pte = pte = pte_offset_map(pmd, addr);
249 	do {
250 		unsigned long pfn;
251 		unsigned int nid;
252 
253 		if (!pte_present(*pte))
254 			continue;
255 		pfn = pte_pfn(*pte);
256 		if (!pfn_valid(pfn))
257 			continue;
258 		nid = pfn_to_nid(pfn);
259 		if (!test_bit(nid, nodes))
260 			break;
261 	} while (pte++, addr += PAGE_SIZE, addr != end);
262 	pte_unmap(orig_pte);
263 	spin_unlock(&mm->page_table_lock);
264 	return addr != end;
265 }
266 
267 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
268 		unsigned long addr, unsigned long end, unsigned long *nodes)
269 {
270 	pmd_t *pmd;
271 	unsigned long next;
272 
273 	pmd = pmd_offset(pud, addr);
274 	do {
275 		next = pmd_addr_end(addr, end);
276 		if (pmd_none_or_clear_bad(pmd))
277 			continue;
278 		if (check_pte_range(mm, pmd, addr, next, nodes))
279 			return -EIO;
280 	} while (pmd++, addr = next, addr != end);
281 	return 0;
282 }
283 
284 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
285 		unsigned long addr, unsigned long end, unsigned long *nodes)
286 {
287 	pud_t *pud;
288 	unsigned long next;
289 
290 	pud = pud_offset(pgd, addr);
291 	do {
292 		next = pud_addr_end(addr, end);
293 		if (pud_none_or_clear_bad(pud))
294 			continue;
295 		if (check_pmd_range(mm, pud, addr, next, nodes))
296 			return -EIO;
297 	} while (pud++, addr = next, addr != end);
298 	return 0;
299 }
300 
301 static inline int check_pgd_range(struct mm_struct *mm,
302 		unsigned long addr, unsigned long end, unsigned long *nodes)
303 {
304 	pgd_t *pgd;
305 	unsigned long next;
306 
307 	pgd = pgd_offset(mm, addr);
308 	do {
309 		next = pgd_addr_end(addr, end);
310 		if (pgd_none_or_clear_bad(pgd))
311 			continue;
312 		if (check_pud_range(mm, pgd, addr, next, nodes))
313 			return -EIO;
314 	} while (pgd++, addr = next, addr != end);
315 	return 0;
316 }
317 
318 /* Step 1: check the range */
319 static struct vm_area_struct *
320 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 	    unsigned long *nodes, unsigned long flags)
322 {
323 	int err;
324 	struct vm_area_struct *first, *vma, *prev;
325 
326 	first = find_vma(mm, start);
327 	if (!first)
328 		return ERR_PTR(-EFAULT);
329 	prev = NULL;
330 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 		if (!vma->vm_next && vma->vm_end < end)
332 			return ERR_PTR(-EFAULT);
333 		if (prev && prev->vm_end < vma->vm_start)
334 			return ERR_PTR(-EFAULT);
335 		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
336 			unsigned long endvma = vma->vm_end;
337 			if (endvma > end)
338 				endvma = end;
339 			if (vma->vm_start > start)
340 				start = vma->vm_start;
341 			err = check_pgd_range(vma->vm_mm,
342 					   start, endvma, nodes);
343 			if (err) {
344 				first = ERR_PTR(err);
345 				break;
346 			}
347 		}
348 		prev = vma;
349 	}
350 	return first;
351 }
352 
353 /* Apply policy to a single VMA */
354 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
355 {
356 	int err = 0;
357 	struct mempolicy *old = vma->vm_policy;
358 
359 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
360 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
361 		 vma->vm_ops, vma->vm_file,
362 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
363 
364 	if (vma->vm_ops && vma->vm_ops->set_policy)
365 		err = vma->vm_ops->set_policy(vma, new);
366 	if (!err) {
367 		mpol_get(new);
368 		vma->vm_policy = new;
369 		mpol_free(old);
370 	}
371 	return err;
372 }
373 
374 /* Step 2: apply policy to a range and do splits. */
375 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
376 		       unsigned long end, struct mempolicy *new)
377 {
378 	struct vm_area_struct *next;
379 	int err;
380 
381 	err = 0;
382 	for (; vma && vma->vm_start < end; vma = next) {
383 		next = vma->vm_next;
384 		if (vma->vm_start < start)
385 			err = split_vma(vma->vm_mm, vma, start, 1);
386 		if (!err && vma->vm_end > end)
387 			err = split_vma(vma->vm_mm, vma, end, 0);
388 		if (!err)
389 			err = policy_vma(vma, new);
390 		if (err)
391 			break;
392 	}
393 	return err;
394 }
395 
396 /* Change policy for a memory range */
397 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
398 			  unsigned long mode,
399 			  unsigned long __user *nmask, unsigned long maxnode,
400 			  unsigned flags)
401 {
402 	struct vm_area_struct *vma;
403 	struct mm_struct *mm = current->mm;
404 	struct mempolicy *new;
405 	unsigned long end;
406 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
407 	int err;
408 
409 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
410 		return -EINVAL;
411 	if (start & ~PAGE_MASK)
412 		return -EINVAL;
413 	if (mode == MPOL_DEFAULT)
414 		flags &= ~MPOL_MF_STRICT;
415 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
416 	end = start + len;
417 	if (end < start)
418 		return -EINVAL;
419 	if (end == start)
420 		return 0;
421 
422 	err = get_nodes(nodes, nmask, maxnode, mode);
423 	if (err)
424 		return err;
425 
426 	new = mpol_new(mode, nodes);
427 	if (IS_ERR(new))
428 		return PTR_ERR(new);
429 
430 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
431 			mode,nodes[0]);
432 
433 	down_write(&mm->mmap_sem);
434 	vma = check_range(mm, start, end, nodes, flags);
435 	err = PTR_ERR(vma);
436 	if (!IS_ERR(vma))
437 		err = mbind_range(vma, start, end, new);
438 	up_write(&mm->mmap_sem);
439 	mpol_free(new);
440 	return err;
441 }
442 
443 /* Set the process memory policy */
444 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
445 				   unsigned long maxnode)
446 {
447 	int err;
448 	struct mempolicy *new;
449 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
450 
451 	if (mode < 0 || mode > MPOL_MAX)
452 		return -EINVAL;
453 	err = get_nodes(nodes, nmask, maxnode, mode);
454 	if (err)
455 		return err;
456 	new = mpol_new(mode, nodes);
457 	if (IS_ERR(new))
458 		return PTR_ERR(new);
459 	mpol_free(current->mempolicy);
460 	current->mempolicy = new;
461 	if (new && new->policy == MPOL_INTERLEAVE)
462 		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
463 	return 0;
464 }
465 
466 /* Fill a zone bitmap for a policy */
467 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
468 {
469 	int i;
470 
471 	bitmap_zero(nodes, MAX_NUMNODES);
472 	switch (p->policy) {
473 	case MPOL_BIND:
474 		for (i = 0; p->v.zonelist->zones[i]; i++)
475 			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
476 		break;
477 	case MPOL_DEFAULT:
478 		break;
479 	case MPOL_INTERLEAVE:
480 		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
481 		break;
482 	case MPOL_PREFERRED:
483 		/* or use current node instead of online map? */
484 		if (p->v.preferred_node < 0)
485 			bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
486 		else
487 			__set_bit(p->v.preferred_node, nodes);
488 		break;
489 	default:
490 		BUG();
491 	}
492 }
493 
494 static int lookup_node(struct mm_struct *mm, unsigned long addr)
495 {
496 	struct page *p;
497 	int err;
498 
499 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
500 	if (err >= 0) {
501 		err = page_to_nid(p);
502 		put_page(p);
503 	}
504 	return err;
505 }
506 
507 /* Copy a kernel node mask to user space */
508 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
509 			      void *nodes, unsigned nbytes)
510 {
511 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
512 
513 	if (copy > nbytes) {
514 		if (copy > PAGE_SIZE)
515 			return -EINVAL;
516 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
517 			return -EFAULT;
518 		copy = nbytes;
519 	}
520 	return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
521 }
522 
523 /* Retrieve NUMA policy */
524 asmlinkage long sys_get_mempolicy(int __user *policy,
525 				  unsigned long __user *nmask,
526 				  unsigned long maxnode,
527 				  unsigned long addr, unsigned long flags)
528 {
529 	int err, pval;
530 	struct mm_struct *mm = current->mm;
531 	struct vm_area_struct *vma = NULL;
532 	struct mempolicy *pol = current->mempolicy;
533 
534 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
535 		return -EINVAL;
536 	if (nmask != NULL && maxnode < MAX_NUMNODES)
537 		return -EINVAL;
538 	if (flags & MPOL_F_ADDR) {
539 		down_read(&mm->mmap_sem);
540 		vma = find_vma_intersection(mm, addr, addr+1);
541 		if (!vma) {
542 			up_read(&mm->mmap_sem);
543 			return -EFAULT;
544 		}
545 		if (vma->vm_ops && vma->vm_ops->get_policy)
546 			pol = vma->vm_ops->get_policy(vma, addr);
547 		else
548 			pol = vma->vm_policy;
549 	} else if (addr)
550 		return -EINVAL;
551 
552 	if (!pol)
553 		pol = &default_policy;
554 
555 	if (flags & MPOL_F_NODE) {
556 		if (flags & MPOL_F_ADDR) {
557 			err = lookup_node(mm, addr);
558 			if (err < 0)
559 				goto out;
560 			pval = err;
561 		} else if (pol == current->mempolicy &&
562 				pol->policy == MPOL_INTERLEAVE) {
563 			pval = current->il_next;
564 		} else {
565 			err = -EINVAL;
566 			goto out;
567 		}
568 	} else
569 		pval = pol->policy;
570 
571 	if (vma) {
572 		up_read(&current->mm->mmap_sem);
573 		vma = NULL;
574 	}
575 
576 	if (policy && put_user(pval, policy))
577 		return -EFAULT;
578 
579 	err = 0;
580 	if (nmask) {
581 		DECLARE_BITMAP(nodes, MAX_NUMNODES);
582 		get_zonemask(pol, nodes);
583 		err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
584 	}
585 
586  out:
587 	if (vma)
588 		up_read(&current->mm->mmap_sem);
589 	return err;
590 }
591 
592 #ifdef CONFIG_COMPAT
593 
594 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
595 				     compat_ulong_t __user *nmask,
596 				     compat_ulong_t maxnode,
597 				     compat_ulong_t addr, compat_ulong_t flags)
598 {
599 	long err;
600 	unsigned long __user *nm = NULL;
601 	unsigned long nr_bits, alloc_size;
602 	DECLARE_BITMAP(bm, MAX_NUMNODES);
603 
604 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
605 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
606 
607 	if (nmask)
608 		nm = compat_alloc_user_space(alloc_size);
609 
610 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
611 
612 	if (!err && nmask) {
613 		err = copy_from_user(bm, nm, alloc_size);
614 		/* ensure entire bitmap is zeroed */
615 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
616 		err |= compat_put_bitmap(nmask, bm, nr_bits);
617 	}
618 
619 	return err;
620 }
621 
622 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
623 				     compat_ulong_t maxnode)
624 {
625 	long err = 0;
626 	unsigned long __user *nm = NULL;
627 	unsigned long nr_bits, alloc_size;
628 	DECLARE_BITMAP(bm, MAX_NUMNODES);
629 
630 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
631 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
632 
633 	if (nmask) {
634 		err = compat_get_bitmap(bm, nmask, nr_bits);
635 		nm = compat_alloc_user_space(alloc_size);
636 		err |= copy_to_user(nm, bm, alloc_size);
637 	}
638 
639 	if (err)
640 		return -EFAULT;
641 
642 	return sys_set_mempolicy(mode, nm, nr_bits+1);
643 }
644 
645 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
646 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
647 			     compat_ulong_t maxnode, compat_ulong_t flags)
648 {
649 	long err = 0;
650 	unsigned long __user *nm = NULL;
651 	unsigned long nr_bits, alloc_size;
652 	DECLARE_BITMAP(bm, MAX_NUMNODES);
653 
654 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
655 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
656 
657 	if (nmask) {
658 		err = compat_get_bitmap(bm, nmask, nr_bits);
659 		nm = compat_alloc_user_space(alloc_size);
660 		err |= copy_to_user(nm, bm, alloc_size);
661 	}
662 
663 	if (err)
664 		return -EFAULT;
665 
666 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
667 }
668 
669 #endif
670 
671 /* Return effective policy for a VMA */
672 struct mempolicy *
673 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
674 {
675 	struct mempolicy *pol = task->mempolicy;
676 
677 	if (vma) {
678 		if (vma->vm_ops && vma->vm_ops->get_policy)
679 		        pol = vma->vm_ops->get_policy(vma, addr);
680 		else if (vma->vm_policy &&
681 				vma->vm_policy->policy != MPOL_DEFAULT)
682 			pol = vma->vm_policy;
683 	}
684 	if (!pol)
685 		pol = &default_policy;
686 	return pol;
687 }
688 
689 /* Return a zonelist representing a mempolicy */
690 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
691 {
692 	int nd;
693 
694 	switch (policy->policy) {
695 	case MPOL_PREFERRED:
696 		nd = policy->v.preferred_node;
697 		if (nd < 0)
698 			nd = numa_node_id();
699 		break;
700 	case MPOL_BIND:
701 		/* Lower zones don't get a policy applied */
702 		/* Careful: current->mems_allowed might have moved */
703 		if ((gfp & GFP_ZONEMASK) >= policy_zone)
704 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
705 				return policy->v.zonelist;
706 		/*FALL THROUGH*/
707 	case MPOL_INTERLEAVE: /* should not happen */
708 	case MPOL_DEFAULT:
709 		nd = numa_node_id();
710 		break;
711 	default:
712 		nd = 0;
713 		BUG();
714 	}
715 	return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
716 }
717 
718 /* Do dynamic interleaving for a process */
719 static unsigned interleave_nodes(struct mempolicy *policy)
720 {
721 	unsigned nid, next;
722 	struct task_struct *me = current;
723 
724 	nid = me->il_next;
725 	BUG_ON(nid >= MAX_NUMNODES);
726 	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
727 	if (next >= MAX_NUMNODES)
728 		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
729 	me->il_next = next;
730 	return nid;
731 }
732 
733 /* Do static interleaving for a VMA with known offset. */
734 static unsigned offset_il_node(struct mempolicy *pol,
735 		struct vm_area_struct *vma, unsigned long off)
736 {
737 	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
738 	unsigned target = (unsigned)off % nnodes;
739 	int c;
740 	int nid = -1;
741 
742 	c = 0;
743 	do {
744 		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
745 		c++;
746 	} while (c <= target);
747 	BUG_ON(nid >= MAX_NUMNODES);
748 	BUG_ON(!test_bit(nid, pol->v.nodes));
749 	return nid;
750 }
751 
752 /* Allocate a page in interleaved policy.
753    Own path because it needs to do special accounting. */
754 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
755 {
756 	struct zonelist *zl;
757 	struct page *page;
758 
759 	BUG_ON(!node_online(nid));
760 	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
761 	page = __alloc_pages(gfp, order, zl);
762 	if (page && page_zone(page) == zl->zones[0]) {
763 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
764 		put_cpu();
765 	}
766 	return page;
767 }
768 
769 /**
770  * 	alloc_page_vma	- Allocate a page for a VMA.
771  *
772  * 	@gfp:
773  *      %GFP_USER    user allocation.
774  *      %GFP_KERNEL  kernel allocations,
775  *      %GFP_HIGHMEM highmem/user allocations,
776  *      %GFP_FS      allocation should not call back into a file system.
777  *      %GFP_ATOMIC  don't sleep.
778  *
779  * 	@vma:  Pointer to VMA or NULL if not available.
780  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
781  *
782  * 	This function allocates a page from the kernel page pool and applies
783  *	a NUMA policy associated with the VMA or the current process.
784  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
785  *	mm_struct of the VMA to prevent it from going away. Should be used for
786  *	all allocations for pages that will be mapped into
787  * 	user space. Returns NULL when no page can be allocated.
788  *
789  *	Should be called with the mm_sem of the vma hold.
790  */
791 struct page *
792 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
793 {
794 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
795 
796 	cpuset_update_current_mems_allowed();
797 
798 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
799 		unsigned nid;
800 		if (vma) {
801 			unsigned long off;
802 			BUG_ON(addr >= vma->vm_end);
803 			BUG_ON(addr < vma->vm_start);
804 			off = vma->vm_pgoff;
805 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
806 			nid = offset_il_node(pol, vma, off);
807 		} else {
808 			/* fall back to process interleaving */
809 			nid = interleave_nodes(pol);
810 		}
811 		return alloc_page_interleave(gfp, 0, nid);
812 	}
813 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
814 }
815 
816 /**
817  * 	alloc_pages_current - Allocate pages.
818  *
819  *	@gfp:
820  *		%GFP_USER   user allocation,
821  *      	%GFP_KERNEL kernel allocation,
822  *      	%GFP_HIGHMEM highmem allocation,
823  *      	%GFP_FS     don't call back into a file system.
824  *      	%GFP_ATOMIC don't sleep.
825  *	@order: Power of two of allocation size in pages. 0 is a single page.
826  *
827  *	Allocate a page from the kernel page pool.  When not in
828  *	interrupt context and apply the current process NUMA policy.
829  *	Returns NULL when no page can be allocated.
830  *
831  *	Don't call cpuset_update_current_mems_allowed() unless
832  *	1) it's ok to take cpuset_sem (can WAIT), and
833  *	2) allocating for current task (not interrupt).
834  */
835 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
836 {
837 	struct mempolicy *pol = current->mempolicy;
838 
839 	if ((gfp & __GFP_WAIT) && !in_interrupt())
840 		cpuset_update_current_mems_allowed();
841 	if (!pol || in_interrupt())
842 		pol = &default_policy;
843 	if (pol->policy == MPOL_INTERLEAVE)
844 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
845 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
846 }
847 EXPORT_SYMBOL(alloc_pages_current);
848 
849 /* Slow path of a mempolicy copy */
850 struct mempolicy *__mpol_copy(struct mempolicy *old)
851 {
852 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
853 
854 	if (!new)
855 		return ERR_PTR(-ENOMEM);
856 	*new = *old;
857 	atomic_set(&new->refcnt, 1);
858 	if (new->policy == MPOL_BIND) {
859 		int sz = ksize(old->v.zonelist);
860 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
861 		if (!new->v.zonelist) {
862 			kmem_cache_free(policy_cache, new);
863 			return ERR_PTR(-ENOMEM);
864 		}
865 		memcpy(new->v.zonelist, old->v.zonelist, sz);
866 	}
867 	return new;
868 }
869 
870 /* Slow path of a mempolicy comparison */
871 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
872 {
873 	if (!a || !b)
874 		return 0;
875 	if (a->policy != b->policy)
876 		return 0;
877 	switch (a->policy) {
878 	case MPOL_DEFAULT:
879 		return 1;
880 	case MPOL_INTERLEAVE:
881 		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
882 	case MPOL_PREFERRED:
883 		return a->v.preferred_node == b->v.preferred_node;
884 	case MPOL_BIND: {
885 		int i;
886 		for (i = 0; a->v.zonelist->zones[i]; i++)
887 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
888 				return 0;
889 		return b->v.zonelist->zones[i] == NULL;
890 	}
891 	default:
892 		BUG();
893 		return 0;
894 	}
895 }
896 
897 /* Slow path of a mpol destructor. */
898 void __mpol_free(struct mempolicy *p)
899 {
900 	if (!atomic_dec_and_test(&p->refcnt))
901 		return;
902 	if (p->policy == MPOL_BIND)
903 		kfree(p->v.zonelist);
904 	p->policy = MPOL_DEFAULT;
905 	kmem_cache_free(policy_cache, p);
906 }
907 
908 /*
909  * Hugetlb policy. Same as above, just works with node numbers instead of
910  * zonelists.
911  */
912 
913 /* Find first node suitable for an allocation */
914 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
915 {
916 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
917 
918 	switch (pol->policy) {
919 	case MPOL_DEFAULT:
920 		return numa_node_id();
921 	case MPOL_BIND:
922 		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
923 	case MPOL_INTERLEAVE:
924 		return interleave_nodes(pol);
925 	case MPOL_PREFERRED:
926 		return pol->v.preferred_node >= 0 ?
927 				pol->v.preferred_node : numa_node_id();
928 	}
929 	BUG();
930 	return 0;
931 }
932 
933 /* Find secondary valid nodes for an allocation */
934 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
935 {
936 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
937 
938 	switch (pol->policy) {
939 	case MPOL_PREFERRED:
940 	case MPOL_DEFAULT:
941 	case MPOL_INTERLEAVE:
942 		return 1;
943 	case MPOL_BIND: {
944 		struct zone **z;
945 		for (z = pol->v.zonelist->zones; *z; z++)
946 			if ((*z)->zone_pgdat->node_id == nid)
947 				return 1;
948 		return 0;
949 	}
950 	default:
951 		BUG();
952 		return 0;
953 	}
954 }
955 
956 /*
957  * Shared memory backing store policy support.
958  *
959  * Remember policies even when nobody has shared memory mapped.
960  * The policies are kept in Red-Black tree linked from the inode.
961  * They are protected by the sp->lock spinlock, which should be held
962  * for any accesses to the tree.
963  */
964 
965 /* lookup first element intersecting start-end */
966 /* Caller holds sp->lock */
967 static struct sp_node *
968 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
969 {
970 	struct rb_node *n = sp->root.rb_node;
971 
972 	while (n) {
973 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
974 
975 		if (start >= p->end)
976 			n = n->rb_right;
977 		else if (end <= p->start)
978 			n = n->rb_left;
979 		else
980 			break;
981 	}
982 	if (!n)
983 		return NULL;
984 	for (;;) {
985 		struct sp_node *w = NULL;
986 		struct rb_node *prev = rb_prev(n);
987 		if (!prev)
988 			break;
989 		w = rb_entry(prev, struct sp_node, nd);
990 		if (w->end <= start)
991 			break;
992 		n = prev;
993 	}
994 	return rb_entry(n, struct sp_node, nd);
995 }
996 
997 /* Insert a new shared policy into the list. */
998 /* Caller holds sp->lock */
999 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1000 {
1001 	struct rb_node **p = &sp->root.rb_node;
1002 	struct rb_node *parent = NULL;
1003 	struct sp_node *nd;
1004 
1005 	while (*p) {
1006 		parent = *p;
1007 		nd = rb_entry(parent, struct sp_node, nd);
1008 		if (new->start < nd->start)
1009 			p = &(*p)->rb_left;
1010 		else if (new->end > nd->end)
1011 			p = &(*p)->rb_right;
1012 		else
1013 			BUG();
1014 	}
1015 	rb_link_node(&new->nd, parent, p);
1016 	rb_insert_color(&new->nd, &sp->root);
1017 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1018 		 new->policy ? new->policy->policy : 0);
1019 }
1020 
1021 /* Find shared policy intersecting idx */
1022 struct mempolicy *
1023 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1024 {
1025 	struct mempolicy *pol = NULL;
1026 	struct sp_node *sn;
1027 
1028 	if (!sp->root.rb_node)
1029 		return NULL;
1030 	spin_lock(&sp->lock);
1031 	sn = sp_lookup(sp, idx, idx+1);
1032 	if (sn) {
1033 		mpol_get(sn->policy);
1034 		pol = sn->policy;
1035 	}
1036 	spin_unlock(&sp->lock);
1037 	return pol;
1038 }
1039 
1040 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1041 {
1042 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1043 	rb_erase(&n->nd, &sp->root);
1044 	mpol_free(n->policy);
1045 	kmem_cache_free(sn_cache, n);
1046 }
1047 
1048 struct sp_node *
1049 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1050 {
1051 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1052 
1053 	if (!n)
1054 		return NULL;
1055 	n->start = start;
1056 	n->end = end;
1057 	mpol_get(pol);
1058 	n->policy = pol;
1059 	return n;
1060 }
1061 
1062 /* Replace a policy range. */
1063 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1064 				 unsigned long end, struct sp_node *new)
1065 {
1066 	struct sp_node *n, *new2 = NULL;
1067 
1068 restart:
1069 	spin_lock(&sp->lock);
1070 	n = sp_lookup(sp, start, end);
1071 	/* Take care of old policies in the same range. */
1072 	while (n && n->start < end) {
1073 		struct rb_node *next = rb_next(&n->nd);
1074 		if (n->start >= start) {
1075 			if (n->end <= end)
1076 				sp_delete(sp, n);
1077 			else
1078 				n->start = end;
1079 		} else {
1080 			/* Old policy spanning whole new range. */
1081 			if (n->end > end) {
1082 				if (!new2) {
1083 					spin_unlock(&sp->lock);
1084 					new2 = sp_alloc(end, n->end, n->policy);
1085 					if (!new2)
1086 						return -ENOMEM;
1087 					goto restart;
1088 				}
1089 				n->end = start;
1090 				sp_insert(sp, new2);
1091 				new2 = NULL;
1092 				break;
1093 			} else
1094 				n->end = start;
1095 		}
1096 		if (!next)
1097 			break;
1098 		n = rb_entry(next, struct sp_node, nd);
1099 	}
1100 	if (new)
1101 		sp_insert(sp, new);
1102 	spin_unlock(&sp->lock);
1103 	if (new2) {
1104 		mpol_free(new2->policy);
1105 		kmem_cache_free(sn_cache, new2);
1106 	}
1107 	return 0;
1108 }
1109 
1110 int mpol_set_shared_policy(struct shared_policy *info,
1111 			struct vm_area_struct *vma, struct mempolicy *npol)
1112 {
1113 	int err;
1114 	struct sp_node *new = NULL;
1115 	unsigned long sz = vma_pages(vma);
1116 
1117 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1118 		 vma->vm_pgoff,
1119 		 sz, npol? npol->policy : -1,
1120 		npol ? npol->v.nodes[0] : -1);
1121 
1122 	if (npol) {
1123 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1124 		if (!new)
1125 			return -ENOMEM;
1126 	}
1127 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1128 	if (err && new)
1129 		kmem_cache_free(sn_cache, new);
1130 	return err;
1131 }
1132 
1133 /* Free a backing policy store on inode delete. */
1134 void mpol_free_shared_policy(struct shared_policy *p)
1135 {
1136 	struct sp_node *n;
1137 	struct rb_node *next;
1138 
1139 	if (!p->root.rb_node)
1140 		return;
1141 	spin_lock(&p->lock);
1142 	next = rb_first(&p->root);
1143 	while (next) {
1144 		n = rb_entry(next, struct sp_node, nd);
1145 		next = rb_next(&n->nd);
1146 		rb_erase(&n->nd, &p->root);
1147 		mpol_free(n->policy);
1148 		kmem_cache_free(sn_cache, n);
1149 	}
1150 	spin_unlock(&p->lock);
1151 }
1152 
1153 /* assumes fs == KERNEL_DS */
1154 void __init numa_policy_init(void)
1155 {
1156 	policy_cache = kmem_cache_create("numa_policy",
1157 					 sizeof(struct mempolicy),
1158 					 0, SLAB_PANIC, NULL, NULL);
1159 
1160 	sn_cache = kmem_cache_create("shared_policy_node",
1161 				     sizeof(struct sp_node),
1162 				     0, SLAB_PANIC, NULL, NULL);
1163 
1164 	/* Set interleaving policy for system init. This way not all
1165 	   the data structures allocated at system boot end up in node zero. */
1166 
1167 	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1168 							MAX_NUMNODES) < 0)
1169 		printk("numa_policy_init: interleaving failed\n");
1170 }
1171 
1172 /* Reset policy of current process to default.
1173  * Assumes fs == KERNEL_DS */
1174 void numa_default_policy(void)
1175 {
1176 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1177 }
1178