xref: /linux/mm/mempolicy.c (revision 9ce7677cfd7cd871adb457c80bea3b581b839641)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <asm/tlbflush.h>
87 #include <asm/uaccess.h>
88 
89 static kmem_cache_t *policy_cache;
90 static kmem_cache_t *sn_cache;
91 
92 #define PDprintk(fmt...)
93 
94 /* Highest zone. An specific allocation for a zone below that is not
95    policied. */
96 static int policy_zone;
97 
98 struct mempolicy default_policy = {
99 	.refcnt = ATOMIC_INIT(1), /* never free it */
100 	.policy = MPOL_DEFAULT,
101 };
102 
103 /* Do sanity checking on a policy */
104 static int mpol_check_policy(int mode, nodemask_t *nodes)
105 {
106 	int empty = nodes_empty(*nodes);
107 
108 	switch (mode) {
109 	case MPOL_DEFAULT:
110 		if (!empty)
111 			return -EINVAL;
112 		break;
113 	case MPOL_BIND:
114 	case MPOL_INTERLEAVE:
115 		/* Preferred will only use the first bit, but allow
116 		   more for now. */
117 		if (empty)
118 			return -EINVAL;
119 		break;
120 	}
121 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
122 }
123 /* Generate a custom zonelist for the BIND policy. */
124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
125 {
126 	struct zonelist *zl;
127 	int num, max, nd;
128 
129 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
130 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131 	if (!zl)
132 		return NULL;
133 	num = 0;
134 	for_each_node_mask(nd, *nodes) {
135 		int k;
136 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 			if (!z->present_pages)
139 				continue;
140 			zl->zones[num++] = z;
141 			if (k > policy_zone)
142 				policy_zone = k;
143 		}
144 	}
145 	zl->zones[num] = NULL;
146 	return zl;
147 }
148 
149 /* Create a new policy */
150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
151 {
152 	struct mempolicy *policy;
153 
154 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
155 	if (mode == MPOL_DEFAULT)
156 		return NULL;
157 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 	if (!policy)
159 		return ERR_PTR(-ENOMEM);
160 	atomic_set(&policy->refcnt, 1);
161 	switch (mode) {
162 	case MPOL_INTERLEAVE:
163 		policy->v.nodes = *nodes;
164 		break;
165 	case MPOL_PREFERRED:
166 		policy->v.preferred_node = first_node(*nodes);
167 		if (policy->v.preferred_node >= MAX_NUMNODES)
168 			policy->v.preferred_node = -1;
169 		break;
170 	case MPOL_BIND:
171 		policy->v.zonelist = bind_zonelist(nodes);
172 		if (policy->v.zonelist == NULL) {
173 			kmem_cache_free(policy_cache, policy);
174 			return ERR_PTR(-ENOMEM);
175 		}
176 		break;
177 	}
178 	policy->policy = mode;
179 	return policy;
180 }
181 
182 /* Ensure all existing pages follow the policy. */
183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 		unsigned long addr, unsigned long end, nodemask_t *nodes)
185 {
186 	pte_t *orig_pte;
187 	pte_t *pte;
188 	spinlock_t *ptl;
189 
190 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 	do {
192 		struct page *page;
193 		unsigned int nid;
194 
195 		if (!pte_present(*pte))
196 			continue;
197 		page = vm_normal_page(vma, addr, *pte);
198 		if (!page)
199 			continue;
200 		nid = page_to_nid(page);
201 		if (!node_isset(nid, *nodes))
202 			break;
203 	} while (pte++, addr += PAGE_SIZE, addr != end);
204 	pte_unmap_unlock(orig_pte, ptl);
205 	return addr != end;
206 }
207 
208 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
209 		unsigned long addr, unsigned long end, nodemask_t *nodes)
210 {
211 	pmd_t *pmd;
212 	unsigned long next;
213 
214 	pmd = pmd_offset(pud, addr);
215 	do {
216 		next = pmd_addr_end(addr, end);
217 		if (pmd_none_or_clear_bad(pmd))
218 			continue;
219 		if (check_pte_range(vma, pmd, addr, next, nodes))
220 			return -EIO;
221 	} while (pmd++, addr = next, addr != end);
222 	return 0;
223 }
224 
225 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
226 		unsigned long addr, unsigned long end, nodemask_t *nodes)
227 {
228 	pud_t *pud;
229 	unsigned long next;
230 
231 	pud = pud_offset(pgd, addr);
232 	do {
233 		next = pud_addr_end(addr, end);
234 		if (pud_none_or_clear_bad(pud))
235 			continue;
236 		if (check_pmd_range(vma, pud, addr, next, nodes))
237 			return -EIO;
238 	} while (pud++, addr = next, addr != end);
239 	return 0;
240 }
241 
242 static inline int check_pgd_range(struct vm_area_struct *vma,
243 		unsigned long addr, unsigned long end, nodemask_t *nodes)
244 {
245 	pgd_t *pgd;
246 	unsigned long next;
247 
248 	pgd = pgd_offset(vma->vm_mm, addr);
249 	do {
250 		next = pgd_addr_end(addr, end);
251 		if (pgd_none_or_clear_bad(pgd))
252 			continue;
253 		if (check_pud_range(vma, pgd, addr, next, nodes))
254 			return -EIO;
255 	} while (pgd++, addr = next, addr != end);
256 	return 0;
257 }
258 
259 /* Step 1: check the range */
260 static struct vm_area_struct *
261 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
262 	    nodemask_t *nodes, unsigned long flags)
263 {
264 	int err;
265 	struct vm_area_struct *first, *vma, *prev;
266 
267 	first = find_vma(mm, start);
268 	if (!first)
269 		return ERR_PTR(-EFAULT);
270 	prev = NULL;
271 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
272 		if (!vma->vm_next && vma->vm_end < end)
273 			return ERR_PTR(-EFAULT);
274 		if (prev && prev->vm_end < vma->vm_start)
275 			return ERR_PTR(-EFAULT);
276 		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
277 			unsigned long endvma = vma->vm_end;
278 			if (endvma > end)
279 				endvma = end;
280 			if (vma->vm_start > start)
281 				start = vma->vm_start;
282 			err = check_pgd_range(vma, start, endvma, nodes);
283 			if (err) {
284 				first = ERR_PTR(err);
285 				break;
286 			}
287 		}
288 		prev = vma;
289 	}
290 	return first;
291 }
292 
293 /* Apply policy to a single VMA */
294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
295 {
296 	int err = 0;
297 	struct mempolicy *old = vma->vm_policy;
298 
299 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
300 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
301 		 vma->vm_ops, vma->vm_file,
302 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
303 
304 	if (vma->vm_ops && vma->vm_ops->set_policy)
305 		err = vma->vm_ops->set_policy(vma, new);
306 	if (!err) {
307 		mpol_get(new);
308 		vma->vm_policy = new;
309 		mpol_free(old);
310 	}
311 	return err;
312 }
313 
314 /* Step 2: apply policy to a range and do splits. */
315 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
316 		       unsigned long end, struct mempolicy *new)
317 {
318 	struct vm_area_struct *next;
319 	int err;
320 
321 	err = 0;
322 	for (; vma && vma->vm_start < end; vma = next) {
323 		next = vma->vm_next;
324 		if (vma->vm_start < start)
325 			err = split_vma(vma->vm_mm, vma, start, 1);
326 		if (!err && vma->vm_end > end)
327 			err = split_vma(vma->vm_mm, vma, end, 0);
328 		if (!err)
329 			err = policy_vma(vma, new);
330 		if (err)
331 			break;
332 	}
333 	return err;
334 }
335 
336 static int contextualize_policy(int mode, nodemask_t *nodes)
337 {
338 	if (!nodes)
339 		return 0;
340 
341 	/* Update current mems_allowed */
342 	cpuset_update_current_mems_allowed();
343 	/* Ignore nodes not set in current->mems_allowed */
344 	cpuset_restrict_to_mems_allowed(nodes->bits);
345 	return mpol_check_policy(mode, nodes);
346 }
347 
348 long do_mbind(unsigned long start, unsigned long len,
349 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
350 {
351 	struct vm_area_struct *vma;
352 	struct mm_struct *mm = current->mm;
353 	struct mempolicy *new;
354 	unsigned long end;
355 	int err;
356 
357 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
358 		return -EINVAL;
359 	if (start & ~PAGE_MASK)
360 		return -EINVAL;
361 	if (mode == MPOL_DEFAULT)
362 		flags &= ~MPOL_MF_STRICT;
363 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
364 	end = start + len;
365 	if (end < start)
366 		return -EINVAL;
367 	if (end == start)
368 		return 0;
369 	if (mpol_check_policy(mode, nmask))
370 		return -EINVAL;
371 	new = mpol_new(mode, nmask);
372 	if (IS_ERR(new))
373 		return PTR_ERR(new);
374 
375 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
376 			mode,nodes_addr(nodes)[0]);
377 
378 	down_write(&mm->mmap_sem);
379 	vma = check_range(mm, start, end, nmask, flags);
380 	err = PTR_ERR(vma);
381 	if (!IS_ERR(vma))
382 		err = mbind_range(vma, start, end, new);
383 	up_write(&mm->mmap_sem);
384 	mpol_free(new);
385 	return err;
386 }
387 
388 /* Set the process memory policy */
389 long do_set_mempolicy(int mode, nodemask_t *nodes)
390 {
391 	struct mempolicy *new;
392 
393 	if (contextualize_policy(mode, nodes))
394 		return -EINVAL;
395 	new = mpol_new(mode, nodes);
396 	if (IS_ERR(new))
397 		return PTR_ERR(new);
398 	mpol_free(current->mempolicy);
399 	current->mempolicy = new;
400 	if (new && new->policy == MPOL_INTERLEAVE)
401 		current->il_next = first_node(new->v.nodes);
402 	return 0;
403 }
404 
405 /* Fill a zone bitmap for a policy */
406 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
407 {
408 	int i;
409 
410 	nodes_clear(*nodes);
411 	switch (p->policy) {
412 	case MPOL_BIND:
413 		for (i = 0; p->v.zonelist->zones[i]; i++)
414 			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
415 				*nodes);
416 		break;
417 	case MPOL_DEFAULT:
418 		break;
419 	case MPOL_INTERLEAVE:
420 		*nodes = p->v.nodes;
421 		break;
422 	case MPOL_PREFERRED:
423 		/* or use current node instead of online map? */
424 		if (p->v.preferred_node < 0)
425 			*nodes = node_online_map;
426 		else
427 			node_set(p->v.preferred_node, *nodes);
428 		break;
429 	default:
430 		BUG();
431 	}
432 }
433 
434 static int lookup_node(struct mm_struct *mm, unsigned long addr)
435 {
436 	struct page *p;
437 	int err;
438 
439 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
440 	if (err >= 0) {
441 		err = page_to_nid(p);
442 		put_page(p);
443 	}
444 	return err;
445 }
446 
447 /* Retrieve NUMA policy */
448 long do_get_mempolicy(int *policy, nodemask_t *nmask,
449 			unsigned long addr, unsigned long flags)
450 {
451 	int err;
452 	struct mm_struct *mm = current->mm;
453 	struct vm_area_struct *vma = NULL;
454 	struct mempolicy *pol = current->mempolicy;
455 
456 	cpuset_update_current_mems_allowed();
457 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
458 		return -EINVAL;
459 	if (flags & MPOL_F_ADDR) {
460 		down_read(&mm->mmap_sem);
461 		vma = find_vma_intersection(mm, addr, addr+1);
462 		if (!vma) {
463 			up_read(&mm->mmap_sem);
464 			return -EFAULT;
465 		}
466 		if (vma->vm_ops && vma->vm_ops->get_policy)
467 			pol = vma->vm_ops->get_policy(vma, addr);
468 		else
469 			pol = vma->vm_policy;
470 	} else if (addr)
471 		return -EINVAL;
472 
473 	if (!pol)
474 		pol = &default_policy;
475 
476 	if (flags & MPOL_F_NODE) {
477 		if (flags & MPOL_F_ADDR) {
478 			err = lookup_node(mm, addr);
479 			if (err < 0)
480 				goto out;
481 			*policy = err;
482 		} else if (pol == current->mempolicy &&
483 				pol->policy == MPOL_INTERLEAVE) {
484 			*policy = current->il_next;
485 		} else {
486 			err = -EINVAL;
487 			goto out;
488 		}
489 	} else
490 		*policy = pol->policy;
491 
492 	if (vma) {
493 		up_read(&current->mm->mmap_sem);
494 		vma = NULL;
495 	}
496 
497 	err = 0;
498 	if (nmask)
499 		get_zonemask(pol, nmask);
500 
501  out:
502 	if (vma)
503 		up_read(&current->mm->mmap_sem);
504 	return err;
505 }
506 
507 /*
508  * User space interface with variable sized bitmaps for nodelists.
509  */
510 
511 /* Copy a node mask from user space. */
512 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
513 		     unsigned long maxnode)
514 {
515 	unsigned long k;
516 	unsigned long nlongs;
517 	unsigned long endmask;
518 
519 	--maxnode;
520 	nodes_clear(*nodes);
521 	if (maxnode == 0 || !nmask)
522 		return 0;
523 
524 	nlongs = BITS_TO_LONGS(maxnode);
525 	if ((maxnode % BITS_PER_LONG) == 0)
526 		endmask = ~0UL;
527 	else
528 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
529 
530 	/* When the user specified more nodes than supported just check
531 	   if the non supported part is all zero. */
532 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
533 		if (nlongs > PAGE_SIZE/sizeof(long))
534 			return -EINVAL;
535 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
536 			unsigned long t;
537 			if (get_user(t, nmask + k))
538 				return -EFAULT;
539 			if (k == nlongs - 1) {
540 				if (t & endmask)
541 					return -EINVAL;
542 			} else if (t)
543 				return -EINVAL;
544 		}
545 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
546 		endmask = ~0UL;
547 	}
548 
549 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
550 		return -EFAULT;
551 	nodes_addr(*nodes)[nlongs-1] &= endmask;
552 	return 0;
553 }
554 
555 /* Copy a kernel node mask to user space */
556 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
557 			      nodemask_t *nodes)
558 {
559 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
560 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
561 
562 	if (copy > nbytes) {
563 		if (copy > PAGE_SIZE)
564 			return -EINVAL;
565 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
566 			return -EFAULT;
567 		copy = nbytes;
568 	}
569 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
570 }
571 
572 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
573 			unsigned long mode,
574 			unsigned long __user *nmask, unsigned long maxnode,
575 			unsigned flags)
576 {
577 	nodemask_t nodes;
578 	int err;
579 
580 	err = get_nodes(&nodes, nmask, maxnode);
581 	if (err)
582 		return err;
583 	return do_mbind(start, len, mode, &nodes, flags);
584 }
585 
586 /* Set the process memory policy */
587 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
588 		unsigned long maxnode)
589 {
590 	int err;
591 	nodemask_t nodes;
592 
593 	if (mode < 0 || mode > MPOL_MAX)
594 		return -EINVAL;
595 	err = get_nodes(&nodes, nmask, maxnode);
596 	if (err)
597 		return err;
598 	return do_set_mempolicy(mode, &nodes);
599 }
600 
601 /* Retrieve NUMA policy */
602 asmlinkage long sys_get_mempolicy(int __user *policy,
603 				unsigned long __user *nmask,
604 				unsigned long maxnode,
605 				unsigned long addr, unsigned long flags)
606 {
607 	int err, pval;
608 	nodemask_t nodes;
609 
610 	if (nmask != NULL && maxnode < MAX_NUMNODES)
611 		return -EINVAL;
612 
613 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
614 
615 	if (err)
616 		return err;
617 
618 	if (policy && put_user(pval, policy))
619 		return -EFAULT;
620 
621 	if (nmask)
622 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
623 
624 	return err;
625 }
626 
627 #ifdef CONFIG_COMPAT
628 
629 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
630 				     compat_ulong_t __user *nmask,
631 				     compat_ulong_t maxnode,
632 				     compat_ulong_t addr, compat_ulong_t flags)
633 {
634 	long err;
635 	unsigned long __user *nm = NULL;
636 	unsigned long nr_bits, alloc_size;
637 	DECLARE_BITMAP(bm, MAX_NUMNODES);
638 
639 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
640 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
641 
642 	if (nmask)
643 		nm = compat_alloc_user_space(alloc_size);
644 
645 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
646 
647 	if (!err && nmask) {
648 		err = copy_from_user(bm, nm, alloc_size);
649 		/* ensure entire bitmap is zeroed */
650 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
651 		err |= compat_put_bitmap(nmask, bm, nr_bits);
652 	}
653 
654 	return err;
655 }
656 
657 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
658 				     compat_ulong_t maxnode)
659 {
660 	long err = 0;
661 	unsigned long __user *nm = NULL;
662 	unsigned long nr_bits, alloc_size;
663 	DECLARE_BITMAP(bm, MAX_NUMNODES);
664 
665 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
666 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
667 
668 	if (nmask) {
669 		err = compat_get_bitmap(bm, nmask, nr_bits);
670 		nm = compat_alloc_user_space(alloc_size);
671 		err |= copy_to_user(nm, bm, alloc_size);
672 	}
673 
674 	if (err)
675 		return -EFAULT;
676 
677 	return sys_set_mempolicy(mode, nm, nr_bits+1);
678 }
679 
680 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
681 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
682 			     compat_ulong_t maxnode, compat_ulong_t flags)
683 {
684 	long err = 0;
685 	unsigned long __user *nm = NULL;
686 	unsigned long nr_bits, alloc_size;
687 	nodemask_t bm;
688 
689 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
690 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
691 
692 	if (nmask) {
693 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
694 		nm = compat_alloc_user_space(alloc_size);
695 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
696 	}
697 
698 	if (err)
699 		return -EFAULT;
700 
701 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
702 }
703 
704 #endif
705 
706 /* Return effective policy for a VMA */
707 struct mempolicy *
708 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
709 {
710 	struct mempolicy *pol = task->mempolicy;
711 
712 	if (vma) {
713 		if (vma->vm_ops && vma->vm_ops->get_policy)
714 			pol = vma->vm_ops->get_policy(vma, addr);
715 		else if (vma->vm_policy &&
716 				vma->vm_policy->policy != MPOL_DEFAULT)
717 			pol = vma->vm_policy;
718 	}
719 	if (!pol)
720 		pol = &default_policy;
721 	return pol;
722 }
723 
724 /* Return a zonelist representing a mempolicy */
725 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
726 {
727 	int nd;
728 
729 	switch (policy->policy) {
730 	case MPOL_PREFERRED:
731 		nd = policy->v.preferred_node;
732 		if (nd < 0)
733 			nd = numa_node_id();
734 		break;
735 	case MPOL_BIND:
736 		/* Lower zones don't get a policy applied */
737 		/* Careful: current->mems_allowed might have moved */
738 		if (gfp_zone(gfp) >= policy_zone)
739 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
740 				return policy->v.zonelist;
741 		/*FALL THROUGH*/
742 	case MPOL_INTERLEAVE: /* should not happen */
743 	case MPOL_DEFAULT:
744 		nd = numa_node_id();
745 		break;
746 	default:
747 		nd = 0;
748 		BUG();
749 	}
750 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
751 }
752 
753 /* Do dynamic interleaving for a process */
754 static unsigned interleave_nodes(struct mempolicy *policy)
755 {
756 	unsigned nid, next;
757 	struct task_struct *me = current;
758 
759 	nid = me->il_next;
760 	next = next_node(nid, policy->v.nodes);
761 	if (next >= MAX_NUMNODES)
762 		next = first_node(policy->v.nodes);
763 	me->il_next = next;
764 	return nid;
765 }
766 
767 /* Do static interleaving for a VMA with known offset. */
768 static unsigned offset_il_node(struct mempolicy *pol,
769 		struct vm_area_struct *vma, unsigned long off)
770 {
771 	unsigned nnodes = nodes_weight(pol->v.nodes);
772 	unsigned target = (unsigned)off % nnodes;
773 	int c;
774 	int nid = -1;
775 
776 	c = 0;
777 	do {
778 		nid = next_node(nid, pol->v.nodes);
779 		c++;
780 	} while (c <= target);
781 	return nid;
782 }
783 
784 /* Allocate a page in interleaved policy.
785    Own path because it needs to do special accounting. */
786 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
787 					unsigned nid)
788 {
789 	struct zonelist *zl;
790 	struct page *page;
791 
792 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
793 	page = __alloc_pages(gfp, order, zl);
794 	if (page && page_zone(page) == zl->zones[0]) {
795 		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
796 		put_cpu();
797 	}
798 	return page;
799 }
800 
801 /**
802  * 	alloc_page_vma	- Allocate a page for a VMA.
803  *
804  * 	@gfp:
805  *      %GFP_USER    user allocation.
806  *      %GFP_KERNEL  kernel allocations,
807  *      %GFP_HIGHMEM highmem/user allocations,
808  *      %GFP_FS      allocation should not call back into a file system.
809  *      %GFP_ATOMIC  don't sleep.
810  *
811  * 	@vma:  Pointer to VMA or NULL if not available.
812  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
813  *
814  * 	This function allocates a page from the kernel page pool and applies
815  *	a NUMA policy associated with the VMA or the current process.
816  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
817  *	mm_struct of the VMA to prevent it from going away. Should be used for
818  *	all allocations for pages that will be mapped into
819  * 	user space. Returns NULL when no page can be allocated.
820  *
821  *	Should be called with the mm_sem of the vma hold.
822  */
823 struct page *
824 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
825 {
826 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
827 
828 	cpuset_update_current_mems_allowed();
829 
830 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
831 		unsigned nid;
832 		if (vma) {
833 			unsigned long off;
834 			off = vma->vm_pgoff;
835 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
836 			nid = offset_il_node(pol, vma, off);
837 		} else {
838 			/* fall back to process interleaving */
839 			nid = interleave_nodes(pol);
840 		}
841 		return alloc_page_interleave(gfp, 0, nid);
842 	}
843 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
844 }
845 
846 /**
847  * 	alloc_pages_current - Allocate pages.
848  *
849  *	@gfp:
850  *		%GFP_USER   user allocation,
851  *      	%GFP_KERNEL kernel allocation,
852  *      	%GFP_HIGHMEM highmem allocation,
853  *      	%GFP_FS     don't call back into a file system.
854  *      	%GFP_ATOMIC don't sleep.
855  *	@order: Power of two of allocation size in pages. 0 is a single page.
856  *
857  *	Allocate a page from the kernel page pool.  When not in
858  *	interrupt context and apply the current process NUMA policy.
859  *	Returns NULL when no page can be allocated.
860  *
861  *	Don't call cpuset_update_current_mems_allowed() unless
862  *	1) it's ok to take cpuset_sem (can WAIT), and
863  *	2) allocating for current task (not interrupt).
864  */
865 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
866 {
867 	struct mempolicy *pol = current->mempolicy;
868 
869 	if ((gfp & __GFP_WAIT) && !in_interrupt())
870 		cpuset_update_current_mems_allowed();
871 	if (!pol || in_interrupt())
872 		pol = &default_policy;
873 	if (pol->policy == MPOL_INTERLEAVE)
874 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
875 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
876 }
877 EXPORT_SYMBOL(alloc_pages_current);
878 
879 /* Slow path of a mempolicy copy */
880 struct mempolicy *__mpol_copy(struct mempolicy *old)
881 {
882 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
883 
884 	if (!new)
885 		return ERR_PTR(-ENOMEM);
886 	*new = *old;
887 	atomic_set(&new->refcnt, 1);
888 	if (new->policy == MPOL_BIND) {
889 		int sz = ksize(old->v.zonelist);
890 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
891 		if (!new->v.zonelist) {
892 			kmem_cache_free(policy_cache, new);
893 			return ERR_PTR(-ENOMEM);
894 		}
895 		memcpy(new->v.zonelist, old->v.zonelist, sz);
896 	}
897 	return new;
898 }
899 
900 /* Slow path of a mempolicy comparison */
901 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
902 {
903 	if (!a || !b)
904 		return 0;
905 	if (a->policy != b->policy)
906 		return 0;
907 	switch (a->policy) {
908 	case MPOL_DEFAULT:
909 		return 1;
910 	case MPOL_INTERLEAVE:
911 		return nodes_equal(a->v.nodes, b->v.nodes);
912 	case MPOL_PREFERRED:
913 		return a->v.preferred_node == b->v.preferred_node;
914 	case MPOL_BIND: {
915 		int i;
916 		for (i = 0; a->v.zonelist->zones[i]; i++)
917 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
918 				return 0;
919 		return b->v.zonelist->zones[i] == NULL;
920 	}
921 	default:
922 		BUG();
923 		return 0;
924 	}
925 }
926 
927 /* Slow path of a mpol destructor. */
928 void __mpol_free(struct mempolicy *p)
929 {
930 	if (!atomic_dec_and_test(&p->refcnt))
931 		return;
932 	if (p->policy == MPOL_BIND)
933 		kfree(p->v.zonelist);
934 	p->policy = MPOL_DEFAULT;
935 	kmem_cache_free(policy_cache, p);
936 }
937 
938 /*
939  * Hugetlb policy. Same as above, just works with node numbers instead of
940  * zonelists.
941  */
942 
943 /* Find first node suitable for an allocation */
944 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
945 {
946 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
947 
948 	switch (pol->policy) {
949 	case MPOL_DEFAULT:
950 		return numa_node_id();
951 	case MPOL_BIND:
952 		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
953 	case MPOL_INTERLEAVE:
954 		return interleave_nodes(pol);
955 	case MPOL_PREFERRED:
956 		return pol->v.preferred_node >= 0 ?
957 				pol->v.preferred_node : numa_node_id();
958 	}
959 	BUG();
960 	return 0;
961 }
962 
963 /* Find secondary valid nodes for an allocation */
964 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
965 {
966 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
967 
968 	switch (pol->policy) {
969 	case MPOL_PREFERRED:
970 	case MPOL_DEFAULT:
971 	case MPOL_INTERLEAVE:
972 		return 1;
973 	case MPOL_BIND: {
974 		struct zone **z;
975 		for (z = pol->v.zonelist->zones; *z; z++)
976 			if ((*z)->zone_pgdat->node_id == nid)
977 				return 1;
978 		return 0;
979 	}
980 	default:
981 		BUG();
982 		return 0;
983 	}
984 }
985 
986 /*
987  * Shared memory backing store policy support.
988  *
989  * Remember policies even when nobody has shared memory mapped.
990  * The policies are kept in Red-Black tree linked from the inode.
991  * They are protected by the sp->lock spinlock, which should be held
992  * for any accesses to the tree.
993  */
994 
995 /* lookup first element intersecting start-end */
996 /* Caller holds sp->lock */
997 static struct sp_node *
998 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
999 {
1000 	struct rb_node *n = sp->root.rb_node;
1001 
1002 	while (n) {
1003 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1004 
1005 		if (start >= p->end)
1006 			n = n->rb_right;
1007 		else if (end <= p->start)
1008 			n = n->rb_left;
1009 		else
1010 			break;
1011 	}
1012 	if (!n)
1013 		return NULL;
1014 	for (;;) {
1015 		struct sp_node *w = NULL;
1016 		struct rb_node *prev = rb_prev(n);
1017 		if (!prev)
1018 			break;
1019 		w = rb_entry(prev, struct sp_node, nd);
1020 		if (w->end <= start)
1021 			break;
1022 		n = prev;
1023 	}
1024 	return rb_entry(n, struct sp_node, nd);
1025 }
1026 
1027 /* Insert a new shared policy into the list. */
1028 /* Caller holds sp->lock */
1029 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1030 {
1031 	struct rb_node **p = &sp->root.rb_node;
1032 	struct rb_node *parent = NULL;
1033 	struct sp_node *nd;
1034 
1035 	while (*p) {
1036 		parent = *p;
1037 		nd = rb_entry(parent, struct sp_node, nd);
1038 		if (new->start < nd->start)
1039 			p = &(*p)->rb_left;
1040 		else if (new->end > nd->end)
1041 			p = &(*p)->rb_right;
1042 		else
1043 			BUG();
1044 	}
1045 	rb_link_node(&new->nd, parent, p);
1046 	rb_insert_color(&new->nd, &sp->root);
1047 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1048 		 new->policy ? new->policy->policy : 0);
1049 }
1050 
1051 /* Find shared policy intersecting idx */
1052 struct mempolicy *
1053 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1054 {
1055 	struct mempolicy *pol = NULL;
1056 	struct sp_node *sn;
1057 
1058 	if (!sp->root.rb_node)
1059 		return NULL;
1060 	spin_lock(&sp->lock);
1061 	sn = sp_lookup(sp, idx, idx+1);
1062 	if (sn) {
1063 		mpol_get(sn->policy);
1064 		pol = sn->policy;
1065 	}
1066 	spin_unlock(&sp->lock);
1067 	return pol;
1068 }
1069 
1070 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1071 {
1072 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1073 	rb_erase(&n->nd, &sp->root);
1074 	mpol_free(n->policy);
1075 	kmem_cache_free(sn_cache, n);
1076 }
1077 
1078 struct sp_node *
1079 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1080 {
1081 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1082 
1083 	if (!n)
1084 		return NULL;
1085 	n->start = start;
1086 	n->end = end;
1087 	mpol_get(pol);
1088 	n->policy = pol;
1089 	return n;
1090 }
1091 
1092 /* Replace a policy range. */
1093 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1094 				 unsigned long end, struct sp_node *new)
1095 {
1096 	struct sp_node *n, *new2 = NULL;
1097 
1098 restart:
1099 	spin_lock(&sp->lock);
1100 	n = sp_lookup(sp, start, end);
1101 	/* Take care of old policies in the same range. */
1102 	while (n && n->start < end) {
1103 		struct rb_node *next = rb_next(&n->nd);
1104 		if (n->start >= start) {
1105 			if (n->end <= end)
1106 				sp_delete(sp, n);
1107 			else
1108 				n->start = end;
1109 		} else {
1110 			/* Old policy spanning whole new range. */
1111 			if (n->end > end) {
1112 				if (!new2) {
1113 					spin_unlock(&sp->lock);
1114 					new2 = sp_alloc(end, n->end, n->policy);
1115 					if (!new2)
1116 						return -ENOMEM;
1117 					goto restart;
1118 				}
1119 				n->end = start;
1120 				sp_insert(sp, new2);
1121 				new2 = NULL;
1122 				break;
1123 			} else
1124 				n->end = start;
1125 		}
1126 		if (!next)
1127 			break;
1128 		n = rb_entry(next, struct sp_node, nd);
1129 	}
1130 	if (new)
1131 		sp_insert(sp, new);
1132 	spin_unlock(&sp->lock);
1133 	if (new2) {
1134 		mpol_free(new2->policy);
1135 		kmem_cache_free(sn_cache, new2);
1136 	}
1137 	return 0;
1138 }
1139 
1140 int mpol_set_shared_policy(struct shared_policy *info,
1141 			struct vm_area_struct *vma, struct mempolicy *npol)
1142 {
1143 	int err;
1144 	struct sp_node *new = NULL;
1145 	unsigned long sz = vma_pages(vma);
1146 
1147 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1148 		 vma->vm_pgoff,
1149 		 sz, npol? npol->policy : -1,
1150 		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1151 
1152 	if (npol) {
1153 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1154 		if (!new)
1155 			return -ENOMEM;
1156 	}
1157 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1158 	if (err && new)
1159 		kmem_cache_free(sn_cache, new);
1160 	return err;
1161 }
1162 
1163 /* Free a backing policy store on inode delete. */
1164 void mpol_free_shared_policy(struct shared_policy *p)
1165 {
1166 	struct sp_node *n;
1167 	struct rb_node *next;
1168 
1169 	if (!p->root.rb_node)
1170 		return;
1171 	spin_lock(&p->lock);
1172 	next = rb_first(&p->root);
1173 	while (next) {
1174 		n = rb_entry(next, struct sp_node, nd);
1175 		next = rb_next(&n->nd);
1176 		rb_erase(&n->nd, &p->root);
1177 		mpol_free(n->policy);
1178 		kmem_cache_free(sn_cache, n);
1179 	}
1180 	spin_unlock(&p->lock);
1181 }
1182 
1183 /* assumes fs == KERNEL_DS */
1184 void __init numa_policy_init(void)
1185 {
1186 	policy_cache = kmem_cache_create("numa_policy",
1187 					 sizeof(struct mempolicy),
1188 					 0, SLAB_PANIC, NULL, NULL);
1189 
1190 	sn_cache = kmem_cache_create("shared_policy_node",
1191 				     sizeof(struct sp_node),
1192 				     0, SLAB_PANIC, NULL, NULL);
1193 
1194 	/* Set interleaving policy for system init. This way not all
1195 	   the data structures allocated at system boot end up in node zero. */
1196 
1197 	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1198 		printk("numa_policy_init: interleaving failed\n");
1199 }
1200 
1201 /* Reset policy of current process to default */
1202 void numa_default_policy(void)
1203 {
1204 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1205 }
1206 
1207 /* Migrate a policy to a different set of nodes */
1208 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1209 							const nodemask_t *new)
1210 {
1211 	nodemask_t tmp;
1212 
1213 	if (!pol)
1214 		return;
1215 
1216 	switch (pol->policy) {
1217 	case MPOL_DEFAULT:
1218 		break;
1219 	case MPOL_INTERLEAVE:
1220 		nodes_remap(tmp, pol->v.nodes, *old, *new);
1221 		pol->v.nodes = tmp;
1222 		current->il_next = node_remap(current->il_next, *old, *new);
1223 		break;
1224 	case MPOL_PREFERRED:
1225 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1226 								*old, *new);
1227 		break;
1228 	case MPOL_BIND: {
1229 		nodemask_t nodes;
1230 		struct zone **z;
1231 		struct zonelist *zonelist;
1232 
1233 		nodes_clear(nodes);
1234 		for (z = pol->v.zonelist->zones; *z; z++)
1235 			node_set((*z)->zone_pgdat->node_id, nodes);
1236 		nodes_remap(tmp, nodes, *old, *new);
1237 		nodes = tmp;
1238 
1239 		zonelist = bind_zonelist(&nodes);
1240 
1241 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1242 		 * If that old zonelist has no remaining mems_allowed nodes,
1243 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1244 		 */
1245 
1246 		if (zonelist) {
1247 			/* Good - got mem - substitute new zonelist */
1248 			kfree(pol->v.zonelist);
1249 			pol->v.zonelist = zonelist;
1250 		}
1251 		break;
1252 	}
1253 	default:
1254 		BUG();
1255 		break;
1256 	}
1257 }
1258 
1259 /*
1260  * Someone moved this task to different nodes.  Fixup mempolicies.
1261  *
1262  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1263  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1264  */
1265 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1266 {
1267 	rebind_policy(current->mempolicy, old, new);
1268 }
1269