xref: /linux/mm/mempolicy.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1*1da177e4SLinus Torvalds /*
2*1da177e4SLinus Torvalds  * Simple NUMA memory policy for the Linux kernel.
3*1da177e4SLinus Torvalds  *
4*1da177e4SLinus Torvalds  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5*1da177e4SLinus Torvalds  * Subject to the GNU Public License, version 2.
6*1da177e4SLinus Torvalds  *
7*1da177e4SLinus Torvalds  * NUMA policy allows the user to give hints in which node(s) memory should
8*1da177e4SLinus Torvalds  * be allocated.
9*1da177e4SLinus Torvalds  *
10*1da177e4SLinus Torvalds  * Support four policies per VMA and per process:
11*1da177e4SLinus Torvalds  *
12*1da177e4SLinus Torvalds  * The VMA policy has priority over the process policy for a page fault.
13*1da177e4SLinus Torvalds  *
14*1da177e4SLinus Torvalds  * interleave     Allocate memory interleaved over a set of nodes,
15*1da177e4SLinus Torvalds  *                with normal fallback if it fails.
16*1da177e4SLinus Torvalds  *                For VMA based allocations this interleaves based on the
17*1da177e4SLinus Torvalds  *                offset into the backing object or offset into the mapping
18*1da177e4SLinus Torvalds  *                for anonymous memory. For process policy an process counter
19*1da177e4SLinus Torvalds  *                is used.
20*1da177e4SLinus Torvalds  * bind           Only allocate memory on a specific set of nodes,
21*1da177e4SLinus Torvalds  *                no fallback.
22*1da177e4SLinus Torvalds  * preferred       Try a specific node first before normal fallback.
23*1da177e4SLinus Torvalds  *                As a special case node -1 here means do the allocation
24*1da177e4SLinus Torvalds  *                on the local CPU. This is normally identical to default,
25*1da177e4SLinus Torvalds  *                but useful to set in a VMA when you have a non default
26*1da177e4SLinus Torvalds  *                process policy.
27*1da177e4SLinus Torvalds  * default        Allocate on the local node first, or when on a VMA
28*1da177e4SLinus Torvalds  *                use the process policy. This is what Linux always did
29*1da177e4SLinus Torvalds  *		  in a NUMA aware kernel and still does by, ahem, default.
30*1da177e4SLinus Torvalds  *
31*1da177e4SLinus Torvalds  * The process policy is applied for most non interrupt memory allocations
32*1da177e4SLinus Torvalds  * in that process' context. Interrupts ignore the policies and always
33*1da177e4SLinus Torvalds  * try to allocate on the local CPU. The VMA policy is only applied for memory
34*1da177e4SLinus Torvalds  * allocations for a VMA in the VM.
35*1da177e4SLinus Torvalds  *
36*1da177e4SLinus Torvalds  * Currently there are a few corner cases in swapping where the policy
37*1da177e4SLinus Torvalds  * is not applied, but the majority should be handled. When process policy
38*1da177e4SLinus Torvalds  * is used it is not remembered over swap outs/swap ins.
39*1da177e4SLinus Torvalds  *
40*1da177e4SLinus Torvalds  * Only the highest zone in the zone hierarchy gets policied. Allocations
41*1da177e4SLinus Torvalds  * requesting a lower zone just use default policy. This implies that
42*1da177e4SLinus Torvalds  * on systems with highmem kernel lowmem allocation don't get policied.
43*1da177e4SLinus Torvalds  * Same with GFP_DMA allocations.
44*1da177e4SLinus Torvalds  *
45*1da177e4SLinus Torvalds  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46*1da177e4SLinus Torvalds  * all users and remembered even when nobody has memory mapped.
47*1da177e4SLinus Torvalds  */
48*1da177e4SLinus Torvalds 
49*1da177e4SLinus Torvalds /* Notebook:
50*1da177e4SLinus Torvalds    fix mmap readahead to honour policy and enable policy for any page cache
51*1da177e4SLinus Torvalds    object
52*1da177e4SLinus Torvalds    statistics for bigpages
53*1da177e4SLinus Torvalds    global policy for page cache? currently it uses process policy. Requires
54*1da177e4SLinus Torvalds    first item above.
55*1da177e4SLinus Torvalds    handle mremap for shared memory (currently ignored for the policy)
56*1da177e4SLinus Torvalds    grows down?
57*1da177e4SLinus Torvalds    make bind policy root only? It can trigger oom much faster and the
58*1da177e4SLinus Torvalds    kernel is not always grateful with that.
59*1da177e4SLinus Torvalds    could replace all the switch()es with a mempolicy_ops structure.
60*1da177e4SLinus Torvalds */
61*1da177e4SLinus Torvalds 
62*1da177e4SLinus Torvalds #include <linux/mempolicy.h>
63*1da177e4SLinus Torvalds #include <linux/mm.h>
64*1da177e4SLinus Torvalds #include <linux/highmem.h>
65*1da177e4SLinus Torvalds #include <linux/hugetlb.h>
66*1da177e4SLinus Torvalds #include <linux/kernel.h>
67*1da177e4SLinus Torvalds #include <linux/sched.h>
68*1da177e4SLinus Torvalds #include <linux/mm.h>
69*1da177e4SLinus Torvalds #include <linux/nodemask.h>
70*1da177e4SLinus Torvalds #include <linux/cpuset.h>
71*1da177e4SLinus Torvalds #include <linux/gfp.h>
72*1da177e4SLinus Torvalds #include <linux/slab.h>
73*1da177e4SLinus Torvalds #include <linux/string.h>
74*1da177e4SLinus Torvalds #include <linux/module.h>
75*1da177e4SLinus Torvalds #include <linux/interrupt.h>
76*1da177e4SLinus Torvalds #include <linux/init.h>
77*1da177e4SLinus Torvalds #include <linux/compat.h>
78*1da177e4SLinus Torvalds #include <linux/mempolicy.h>
79*1da177e4SLinus Torvalds #include <asm/tlbflush.h>
80*1da177e4SLinus Torvalds #include <asm/uaccess.h>
81*1da177e4SLinus Torvalds 
82*1da177e4SLinus Torvalds static kmem_cache_t *policy_cache;
83*1da177e4SLinus Torvalds static kmem_cache_t *sn_cache;
84*1da177e4SLinus Torvalds 
85*1da177e4SLinus Torvalds #define PDprintk(fmt...)
86*1da177e4SLinus Torvalds 
87*1da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
88*1da177e4SLinus Torvalds    policied. */
89*1da177e4SLinus Torvalds static int policy_zone;
90*1da177e4SLinus Torvalds 
91*1da177e4SLinus Torvalds static struct mempolicy default_policy = {
92*1da177e4SLinus Torvalds 	.refcnt = ATOMIC_INIT(1), /* never free it */
93*1da177e4SLinus Torvalds 	.policy = MPOL_DEFAULT,
94*1da177e4SLinus Torvalds };
95*1da177e4SLinus Torvalds 
96*1da177e4SLinus Torvalds /* Check if all specified nodes are online */
97*1da177e4SLinus Torvalds static int nodes_online(unsigned long *nodes)
98*1da177e4SLinus Torvalds {
99*1da177e4SLinus Torvalds 	DECLARE_BITMAP(online2, MAX_NUMNODES);
100*1da177e4SLinus Torvalds 
101*1da177e4SLinus Torvalds 	bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102*1da177e4SLinus Torvalds 	if (bitmap_empty(online2, MAX_NUMNODES))
103*1da177e4SLinus Torvalds 		set_bit(0, online2);
104*1da177e4SLinus Torvalds 	if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105*1da177e4SLinus Torvalds 		return -EINVAL;
106*1da177e4SLinus Torvalds 	return 0;
107*1da177e4SLinus Torvalds }
108*1da177e4SLinus Torvalds 
109*1da177e4SLinus Torvalds /* Do sanity checking on a policy */
110*1da177e4SLinus Torvalds static int mpol_check_policy(int mode, unsigned long *nodes)
111*1da177e4SLinus Torvalds {
112*1da177e4SLinus Torvalds 	int empty = bitmap_empty(nodes, MAX_NUMNODES);
113*1da177e4SLinus Torvalds 
114*1da177e4SLinus Torvalds 	switch (mode) {
115*1da177e4SLinus Torvalds 	case MPOL_DEFAULT:
116*1da177e4SLinus Torvalds 		if (!empty)
117*1da177e4SLinus Torvalds 			return -EINVAL;
118*1da177e4SLinus Torvalds 		break;
119*1da177e4SLinus Torvalds 	case MPOL_BIND:
120*1da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
121*1da177e4SLinus Torvalds 		/* Preferred will only use the first bit, but allow
122*1da177e4SLinus Torvalds 		   more for now. */
123*1da177e4SLinus Torvalds 		if (empty)
124*1da177e4SLinus Torvalds 			return -EINVAL;
125*1da177e4SLinus Torvalds 		break;
126*1da177e4SLinus Torvalds 	}
127*1da177e4SLinus Torvalds 	return nodes_online(nodes);
128*1da177e4SLinus Torvalds }
129*1da177e4SLinus Torvalds 
130*1da177e4SLinus Torvalds /* Copy a node mask from user space. */
131*1da177e4SLinus Torvalds static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132*1da177e4SLinus Torvalds 		     unsigned long maxnode, int mode)
133*1da177e4SLinus Torvalds {
134*1da177e4SLinus Torvalds 	unsigned long k;
135*1da177e4SLinus Torvalds 	unsigned long nlongs;
136*1da177e4SLinus Torvalds 	unsigned long endmask;
137*1da177e4SLinus Torvalds 
138*1da177e4SLinus Torvalds 	--maxnode;
139*1da177e4SLinus Torvalds 	bitmap_zero(nodes, MAX_NUMNODES);
140*1da177e4SLinus Torvalds 	if (maxnode == 0 || !nmask)
141*1da177e4SLinus Torvalds 		return 0;
142*1da177e4SLinus Torvalds 
143*1da177e4SLinus Torvalds 	nlongs = BITS_TO_LONGS(maxnode);
144*1da177e4SLinus Torvalds 	if ((maxnode % BITS_PER_LONG) == 0)
145*1da177e4SLinus Torvalds 		endmask = ~0UL;
146*1da177e4SLinus Torvalds 	else
147*1da177e4SLinus Torvalds 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148*1da177e4SLinus Torvalds 
149*1da177e4SLinus Torvalds 	/* When the user specified more nodes than supported just check
150*1da177e4SLinus Torvalds 	   if the non supported part is all zero. */
151*1da177e4SLinus Torvalds 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152*1da177e4SLinus Torvalds 		if (nlongs > PAGE_SIZE/sizeof(long))
153*1da177e4SLinus Torvalds 			return -EINVAL;
154*1da177e4SLinus Torvalds 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155*1da177e4SLinus Torvalds 			unsigned long t;
156*1da177e4SLinus Torvalds 			if (get_user(t,  nmask + k))
157*1da177e4SLinus Torvalds 				return -EFAULT;
158*1da177e4SLinus Torvalds 			if (k == nlongs - 1) {
159*1da177e4SLinus Torvalds 				if (t & endmask)
160*1da177e4SLinus Torvalds 					return -EINVAL;
161*1da177e4SLinus Torvalds 			} else if (t)
162*1da177e4SLinus Torvalds 				return -EINVAL;
163*1da177e4SLinus Torvalds 		}
164*1da177e4SLinus Torvalds 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165*1da177e4SLinus Torvalds 		endmask = ~0UL;
166*1da177e4SLinus Torvalds 	}
167*1da177e4SLinus Torvalds 
168*1da177e4SLinus Torvalds 	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169*1da177e4SLinus Torvalds 		return -EFAULT;
170*1da177e4SLinus Torvalds 	nodes[nlongs-1] &= endmask;
171*1da177e4SLinus Torvalds 	/* Update current mems_allowed */
172*1da177e4SLinus Torvalds 	cpuset_update_current_mems_allowed();
173*1da177e4SLinus Torvalds 	/* Ignore nodes not set in current->mems_allowed */
174*1da177e4SLinus Torvalds 	cpuset_restrict_to_mems_allowed(nodes);
175*1da177e4SLinus Torvalds 	return mpol_check_policy(mode, nodes);
176*1da177e4SLinus Torvalds }
177*1da177e4SLinus Torvalds 
178*1da177e4SLinus Torvalds /* Generate a custom zonelist for the BIND policy. */
179*1da177e4SLinus Torvalds static struct zonelist *bind_zonelist(unsigned long *nodes)
180*1da177e4SLinus Torvalds {
181*1da177e4SLinus Torvalds 	struct zonelist *zl;
182*1da177e4SLinus Torvalds 	int num, max, nd;
183*1da177e4SLinus Torvalds 
184*1da177e4SLinus Torvalds 	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185*1da177e4SLinus Torvalds 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186*1da177e4SLinus Torvalds 	if (!zl)
187*1da177e4SLinus Torvalds 		return NULL;
188*1da177e4SLinus Torvalds 	num = 0;
189*1da177e4SLinus Torvalds 	for (nd = find_first_bit(nodes, MAX_NUMNODES);
190*1da177e4SLinus Torvalds 	     nd < MAX_NUMNODES;
191*1da177e4SLinus Torvalds 	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192*1da177e4SLinus Torvalds 		int k;
193*1da177e4SLinus Torvalds 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194*1da177e4SLinus Torvalds 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
195*1da177e4SLinus Torvalds 			if (!z->present_pages)
196*1da177e4SLinus Torvalds 				continue;
197*1da177e4SLinus Torvalds 			zl->zones[num++] = z;
198*1da177e4SLinus Torvalds 			if (k > policy_zone)
199*1da177e4SLinus Torvalds 				policy_zone = k;
200*1da177e4SLinus Torvalds 		}
201*1da177e4SLinus Torvalds 	}
202*1da177e4SLinus Torvalds 	BUG_ON(num >= max);
203*1da177e4SLinus Torvalds 	zl->zones[num] = NULL;
204*1da177e4SLinus Torvalds 	return zl;
205*1da177e4SLinus Torvalds }
206*1da177e4SLinus Torvalds 
207*1da177e4SLinus Torvalds /* Create a new policy */
208*1da177e4SLinus Torvalds static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209*1da177e4SLinus Torvalds {
210*1da177e4SLinus Torvalds 	struct mempolicy *policy;
211*1da177e4SLinus Torvalds 
212*1da177e4SLinus Torvalds 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213*1da177e4SLinus Torvalds 	if (mode == MPOL_DEFAULT)
214*1da177e4SLinus Torvalds 		return NULL;
215*1da177e4SLinus Torvalds 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216*1da177e4SLinus Torvalds 	if (!policy)
217*1da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
218*1da177e4SLinus Torvalds 	atomic_set(&policy->refcnt, 1);
219*1da177e4SLinus Torvalds 	switch (mode) {
220*1da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
221*1da177e4SLinus Torvalds 		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222*1da177e4SLinus Torvalds 		break;
223*1da177e4SLinus Torvalds 	case MPOL_PREFERRED:
224*1da177e4SLinus Torvalds 		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225*1da177e4SLinus Torvalds 		if (policy->v.preferred_node >= MAX_NUMNODES)
226*1da177e4SLinus Torvalds 			policy->v.preferred_node = -1;
227*1da177e4SLinus Torvalds 		break;
228*1da177e4SLinus Torvalds 	case MPOL_BIND:
229*1da177e4SLinus Torvalds 		policy->v.zonelist = bind_zonelist(nodes);
230*1da177e4SLinus Torvalds 		if (policy->v.zonelist == NULL) {
231*1da177e4SLinus Torvalds 			kmem_cache_free(policy_cache, policy);
232*1da177e4SLinus Torvalds 			return ERR_PTR(-ENOMEM);
233*1da177e4SLinus Torvalds 		}
234*1da177e4SLinus Torvalds 		break;
235*1da177e4SLinus Torvalds 	}
236*1da177e4SLinus Torvalds 	policy->policy = mode;
237*1da177e4SLinus Torvalds 	return policy;
238*1da177e4SLinus Torvalds }
239*1da177e4SLinus Torvalds 
240*1da177e4SLinus Torvalds /* Ensure all existing pages follow the policy. */
241*1da177e4SLinus Torvalds static int
242*1da177e4SLinus Torvalds verify_pages(struct mm_struct *mm,
243*1da177e4SLinus Torvalds 	     unsigned long addr, unsigned long end, unsigned long *nodes)
244*1da177e4SLinus Torvalds {
245*1da177e4SLinus Torvalds 	while (addr < end) {
246*1da177e4SLinus Torvalds 		struct page *p;
247*1da177e4SLinus Torvalds 		pte_t *pte;
248*1da177e4SLinus Torvalds 		pmd_t *pmd;
249*1da177e4SLinus Torvalds 		pud_t *pud;
250*1da177e4SLinus Torvalds 		pgd_t *pgd;
251*1da177e4SLinus Torvalds 		pgd = pgd_offset(mm, addr);
252*1da177e4SLinus Torvalds 		if (pgd_none(*pgd)) {
253*1da177e4SLinus Torvalds 			unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
254*1da177e4SLinus Torvalds 			if (next > addr)
255*1da177e4SLinus Torvalds 				break;
256*1da177e4SLinus Torvalds 			addr = next;
257*1da177e4SLinus Torvalds 			continue;
258*1da177e4SLinus Torvalds 		}
259*1da177e4SLinus Torvalds 		pud = pud_offset(pgd, addr);
260*1da177e4SLinus Torvalds 		if (pud_none(*pud)) {
261*1da177e4SLinus Torvalds 			addr = (addr + PUD_SIZE) & PUD_MASK;
262*1da177e4SLinus Torvalds 			continue;
263*1da177e4SLinus Torvalds 		}
264*1da177e4SLinus Torvalds 		pmd = pmd_offset(pud, addr);
265*1da177e4SLinus Torvalds 		if (pmd_none(*pmd)) {
266*1da177e4SLinus Torvalds 			addr = (addr + PMD_SIZE) & PMD_MASK;
267*1da177e4SLinus Torvalds 			continue;
268*1da177e4SLinus Torvalds 		}
269*1da177e4SLinus Torvalds 		p = NULL;
270*1da177e4SLinus Torvalds 		pte = pte_offset_map(pmd, addr);
271*1da177e4SLinus Torvalds 		if (pte_present(*pte))
272*1da177e4SLinus Torvalds 			p = pte_page(*pte);
273*1da177e4SLinus Torvalds 		pte_unmap(pte);
274*1da177e4SLinus Torvalds 		if (p) {
275*1da177e4SLinus Torvalds 			unsigned nid = page_to_nid(p);
276*1da177e4SLinus Torvalds 			if (!test_bit(nid, nodes))
277*1da177e4SLinus Torvalds 				return -EIO;
278*1da177e4SLinus Torvalds 		}
279*1da177e4SLinus Torvalds 		addr += PAGE_SIZE;
280*1da177e4SLinus Torvalds 	}
281*1da177e4SLinus Torvalds 	return 0;
282*1da177e4SLinus Torvalds }
283*1da177e4SLinus Torvalds 
284*1da177e4SLinus Torvalds /* Step 1: check the range */
285*1da177e4SLinus Torvalds static struct vm_area_struct *
286*1da177e4SLinus Torvalds check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
287*1da177e4SLinus Torvalds 	    unsigned long *nodes, unsigned long flags)
288*1da177e4SLinus Torvalds {
289*1da177e4SLinus Torvalds 	int err;
290*1da177e4SLinus Torvalds 	struct vm_area_struct *first, *vma, *prev;
291*1da177e4SLinus Torvalds 
292*1da177e4SLinus Torvalds 	first = find_vma(mm, start);
293*1da177e4SLinus Torvalds 	if (!first)
294*1da177e4SLinus Torvalds 		return ERR_PTR(-EFAULT);
295*1da177e4SLinus Torvalds 	prev = NULL;
296*1da177e4SLinus Torvalds 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
297*1da177e4SLinus Torvalds 		if (!vma->vm_next && vma->vm_end < end)
298*1da177e4SLinus Torvalds 			return ERR_PTR(-EFAULT);
299*1da177e4SLinus Torvalds 		if (prev && prev->vm_end < vma->vm_start)
300*1da177e4SLinus Torvalds 			return ERR_PTR(-EFAULT);
301*1da177e4SLinus Torvalds 		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
302*1da177e4SLinus Torvalds 			err = verify_pages(vma->vm_mm,
303*1da177e4SLinus Torvalds 					   vma->vm_start, vma->vm_end, nodes);
304*1da177e4SLinus Torvalds 			if (err) {
305*1da177e4SLinus Torvalds 				first = ERR_PTR(err);
306*1da177e4SLinus Torvalds 				break;
307*1da177e4SLinus Torvalds 			}
308*1da177e4SLinus Torvalds 		}
309*1da177e4SLinus Torvalds 		prev = vma;
310*1da177e4SLinus Torvalds 	}
311*1da177e4SLinus Torvalds 	return first;
312*1da177e4SLinus Torvalds }
313*1da177e4SLinus Torvalds 
314*1da177e4SLinus Torvalds /* Apply policy to a single VMA */
315*1da177e4SLinus Torvalds static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
316*1da177e4SLinus Torvalds {
317*1da177e4SLinus Torvalds 	int err = 0;
318*1da177e4SLinus Torvalds 	struct mempolicy *old = vma->vm_policy;
319*1da177e4SLinus Torvalds 
320*1da177e4SLinus Torvalds 	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
321*1da177e4SLinus Torvalds 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
322*1da177e4SLinus Torvalds 		 vma->vm_ops, vma->vm_file,
323*1da177e4SLinus Torvalds 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
324*1da177e4SLinus Torvalds 
325*1da177e4SLinus Torvalds 	if (vma->vm_ops && vma->vm_ops->set_policy)
326*1da177e4SLinus Torvalds 		err = vma->vm_ops->set_policy(vma, new);
327*1da177e4SLinus Torvalds 	if (!err) {
328*1da177e4SLinus Torvalds 		mpol_get(new);
329*1da177e4SLinus Torvalds 		vma->vm_policy = new;
330*1da177e4SLinus Torvalds 		mpol_free(old);
331*1da177e4SLinus Torvalds 	}
332*1da177e4SLinus Torvalds 	return err;
333*1da177e4SLinus Torvalds }
334*1da177e4SLinus Torvalds 
335*1da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */
336*1da177e4SLinus Torvalds static int mbind_range(struct vm_area_struct *vma, unsigned long start,
337*1da177e4SLinus Torvalds 		       unsigned long end, struct mempolicy *new)
338*1da177e4SLinus Torvalds {
339*1da177e4SLinus Torvalds 	struct vm_area_struct *next;
340*1da177e4SLinus Torvalds 	int err;
341*1da177e4SLinus Torvalds 
342*1da177e4SLinus Torvalds 	err = 0;
343*1da177e4SLinus Torvalds 	for (; vma && vma->vm_start < end; vma = next) {
344*1da177e4SLinus Torvalds 		next = vma->vm_next;
345*1da177e4SLinus Torvalds 		if (vma->vm_start < start)
346*1da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, start, 1);
347*1da177e4SLinus Torvalds 		if (!err && vma->vm_end > end)
348*1da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, end, 0);
349*1da177e4SLinus Torvalds 		if (!err)
350*1da177e4SLinus Torvalds 			err = policy_vma(vma, new);
351*1da177e4SLinus Torvalds 		if (err)
352*1da177e4SLinus Torvalds 			break;
353*1da177e4SLinus Torvalds 	}
354*1da177e4SLinus Torvalds 	return err;
355*1da177e4SLinus Torvalds }
356*1da177e4SLinus Torvalds 
357*1da177e4SLinus Torvalds /* Change policy for a memory range */
358*1da177e4SLinus Torvalds asmlinkage long sys_mbind(unsigned long start, unsigned long len,
359*1da177e4SLinus Torvalds 			  unsigned long mode,
360*1da177e4SLinus Torvalds 			  unsigned long __user *nmask, unsigned long maxnode,
361*1da177e4SLinus Torvalds 			  unsigned flags)
362*1da177e4SLinus Torvalds {
363*1da177e4SLinus Torvalds 	struct vm_area_struct *vma;
364*1da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
365*1da177e4SLinus Torvalds 	struct mempolicy *new;
366*1da177e4SLinus Torvalds 	unsigned long end;
367*1da177e4SLinus Torvalds 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
368*1da177e4SLinus Torvalds 	int err;
369*1da177e4SLinus Torvalds 
370*1da177e4SLinus Torvalds 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
371*1da177e4SLinus Torvalds 		return -EINVAL;
372*1da177e4SLinus Torvalds 	if (start & ~PAGE_MASK)
373*1da177e4SLinus Torvalds 		return -EINVAL;
374*1da177e4SLinus Torvalds 	if (mode == MPOL_DEFAULT)
375*1da177e4SLinus Torvalds 		flags &= ~MPOL_MF_STRICT;
376*1da177e4SLinus Torvalds 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
377*1da177e4SLinus Torvalds 	end = start + len;
378*1da177e4SLinus Torvalds 	if (end < start)
379*1da177e4SLinus Torvalds 		return -EINVAL;
380*1da177e4SLinus Torvalds 	if (end == start)
381*1da177e4SLinus Torvalds 		return 0;
382*1da177e4SLinus Torvalds 
383*1da177e4SLinus Torvalds 	err = get_nodes(nodes, nmask, maxnode, mode);
384*1da177e4SLinus Torvalds 	if (err)
385*1da177e4SLinus Torvalds 		return err;
386*1da177e4SLinus Torvalds 
387*1da177e4SLinus Torvalds 	new = mpol_new(mode, nodes);
388*1da177e4SLinus Torvalds 	if (IS_ERR(new))
389*1da177e4SLinus Torvalds 		return PTR_ERR(new);
390*1da177e4SLinus Torvalds 
391*1da177e4SLinus Torvalds 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
392*1da177e4SLinus Torvalds 			mode,nodes[0]);
393*1da177e4SLinus Torvalds 
394*1da177e4SLinus Torvalds 	down_write(&mm->mmap_sem);
395*1da177e4SLinus Torvalds 	vma = check_range(mm, start, end, nodes, flags);
396*1da177e4SLinus Torvalds 	err = PTR_ERR(vma);
397*1da177e4SLinus Torvalds 	if (!IS_ERR(vma))
398*1da177e4SLinus Torvalds 		err = mbind_range(vma, start, end, new);
399*1da177e4SLinus Torvalds 	up_write(&mm->mmap_sem);
400*1da177e4SLinus Torvalds 	mpol_free(new);
401*1da177e4SLinus Torvalds 	return err;
402*1da177e4SLinus Torvalds }
403*1da177e4SLinus Torvalds 
404*1da177e4SLinus Torvalds /* Set the process memory policy */
405*1da177e4SLinus Torvalds asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
406*1da177e4SLinus Torvalds 				   unsigned long maxnode)
407*1da177e4SLinus Torvalds {
408*1da177e4SLinus Torvalds 	int err;
409*1da177e4SLinus Torvalds 	struct mempolicy *new;
410*1da177e4SLinus Torvalds 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
411*1da177e4SLinus Torvalds 
412*1da177e4SLinus Torvalds 	if (mode > MPOL_MAX)
413*1da177e4SLinus Torvalds 		return -EINVAL;
414*1da177e4SLinus Torvalds 	err = get_nodes(nodes, nmask, maxnode, mode);
415*1da177e4SLinus Torvalds 	if (err)
416*1da177e4SLinus Torvalds 		return err;
417*1da177e4SLinus Torvalds 	new = mpol_new(mode, nodes);
418*1da177e4SLinus Torvalds 	if (IS_ERR(new))
419*1da177e4SLinus Torvalds 		return PTR_ERR(new);
420*1da177e4SLinus Torvalds 	mpol_free(current->mempolicy);
421*1da177e4SLinus Torvalds 	current->mempolicy = new;
422*1da177e4SLinus Torvalds 	if (new && new->policy == MPOL_INTERLEAVE)
423*1da177e4SLinus Torvalds 		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
424*1da177e4SLinus Torvalds 	return 0;
425*1da177e4SLinus Torvalds }
426*1da177e4SLinus Torvalds 
427*1da177e4SLinus Torvalds /* Fill a zone bitmap for a policy */
428*1da177e4SLinus Torvalds static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
429*1da177e4SLinus Torvalds {
430*1da177e4SLinus Torvalds 	int i;
431*1da177e4SLinus Torvalds 
432*1da177e4SLinus Torvalds 	bitmap_zero(nodes, MAX_NUMNODES);
433*1da177e4SLinus Torvalds 	switch (p->policy) {
434*1da177e4SLinus Torvalds 	case MPOL_BIND:
435*1da177e4SLinus Torvalds 		for (i = 0; p->v.zonelist->zones[i]; i++)
436*1da177e4SLinus Torvalds 			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
437*1da177e4SLinus Torvalds 		break;
438*1da177e4SLinus Torvalds 	case MPOL_DEFAULT:
439*1da177e4SLinus Torvalds 		break;
440*1da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
441*1da177e4SLinus Torvalds 		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
442*1da177e4SLinus Torvalds 		break;
443*1da177e4SLinus Torvalds 	case MPOL_PREFERRED:
444*1da177e4SLinus Torvalds 		/* or use current node instead of online map? */
445*1da177e4SLinus Torvalds 		if (p->v.preferred_node < 0)
446*1da177e4SLinus Torvalds 			bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
447*1da177e4SLinus Torvalds 		else
448*1da177e4SLinus Torvalds 			__set_bit(p->v.preferred_node, nodes);
449*1da177e4SLinus Torvalds 		break;
450*1da177e4SLinus Torvalds 	default:
451*1da177e4SLinus Torvalds 		BUG();
452*1da177e4SLinus Torvalds 	}
453*1da177e4SLinus Torvalds }
454*1da177e4SLinus Torvalds 
455*1da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr)
456*1da177e4SLinus Torvalds {
457*1da177e4SLinus Torvalds 	struct page *p;
458*1da177e4SLinus Torvalds 	int err;
459*1da177e4SLinus Torvalds 
460*1da177e4SLinus Torvalds 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
461*1da177e4SLinus Torvalds 	if (err >= 0) {
462*1da177e4SLinus Torvalds 		err = page_to_nid(p);
463*1da177e4SLinus Torvalds 		put_page(p);
464*1da177e4SLinus Torvalds 	}
465*1da177e4SLinus Torvalds 	return err;
466*1da177e4SLinus Torvalds }
467*1da177e4SLinus Torvalds 
468*1da177e4SLinus Torvalds /* Copy a kernel node mask to user space */
469*1da177e4SLinus Torvalds static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
470*1da177e4SLinus Torvalds 			      void *nodes, unsigned nbytes)
471*1da177e4SLinus Torvalds {
472*1da177e4SLinus Torvalds 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
473*1da177e4SLinus Torvalds 
474*1da177e4SLinus Torvalds 	if (copy > nbytes) {
475*1da177e4SLinus Torvalds 		if (copy > PAGE_SIZE)
476*1da177e4SLinus Torvalds 			return -EINVAL;
477*1da177e4SLinus Torvalds 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
478*1da177e4SLinus Torvalds 			return -EFAULT;
479*1da177e4SLinus Torvalds 		copy = nbytes;
480*1da177e4SLinus Torvalds 	}
481*1da177e4SLinus Torvalds 	return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
482*1da177e4SLinus Torvalds }
483*1da177e4SLinus Torvalds 
484*1da177e4SLinus Torvalds /* Retrieve NUMA policy */
485*1da177e4SLinus Torvalds asmlinkage long sys_get_mempolicy(int __user *policy,
486*1da177e4SLinus Torvalds 				  unsigned long __user *nmask,
487*1da177e4SLinus Torvalds 				  unsigned long maxnode,
488*1da177e4SLinus Torvalds 				  unsigned long addr, unsigned long flags)
489*1da177e4SLinus Torvalds {
490*1da177e4SLinus Torvalds 	int err, pval;
491*1da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
492*1da177e4SLinus Torvalds 	struct vm_area_struct *vma = NULL;
493*1da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
494*1da177e4SLinus Torvalds 
495*1da177e4SLinus Torvalds 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
496*1da177e4SLinus Torvalds 		return -EINVAL;
497*1da177e4SLinus Torvalds 	if (nmask != NULL && maxnode < MAX_NUMNODES)
498*1da177e4SLinus Torvalds 		return -EINVAL;
499*1da177e4SLinus Torvalds 	if (flags & MPOL_F_ADDR) {
500*1da177e4SLinus Torvalds 		down_read(&mm->mmap_sem);
501*1da177e4SLinus Torvalds 		vma = find_vma_intersection(mm, addr, addr+1);
502*1da177e4SLinus Torvalds 		if (!vma) {
503*1da177e4SLinus Torvalds 			up_read(&mm->mmap_sem);
504*1da177e4SLinus Torvalds 			return -EFAULT;
505*1da177e4SLinus Torvalds 		}
506*1da177e4SLinus Torvalds 		if (vma->vm_ops && vma->vm_ops->get_policy)
507*1da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
508*1da177e4SLinus Torvalds 		else
509*1da177e4SLinus Torvalds 			pol = vma->vm_policy;
510*1da177e4SLinus Torvalds 	} else if (addr)
511*1da177e4SLinus Torvalds 		return -EINVAL;
512*1da177e4SLinus Torvalds 
513*1da177e4SLinus Torvalds 	if (!pol)
514*1da177e4SLinus Torvalds 		pol = &default_policy;
515*1da177e4SLinus Torvalds 
516*1da177e4SLinus Torvalds 	if (flags & MPOL_F_NODE) {
517*1da177e4SLinus Torvalds 		if (flags & MPOL_F_ADDR) {
518*1da177e4SLinus Torvalds 			err = lookup_node(mm, addr);
519*1da177e4SLinus Torvalds 			if (err < 0)
520*1da177e4SLinus Torvalds 				goto out;
521*1da177e4SLinus Torvalds 			pval = err;
522*1da177e4SLinus Torvalds 		} else if (pol == current->mempolicy &&
523*1da177e4SLinus Torvalds 				pol->policy == MPOL_INTERLEAVE) {
524*1da177e4SLinus Torvalds 			pval = current->il_next;
525*1da177e4SLinus Torvalds 		} else {
526*1da177e4SLinus Torvalds 			err = -EINVAL;
527*1da177e4SLinus Torvalds 			goto out;
528*1da177e4SLinus Torvalds 		}
529*1da177e4SLinus Torvalds 	} else
530*1da177e4SLinus Torvalds 		pval = pol->policy;
531*1da177e4SLinus Torvalds 
532*1da177e4SLinus Torvalds 	if (vma) {
533*1da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
534*1da177e4SLinus Torvalds 		vma = NULL;
535*1da177e4SLinus Torvalds 	}
536*1da177e4SLinus Torvalds 
537*1da177e4SLinus Torvalds 	if (policy && put_user(pval, policy))
538*1da177e4SLinus Torvalds 		return -EFAULT;
539*1da177e4SLinus Torvalds 
540*1da177e4SLinus Torvalds 	err = 0;
541*1da177e4SLinus Torvalds 	if (nmask) {
542*1da177e4SLinus Torvalds 		DECLARE_BITMAP(nodes, MAX_NUMNODES);
543*1da177e4SLinus Torvalds 		get_zonemask(pol, nodes);
544*1da177e4SLinus Torvalds 		err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
545*1da177e4SLinus Torvalds 	}
546*1da177e4SLinus Torvalds 
547*1da177e4SLinus Torvalds  out:
548*1da177e4SLinus Torvalds 	if (vma)
549*1da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
550*1da177e4SLinus Torvalds 	return err;
551*1da177e4SLinus Torvalds }
552*1da177e4SLinus Torvalds 
553*1da177e4SLinus Torvalds #ifdef CONFIG_COMPAT
554*1da177e4SLinus Torvalds 
555*1da177e4SLinus Torvalds asmlinkage long compat_sys_get_mempolicy(int __user *policy,
556*1da177e4SLinus Torvalds 				     compat_ulong_t __user *nmask,
557*1da177e4SLinus Torvalds 				     compat_ulong_t maxnode,
558*1da177e4SLinus Torvalds 				     compat_ulong_t addr, compat_ulong_t flags)
559*1da177e4SLinus Torvalds {
560*1da177e4SLinus Torvalds 	long err;
561*1da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
562*1da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
563*1da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
564*1da177e4SLinus Torvalds 
565*1da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
566*1da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
567*1da177e4SLinus Torvalds 
568*1da177e4SLinus Torvalds 	if (nmask)
569*1da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
570*1da177e4SLinus Torvalds 
571*1da177e4SLinus Torvalds 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
572*1da177e4SLinus Torvalds 
573*1da177e4SLinus Torvalds 	if (!err && nmask) {
574*1da177e4SLinus Torvalds 		err = copy_from_user(bm, nm, alloc_size);
575*1da177e4SLinus Torvalds 		/* ensure entire bitmap is zeroed */
576*1da177e4SLinus Torvalds 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
577*1da177e4SLinus Torvalds 		err |= compat_put_bitmap(nmask, bm, nr_bits);
578*1da177e4SLinus Torvalds 	}
579*1da177e4SLinus Torvalds 
580*1da177e4SLinus Torvalds 	return err;
581*1da177e4SLinus Torvalds }
582*1da177e4SLinus Torvalds 
583*1da177e4SLinus Torvalds asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
584*1da177e4SLinus Torvalds 				     compat_ulong_t maxnode)
585*1da177e4SLinus Torvalds {
586*1da177e4SLinus Torvalds 	long err = 0;
587*1da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
588*1da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
589*1da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
590*1da177e4SLinus Torvalds 
591*1da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
592*1da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
593*1da177e4SLinus Torvalds 
594*1da177e4SLinus Torvalds 	if (nmask) {
595*1da177e4SLinus Torvalds 		err = compat_get_bitmap(bm, nmask, nr_bits);
596*1da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
597*1da177e4SLinus Torvalds 		err |= copy_to_user(nm, bm, alloc_size);
598*1da177e4SLinus Torvalds 	}
599*1da177e4SLinus Torvalds 
600*1da177e4SLinus Torvalds 	if (err)
601*1da177e4SLinus Torvalds 		return -EFAULT;
602*1da177e4SLinus Torvalds 
603*1da177e4SLinus Torvalds 	return sys_set_mempolicy(mode, nm, nr_bits+1);
604*1da177e4SLinus Torvalds }
605*1da177e4SLinus Torvalds 
606*1da177e4SLinus Torvalds asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
607*1da177e4SLinus Torvalds 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
608*1da177e4SLinus Torvalds 			     compat_ulong_t maxnode, compat_ulong_t flags)
609*1da177e4SLinus Torvalds {
610*1da177e4SLinus Torvalds 	long err = 0;
611*1da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
612*1da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
613*1da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
614*1da177e4SLinus Torvalds 
615*1da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
616*1da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
617*1da177e4SLinus Torvalds 
618*1da177e4SLinus Torvalds 	if (nmask) {
619*1da177e4SLinus Torvalds 		err = compat_get_bitmap(bm, nmask, nr_bits);
620*1da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
621*1da177e4SLinus Torvalds 		err |= copy_to_user(nm, bm, alloc_size);
622*1da177e4SLinus Torvalds 	}
623*1da177e4SLinus Torvalds 
624*1da177e4SLinus Torvalds 	if (err)
625*1da177e4SLinus Torvalds 		return -EFAULT;
626*1da177e4SLinus Torvalds 
627*1da177e4SLinus Torvalds 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
628*1da177e4SLinus Torvalds }
629*1da177e4SLinus Torvalds 
630*1da177e4SLinus Torvalds #endif
631*1da177e4SLinus Torvalds 
632*1da177e4SLinus Torvalds /* Return effective policy for a VMA */
633*1da177e4SLinus Torvalds static struct mempolicy *
634*1da177e4SLinus Torvalds get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
635*1da177e4SLinus Torvalds {
636*1da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
637*1da177e4SLinus Torvalds 
638*1da177e4SLinus Torvalds 	if (vma) {
639*1da177e4SLinus Torvalds 		if (vma->vm_ops && vma->vm_ops->get_policy)
640*1da177e4SLinus Torvalds 		        pol = vma->vm_ops->get_policy(vma, addr);
641*1da177e4SLinus Torvalds 		else if (vma->vm_policy &&
642*1da177e4SLinus Torvalds 				vma->vm_policy->policy != MPOL_DEFAULT)
643*1da177e4SLinus Torvalds 			pol = vma->vm_policy;
644*1da177e4SLinus Torvalds 	}
645*1da177e4SLinus Torvalds 	if (!pol)
646*1da177e4SLinus Torvalds 		pol = &default_policy;
647*1da177e4SLinus Torvalds 	return pol;
648*1da177e4SLinus Torvalds }
649*1da177e4SLinus Torvalds 
650*1da177e4SLinus Torvalds /* Return a zonelist representing a mempolicy */
651*1da177e4SLinus Torvalds static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
652*1da177e4SLinus Torvalds {
653*1da177e4SLinus Torvalds 	int nd;
654*1da177e4SLinus Torvalds 
655*1da177e4SLinus Torvalds 	switch (policy->policy) {
656*1da177e4SLinus Torvalds 	case MPOL_PREFERRED:
657*1da177e4SLinus Torvalds 		nd = policy->v.preferred_node;
658*1da177e4SLinus Torvalds 		if (nd < 0)
659*1da177e4SLinus Torvalds 			nd = numa_node_id();
660*1da177e4SLinus Torvalds 		break;
661*1da177e4SLinus Torvalds 	case MPOL_BIND:
662*1da177e4SLinus Torvalds 		/* Lower zones don't get a policy applied */
663*1da177e4SLinus Torvalds 		/* Careful: current->mems_allowed might have moved */
664*1da177e4SLinus Torvalds 		if (gfp >= policy_zone)
665*1da177e4SLinus Torvalds 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
666*1da177e4SLinus Torvalds 				return policy->v.zonelist;
667*1da177e4SLinus Torvalds 		/*FALL THROUGH*/
668*1da177e4SLinus Torvalds 	case MPOL_INTERLEAVE: /* should not happen */
669*1da177e4SLinus Torvalds 	case MPOL_DEFAULT:
670*1da177e4SLinus Torvalds 		nd = numa_node_id();
671*1da177e4SLinus Torvalds 		break;
672*1da177e4SLinus Torvalds 	default:
673*1da177e4SLinus Torvalds 		nd = 0;
674*1da177e4SLinus Torvalds 		BUG();
675*1da177e4SLinus Torvalds 	}
676*1da177e4SLinus Torvalds 	return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
677*1da177e4SLinus Torvalds }
678*1da177e4SLinus Torvalds 
679*1da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
680*1da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy)
681*1da177e4SLinus Torvalds {
682*1da177e4SLinus Torvalds 	unsigned nid, next;
683*1da177e4SLinus Torvalds 	struct task_struct *me = current;
684*1da177e4SLinus Torvalds 
685*1da177e4SLinus Torvalds 	nid = me->il_next;
686*1da177e4SLinus Torvalds 	BUG_ON(nid >= MAX_NUMNODES);
687*1da177e4SLinus Torvalds 	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
688*1da177e4SLinus Torvalds 	if (next >= MAX_NUMNODES)
689*1da177e4SLinus Torvalds 		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
690*1da177e4SLinus Torvalds 	me->il_next = next;
691*1da177e4SLinus Torvalds 	return nid;
692*1da177e4SLinus Torvalds }
693*1da177e4SLinus Torvalds 
694*1da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */
695*1da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol,
696*1da177e4SLinus Torvalds 		struct vm_area_struct *vma, unsigned long off)
697*1da177e4SLinus Torvalds {
698*1da177e4SLinus Torvalds 	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
699*1da177e4SLinus Torvalds 	unsigned target = (unsigned)off % nnodes;
700*1da177e4SLinus Torvalds 	int c;
701*1da177e4SLinus Torvalds 	int nid = -1;
702*1da177e4SLinus Torvalds 
703*1da177e4SLinus Torvalds 	c = 0;
704*1da177e4SLinus Torvalds 	do {
705*1da177e4SLinus Torvalds 		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
706*1da177e4SLinus Torvalds 		c++;
707*1da177e4SLinus Torvalds 	} while (c <= target);
708*1da177e4SLinus Torvalds 	BUG_ON(nid >= MAX_NUMNODES);
709*1da177e4SLinus Torvalds 	BUG_ON(!test_bit(nid, pol->v.nodes));
710*1da177e4SLinus Torvalds 	return nid;
711*1da177e4SLinus Torvalds }
712*1da177e4SLinus Torvalds 
713*1da177e4SLinus Torvalds /* Allocate a page in interleaved policy.
714*1da177e4SLinus Torvalds    Own path because it needs to do special accounting. */
715*1da177e4SLinus Torvalds static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
716*1da177e4SLinus Torvalds {
717*1da177e4SLinus Torvalds 	struct zonelist *zl;
718*1da177e4SLinus Torvalds 	struct page *page;
719*1da177e4SLinus Torvalds 
720*1da177e4SLinus Torvalds 	BUG_ON(!node_online(nid));
721*1da177e4SLinus Torvalds 	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722*1da177e4SLinus Torvalds 	page = __alloc_pages(gfp, order, zl);
723*1da177e4SLinus Torvalds 	if (page && page_zone(page) == zl->zones[0]) {
724*1da177e4SLinus Torvalds 		zl->zones[0]->pageset[get_cpu()].interleave_hit++;
725*1da177e4SLinus Torvalds 		put_cpu();
726*1da177e4SLinus Torvalds 	}
727*1da177e4SLinus Torvalds 	return page;
728*1da177e4SLinus Torvalds }
729*1da177e4SLinus Torvalds 
730*1da177e4SLinus Torvalds /**
731*1da177e4SLinus Torvalds  * 	alloc_page_vma	- Allocate a page for a VMA.
732*1da177e4SLinus Torvalds  *
733*1da177e4SLinus Torvalds  * 	@gfp:
734*1da177e4SLinus Torvalds  *      %GFP_USER    user allocation.
735*1da177e4SLinus Torvalds  *      %GFP_KERNEL  kernel allocations,
736*1da177e4SLinus Torvalds  *      %GFP_HIGHMEM highmem/user allocations,
737*1da177e4SLinus Torvalds  *      %GFP_FS      allocation should not call back into a file system.
738*1da177e4SLinus Torvalds  *      %GFP_ATOMIC  don't sleep.
739*1da177e4SLinus Torvalds  *
740*1da177e4SLinus Torvalds  * 	@vma:  Pointer to VMA or NULL if not available.
741*1da177e4SLinus Torvalds  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
742*1da177e4SLinus Torvalds  *
743*1da177e4SLinus Torvalds  * 	This function allocates a page from the kernel page pool and applies
744*1da177e4SLinus Torvalds  *	a NUMA policy associated with the VMA or the current process.
745*1da177e4SLinus Torvalds  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
746*1da177e4SLinus Torvalds  *	mm_struct of the VMA to prevent it from going away. Should be used for
747*1da177e4SLinus Torvalds  *	all allocations for pages that will be mapped into
748*1da177e4SLinus Torvalds  * 	user space. Returns NULL when no page can be allocated.
749*1da177e4SLinus Torvalds  *
750*1da177e4SLinus Torvalds  *	Should be called with the mm_sem of the vma hold.
751*1da177e4SLinus Torvalds  */
752*1da177e4SLinus Torvalds struct page *
753*1da177e4SLinus Torvalds alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
754*1da177e4SLinus Torvalds {
755*1da177e4SLinus Torvalds 	struct mempolicy *pol = get_vma_policy(vma, addr);
756*1da177e4SLinus Torvalds 
757*1da177e4SLinus Torvalds 	cpuset_update_current_mems_allowed();
758*1da177e4SLinus Torvalds 
759*1da177e4SLinus Torvalds 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
760*1da177e4SLinus Torvalds 		unsigned nid;
761*1da177e4SLinus Torvalds 		if (vma) {
762*1da177e4SLinus Torvalds 			unsigned long off;
763*1da177e4SLinus Torvalds 			BUG_ON(addr >= vma->vm_end);
764*1da177e4SLinus Torvalds 			BUG_ON(addr < vma->vm_start);
765*1da177e4SLinus Torvalds 			off = vma->vm_pgoff;
766*1da177e4SLinus Torvalds 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
767*1da177e4SLinus Torvalds 			nid = offset_il_node(pol, vma, off);
768*1da177e4SLinus Torvalds 		} else {
769*1da177e4SLinus Torvalds 			/* fall back to process interleaving */
770*1da177e4SLinus Torvalds 			nid = interleave_nodes(pol);
771*1da177e4SLinus Torvalds 		}
772*1da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, 0, nid);
773*1da177e4SLinus Torvalds 	}
774*1da177e4SLinus Torvalds 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
775*1da177e4SLinus Torvalds }
776*1da177e4SLinus Torvalds 
777*1da177e4SLinus Torvalds /**
778*1da177e4SLinus Torvalds  * 	alloc_pages_current - Allocate pages.
779*1da177e4SLinus Torvalds  *
780*1da177e4SLinus Torvalds  *	@gfp:
781*1da177e4SLinus Torvalds  *		%GFP_USER   user allocation,
782*1da177e4SLinus Torvalds  *      	%GFP_KERNEL kernel allocation,
783*1da177e4SLinus Torvalds  *      	%GFP_HIGHMEM highmem allocation,
784*1da177e4SLinus Torvalds  *      	%GFP_FS     don't call back into a file system.
785*1da177e4SLinus Torvalds  *      	%GFP_ATOMIC don't sleep.
786*1da177e4SLinus Torvalds  *	@order: Power of two of allocation size in pages. 0 is a single page.
787*1da177e4SLinus Torvalds  *
788*1da177e4SLinus Torvalds  *	Allocate a page from the kernel page pool.  When not in
789*1da177e4SLinus Torvalds  *	interrupt context and apply the current process NUMA policy.
790*1da177e4SLinus Torvalds  *	Returns NULL when no page can be allocated.
791*1da177e4SLinus Torvalds  *
792*1da177e4SLinus Torvalds  *	Don't call cpuset_update_current_mems_allowed() unless
793*1da177e4SLinus Torvalds  *	1) it's ok to take cpuset_sem (can WAIT), and
794*1da177e4SLinus Torvalds  *	2) allocating for current task (not interrupt).
795*1da177e4SLinus Torvalds  */
796*1da177e4SLinus Torvalds struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
797*1da177e4SLinus Torvalds {
798*1da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
799*1da177e4SLinus Torvalds 
800*1da177e4SLinus Torvalds 	if ((gfp & __GFP_WAIT) && !in_interrupt())
801*1da177e4SLinus Torvalds 		cpuset_update_current_mems_allowed();
802*1da177e4SLinus Torvalds 	if (!pol || in_interrupt())
803*1da177e4SLinus Torvalds 		pol = &default_policy;
804*1da177e4SLinus Torvalds 	if (pol->policy == MPOL_INTERLEAVE)
805*1da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
806*1da177e4SLinus Torvalds 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
807*1da177e4SLinus Torvalds }
808*1da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current);
809*1da177e4SLinus Torvalds 
810*1da177e4SLinus Torvalds /* Slow path of a mempolicy copy */
811*1da177e4SLinus Torvalds struct mempolicy *__mpol_copy(struct mempolicy *old)
812*1da177e4SLinus Torvalds {
813*1da177e4SLinus Torvalds 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
814*1da177e4SLinus Torvalds 
815*1da177e4SLinus Torvalds 	if (!new)
816*1da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
817*1da177e4SLinus Torvalds 	*new = *old;
818*1da177e4SLinus Torvalds 	atomic_set(&new->refcnt, 1);
819*1da177e4SLinus Torvalds 	if (new->policy == MPOL_BIND) {
820*1da177e4SLinus Torvalds 		int sz = ksize(old->v.zonelist);
821*1da177e4SLinus Torvalds 		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
822*1da177e4SLinus Torvalds 		if (!new->v.zonelist) {
823*1da177e4SLinus Torvalds 			kmem_cache_free(policy_cache, new);
824*1da177e4SLinus Torvalds 			return ERR_PTR(-ENOMEM);
825*1da177e4SLinus Torvalds 		}
826*1da177e4SLinus Torvalds 		memcpy(new->v.zonelist, old->v.zonelist, sz);
827*1da177e4SLinus Torvalds 	}
828*1da177e4SLinus Torvalds 	return new;
829*1da177e4SLinus Torvalds }
830*1da177e4SLinus Torvalds 
831*1da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
832*1da177e4SLinus Torvalds int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
833*1da177e4SLinus Torvalds {
834*1da177e4SLinus Torvalds 	if (!a || !b)
835*1da177e4SLinus Torvalds 		return 0;
836*1da177e4SLinus Torvalds 	if (a->policy != b->policy)
837*1da177e4SLinus Torvalds 		return 0;
838*1da177e4SLinus Torvalds 	switch (a->policy) {
839*1da177e4SLinus Torvalds 	case MPOL_DEFAULT:
840*1da177e4SLinus Torvalds 		return 1;
841*1da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
842*1da177e4SLinus Torvalds 		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
843*1da177e4SLinus Torvalds 	case MPOL_PREFERRED:
844*1da177e4SLinus Torvalds 		return a->v.preferred_node == b->v.preferred_node;
845*1da177e4SLinus Torvalds 	case MPOL_BIND: {
846*1da177e4SLinus Torvalds 		int i;
847*1da177e4SLinus Torvalds 		for (i = 0; a->v.zonelist->zones[i]; i++)
848*1da177e4SLinus Torvalds 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
849*1da177e4SLinus Torvalds 				return 0;
850*1da177e4SLinus Torvalds 		return b->v.zonelist->zones[i] == NULL;
851*1da177e4SLinus Torvalds 	}
852*1da177e4SLinus Torvalds 	default:
853*1da177e4SLinus Torvalds 		BUG();
854*1da177e4SLinus Torvalds 		return 0;
855*1da177e4SLinus Torvalds 	}
856*1da177e4SLinus Torvalds }
857*1da177e4SLinus Torvalds 
858*1da177e4SLinus Torvalds /* Slow path of a mpol destructor. */
859*1da177e4SLinus Torvalds void __mpol_free(struct mempolicy *p)
860*1da177e4SLinus Torvalds {
861*1da177e4SLinus Torvalds 	if (!atomic_dec_and_test(&p->refcnt))
862*1da177e4SLinus Torvalds 		return;
863*1da177e4SLinus Torvalds 	if (p->policy == MPOL_BIND)
864*1da177e4SLinus Torvalds 		kfree(p->v.zonelist);
865*1da177e4SLinus Torvalds 	p->policy = MPOL_DEFAULT;
866*1da177e4SLinus Torvalds 	kmem_cache_free(policy_cache, p);
867*1da177e4SLinus Torvalds }
868*1da177e4SLinus Torvalds 
869*1da177e4SLinus Torvalds /*
870*1da177e4SLinus Torvalds  * Hugetlb policy. Same as above, just works with node numbers instead of
871*1da177e4SLinus Torvalds  * zonelists.
872*1da177e4SLinus Torvalds  */
873*1da177e4SLinus Torvalds 
874*1da177e4SLinus Torvalds /* Find first node suitable for an allocation */
875*1da177e4SLinus Torvalds int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
876*1da177e4SLinus Torvalds {
877*1da177e4SLinus Torvalds 	struct mempolicy *pol = get_vma_policy(vma, addr);
878*1da177e4SLinus Torvalds 
879*1da177e4SLinus Torvalds 	switch (pol->policy) {
880*1da177e4SLinus Torvalds 	case MPOL_DEFAULT:
881*1da177e4SLinus Torvalds 		return numa_node_id();
882*1da177e4SLinus Torvalds 	case MPOL_BIND:
883*1da177e4SLinus Torvalds 		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
884*1da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
885*1da177e4SLinus Torvalds 		return interleave_nodes(pol);
886*1da177e4SLinus Torvalds 	case MPOL_PREFERRED:
887*1da177e4SLinus Torvalds 		return pol->v.preferred_node >= 0 ?
888*1da177e4SLinus Torvalds 				pol->v.preferred_node : numa_node_id();
889*1da177e4SLinus Torvalds 	}
890*1da177e4SLinus Torvalds 	BUG();
891*1da177e4SLinus Torvalds 	return 0;
892*1da177e4SLinus Torvalds }
893*1da177e4SLinus Torvalds 
894*1da177e4SLinus Torvalds /* Find secondary valid nodes for an allocation */
895*1da177e4SLinus Torvalds int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
896*1da177e4SLinus Torvalds {
897*1da177e4SLinus Torvalds 	struct mempolicy *pol = get_vma_policy(vma, addr);
898*1da177e4SLinus Torvalds 
899*1da177e4SLinus Torvalds 	switch (pol->policy) {
900*1da177e4SLinus Torvalds 	case MPOL_PREFERRED:
901*1da177e4SLinus Torvalds 	case MPOL_DEFAULT:
902*1da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
903*1da177e4SLinus Torvalds 		return 1;
904*1da177e4SLinus Torvalds 	case MPOL_BIND: {
905*1da177e4SLinus Torvalds 		struct zone **z;
906*1da177e4SLinus Torvalds 		for (z = pol->v.zonelist->zones; *z; z++)
907*1da177e4SLinus Torvalds 			if ((*z)->zone_pgdat->node_id == nid)
908*1da177e4SLinus Torvalds 				return 1;
909*1da177e4SLinus Torvalds 		return 0;
910*1da177e4SLinus Torvalds 	}
911*1da177e4SLinus Torvalds 	default:
912*1da177e4SLinus Torvalds 		BUG();
913*1da177e4SLinus Torvalds 		return 0;
914*1da177e4SLinus Torvalds 	}
915*1da177e4SLinus Torvalds }
916*1da177e4SLinus Torvalds 
917*1da177e4SLinus Torvalds /*
918*1da177e4SLinus Torvalds  * Shared memory backing store policy support.
919*1da177e4SLinus Torvalds  *
920*1da177e4SLinus Torvalds  * Remember policies even when nobody has shared memory mapped.
921*1da177e4SLinus Torvalds  * The policies are kept in Red-Black tree linked from the inode.
922*1da177e4SLinus Torvalds  * They are protected by the sp->lock spinlock, which should be held
923*1da177e4SLinus Torvalds  * for any accesses to the tree.
924*1da177e4SLinus Torvalds  */
925*1da177e4SLinus Torvalds 
926*1da177e4SLinus Torvalds /* lookup first element intersecting start-end */
927*1da177e4SLinus Torvalds /* Caller holds sp->lock */
928*1da177e4SLinus Torvalds static struct sp_node *
929*1da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
930*1da177e4SLinus Torvalds {
931*1da177e4SLinus Torvalds 	struct rb_node *n = sp->root.rb_node;
932*1da177e4SLinus Torvalds 
933*1da177e4SLinus Torvalds 	while (n) {
934*1da177e4SLinus Torvalds 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
935*1da177e4SLinus Torvalds 
936*1da177e4SLinus Torvalds 		if (start >= p->end)
937*1da177e4SLinus Torvalds 			n = n->rb_right;
938*1da177e4SLinus Torvalds 		else if (end <= p->start)
939*1da177e4SLinus Torvalds 			n = n->rb_left;
940*1da177e4SLinus Torvalds 		else
941*1da177e4SLinus Torvalds 			break;
942*1da177e4SLinus Torvalds 	}
943*1da177e4SLinus Torvalds 	if (!n)
944*1da177e4SLinus Torvalds 		return NULL;
945*1da177e4SLinus Torvalds 	for (;;) {
946*1da177e4SLinus Torvalds 		struct sp_node *w = NULL;
947*1da177e4SLinus Torvalds 		struct rb_node *prev = rb_prev(n);
948*1da177e4SLinus Torvalds 		if (!prev)
949*1da177e4SLinus Torvalds 			break;
950*1da177e4SLinus Torvalds 		w = rb_entry(prev, struct sp_node, nd);
951*1da177e4SLinus Torvalds 		if (w->end <= start)
952*1da177e4SLinus Torvalds 			break;
953*1da177e4SLinus Torvalds 		n = prev;
954*1da177e4SLinus Torvalds 	}
955*1da177e4SLinus Torvalds 	return rb_entry(n, struct sp_node, nd);
956*1da177e4SLinus Torvalds }
957*1da177e4SLinus Torvalds 
958*1da177e4SLinus Torvalds /* Insert a new shared policy into the list. */
959*1da177e4SLinus Torvalds /* Caller holds sp->lock */
960*1da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
961*1da177e4SLinus Torvalds {
962*1da177e4SLinus Torvalds 	struct rb_node **p = &sp->root.rb_node;
963*1da177e4SLinus Torvalds 	struct rb_node *parent = NULL;
964*1da177e4SLinus Torvalds 	struct sp_node *nd;
965*1da177e4SLinus Torvalds 
966*1da177e4SLinus Torvalds 	while (*p) {
967*1da177e4SLinus Torvalds 		parent = *p;
968*1da177e4SLinus Torvalds 		nd = rb_entry(parent, struct sp_node, nd);
969*1da177e4SLinus Torvalds 		if (new->start < nd->start)
970*1da177e4SLinus Torvalds 			p = &(*p)->rb_left;
971*1da177e4SLinus Torvalds 		else if (new->end > nd->end)
972*1da177e4SLinus Torvalds 			p = &(*p)->rb_right;
973*1da177e4SLinus Torvalds 		else
974*1da177e4SLinus Torvalds 			BUG();
975*1da177e4SLinus Torvalds 	}
976*1da177e4SLinus Torvalds 	rb_link_node(&new->nd, parent, p);
977*1da177e4SLinus Torvalds 	rb_insert_color(&new->nd, &sp->root);
978*1da177e4SLinus Torvalds 	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
979*1da177e4SLinus Torvalds 		 new->policy ? new->policy->policy : 0);
980*1da177e4SLinus Torvalds }
981*1da177e4SLinus Torvalds 
982*1da177e4SLinus Torvalds /* Find shared policy intersecting idx */
983*1da177e4SLinus Torvalds struct mempolicy *
984*1da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
985*1da177e4SLinus Torvalds {
986*1da177e4SLinus Torvalds 	struct mempolicy *pol = NULL;
987*1da177e4SLinus Torvalds 	struct sp_node *sn;
988*1da177e4SLinus Torvalds 
989*1da177e4SLinus Torvalds 	if (!sp->root.rb_node)
990*1da177e4SLinus Torvalds 		return NULL;
991*1da177e4SLinus Torvalds 	spin_lock(&sp->lock);
992*1da177e4SLinus Torvalds 	sn = sp_lookup(sp, idx, idx+1);
993*1da177e4SLinus Torvalds 	if (sn) {
994*1da177e4SLinus Torvalds 		mpol_get(sn->policy);
995*1da177e4SLinus Torvalds 		pol = sn->policy;
996*1da177e4SLinus Torvalds 	}
997*1da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
998*1da177e4SLinus Torvalds 	return pol;
999*1da177e4SLinus Torvalds }
1000*1da177e4SLinus Torvalds 
1001*1da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1002*1da177e4SLinus Torvalds {
1003*1da177e4SLinus Torvalds 	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1004*1da177e4SLinus Torvalds 	rb_erase(&n->nd, &sp->root);
1005*1da177e4SLinus Torvalds 	mpol_free(n->policy);
1006*1da177e4SLinus Torvalds 	kmem_cache_free(sn_cache, n);
1007*1da177e4SLinus Torvalds }
1008*1da177e4SLinus Torvalds 
1009*1da177e4SLinus Torvalds struct sp_node *
1010*1da177e4SLinus Torvalds sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1011*1da177e4SLinus Torvalds {
1012*1da177e4SLinus Torvalds 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1013*1da177e4SLinus Torvalds 
1014*1da177e4SLinus Torvalds 	if (!n)
1015*1da177e4SLinus Torvalds 		return NULL;
1016*1da177e4SLinus Torvalds 	n->start = start;
1017*1da177e4SLinus Torvalds 	n->end = end;
1018*1da177e4SLinus Torvalds 	mpol_get(pol);
1019*1da177e4SLinus Torvalds 	n->policy = pol;
1020*1da177e4SLinus Torvalds 	return n;
1021*1da177e4SLinus Torvalds }
1022*1da177e4SLinus Torvalds 
1023*1da177e4SLinus Torvalds /* Replace a policy range. */
1024*1da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1025*1da177e4SLinus Torvalds 				 unsigned long end, struct sp_node *new)
1026*1da177e4SLinus Torvalds {
1027*1da177e4SLinus Torvalds 	struct sp_node *n, *new2 = NULL;
1028*1da177e4SLinus Torvalds 
1029*1da177e4SLinus Torvalds restart:
1030*1da177e4SLinus Torvalds 	spin_lock(&sp->lock);
1031*1da177e4SLinus Torvalds 	n = sp_lookup(sp, start, end);
1032*1da177e4SLinus Torvalds 	/* Take care of old policies in the same range. */
1033*1da177e4SLinus Torvalds 	while (n && n->start < end) {
1034*1da177e4SLinus Torvalds 		struct rb_node *next = rb_next(&n->nd);
1035*1da177e4SLinus Torvalds 		if (n->start >= start) {
1036*1da177e4SLinus Torvalds 			if (n->end <= end)
1037*1da177e4SLinus Torvalds 				sp_delete(sp, n);
1038*1da177e4SLinus Torvalds 			else
1039*1da177e4SLinus Torvalds 				n->start = end;
1040*1da177e4SLinus Torvalds 		} else {
1041*1da177e4SLinus Torvalds 			/* Old policy spanning whole new range. */
1042*1da177e4SLinus Torvalds 			if (n->end > end) {
1043*1da177e4SLinus Torvalds 				if (!new2) {
1044*1da177e4SLinus Torvalds 					spin_unlock(&sp->lock);
1045*1da177e4SLinus Torvalds 					new2 = sp_alloc(end, n->end, n->policy);
1046*1da177e4SLinus Torvalds 					if (!new2)
1047*1da177e4SLinus Torvalds 						return -ENOMEM;
1048*1da177e4SLinus Torvalds 					goto restart;
1049*1da177e4SLinus Torvalds 				}
1050*1da177e4SLinus Torvalds 				n->end = start;
1051*1da177e4SLinus Torvalds 				sp_insert(sp, new2);
1052*1da177e4SLinus Torvalds 				new2 = NULL;
1053*1da177e4SLinus Torvalds 				break;
1054*1da177e4SLinus Torvalds 			} else
1055*1da177e4SLinus Torvalds 				n->end = start;
1056*1da177e4SLinus Torvalds 		}
1057*1da177e4SLinus Torvalds 		if (!next)
1058*1da177e4SLinus Torvalds 			break;
1059*1da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
1060*1da177e4SLinus Torvalds 	}
1061*1da177e4SLinus Torvalds 	if (new)
1062*1da177e4SLinus Torvalds 		sp_insert(sp, new);
1063*1da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
1064*1da177e4SLinus Torvalds 	if (new2) {
1065*1da177e4SLinus Torvalds 		mpol_free(new2->policy);
1066*1da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new2);
1067*1da177e4SLinus Torvalds 	}
1068*1da177e4SLinus Torvalds 	return 0;
1069*1da177e4SLinus Torvalds }
1070*1da177e4SLinus Torvalds 
1071*1da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info,
1072*1da177e4SLinus Torvalds 			struct vm_area_struct *vma, struct mempolicy *npol)
1073*1da177e4SLinus Torvalds {
1074*1da177e4SLinus Torvalds 	int err;
1075*1da177e4SLinus Torvalds 	struct sp_node *new = NULL;
1076*1da177e4SLinus Torvalds 	unsigned long sz = vma_pages(vma);
1077*1da177e4SLinus Torvalds 
1078*1da177e4SLinus Torvalds 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1079*1da177e4SLinus Torvalds 		 vma->vm_pgoff,
1080*1da177e4SLinus Torvalds 		 sz, npol? npol->policy : -1,
1081*1da177e4SLinus Torvalds 		npol ? npol->v.nodes[0] : -1);
1082*1da177e4SLinus Torvalds 
1083*1da177e4SLinus Torvalds 	if (npol) {
1084*1da177e4SLinus Torvalds 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1085*1da177e4SLinus Torvalds 		if (!new)
1086*1da177e4SLinus Torvalds 			return -ENOMEM;
1087*1da177e4SLinus Torvalds 	}
1088*1da177e4SLinus Torvalds 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1089*1da177e4SLinus Torvalds 	if (err && new)
1090*1da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new);
1091*1da177e4SLinus Torvalds 	return err;
1092*1da177e4SLinus Torvalds }
1093*1da177e4SLinus Torvalds 
1094*1da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
1095*1da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p)
1096*1da177e4SLinus Torvalds {
1097*1da177e4SLinus Torvalds 	struct sp_node *n;
1098*1da177e4SLinus Torvalds 	struct rb_node *next;
1099*1da177e4SLinus Torvalds 
1100*1da177e4SLinus Torvalds 	if (!p->root.rb_node)
1101*1da177e4SLinus Torvalds 		return;
1102*1da177e4SLinus Torvalds 	spin_lock(&p->lock);
1103*1da177e4SLinus Torvalds 	next = rb_first(&p->root);
1104*1da177e4SLinus Torvalds 	while (next) {
1105*1da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
1106*1da177e4SLinus Torvalds 		next = rb_next(&n->nd);
1107*1da177e4SLinus Torvalds 		mpol_free(n->policy);
1108*1da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, n);
1109*1da177e4SLinus Torvalds 	}
1110*1da177e4SLinus Torvalds 	spin_unlock(&p->lock);
1111*1da177e4SLinus Torvalds 	p->root = RB_ROOT;
1112*1da177e4SLinus Torvalds }
1113*1da177e4SLinus Torvalds 
1114*1da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */
1115*1da177e4SLinus Torvalds void __init numa_policy_init(void)
1116*1da177e4SLinus Torvalds {
1117*1da177e4SLinus Torvalds 	policy_cache = kmem_cache_create("numa_policy",
1118*1da177e4SLinus Torvalds 					 sizeof(struct mempolicy),
1119*1da177e4SLinus Torvalds 					 0, SLAB_PANIC, NULL, NULL);
1120*1da177e4SLinus Torvalds 
1121*1da177e4SLinus Torvalds 	sn_cache = kmem_cache_create("shared_policy_node",
1122*1da177e4SLinus Torvalds 				     sizeof(struct sp_node),
1123*1da177e4SLinus Torvalds 				     0, SLAB_PANIC, NULL, NULL);
1124*1da177e4SLinus Torvalds 
1125*1da177e4SLinus Torvalds 	/* Set interleaving policy for system init. This way not all
1126*1da177e4SLinus Torvalds 	   the data structures allocated at system boot end up in node zero. */
1127*1da177e4SLinus Torvalds 
1128*1da177e4SLinus Torvalds 	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1129*1da177e4SLinus Torvalds 							MAX_NUMNODES) < 0)
1130*1da177e4SLinus Torvalds 		printk("numa_policy_init: interleaving failed\n");
1131*1da177e4SLinus Torvalds }
1132*1da177e4SLinus Torvalds 
1133*1da177e4SLinus Torvalds /* Reset policy of current process to default.
1134*1da177e4SLinus Torvalds  * Assumes fs == KERNEL_DS */
1135*1da177e4SLinus Torvalds void numa_default_policy(void)
1136*1da177e4SLinus Torvalds {
1137*1da177e4SLinus Torvalds 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1138*1da177e4SLinus Torvalds }
1139