xref: /linux/mm/mempolicy.c (revision 3e1f064562fcff7bf3856bc1d00dfa84d4f121cc)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Simple NUMA memory policy for the Linux kernel.
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
58bccd85fSChristoph Lameter  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
61da177e4SLinus Torvalds  * Subject to the GNU Public License, version 2.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds  * be allocated.
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  * Support four policies per VMA and per process:
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds  *
151da177e4SLinus Torvalds  * interleave     Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds  *                with normal fallback if it fails.
171da177e4SLinus Torvalds  *                For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds  *                offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds  *                for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds  *                is used.
218bccd85fSChristoph Lameter  *
221da177e4SLinus Torvalds  * bind           Only allocate memory on a specific set of nodes,
231da177e4SLinus Torvalds  *                no fallback.
248bccd85fSChristoph Lameter  *                FIXME: memory is allocated starting with the first node
258bccd85fSChristoph Lameter  *                to the last. It would be better if bind would truly restrict
268bccd85fSChristoph Lameter  *                the allocation to memory nodes instead
278bccd85fSChristoph Lameter  *
281da177e4SLinus Torvalds  * preferred       Try a specific node first before normal fallback.
291da177e4SLinus Torvalds  *                As a special case node -1 here means do the allocation
301da177e4SLinus Torvalds  *                on the local CPU. This is normally identical to default,
311da177e4SLinus Torvalds  *                but useful to set in a VMA when you have a non default
321da177e4SLinus Torvalds  *                process policy.
338bccd85fSChristoph Lameter  *
341da177e4SLinus Torvalds  * default        Allocate on the local node first, or when on a VMA
351da177e4SLinus Torvalds  *                use the process policy. This is what Linux always did
361da177e4SLinus Torvalds  *		  in a NUMA aware kernel and still does by, ahem, default.
371da177e4SLinus Torvalds  *
381da177e4SLinus Torvalds  * The process policy is applied for most non interrupt memory allocations
391da177e4SLinus Torvalds  * in that process' context. Interrupts ignore the policies and always
401da177e4SLinus Torvalds  * try to allocate on the local CPU. The VMA policy is only applied for memory
411da177e4SLinus Torvalds  * allocations for a VMA in the VM.
421da177e4SLinus Torvalds  *
431da177e4SLinus Torvalds  * Currently there are a few corner cases in swapping where the policy
441da177e4SLinus Torvalds  * is not applied, but the majority should be handled. When process policy
451da177e4SLinus Torvalds  * is used it is not remembered over swap outs/swap ins.
461da177e4SLinus Torvalds  *
471da177e4SLinus Torvalds  * Only the highest zone in the zone hierarchy gets policied. Allocations
481da177e4SLinus Torvalds  * requesting a lower zone just use default policy. This implies that
491da177e4SLinus Torvalds  * on systems with highmem kernel lowmem allocation don't get policied.
501da177e4SLinus Torvalds  * Same with GFP_DMA allocations.
511da177e4SLinus Torvalds  *
521da177e4SLinus Torvalds  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
531da177e4SLinus Torvalds  * all users and remembered even when nobody has memory mapped.
541da177e4SLinus Torvalds  */
551da177e4SLinus Torvalds 
561da177e4SLinus Torvalds /* Notebook:
571da177e4SLinus Torvalds    fix mmap readahead to honour policy and enable policy for any page cache
581da177e4SLinus Torvalds    object
591da177e4SLinus Torvalds    statistics for bigpages
601da177e4SLinus Torvalds    global policy for page cache? currently it uses process policy. Requires
611da177e4SLinus Torvalds    first item above.
621da177e4SLinus Torvalds    handle mremap for shared memory (currently ignored for the policy)
631da177e4SLinus Torvalds    grows down?
641da177e4SLinus Torvalds    make bind policy root only? It can trigger oom much faster and the
651da177e4SLinus Torvalds    kernel is not always grateful with that.
661da177e4SLinus Torvalds */
671da177e4SLinus Torvalds 
681da177e4SLinus Torvalds #include <linux/mempolicy.h>
691da177e4SLinus Torvalds #include <linux/mm.h>
701da177e4SLinus Torvalds #include <linux/highmem.h>
711da177e4SLinus Torvalds #include <linux/hugetlb.h>
721da177e4SLinus Torvalds #include <linux/kernel.h>
731da177e4SLinus Torvalds #include <linux/sched.h>
741da177e4SLinus Torvalds #include <linux/nodemask.h>
751da177e4SLinus Torvalds #include <linux/cpuset.h>
761da177e4SLinus Torvalds #include <linux/gfp.h>
771da177e4SLinus Torvalds #include <linux/slab.h>
781da177e4SLinus Torvalds #include <linux/string.h>
791da177e4SLinus Torvalds #include <linux/module.h>
80b488893aSPavel Emelyanov #include <linux/nsproxy.h>
811da177e4SLinus Torvalds #include <linux/interrupt.h>
821da177e4SLinus Torvalds #include <linux/init.h>
831da177e4SLinus Torvalds #include <linux/compat.h>
84dc9aa5b9SChristoph Lameter #include <linux/swap.h>
851a75a6c8SChristoph Lameter #include <linux/seq_file.h>
861a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
87b20a3503SChristoph Lameter #include <linux/migrate.h>
8895a402c3SChristoph Lameter #include <linux/rmap.h>
8986c3a764SDavid Quigley #include <linux/security.h>
90dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
91dc9aa5b9SChristoph Lameter 
921da177e4SLinus Torvalds #include <asm/tlbflush.h>
931da177e4SLinus Torvalds #include <asm/uaccess.h>
941da177e4SLinus Torvalds 
9538e35860SChristoph Lameter /* Internal flags */
96dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
9738e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
981a75a6c8SChristoph Lameter #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
99dc9aa5b9SChristoph Lameter 
100fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
101fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1021da177e4SLinus Torvalds 
1031da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1041da177e4SLinus Torvalds    policied. */
1056267276fSChristoph Lameter enum zone_type policy_zone = 0;
1061da177e4SLinus Torvalds 
107d42c6997SAndi Kleen struct mempolicy default_policy = {
1081da177e4SLinus Torvalds 	.refcnt = ATOMIC_INIT(1), /* never free it */
1091da177e4SLinus Torvalds 	.policy = MPOL_DEFAULT,
1101da177e4SLinus Torvalds };
1111da177e4SLinus Torvalds 
11237012946SDavid Rientjes static const struct mempolicy_operations {
11337012946SDavid Rientjes 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
11437012946SDavid Rientjes 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
11537012946SDavid Rientjes } mpol_ops[MPOL_MAX];
11637012946SDavid Rientjes 
11719770b32SMel Gorman /* Check that the nodemask contains at least one populated zone */
11837012946SDavid Rientjes static int is_valid_nodemask(const nodemask_t *nodemask)
1191da177e4SLinus Torvalds {
12019770b32SMel Gorman 	int nd, k;
1211da177e4SLinus Torvalds 
12219770b32SMel Gorman 	/* Check that there is something useful in this mask */
12319770b32SMel Gorman 	k = policy_zone;
12419770b32SMel Gorman 
12519770b32SMel Gorman 	for_each_node_mask(nd, *nodemask) {
12619770b32SMel Gorman 		struct zone *z;
12719770b32SMel Gorman 
12819770b32SMel Gorman 		for (k = 0; k <= policy_zone; k++) {
12919770b32SMel Gorman 			z = &NODE_DATA(nd)->node_zones[k];
130dd942ae3SAndi Kleen 			if (z->present_pages > 0)
13119770b32SMel Gorman 				return 1;
132dd942ae3SAndi Kleen 		}
133dd942ae3SAndi Kleen 	}
13419770b32SMel Gorman 
13519770b32SMel Gorman 	return 0;
1361da177e4SLinus Torvalds }
1371da177e4SLinus Torvalds 
138f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
139f5b087b5SDavid Rientjes {
1404c50bc01SDavid Rientjes 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
1414c50bc01SDavid Rientjes }
1424c50bc01SDavid Rientjes 
1434c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
1444c50bc01SDavid Rientjes 				   const nodemask_t *rel)
1454c50bc01SDavid Rientjes {
1464c50bc01SDavid Rientjes 	nodemask_t tmp;
1474c50bc01SDavid Rientjes 	nodes_fold(tmp, *orig, nodes_weight(*rel));
1484c50bc01SDavid Rientjes 	nodes_onto(*ret, tmp, *rel);
149f5b087b5SDavid Rientjes }
150f5b087b5SDavid Rientjes 
15137012946SDavid Rientjes static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
15237012946SDavid Rientjes {
15337012946SDavid Rientjes 	if (nodes_empty(*nodes))
15437012946SDavid Rientjes 		return -EINVAL;
15537012946SDavid Rientjes 	pol->v.nodes = *nodes;
15637012946SDavid Rientjes 	return 0;
15737012946SDavid Rientjes }
15837012946SDavid Rientjes 
15937012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
16037012946SDavid Rientjes {
16137012946SDavid Rientjes 	if (!nodes)
16237012946SDavid Rientjes 		pol->v.preferred_node = -1;	/* local allocation */
16337012946SDavid Rientjes 	else if (nodes_empty(*nodes))
16437012946SDavid Rientjes 		return -EINVAL;			/*  no allowed nodes */
16537012946SDavid Rientjes 	else
16637012946SDavid Rientjes 		pol->v.preferred_node = first_node(*nodes);
16737012946SDavid Rientjes 	return 0;
16837012946SDavid Rientjes }
16937012946SDavid Rientjes 
17037012946SDavid Rientjes static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
17137012946SDavid Rientjes {
17237012946SDavid Rientjes 	if (!is_valid_nodemask(nodes))
17337012946SDavid Rientjes 		return -EINVAL;
17437012946SDavid Rientjes 	pol->v.nodes = *nodes;
17537012946SDavid Rientjes 	return 0;
17637012946SDavid Rientjes }
17737012946SDavid Rientjes 
1781da177e4SLinus Torvalds /* Create a new policy */
179028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
180028fec41SDavid Rientjes 				  nodemask_t *nodes)
1811da177e4SLinus Torvalds {
1821da177e4SLinus Torvalds 	struct mempolicy *policy;
183f5b087b5SDavid Rientjes 	nodemask_t cpuset_context_nmask;
18437012946SDavid Rientjes 	int ret;
1851da177e4SLinus Torvalds 
186028fec41SDavid Rientjes 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
187028fec41SDavid Rientjes 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
188140d5a49SPaul Mundt 
189*3e1f0645SDavid Rientjes 	if (mode == MPOL_DEFAULT) {
190*3e1f0645SDavid Rientjes 		if (nodes && !nodes_empty(*nodes))
19137012946SDavid Rientjes 			return ERR_PTR(-EINVAL);
192*3e1f0645SDavid Rientjes 		return NULL;
19337012946SDavid Rientjes 	}
194*3e1f0645SDavid Rientjes 	VM_BUG_ON(!nodes);
195*3e1f0645SDavid Rientjes 
196*3e1f0645SDavid Rientjes 	/*
197*3e1f0645SDavid Rientjes 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
198*3e1f0645SDavid Rientjes 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
199*3e1f0645SDavid Rientjes 	 * All other modes require a valid pointer to a non-empty nodemask.
200*3e1f0645SDavid Rientjes 	 */
201*3e1f0645SDavid Rientjes 	if (mode == MPOL_PREFERRED) {
202*3e1f0645SDavid Rientjes 		if (nodes_empty(*nodes)) {
203*3e1f0645SDavid Rientjes 			if (((flags & MPOL_F_STATIC_NODES) ||
204*3e1f0645SDavid Rientjes 			     (flags & MPOL_F_RELATIVE_NODES)))
205*3e1f0645SDavid Rientjes 				return ERR_PTR(-EINVAL);
206*3e1f0645SDavid Rientjes 			nodes = NULL;	/* flag local alloc */
207*3e1f0645SDavid Rientjes 		}
208*3e1f0645SDavid Rientjes 	} else if (nodes_empty(*nodes))
209*3e1f0645SDavid Rientjes 		return ERR_PTR(-EINVAL);
2101da177e4SLinus Torvalds 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2111da177e4SLinus Torvalds 	if (!policy)
2121da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2131da177e4SLinus Torvalds 	atomic_set(&policy->refcnt, 1);
21437012946SDavid Rientjes 	policy->policy = mode;
21537012946SDavid Rientjes 	policy->flags = flags;
216*3e1f0645SDavid Rientjes 
217*3e1f0645SDavid Rientjes 	if (nodes) {
218*3e1f0645SDavid Rientjes 		/*
219*3e1f0645SDavid Rientjes 		 * cpuset related setup doesn't apply to local allocation
220*3e1f0645SDavid Rientjes 		 */
221f5b087b5SDavid Rientjes 		cpuset_update_task_memory_state();
2224c50bc01SDavid Rientjes 		if (flags & MPOL_F_RELATIVE_NODES)
2234c50bc01SDavid Rientjes 			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
2244c50bc01SDavid Rientjes 					       &cpuset_current_mems_allowed);
2254c50bc01SDavid Rientjes 		else
2264c50bc01SDavid Rientjes 			nodes_and(cpuset_context_nmask, *nodes,
2274c50bc01SDavid Rientjes 				  cpuset_current_mems_allowed);
228f5b087b5SDavid Rientjes 		if (mpol_store_user_nodemask(policy))
229f5b087b5SDavid Rientjes 			policy->w.user_nodemask = *nodes;
230f5b087b5SDavid Rientjes 		else
23137012946SDavid Rientjes 			policy->w.cpuset_mems_allowed =
23237012946SDavid Rientjes 						cpuset_mems_allowed(current);
2331da177e4SLinus Torvalds 	}
2341da177e4SLinus Torvalds 
23537012946SDavid Rientjes 	ret = mpol_ops[mode].create(policy,
236*3e1f0645SDavid Rientjes 				nodes ? &cpuset_context_nmask : NULL);
23737012946SDavid Rientjes 	if (ret < 0) {
23837012946SDavid Rientjes 		kmem_cache_free(policy_cache, policy);
23937012946SDavid Rientjes 		return ERR_PTR(ret);
24037012946SDavid Rientjes 	}
24137012946SDavid Rientjes 	return policy;
24237012946SDavid Rientjes }
24337012946SDavid Rientjes 
24437012946SDavid Rientjes static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
24537012946SDavid Rientjes {
24637012946SDavid Rientjes }
24737012946SDavid Rientjes 
24837012946SDavid Rientjes static void mpol_rebind_nodemask(struct mempolicy *pol,
24937012946SDavid Rientjes 				 const nodemask_t *nodes)
2501d0d2680SDavid Rientjes {
2511d0d2680SDavid Rientjes 	nodemask_t tmp;
2521d0d2680SDavid Rientjes 
25337012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES)
25437012946SDavid Rientjes 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
25537012946SDavid Rientjes 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
25637012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
2571d0d2680SDavid Rientjes 	else {
25837012946SDavid Rientjes 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
25937012946SDavid Rientjes 			    *nodes);
26037012946SDavid Rientjes 		pol->w.cpuset_mems_allowed = *nodes;
2611d0d2680SDavid Rientjes 	}
26237012946SDavid Rientjes 
2631d0d2680SDavid Rientjes 	pol->v.nodes = tmp;
2641d0d2680SDavid Rientjes 	if (!node_isset(current->il_next, tmp)) {
2651d0d2680SDavid Rientjes 		current->il_next = next_node(current->il_next, tmp);
2661d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
2671d0d2680SDavid Rientjes 			current->il_next = first_node(tmp);
2681d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
2691d0d2680SDavid Rientjes 			current->il_next = numa_node_id();
2701d0d2680SDavid Rientjes 	}
27137012946SDavid Rientjes }
27237012946SDavid Rientjes 
27337012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
27437012946SDavid Rientjes 				  const nodemask_t *nodes)
27537012946SDavid Rientjes {
27637012946SDavid Rientjes 	nodemask_t tmp;
27737012946SDavid Rientjes 
27837012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES) {
2791d0d2680SDavid Rientjes 		int node = first_node(pol->w.user_nodemask);
2801d0d2680SDavid Rientjes 
28137012946SDavid Rientjes 		if (node_isset(node, *nodes))
2821d0d2680SDavid Rientjes 			pol->v.preferred_node = node;
2831d0d2680SDavid Rientjes 		else
2841d0d2680SDavid Rientjes 			pol->v.preferred_node = -1;
28537012946SDavid Rientjes 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
28637012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
2871d0d2680SDavid Rientjes 		pol->v.preferred_node = first_node(tmp);
288*3e1f0645SDavid Rientjes 	} else if (pol->v.preferred_node != -1) {
2891d0d2680SDavid Rientjes 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
29037012946SDavid Rientjes 						   pol->w.cpuset_mems_allowed,
29137012946SDavid Rientjes 						   *nodes);
29237012946SDavid Rientjes 		pol->w.cpuset_mems_allowed = *nodes;
2931d0d2680SDavid Rientjes 	}
2941d0d2680SDavid Rientjes }
29537012946SDavid Rientjes 
29637012946SDavid Rientjes /* Migrate a policy to a different set of nodes */
29737012946SDavid Rientjes static void mpol_rebind_policy(struct mempolicy *pol,
29837012946SDavid Rientjes 			       const nodemask_t *newmask)
29937012946SDavid Rientjes {
30037012946SDavid Rientjes 	if (!pol)
30137012946SDavid Rientjes 		return;
30237012946SDavid Rientjes 	if (!mpol_store_user_nodemask(pol) &&
30337012946SDavid Rientjes 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
30437012946SDavid Rientjes 		return;
30537012946SDavid Rientjes 	mpol_ops[pol->policy].rebind(pol, newmask);
3061d0d2680SDavid Rientjes }
3071d0d2680SDavid Rientjes 
3081d0d2680SDavid Rientjes /*
3091d0d2680SDavid Rientjes  * Wrapper for mpol_rebind_policy() that just requires task
3101d0d2680SDavid Rientjes  * pointer, and updates task mempolicy.
3111d0d2680SDavid Rientjes  */
3121d0d2680SDavid Rientjes 
3131d0d2680SDavid Rientjes void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
3141d0d2680SDavid Rientjes {
3151d0d2680SDavid Rientjes 	mpol_rebind_policy(tsk->mempolicy, new);
3161d0d2680SDavid Rientjes }
3171d0d2680SDavid Rientjes 
3181d0d2680SDavid Rientjes /*
3191d0d2680SDavid Rientjes  * Rebind each vma in mm to new nodemask.
3201d0d2680SDavid Rientjes  *
3211d0d2680SDavid Rientjes  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
3221d0d2680SDavid Rientjes  */
3231d0d2680SDavid Rientjes 
3241d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
3251d0d2680SDavid Rientjes {
3261d0d2680SDavid Rientjes 	struct vm_area_struct *vma;
3271d0d2680SDavid Rientjes 
3281d0d2680SDavid Rientjes 	down_write(&mm->mmap_sem);
3291d0d2680SDavid Rientjes 	for (vma = mm->mmap; vma; vma = vma->vm_next)
3301d0d2680SDavid Rientjes 		mpol_rebind_policy(vma->vm_policy, new);
3311d0d2680SDavid Rientjes 	up_write(&mm->mmap_sem);
3321d0d2680SDavid Rientjes }
3331d0d2680SDavid Rientjes 
33437012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
33537012946SDavid Rientjes 	[MPOL_DEFAULT] = {
33637012946SDavid Rientjes 		.rebind = mpol_rebind_default,
33737012946SDavid Rientjes 	},
33837012946SDavid Rientjes 	[MPOL_INTERLEAVE] = {
33937012946SDavid Rientjes 		.create = mpol_new_interleave,
34037012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
34137012946SDavid Rientjes 	},
34237012946SDavid Rientjes 	[MPOL_PREFERRED] = {
34337012946SDavid Rientjes 		.create = mpol_new_preferred,
34437012946SDavid Rientjes 		.rebind = mpol_rebind_preferred,
34537012946SDavid Rientjes 	},
34637012946SDavid Rientjes 	[MPOL_BIND] = {
34737012946SDavid Rientjes 		.create = mpol_new_bind,
34837012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
34937012946SDavid Rientjes 	},
35037012946SDavid Rientjes };
35137012946SDavid Rientjes 
352397874dfSChristoph Lameter static void gather_stats(struct page *, void *, int pte_dirty);
353fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
354fc301289SChristoph Lameter 				unsigned long flags);
3551a75a6c8SChristoph Lameter 
35638e35860SChristoph Lameter /* Scan through pages checking if pages follow certain conditions. */
357b5810039SNick Piggin static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
358dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
359dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
36038e35860SChristoph Lameter 		void *private)
3611da177e4SLinus Torvalds {
36291612e0dSHugh Dickins 	pte_t *orig_pte;
36391612e0dSHugh Dickins 	pte_t *pte;
364705e87c0SHugh Dickins 	spinlock_t *ptl;
365941150a3SHugh Dickins 
366705e87c0SHugh Dickins 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
36791612e0dSHugh Dickins 	do {
3686aab341eSLinus Torvalds 		struct page *page;
36925ba77c1SAndy Whitcroft 		int nid;
37091612e0dSHugh Dickins 
37191612e0dSHugh Dickins 		if (!pte_present(*pte))
37291612e0dSHugh Dickins 			continue;
3736aab341eSLinus Torvalds 		page = vm_normal_page(vma, addr, *pte);
3746aab341eSLinus Torvalds 		if (!page)
37591612e0dSHugh Dickins 			continue;
376053837fcSNick Piggin 		/*
377053837fcSNick Piggin 		 * The check for PageReserved here is important to avoid
378053837fcSNick Piggin 		 * handling zero pages and other pages that may have been
379053837fcSNick Piggin 		 * marked special by the system.
380053837fcSNick Piggin 		 *
381053837fcSNick Piggin 		 * If the PageReserved would not be checked here then f.e.
382053837fcSNick Piggin 		 * the location of the zero page could have an influence
383053837fcSNick Piggin 		 * on MPOL_MF_STRICT, zero pages would be counted for
384053837fcSNick Piggin 		 * the per node stats, and there would be useless attempts
385053837fcSNick Piggin 		 * to put zero pages on the migration list.
386053837fcSNick Piggin 		 */
387f4598c8bSChristoph Lameter 		if (PageReserved(page))
388f4598c8bSChristoph Lameter 			continue;
3896aab341eSLinus Torvalds 		nid = page_to_nid(page);
39038e35860SChristoph Lameter 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
39138e35860SChristoph Lameter 			continue;
39238e35860SChristoph Lameter 
3931a75a6c8SChristoph Lameter 		if (flags & MPOL_MF_STATS)
394397874dfSChristoph Lameter 			gather_stats(page, private, pte_dirty(*pte));
395053837fcSNick Piggin 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
396fc301289SChristoph Lameter 			migrate_page_add(page, private, flags);
397dc9aa5b9SChristoph Lameter 		else
3981da177e4SLinus Torvalds 			break;
39991612e0dSHugh Dickins 	} while (pte++, addr += PAGE_SIZE, addr != end);
400705e87c0SHugh Dickins 	pte_unmap_unlock(orig_pte, ptl);
40191612e0dSHugh Dickins 	return addr != end;
40291612e0dSHugh Dickins }
40391612e0dSHugh Dickins 
404b5810039SNick Piggin static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
405dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
406dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
40738e35860SChristoph Lameter 		void *private)
40891612e0dSHugh Dickins {
40991612e0dSHugh Dickins 	pmd_t *pmd;
41091612e0dSHugh Dickins 	unsigned long next;
41191612e0dSHugh Dickins 
41291612e0dSHugh Dickins 	pmd = pmd_offset(pud, addr);
41391612e0dSHugh Dickins 	do {
41491612e0dSHugh Dickins 		next = pmd_addr_end(addr, end);
41591612e0dSHugh Dickins 		if (pmd_none_or_clear_bad(pmd))
41691612e0dSHugh Dickins 			continue;
417dc9aa5b9SChristoph Lameter 		if (check_pte_range(vma, pmd, addr, next, nodes,
41838e35860SChristoph Lameter 				    flags, private))
41991612e0dSHugh Dickins 			return -EIO;
42091612e0dSHugh Dickins 	} while (pmd++, addr = next, addr != end);
42191612e0dSHugh Dickins 	return 0;
42291612e0dSHugh Dickins }
42391612e0dSHugh Dickins 
424b5810039SNick Piggin static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
425dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
426dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
42738e35860SChristoph Lameter 		void *private)
42891612e0dSHugh Dickins {
42991612e0dSHugh Dickins 	pud_t *pud;
43091612e0dSHugh Dickins 	unsigned long next;
43191612e0dSHugh Dickins 
43291612e0dSHugh Dickins 	pud = pud_offset(pgd, addr);
43391612e0dSHugh Dickins 	do {
43491612e0dSHugh Dickins 		next = pud_addr_end(addr, end);
43591612e0dSHugh Dickins 		if (pud_none_or_clear_bad(pud))
43691612e0dSHugh Dickins 			continue;
437dc9aa5b9SChristoph Lameter 		if (check_pmd_range(vma, pud, addr, next, nodes,
43838e35860SChristoph Lameter 				    flags, private))
43991612e0dSHugh Dickins 			return -EIO;
44091612e0dSHugh Dickins 	} while (pud++, addr = next, addr != end);
44191612e0dSHugh Dickins 	return 0;
44291612e0dSHugh Dickins }
44391612e0dSHugh Dickins 
444b5810039SNick Piggin static inline int check_pgd_range(struct vm_area_struct *vma,
445dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
446dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
44738e35860SChristoph Lameter 		void *private)
44891612e0dSHugh Dickins {
44991612e0dSHugh Dickins 	pgd_t *pgd;
45091612e0dSHugh Dickins 	unsigned long next;
45191612e0dSHugh Dickins 
452b5810039SNick Piggin 	pgd = pgd_offset(vma->vm_mm, addr);
45391612e0dSHugh Dickins 	do {
45491612e0dSHugh Dickins 		next = pgd_addr_end(addr, end);
45591612e0dSHugh Dickins 		if (pgd_none_or_clear_bad(pgd))
45691612e0dSHugh Dickins 			continue;
457dc9aa5b9SChristoph Lameter 		if (check_pud_range(vma, pgd, addr, next, nodes,
45838e35860SChristoph Lameter 				    flags, private))
45991612e0dSHugh Dickins 			return -EIO;
46091612e0dSHugh Dickins 	} while (pgd++, addr = next, addr != end);
46191612e0dSHugh Dickins 	return 0;
4621da177e4SLinus Torvalds }
4631da177e4SLinus Torvalds 
464dc9aa5b9SChristoph Lameter /*
465dc9aa5b9SChristoph Lameter  * Check if all pages in a range are on a set of nodes.
466dc9aa5b9SChristoph Lameter  * If pagelist != NULL then isolate pages from the LRU and
467dc9aa5b9SChristoph Lameter  * put them on the pagelist.
468dc9aa5b9SChristoph Lameter  */
4691da177e4SLinus Torvalds static struct vm_area_struct *
4701da177e4SLinus Torvalds check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
47138e35860SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags, void *private)
4721da177e4SLinus Torvalds {
4731da177e4SLinus Torvalds 	int err;
4741da177e4SLinus Torvalds 	struct vm_area_struct *first, *vma, *prev;
4751da177e4SLinus Torvalds 
47690036ee5SChristoph Lameter 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
47790036ee5SChristoph Lameter 
478b20a3503SChristoph Lameter 		err = migrate_prep();
479b20a3503SChristoph Lameter 		if (err)
480b20a3503SChristoph Lameter 			return ERR_PTR(err);
48190036ee5SChristoph Lameter 	}
482053837fcSNick Piggin 
4831da177e4SLinus Torvalds 	first = find_vma(mm, start);
4841da177e4SLinus Torvalds 	if (!first)
4851da177e4SLinus Torvalds 		return ERR_PTR(-EFAULT);
4861da177e4SLinus Torvalds 	prev = NULL;
4871da177e4SLinus Torvalds 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
488dc9aa5b9SChristoph Lameter 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
4891da177e4SLinus Torvalds 			if (!vma->vm_next && vma->vm_end < end)
4901da177e4SLinus Torvalds 				return ERR_PTR(-EFAULT);
4911da177e4SLinus Torvalds 			if (prev && prev->vm_end < vma->vm_start)
4921da177e4SLinus Torvalds 				return ERR_PTR(-EFAULT);
493dc9aa5b9SChristoph Lameter 		}
494dc9aa5b9SChristoph Lameter 		if (!is_vm_hugetlb_page(vma) &&
495dc9aa5b9SChristoph Lameter 		    ((flags & MPOL_MF_STRICT) ||
496dc9aa5b9SChristoph Lameter 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
497dc9aa5b9SChristoph Lameter 				vma_migratable(vma)))) {
4985b952b3cSAndi Kleen 			unsigned long endvma = vma->vm_end;
499dc9aa5b9SChristoph Lameter 
5005b952b3cSAndi Kleen 			if (endvma > end)
5015b952b3cSAndi Kleen 				endvma = end;
5025b952b3cSAndi Kleen 			if (vma->vm_start > start)
5035b952b3cSAndi Kleen 				start = vma->vm_start;
504dc9aa5b9SChristoph Lameter 			err = check_pgd_range(vma, start, endvma, nodes,
50538e35860SChristoph Lameter 						flags, private);
5061da177e4SLinus Torvalds 			if (err) {
5071da177e4SLinus Torvalds 				first = ERR_PTR(err);
5081da177e4SLinus Torvalds 				break;
5091da177e4SLinus Torvalds 			}
5101da177e4SLinus Torvalds 		}
5111da177e4SLinus Torvalds 		prev = vma;
5121da177e4SLinus Torvalds 	}
5131da177e4SLinus Torvalds 	return first;
5141da177e4SLinus Torvalds }
5151da177e4SLinus Torvalds 
5161da177e4SLinus Torvalds /* Apply policy to a single VMA */
5171da177e4SLinus Torvalds static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
5181da177e4SLinus Torvalds {
5191da177e4SLinus Torvalds 	int err = 0;
5201da177e4SLinus Torvalds 	struct mempolicy *old = vma->vm_policy;
5211da177e4SLinus Torvalds 
522140d5a49SPaul Mundt 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
5231da177e4SLinus Torvalds 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
5241da177e4SLinus Torvalds 		 vma->vm_ops, vma->vm_file,
5251da177e4SLinus Torvalds 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
5261da177e4SLinus Torvalds 
5271da177e4SLinus Torvalds 	if (vma->vm_ops && vma->vm_ops->set_policy)
5281da177e4SLinus Torvalds 		err = vma->vm_ops->set_policy(vma, new);
5291da177e4SLinus Torvalds 	if (!err) {
5301da177e4SLinus Torvalds 		mpol_get(new);
5311da177e4SLinus Torvalds 		vma->vm_policy = new;
5321da177e4SLinus Torvalds 		mpol_free(old);
5331da177e4SLinus Torvalds 	}
5341da177e4SLinus Torvalds 	return err;
5351da177e4SLinus Torvalds }
5361da177e4SLinus Torvalds 
5371da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */
5381da177e4SLinus Torvalds static int mbind_range(struct vm_area_struct *vma, unsigned long start,
5391da177e4SLinus Torvalds 		       unsigned long end, struct mempolicy *new)
5401da177e4SLinus Torvalds {
5411da177e4SLinus Torvalds 	struct vm_area_struct *next;
5421da177e4SLinus Torvalds 	int err;
5431da177e4SLinus Torvalds 
5441da177e4SLinus Torvalds 	err = 0;
5451da177e4SLinus Torvalds 	for (; vma && vma->vm_start < end; vma = next) {
5461da177e4SLinus Torvalds 		next = vma->vm_next;
5471da177e4SLinus Torvalds 		if (vma->vm_start < start)
5481da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, start, 1);
5491da177e4SLinus Torvalds 		if (!err && vma->vm_end > end)
5501da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, end, 0);
5511da177e4SLinus Torvalds 		if (!err)
5521da177e4SLinus Torvalds 			err = policy_vma(vma, new);
5531da177e4SLinus Torvalds 		if (err)
5541da177e4SLinus Torvalds 			break;
5551da177e4SLinus Torvalds 	}
5561da177e4SLinus Torvalds 	return err;
5571da177e4SLinus Torvalds }
5581da177e4SLinus Torvalds 
559c61afb18SPaul Jackson /*
560c61afb18SPaul Jackson  * Update task->flags PF_MEMPOLICY bit: set iff non-default
561c61afb18SPaul Jackson  * mempolicy.  Allows more rapid checking of this (combined perhaps
562c61afb18SPaul Jackson  * with other PF_* flag bits) on memory allocation hot code paths.
563c61afb18SPaul Jackson  *
564c61afb18SPaul Jackson  * If called from outside this file, the task 'p' should -only- be
565c61afb18SPaul Jackson  * a newly forked child not yet visible on the task list, because
566c61afb18SPaul Jackson  * manipulating the task flags of a visible task is not safe.
567c61afb18SPaul Jackson  *
568c61afb18SPaul Jackson  * The above limitation is why this routine has the funny name
569c61afb18SPaul Jackson  * mpol_fix_fork_child_flag().
570c61afb18SPaul Jackson  *
571c61afb18SPaul Jackson  * It is also safe to call this with a task pointer of current,
572c61afb18SPaul Jackson  * which the static wrapper mpol_set_task_struct_flag() does,
573c61afb18SPaul Jackson  * for use within this file.
574c61afb18SPaul Jackson  */
575c61afb18SPaul Jackson 
576c61afb18SPaul Jackson void mpol_fix_fork_child_flag(struct task_struct *p)
577c61afb18SPaul Jackson {
578c61afb18SPaul Jackson 	if (p->mempolicy)
579c61afb18SPaul Jackson 		p->flags |= PF_MEMPOLICY;
580c61afb18SPaul Jackson 	else
581c61afb18SPaul Jackson 		p->flags &= ~PF_MEMPOLICY;
582c61afb18SPaul Jackson }
583c61afb18SPaul Jackson 
584c61afb18SPaul Jackson static void mpol_set_task_struct_flag(void)
585c61afb18SPaul Jackson {
586c61afb18SPaul Jackson 	mpol_fix_fork_child_flag(current);
587c61afb18SPaul Jackson }
588c61afb18SPaul Jackson 
5891da177e4SLinus Torvalds /* Set the process memory policy */
590028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
591028fec41SDavid Rientjes 			     nodemask_t *nodes)
5921da177e4SLinus Torvalds {
5931da177e4SLinus Torvalds 	struct mempolicy *new;
5941da177e4SLinus Torvalds 
595028fec41SDavid Rientjes 	new = mpol_new(mode, flags, nodes);
5961da177e4SLinus Torvalds 	if (IS_ERR(new))
5971da177e4SLinus Torvalds 		return PTR_ERR(new);
5981da177e4SLinus Torvalds 	mpol_free(current->mempolicy);
5991da177e4SLinus Torvalds 	current->mempolicy = new;
600c61afb18SPaul Jackson 	mpol_set_task_struct_flag();
601f5b087b5SDavid Rientjes 	if (new && new->policy == MPOL_INTERLEAVE &&
602f5b087b5SDavid Rientjes 	    nodes_weight(new->v.nodes))
603dfcd3c0dSAndi Kleen 		current->il_next = first_node(new->v.nodes);
6041da177e4SLinus Torvalds 	return 0;
6051da177e4SLinus Torvalds }
6061da177e4SLinus Torvalds 
6071da177e4SLinus Torvalds /* Fill a zone bitmap for a policy */
608dfcd3c0dSAndi Kleen static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
6091da177e4SLinus Torvalds {
610dfcd3c0dSAndi Kleen 	nodes_clear(*nodes);
6111da177e4SLinus Torvalds 	switch (p->policy) {
6121da177e4SLinus Torvalds 	case MPOL_DEFAULT:
6131da177e4SLinus Torvalds 		break;
61419770b32SMel Gorman 	case MPOL_BIND:
61519770b32SMel Gorman 		/* Fall through */
6161da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
617dfcd3c0dSAndi Kleen 		*nodes = p->v.nodes;
6181da177e4SLinus Torvalds 		break;
6191da177e4SLinus Torvalds 	case MPOL_PREFERRED:
62056bbd65dSChristoph Lameter 		/* or use current node instead of memory_map? */
6211da177e4SLinus Torvalds 		if (p->v.preferred_node < 0)
62256bbd65dSChristoph Lameter 			*nodes = node_states[N_HIGH_MEMORY];
6231da177e4SLinus Torvalds 		else
624dfcd3c0dSAndi Kleen 			node_set(p->v.preferred_node, *nodes);
6251da177e4SLinus Torvalds 		break;
6261da177e4SLinus Torvalds 	default:
6271da177e4SLinus Torvalds 		BUG();
6281da177e4SLinus Torvalds 	}
6291da177e4SLinus Torvalds }
6301da177e4SLinus Torvalds 
6311da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr)
6321da177e4SLinus Torvalds {
6331da177e4SLinus Torvalds 	struct page *p;
6341da177e4SLinus Torvalds 	int err;
6351da177e4SLinus Torvalds 
6361da177e4SLinus Torvalds 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
6371da177e4SLinus Torvalds 	if (err >= 0) {
6381da177e4SLinus Torvalds 		err = page_to_nid(p);
6391da177e4SLinus Torvalds 		put_page(p);
6401da177e4SLinus Torvalds 	}
6411da177e4SLinus Torvalds 	return err;
6421da177e4SLinus Torvalds }
6431da177e4SLinus Torvalds 
6441da177e4SLinus Torvalds /* Retrieve NUMA policy */
645dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
6461da177e4SLinus Torvalds 			     unsigned long addr, unsigned long flags)
6471da177e4SLinus Torvalds {
6488bccd85fSChristoph Lameter 	int err;
6491da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
6501da177e4SLinus Torvalds 	struct vm_area_struct *vma = NULL;
6511da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
6521da177e4SLinus Torvalds 
653cf2a473cSPaul Jackson 	cpuset_update_task_memory_state();
654754af6f5SLee Schermerhorn 	if (flags &
655754af6f5SLee Schermerhorn 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
6561da177e4SLinus Torvalds 		return -EINVAL;
657754af6f5SLee Schermerhorn 
658754af6f5SLee Schermerhorn 	if (flags & MPOL_F_MEMS_ALLOWED) {
659754af6f5SLee Schermerhorn 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
660754af6f5SLee Schermerhorn 			return -EINVAL;
661754af6f5SLee Schermerhorn 		*policy = 0;	/* just so it's initialized */
662754af6f5SLee Schermerhorn 		*nmask  = cpuset_current_mems_allowed;
663754af6f5SLee Schermerhorn 		return 0;
664754af6f5SLee Schermerhorn 	}
665754af6f5SLee Schermerhorn 
6661da177e4SLinus Torvalds 	if (flags & MPOL_F_ADDR) {
6671da177e4SLinus Torvalds 		down_read(&mm->mmap_sem);
6681da177e4SLinus Torvalds 		vma = find_vma_intersection(mm, addr, addr+1);
6691da177e4SLinus Torvalds 		if (!vma) {
6701da177e4SLinus Torvalds 			up_read(&mm->mmap_sem);
6711da177e4SLinus Torvalds 			return -EFAULT;
6721da177e4SLinus Torvalds 		}
6731da177e4SLinus Torvalds 		if (vma->vm_ops && vma->vm_ops->get_policy)
6741da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
6751da177e4SLinus Torvalds 		else
6761da177e4SLinus Torvalds 			pol = vma->vm_policy;
6771da177e4SLinus Torvalds 	} else if (addr)
6781da177e4SLinus Torvalds 		return -EINVAL;
6791da177e4SLinus Torvalds 
6801da177e4SLinus Torvalds 	if (!pol)
6811da177e4SLinus Torvalds 		pol = &default_policy;
6821da177e4SLinus Torvalds 
6831da177e4SLinus Torvalds 	if (flags & MPOL_F_NODE) {
6841da177e4SLinus Torvalds 		if (flags & MPOL_F_ADDR) {
6851da177e4SLinus Torvalds 			err = lookup_node(mm, addr);
6861da177e4SLinus Torvalds 			if (err < 0)
6871da177e4SLinus Torvalds 				goto out;
6888bccd85fSChristoph Lameter 			*policy = err;
6891da177e4SLinus Torvalds 		} else if (pol == current->mempolicy &&
6901da177e4SLinus Torvalds 				pol->policy == MPOL_INTERLEAVE) {
6918bccd85fSChristoph Lameter 			*policy = current->il_next;
6921da177e4SLinus Torvalds 		} else {
6931da177e4SLinus Torvalds 			err = -EINVAL;
6941da177e4SLinus Torvalds 			goto out;
6951da177e4SLinus Torvalds 		}
6961da177e4SLinus Torvalds 	} else
697028fec41SDavid Rientjes 		*policy = pol->policy | pol->flags;
6981da177e4SLinus Torvalds 
6991da177e4SLinus Torvalds 	if (vma) {
7001da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
7011da177e4SLinus Torvalds 		vma = NULL;
7021da177e4SLinus Torvalds 	}
7031da177e4SLinus Torvalds 
7041da177e4SLinus Torvalds 	err = 0;
7058bccd85fSChristoph Lameter 	if (nmask)
7068bccd85fSChristoph Lameter 		get_zonemask(pol, nmask);
7071da177e4SLinus Torvalds 
7081da177e4SLinus Torvalds  out:
7091da177e4SLinus Torvalds 	if (vma)
7101da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
7111da177e4SLinus Torvalds 	return err;
7121da177e4SLinus Torvalds }
7131da177e4SLinus Torvalds 
714b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION
7158bccd85fSChristoph Lameter /*
7166ce3c4c0SChristoph Lameter  * page migration
7176ce3c4c0SChristoph Lameter  */
718fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
719fc301289SChristoph Lameter 				unsigned long flags)
7206ce3c4c0SChristoph Lameter {
7216ce3c4c0SChristoph Lameter 	/*
722fc301289SChristoph Lameter 	 * Avoid migrating a page that is shared with others.
7236ce3c4c0SChristoph Lameter 	 */
724b20a3503SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
725b20a3503SChristoph Lameter 		isolate_lru_page(page, pagelist);
7266ce3c4c0SChristoph Lameter }
7276ce3c4c0SChristoph Lameter 
728742755a1SChristoph Lameter static struct page *new_node_page(struct page *page, unsigned long node, int **x)
72995a402c3SChristoph Lameter {
730769848c0SMel Gorman 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
73195a402c3SChristoph Lameter }
73295a402c3SChristoph Lameter 
7336ce3c4c0SChristoph Lameter /*
7347e2ab150SChristoph Lameter  * Migrate pages from one node to a target node.
7357e2ab150SChristoph Lameter  * Returns error or the number of pages not migrated.
7367e2ab150SChristoph Lameter  */
737dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest,
738dbcb0f19SAdrian Bunk 			   int flags)
7397e2ab150SChristoph Lameter {
7407e2ab150SChristoph Lameter 	nodemask_t nmask;
7417e2ab150SChristoph Lameter 	LIST_HEAD(pagelist);
7427e2ab150SChristoph Lameter 	int err = 0;
7437e2ab150SChristoph Lameter 
7447e2ab150SChristoph Lameter 	nodes_clear(nmask);
7457e2ab150SChristoph Lameter 	node_set(source, nmask);
7467e2ab150SChristoph Lameter 
7477e2ab150SChristoph Lameter 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
7487e2ab150SChristoph Lameter 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
7497e2ab150SChristoph Lameter 
7507e2ab150SChristoph Lameter 	if (!list_empty(&pagelist))
75195a402c3SChristoph Lameter 		err = migrate_pages(&pagelist, new_node_page, dest);
75295a402c3SChristoph Lameter 
7537e2ab150SChristoph Lameter 	return err;
7547e2ab150SChristoph Lameter }
7557e2ab150SChristoph Lameter 
7567e2ab150SChristoph Lameter /*
7577e2ab150SChristoph Lameter  * Move pages between the two nodesets so as to preserve the physical
7587e2ab150SChristoph Lameter  * layout as much as possible.
75939743889SChristoph Lameter  *
76039743889SChristoph Lameter  * Returns the number of page that could not be moved.
76139743889SChristoph Lameter  */
76239743889SChristoph Lameter int do_migrate_pages(struct mm_struct *mm,
76339743889SChristoph Lameter 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
76439743889SChristoph Lameter {
76539743889SChristoph Lameter 	LIST_HEAD(pagelist);
7667e2ab150SChristoph Lameter 	int busy = 0;
7677e2ab150SChristoph Lameter 	int err = 0;
7687e2ab150SChristoph Lameter 	nodemask_t tmp;
76939743889SChristoph Lameter 
77039743889SChristoph Lameter   	down_read(&mm->mmap_sem);
771d4984711SChristoph Lameter 
7727b2259b3SChristoph Lameter 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
7737b2259b3SChristoph Lameter 	if (err)
7747b2259b3SChristoph Lameter 		goto out;
7757b2259b3SChristoph Lameter 
7767e2ab150SChristoph Lameter /*
7777e2ab150SChristoph Lameter  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
7787e2ab150SChristoph Lameter  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
7797e2ab150SChristoph Lameter  * bit in 'tmp', and return that <source, dest> pair for migration.
7807e2ab150SChristoph Lameter  * The pair of nodemasks 'to' and 'from' define the map.
7817e2ab150SChristoph Lameter  *
7827e2ab150SChristoph Lameter  * If no pair of bits is found that way, fallback to picking some
7837e2ab150SChristoph Lameter  * pair of 'source' and 'dest' bits that are not the same.  If the
7847e2ab150SChristoph Lameter  * 'source' and 'dest' bits are the same, this represents a node
7857e2ab150SChristoph Lameter  * that will be migrating to itself, so no pages need move.
7867e2ab150SChristoph Lameter  *
7877e2ab150SChristoph Lameter  * If no bits are left in 'tmp', or if all remaining bits left
7887e2ab150SChristoph Lameter  * in 'tmp' correspond to the same bit in 'to', return false
7897e2ab150SChristoph Lameter  * (nothing left to migrate).
7907e2ab150SChristoph Lameter  *
7917e2ab150SChristoph Lameter  * This lets us pick a pair of nodes to migrate between, such that
7927e2ab150SChristoph Lameter  * if possible the dest node is not already occupied by some other
7937e2ab150SChristoph Lameter  * source node, minimizing the risk of overloading the memory on a
7947e2ab150SChristoph Lameter  * node that would happen if we migrated incoming memory to a node
7957e2ab150SChristoph Lameter  * before migrating outgoing memory source that same node.
7967e2ab150SChristoph Lameter  *
7977e2ab150SChristoph Lameter  * A single scan of tmp is sufficient.  As we go, we remember the
7987e2ab150SChristoph Lameter  * most recent <s, d> pair that moved (s != d).  If we find a pair
7997e2ab150SChristoph Lameter  * that not only moved, but what's better, moved to an empty slot
8007e2ab150SChristoph Lameter  * (d is not set in tmp), then we break out then, with that pair.
8017e2ab150SChristoph Lameter  * Otherwise when we finish scannng from_tmp, we at least have the
8027e2ab150SChristoph Lameter  * most recent <s, d> pair that moved.  If we get all the way through
8037e2ab150SChristoph Lameter  * the scan of tmp without finding any node that moved, much less
8047e2ab150SChristoph Lameter  * moved to an empty node, then there is nothing left worth migrating.
8057e2ab150SChristoph Lameter  */
8067e2ab150SChristoph Lameter 
8077e2ab150SChristoph Lameter 	tmp = *from_nodes;
8087e2ab150SChristoph Lameter 	while (!nodes_empty(tmp)) {
8097e2ab150SChristoph Lameter 		int s,d;
8107e2ab150SChristoph Lameter 		int source = -1;
8117e2ab150SChristoph Lameter 		int dest = 0;
8127e2ab150SChristoph Lameter 
8137e2ab150SChristoph Lameter 		for_each_node_mask(s, tmp) {
8147e2ab150SChristoph Lameter 			d = node_remap(s, *from_nodes, *to_nodes);
8157e2ab150SChristoph Lameter 			if (s == d)
8167e2ab150SChristoph Lameter 				continue;
8177e2ab150SChristoph Lameter 
8187e2ab150SChristoph Lameter 			source = s;	/* Node moved. Memorize */
8197e2ab150SChristoph Lameter 			dest = d;
8207e2ab150SChristoph Lameter 
8217e2ab150SChristoph Lameter 			/* dest not in remaining from nodes? */
8227e2ab150SChristoph Lameter 			if (!node_isset(dest, tmp))
8237e2ab150SChristoph Lameter 				break;
8247e2ab150SChristoph Lameter 		}
8257e2ab150SChristoph Lameter 		if (source == -1)
8267e2ab150SChristoph Lameter 			break;
8277e2ab150SChristoph Lameter 
8287e2ab150SChristoph Lameter 		node_clear(source, tmp);
8297e2ab150SChristoph Lameter 		err = migrate_to_node(mm, source, dest, flags);
8307e2ab150SChristoph Lameter 		if (err > 0)
8317e2ab150SChristoph Lameter 			busy += err;
8327e2ab150SChristoph Lameter 		if (err < 0)
8337e2ab150SChristoph Lameter 			break;
83439743889SChristoph Lameter 	}
8357b2259b3SChristoph Lameter out:
83639743889SChristoph Lameter 	up_read(&mm->mmap_sem);
8377e2ab150SChristoph Lameter 	if (err < 0)
8387e2ab150SChristoph Lameter 		return err;
8397e2ab150SChristoph Lameter 	return busy;
840b20a3503SChristoph Lameter 
84139743889SChristoph Lameter }
84239743889SChristoph Lameter 
8433ad33b24SLee Schermerhorn /*
8443ad33b24SLee Schermerhorn  * Allocate a new page for page migration based on vma policy.
8453ad33b24SLee Schermerhorn  * Start assuming that page is mapped by vma pointed to by @private.
8463ad33b24SLee Schermerhorn  * Search forward from there, if not.  N.B., this assumes that the
8473ad33b24SLee Schermerhorn  * list of pages handed to migrate_pages()--which is how we get here--
8483ad33b24SLee Schermerhorn  * is in virtual address order.
8493ad33b24SLee Schermerhorn  */
850742755a1SChristoph Lameter static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
85195a402c3SChristoph Lameter {
85295a402c3SChristoph Lameter 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
8533ad33b24SLee Schermerhorn 	unsigned long uninitialized_var(address);
85495a402c3SChristoph Lameter 
8553ad33b24SLee Schermerhorn 	while (vma) {
8563ad33b24SLee Schermerhorn 		address = page_address_in_vma(page, vma);
8573ad33b24SLee Schermerhorn 		if (address != -EFAULT)
8583ad33b24SLee Schermerhorn 			break;
8593ad33b24SLee Schermerhorn 		vma = vma->vm_next;
8603ad33b24SLee Schermerhorn 	}
8613ad33b24SLee Schermerhorn 
8623ad33b24SLee Schermerhorn 	/*
8633ad33b24SLee Schermerhorn 	 * if !vma, alloc_page_vma() will use task or system default policy
8643ad33b24SLee Schermerhorn 	 */
8653ad33b24SLee Schermerhorn 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
86695a402c3SChristoph Lameter }
867b20a3503SChristoph Lameter #else
868b20a3503SChristoph Lameter 
869b20a3503SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
870b20a3503SChristoph Lameter 				unsigned long flags)
871b20a3503SChristoph Lameter {
872b20a3503SChristoph Lameter }
873b20a3503SChristoph Lameter 
874b20a3503SChristoph Lameter int do_migrate_pages(struct mm_struct *mm,
875b20a3503SChristoph Lameter 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
876b20a3503SChristoph Lameter {
877b20a3503SChristoph Lameter 	return -ENOSYS;
878b20a3503SChristoph Lameter }
87995a402c3SChristoph Lameter 
88069939749SKeith Owens static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
88195a402c3SChristoph Lameter {
88295a402c3SChristoph Lameter 	return NULL;
88395a402c3SChristoph Lameter }
884b20a3503SChristoph Lameter #endif
885b20a3503SChristoph Lameter 
886dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
887028fec41SDavid Rientjes 		     unsigned short mode, unsigned short mode_flags,
888028fec41SDavid Rientjes 		     nodemask_t *nmask, unsigned long flags)
8896ce3c4c0SChristoph Lameter {
8906ce3c4c0SChristoph Lameter 	struct vm_area_struct *vma;
8916ce3c4c0SChristoph Lameter 	struct mm_struct *mm = current->mm;
8926ce3c4c0SChristoph Lameter 	struct mempolicy *new;
8936ce3c4c0SChristoph Lameter 	unsigned long end;
8946ce3c4c0SChristoph Lameter 	int err;
8956ce3c4c0SChristoph Lameter 	LIST_HEAD(pagelist);
8966ce3c4c0SChristoph Lameter 
897a3b51e01SDavid Rientjes 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
8986ce3c4c0SChristoph Lameter 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
8996ce3c4c0SChristoph Lameter 		return -EINVAL;
90074c00241SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
9016ce3c4c0SChristoph Lameter 		return -EPERM;
9026ce3c4c0SChristoph Lameter 
9036ce3c4c0SChristoph Lameter 	if (start & ~PAGE_MASK)
9046ce3c4c0SChristoph Lameter 		return -EINVAL;
9056ce3c4c0SChristoph Lameter 
9066ce3c4c0SChristoph Lameter 	if (mode == MPOL_DEFAULT)
9076ce3c4c0SChristoph Lameter 		flags &= ~MPOL_MF_STRICT;
9086ce3c4c0SChristoph Lameter 
9096ce3c4c0SChristoph Lameter 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
9106ce3c4c0SChristoph Lameter 	end = start + len;
9116ce3c4c0SChristoph Lameter 
9126ce3c4c0SChristoph Lameter 	if (end < start)
9136ce3c4c0SChristoph Lameter 		return -EINVAL;
9146ce3c4c0SChristoph Lameter 	if (end == start)
9156ce3c4c0SChristoph Lameter 		return 0;
9166ce3c4c0SChristoph Lameter 
917028fec41SDavid Rientjes 	new = mpol_new(mode, mode_flags, nmask);
9186ce3c4c0SChristoph Lameter 	if (IS_ERR(new))
9196ce3c4c0SChristoph Lameter 		return PTR_ERR(new);
9206ce3c4c0SChristoph Lameter 
9216ce3c4c0SChristoph Lameter 	/*
9226ce3c4c0SChristoph Lameter 	 * If we are using the default policy then operation
9236ce3c4c0SChristoph Lameter 	 * on discontinuous address spaces is okay after all
9246ce3c4c0SChristoph Lameter 	 */
9256ce3c4c0SChristoph Lameter 	if (!new)
9266ce3c4c0SChristoph Lameter 		flags |= MPOL_MF_DISCONTIG_OK;
9276ce3c4c0SChristoph Lameter 
928028fec41SDavid Rientjes 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
929028fec41SDavid Rientjes 		 start, start + len, mode, mode_flags,
930028fec41SDavid Rientjes 		 nmask ? nodes_addr(*nmask)[0] : -1);
9316ce3c4c0SChristoph Lameter 
9326ce3c4c0SChristoph Lameter 	down_write(&mm->mmap_sem);
9336ce3c4c0SChristoph Lameter 	vma = check_range(mm, start, end, nmask,
9346ce3c4c0SChristoph Lameter 			  flags | MPOL_MF_INVERT, &pagelist);
9356ce3c4c0SChristoph Lameter 
9366ce3c4c0SChristoph Lameter 	err = PTR_ERR(vma);
9376ce3c4c0SChristoph Lameter 	if (!IS_ERR(vma)) {
9386ce3c4c0SChristoph Lameter 		int nr_failed = 0;
9396ce3c4c0SChristoph Lameter 
9406ce3c4c0SChristoph Lameter 		err = mbind_range(vma, start, end, new);
9417e2ab150SChristoph Lameter 
9426ce3c4c0SChristoph Lameter 		if (!list_empty(&pagelist))
94395a402c3SChristoph Lameter 			nr_failed = migrate_pages(&pagelist, new_vma_page,
94495a402c3SChristoph Lameter 						(unsigned long)vma);
9456ce3c4c0SChristoph Lameter 
9466ce3c4c0SChristoph Lameter 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
9476ce3c4c0SChristoph Lameter 			err = -EIO;
9486ce3c4c0SChristoph Lameter 	}
949b20a3503SChristoph Lameter 
9506ce3c4c0SChristoph Lameter 	up_write(&mm->mmap_sem);
9516ce3c4c0SChristoph Lameter 	mpol_free(new);
9526ce3c4c0SChristoph Lameter 	return err;
9536ce3c4c0SChristoph Lameter }
9546ce3c4c0SChristoph Lameter 
95539743889SChristoph Lameter /*
9568bccd85fSChristoph Lameter  * User space interface with variable sized bitmaps for nodelists.
9578bccd85fSChristoph Lameter  */
9588bccd85fSChristoph Lameter 
9598bccd85fSChristoph Lameter /* Copy a node mask from user space. */
96039743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
9618bccd85fSChristoph Lameter 		     unsigned long maxnode)
9628bccd85fSChristoph Lameter {
9638bccd85fSChristoph Lameter 	unsigned long k;
9648bccd85fSChristoph Lameter 	unsigned long nlongs;
9658bccd85fSChristoph Lameter 	unsigned long endmask;
9668bccd85fSChristoph Lameter 
9678bccd85fSChristoph Lameter 	--maxnode;
9688bccd85fSChristoph Lameter 	nodes_clear(*nodes);
9698bccd85fSChristoph Lameter 	if (maxnode == 0 || !nmask)
9708bccd85fSChristoph Lameter 		return 0;
971a9c930baSAndi Kleen 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
972636f13c1SChris Wright 		return -EINVAL;
9738bccd85fSChristoph Lameter 
9748bccd85fSChristoph Lameter 	nlongs = BITS_TO_LONGS(maxnode);
9758bccd85fSChristoph Lameter 	if ((maxnode % BITS_PER_LONG) == 0)
9768bccd85fSChristoph Lameter 		endmask = ~0UL;
9778bccd85fSChristoph Lameter 	else
9788bccd85fSChristoph Lameter 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
9798bccd85fSChristoph Lameter 
9808bccd85fSChristoph Lameter 	/* When the user specified more nodes than supported just check
9818bccd85fSChristoph Lameter 	   if the non supported part is all zero. */
9828bccd85fSChristoph Lameter 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
9838bccd85fSChristoph Lameter 		if (nlongs > PAGE_SIZE/sizeof(long))
9848bccd85fSChristoph Lameter 			return -EINVAL;
9858bccd85fSChristoph Lameter 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
9868bccd85fSChristoph Lameter 			unsigned long t;
9878bccd85fSChristoph Lameter 			if (get_user(t, nmask + k))
9888bccd85fSChristoph Lameter 				return -EFAULT;
9898bccd85fSChristoph Lameter 			if (k == nlongs - 1) {
9908bccd85fSChristoph Lameter 				if (t & endmask)
9918bccd85fSChristoph Lameter 					return -EINVAL;
9928bccd85fSChristoph Lameter 			} else if (t)
9938bccd85fSChristoph Lameter 				return -EINVAL;
9948bccd85fSChristoph Lameter 		}
9958bccd85fSChristoph Lameter 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
9968bccd85fSChristoph Lameter 		endmask = ~0UL;
9978bccd85fSChristoph Lameter 	}
9988bccd85fSChristoph Lameter 
9998bccd85fSChristoph Lameter 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
10008bccd85fSChristoph Lameter 		return -EFAULT;
10018bccd85fSChristoph Lameter 	nodes_addr(*nodes)[nlongs-1] &= endmask;
10028bccd85fSChristoph Lameter 	return 0;
10038bccd85fSChristoph Lameter }
10048bccd85fSChristoph Lameter 
10058bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
10068bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
10078bccd85fSChristoph Lameter 			      nodemask_t *nodes)
10088bccd85fSChristoph Lameter {
10098bccd85fSChristoph Lameter 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
10108bccd85fSChristoph Lameter 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
10118bccd85fSChristoph Lameter 
10128bccd85fSChristoph Lameter 	if (copy > nbytes) {
10138bccd85fSChristoph Lameter 		if (copy > PAGE_SIZE)
10148bccd85fSChristoph Lameter 			return -EINVAL;
10158bccd85fSChristoph Lameter 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
10168bccd85fSChristoph Lameter 			return -EFAULT;
10178bccd85fSChristoph Lameter 		copy = nbytes;
10188bccd85fSChristoph Lameter 	}
10198bccd85fSChristoph Lameter 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
10208bccd85fSChristoph Lameter }
10218bccd85fSChristoph Lameter 
10228bccd85fSChristoph Lameter asmlinkage long sys_mbind(unsigned long start, unsigned long len,
10238bccd85fSChristoph Lameter 			unsigned long mode,
10248bccd85fSChristoph Lameter 			unsigned long __user *nmask, unsigned long maxnode,
10258bccd85fSChristoph Lameter 			unsigned flags)
10268bccd85fSChristoph Lameter {
10278bccd85fSChristoph Lameter 	nodemask_t nodes;
10288bccd85fSChristoph Lameter 	int err;
1029028fec41SDavid Rientjes 	unsigned short mode_flags;
10308bccd85fSChristoph Lameter 
1031028fec41SDavid Rientjes 	mode_flags = mode & MPOL_MODE_FLAGS;
1032028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1033a3b51e01SDavid Rientjes 	if (mode >= MPOL_MAX)
1034a3b51e01SDavid Rientjes 		return -EINVAL;
10354c50bc01SDavid Rientjes 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
10364c50bc01SDavid Rientjes 	    (mode_flags & MPOL_F_RELATIVE_NODES))
10374c50bc01SDavid Rientjes 		return -EINVAL;
10388bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
10398bccd85fSChristoph Lameter 	if (err)
10408bccd85fSChristoph Lameter 		return err;
1041028fec41SDavid Rientjes 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
10428bccd85fSChristoph Lameter }
10438bccd85fSChristoph Lameter 
10448bccd85fSChristoph Lameter /* Set the process memory policy */
10458bccd85fSChristoph Lameter asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
10468bccd85fSChristoph Lameter 		unsigned long maxnode)
10478bccd85fSChristoph Lameter {
10488bccd85fSChristoph Lameter 	int err;
10498bccd85fSChristoph Lameter 	nodemask_t nodes;
1050028fec41SDavid Rientjes 	unsigned short flags;
10518bccd85fSChristoph Lameter 
1052028fec41SDavid Rientjes 	flags = mode & MPOL_MODE_FLAGS;
1053028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1054028fec41SDavid Rientjes 	if ((unsigned int)mode >= MPOL_MAX)
10558bccd85fSChristoph Lameter 		return -EINVAL;
10564c50bc01SDavid Rientjes 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
10574c50bc01SDavid Rientjes 		return -EINVAL;
10588bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
10598bccd85fSChristoph Lameter 	if (err)
10608bccd85fSChristoph Lameter 		return err;
1061028fec41SDavid Rientjes 	return do_set_mempolicy(mode, flags, &nodes);
10628bccd85fSChristoph Lameter }
10638bccd85fSChristoph Lameter 
106439743889SChristoph Lameter asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
106539743889SChristoph Lameter 		const unsigned long __user *old_nodes,
106639743889SChristoph Lameter 		const unsigned long __user *new_nodes)
106739743889SChristoph Lameter {
106839743889SChristoph Lameter 	struct mm_struct *mm;
106939743889SChristoph Lameter 	struct task_struct *task;
107039743889SChristoph Lameter 	nodemask_t old;
107139743889SChristoph Lameter 	nodemask_t new;
107239743889SChristoph Lameter 	nodemask_t task_nodes;
107339743889SChristoph Lameter 	int err;
107439743889SChristoph Lameter 
107539743889SChristoph Lameter 	err = get_nodes(&old, old_nodes, maxnode);
107639743889SChristoph Lameter 	if (err)
107739743889SChristoph Lameter 		return err;
107839743889SChristoph Lameter 
107939743889SChristoph Lameter 	err = get_nodes(&new, new_nodes, maxnode);
108039743889SChristoph Lameter 	if (err)
108139743889SChristoph Lameter 		return err;
108239743889SChristoph Lameter 
108339743889SChristoph Lameter 	/* Find the mm_struct */
108439743889SChristoph Lameter 	read_lock(&tasklist_lock);
1085228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
108639743889SChristoph Lameter 	if (!task) {
108739743889SChristoph Lameter 		read_unlock(&tasklist_lock);
108839743889SChristoph Lameter 		return -ESRCH;
108939743889SChristoph Lameter 	}
109039743889SChristoph Lameter 	mm = get_task_mm(task);
109139743889SChristoph Lameter 	read_unlock(&tasklist_lock);
109239743889SChristoph Lameter 
109339743889SChristoph Lameter 	if (!mm)
109439743889SChristoph Lameter 		return -EINVAL;
109539743889SChristoph Lameter 
109639743889SChristoph Lameter 	/*
109739743889SChristoph Lameter 	 * Check if this process has the right to modify the specified
109839743889SChristoph Lameter 	 * process. The right exists if the process has administrative
10997f927fccSAlexey Dobriyan 	 * capabilities, superuser privileges or the same
110039743889SChristoph Lameter 	 * userid as the target process.
110139743889SChristoph Lameter 	 */
110239743889SChristoph Lameter 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
110339743889SChristoph Lameter 	    (current->uid != task->suid) && (current->uid != task->uid) &&
110474c00241SChristoph Lameter 	    !capable(CAP_SYS_NICE)) {
110539743889SChristoph Lameter 		err = -EPERM;
110639743889SChristoph Lameter 		goto out;
110739743889SChristoph Lameter 	}
110839743889SChristoph Lameter 
110939743889SChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
111039743889SChristoph Lameter 	/* Is the user allowed to access the target nodes? */
111174c00241SChristoph Lameter 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
111239743889SChristoph Lameter 		err = -EPERM;
111339743889SChristoph Lameter 		goto out;
111439743889SChristoph Lameter 	}
111539743889SChristoph Lameter 
111637b07e41SLee Schermerhorn 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
11173b42d28bSChristoph Lameter 		err = -EINVAL;
11183b42d28bSChristoph Lameter 		goto out;
11193b42d28bSChristoph Lameter 	}
11203b42d28bSChristoph Lameter 
112186c3a764SDavid Quigley 	err = security_task_movememory(task);
112286c3a764SDavid Quigley 	if (err)
112386c3a764SDavid Quigley 		goto out;
112486c3a764SDavid Quigley 
1125511030bcSChristoph Lameter 	err = do_migrate_pages(mm, &old, &new,
112674c00241SChristoph Lameter 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
112739743889SChristoph Lameter out:
112839743889SChristoph Lameter 	mmput(mm);
112939743889SChristoph Lameter 	return err;
113039743889SChristoph Lameter }
113139743889SChristoph Lameter 
113239743889SChristoph Lameter 
11338bccd85fSChristoph Lameter /* Retrieve NUMA policy */
11348bccd85fSChristoph Lameter asmlinkage long sys_get_mempolicy(int __user *policy,
11358bccd85fSChristoph Lameter 				unsigned long __user *nmask,
11368bccd85fSChristoph Lameter 				unsigned long maxnode,
11378bccd85fSChristoph Lameter 				unsigned long addr, unsigned long flags)
11388bccd85fSChristoph Lameter {
1139dbcb0f19SAdrian Bunk 	int err;
1140dbcb0f19SAdrian Bunk 	int uninitialized_var(pval);
11418bccd85fSChristoph Lameter 	nodemask_t nodes;
11428bccd85fSChristoph Lameter 
11438bccd85fSChristoph Lameter 	if (nmask != NULL && maxnode < MAX_NUMNODES)
11448bccd85fSChristoph Lameter 		return -EINVAL;
11458bccd85fSChristoph Lameter 
11468bccd85fSChristoph Lameter 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
11478bccd85fSChristoph Lameter 
11488bccd85fSChristoph Lameter 	if (err)
11498bccd85fSChristoph Lameter 		return err;
11508bccd85fSChristoph Lameter 
11518bccd85fSChristoph Lameter 	if (policy && put_user(pval, policy))
11528bccd85fSChristoph Lameter 		return -EFAULT;
11538bccd85fSChristoph Lameter 
11548bccd85fSChristoph Lameter 	if (nmask)
11558bccd85fSChristoph Lameter 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
11568bccd85fSChristoph Lameter 
11578bccd85fSChristoph Lameter 	return err;
11588bccd85fSChristoph Lameter }
11598bccd85fSChristoph Lameter 
11601da177e4SLinus Torvalds #ifdef CONFIG_COMPAT
11611da177e4SLinus Torvalds 
11621da177e4SLinus Torvalds asmlinkage long compat_sys_get_mempolicy(int __user *policy,
11631da177e4SLinus Torvalds 				     compat_ulong_t __user *nmask,
11641da177e4SLinus Torvalds 				     compat_ulong_t maxnode,
11651da177e4SLinus Torvalds 				     compat_ulong_t addr, compat_ulong_t flags)
11661da177e4SLinus Torvalds {
11671da177e4SLinus Torvalds 	long err;
11681da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
11691da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
11701da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
11711da177e4SLinus Torvalds 
11721da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
11731da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
11741da177e4SLinus Torvalds 
11751da177e4SLinus Torvalds 	if (nmask)
11761da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
11771da177e4SLinus Torvalds 
11781da177e4SLinus Torvalds 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
11791da177e4SLinus Torvalds 
11801da177e4SLinus Torvalds 	if (!err && nmask) {
11811da177e4SLinus Torvalds 		err = copy_from_user(bm, nm, alloc_size);
11821da177e4SLinus Torvalds 		/* ensure entire bitmap is zeroed */
11831da177e4SLinus Torvalds 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
11841da177e4SLinus Torvalds 		err |= compat_put_bitmap(nmask, bm, nr_bits);
11851da177e4SLinus Torvalds 	}
11861da177e4SLinus Torvalds 
11871da177e4SLinus Torvalds 	return err;
11881da177e4SLinus Torvalds }
11891da177e4SLinus Torvalds 
11901da177e4SLinus Torvalds asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
11911da177e4SLinus Torvalds 				     compat_ulong_t maxnode)
11921da177e4SLinus Torvalds {
11931da177e4SLinus Torvalds 	long err = 0;
11941da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
11951da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
11961da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
11971da177e4SLinus Torvalds 
11981da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
11991da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
12001da177e4SLinus Torvalds 
12011da177e4SLinus Torvalds 	if (nmask) {
12021da177e4SLinus Torvalds 		err = compat_get_bitmap(bm, nmask, nr_bits);
12031da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
12041da177e4SLinus Torvalds 		err |= copy_to_user(nm, bm, alloc_size);
12051da177e4SLinus Torvalds 	}
12061da177e4SLinus Torvalds 
12071da177e4SLinus Torvalds 	if (err)
12081da177e4SLinus Torvalds 		return -EFAULT;
12091da177e4SLinus Torvalds 
12101da177e4SLinus Torvalds 	return sys_set_mempolicy(mode, nm, nr_bits+1);
12111da177e4SLinus Torvalds }
12121da177e4SLinus Torvalds 
12131da177e4SLinus Torvalds asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
12141da177e4SLinus Torvalds 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
12151da177e4SLinus Torvalds 			     compat_ulong_t maxnode, compat_ulong_t flags)
12161da177e4SLinus Torvalds {
12171da177e4SLinus Torvalds 	long err = 0;
12181da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
12191da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
1220dfcd3c0dSAndi Kleen 	nodemask_t bm;
12211da177e4SLinus Torvalds 
12221da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
12231da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
12241da177e4SLinus Torvalds 
12251da177e4SLinus Torvalds 	if (nmask) {
1226dfcd3c0dSAndi Kleen 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
12271da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
1228dfcd3c0dSAndi Kleen 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
12291da177e4SLinus Torvalds 	}
12301da177e4SLinus Torvalds 
12311da177e4SLinus Torvalds 	if (err)
12321da177e4SLinus Torvalds 		return -EFAULT;
12331da177e4SLinus Torvalds 
12341da177e4SLinus Torvalds 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
12351da177e4SLinus Torvalds }
12361da177e4SLinus Torvalds 
12371da177e4SLinus Torvalds #endif
12381da177e4SLinus Torvalds 
1239480eccf9SLee Schermerhorn /*
1240480eccf9SLee Schermerhorn  * get_vma_policy(@task, @vma, @addr)
1241480eccf9SLee Schermerhorn  * @task - task for fallback if vma policy == default
1242480eccf9SLee Schermerhorn  * @vma   - virtual memory area whose policy is sought
1243480eccf9SLee Schermerhorn  * @addr  - address in @vma for shared policy lookup
1244480eccf9SLee Schermerhorn  *
1245480eccf9SLee Schermerhorn  * Returns effective policy for a VMA at specified address.
1246480eccf9SLee Schermerhorn  * Falls back to @task or system default policy, as necessary.
1247480eccf9SLee Schermerhorn  * Returned policy has extra reference count if shared, vma,
1248480eccf9SLee Schermerhorn  * or some other task's policy [show_numa_maps() can pass
1249480eccf9SLee Schermerhorn  * @task != current].  It is the caller's responsibility to
1250480eccf9SLee Schermerhorn  * free the reference in these cases.
1251480eccf9SLee Schermerhorn  */
125248fce342SChristoph Lameter static struct mempolicy * get_vma_policy(struct task_struct *task,
125348fce342SChristoph Lameter 		struct vm_area_struct *vma, unsigned long addr)
12541da177e4SLinus Torvalds {
12556e21c8f1SChristoph Lameter 	struct mempolicy *pol = task->mempolicy;
1256480eccf9SLee Schermerhorn 	int shared_pol = 0;
12571da177e4SLinus Torvalds 
12581da177e4SLinus Torvalds 	if (vma) {
1259480eccf9SLee Schermerhorn 		if (vma->vm_ops && vma->vm_ops->get_policy) {
12601da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
1261480eccf9SLee Schermerhorn 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1262480eccf9SLee Schermerhorn 		} else if (vma->vm_policy &&
12631da177e4SLinus Torvalds 				vma->vm_policy->policy != MPOL_DEFAULT)
12641da177e4SLinus Torvalds 			pol = vma->vm_policy;
12651da177e4SLinus Torvalds 	}
12661da177e4SLinus Torvalds 	if (!pol)
12671da177e4SLinus Torvalds 		pol = &default_policy;
1268480eccf9SLee Schermerhorn 	else if (!shared_pol && pol != current->mempolicy)
1269480eccf9SLee Schermerhorn 		mpol_get(pol);	/* vma or other task's policy */
12701da177e4SLinus Torvalds 	return pol;
12711da177e4SLinus Torvalds }
12721da177e4SLinus Torvalds 
127319770b32SMel Gorman /* Return a nodemask representing a mempolicy */
127419770b32SMel Gorman static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
127519770b32SMel Gorman {
127619770b32SMel Gorman 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
127719770b32SMel Gorman 	if (unlikely(policy->policy == MPOL_BIND) &&
127819770b32SMel Gorman 			gfp_zone(gfp) >= policy_zone &&
127919770b32SMel Gorman 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
128019770b32SMel Gorman 		return &policy->v.nodes;
128119770b32SMel Gorman 
128219770b32SMel Gorman 	return NULL;
128319770b32SMel Gorman }
128419770b32SMel Gorman 
12851da177e4SLinus Torvalds /* Return a zonelist representing a mempolicy */
1286dd0fc66fSAl Viro static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
12871da177e4SLinus Torvalds {
12881da177e4SLinus Torvalds 	int nd;
12891da177e4SLinus Torvalds 
12901da177e4SLinus Torvalds 	switch (policy->policy) {
12911da177e4SLinus Torvalds 	case MPOL_PREFERRED:
12921da177e4SLinus Torvalds 		nd = policy->v.preferred_node;
12931da177e4SLinus Torvalds 		if (nd < 0)
12941da177e4SLinus Torvalds 			nd = numa_node_id();
12951da177e4SLinus Torvalds 		break;
12961da177e4SLinus Torvalds 	case MPOL_BIND:
129719770b32SMel Gorman 		/*
129819770b32SMel Gorman 		 * Normally, MPOL_BIND allocations node-local are node-local
129919770b32SMel Gorman 		 * within the allowed nodemask. However, if __GFP_THISNODE is
130019770b32SMel Gorman 		 * set and the current node is part of the mask, we use the
130119770b32SMel Gorman 		 * the zonelist for the first node in the mask instead.
130219770b32SMel Gorman 		 */
130319770b32SMel Gorman 		nd = numa_node_id();
130419770b32SMel Gorman 		if (unlikely(gfp & __GFP_THISNODE) &&
130519770b32SMel Gorman 				unlikely(!node_isset(nd, policy->v.nodes)))
130619770b32SMel Gorman 			nd = first_node(policy->v.nodes);
130719770b32SMel Gorman 		break;
13081da177e4SLinus Torvalds 	case MPOL_INTERLEAVE: /* should not happen */
13091da177e4SLinus Torvalds 	case MPOL_DEFAULT:
13101da177e4SLinus Torvalds 		nd = numa_node_id();
13111da177e4SLinus Torvalds 		break;
13121da177e4SLinus Torvalds 	default:
13131da177e4SLinus Torvalds 		nd = 0;
13141da177e4SLinus Torvalds 		BUG();
13151da177e4SLinus Torvalds 	}
13160e88460dSMel Gorman 	return node_zonelist(nd, gfp);
13171da177e4SLinus Torvalds }
13181da177e4SLinus Torvalds 
13191da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
13201da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy)
13211da177e4SLinus Torvalds {
13221da177e4SLinus Torvalds 	unsigned nid, next;
13231da177e4SLinus Torvalds 	struct task_struct *me = current;
13241da177e4SLinus Torvalds 
13251da177e4SLinus Torvalds 	nid = me->il_next;
1326dfcd3c0dSAndi Kleen 	next = next_node(nid, policy->v.nodes);
13271da177e4SLinus Torvalds 	if (next >= MAX_NUMNODES)
1328dfcd3c0dSAndi Kleen 		next = first_node(policy->v.nodes);
1329f5b087b5SDavid Rientjes 	if (next < MAX_NUMNODES)
13301da177e4SLinus Torvalds 		me->il_next = next;
13311da177e4SLinus Torvalds 	return nid;
13321da177e4SLinus Torvalds }
13331da177e4SLinus Torvalds 
1334dc85da15SChristoph Lameter /*
1335dc85da15SChristoph Lameter  * Depending on the memory policy provide a node from which to allocate the
1336dc85da15SChristoph Lameter  * next slab entry.
1337dc85da15SChristoph Lameter  */
1338dc85da15SChristoph Lameter unsigned slab_node(struct mempolicy *policy)
1339dc85da15SChristoph Lameter {
1340a3b51e01SDavid Rientjes 	unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1341765c4507SChristoph Lameter 
1342765c4507SChristoph Lameter 	switch (pol) {
1343dc85da15SChristoph Lameter 	case MPOL_INTERLEAVE:
1344dc85da15SChristoph Lameter 		return interleave_nodes(policy);
1345dc85da15SChristoph Lameter 
1346dd1a239fSMel Gorman 	case MPOL_BIND: {
1347dc85da15SChristoph Lameter 		/*
1348dc85da15SChristoph Lameter 		 * Follow bind policy behavior and start allocation at the
1349dc85da15SChristoph Lameter 		 * first node.
1350dc85da15SChristoph Lameter 		 */
135119770b32SMel Gorman 		struct zonelist *zonelist;
135219770b32SMel Gorman 		struct zone *zone;
135319770b32SMel Gorman 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
135419770b32SMel Gorman 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
135519770b32SMel Gorman 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
135619770b32SMel Gorman 							&policy->v.nodes,
135719770b32SMel Gorman 							&zone);
135819770b32SMel Gorman 		return zone->node;
1359dd1a239fSMel Gorman 	}
1360dc85da15SChristoph Lameter 
1361dc85da15SChristoph Lameter 	case MPOL_PREFERRED:
1362dc85da15SChristoph Lameter 		if (policy->v.preferred_node >= 0)
1363dc85da15SChristoph Lameter 			return policy->v.preferred_node;
1364dc85da15SChristoph Lameter 		/* Fall through */
1365dc85da15SChristoph Lameter 
1366dc85da15SChristoph Lameter 	default:
1367dc85da15SChristoph Lameter 		return numa_node_id();
1368dc85da15SChristoph Lameter 	}
1369dc85da15SChristoph Lameter }
1370dc85da15SChristoph Lameter 
13711da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */
13721da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol,
13731da177e4SLinus Torvalds 		struct vm_area_struct *vma, unsigned long off)
13741da177e4SLinus Torvalds {
1375dfcd3c0dSAndi Kleen 	unsigned nnodes = nodes_weight(pol->v.nodes);
1376f5b087b5SDavid Rientjes 	unsigned target;
13771da177e4SLinus Torvalds 	int c;
13781da177e4SLinus Torvalds 	int nid = -1;
13791da177e4SLinus Torvalds 
1380f5b087b5SDavid Rientjes 	if (!nnodes)
1381f5b087b5SDavid Rientjes 		return numa_node_id();
1382f5b087b5SDavid Rientjes 	target = (unsigned int)off % nnodes;
13831da177e4SLinus Torvalds 	c = 0;
13841da177e4SLinus Torvalds 	do {
1385dfcd3c0dSAndi Kleen 		nid = next_node(nid, pol->v.nodes);
13861da177e4SLinus Torvalds 		c++;
13871da177e4SLinus Torvalds 	} while (c <= target);
13881da177e4SLinus Torvalds 	return nid;
13891da177e4SLinus Torvalds }
13901da177e4SLinus Torvalds 
13915da7ca86SChristoph Lameter /* Determine a node number for interleave */
13925da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol,
13935da7ca86SChristoph Lameter 		 struct vm_area_struct *vma, unsigned long addr, int shift)
13945da7ca86SChristoph Lameter {
13955da7ca86SChristoph Lameter 	if (vma) {
13965da7ca86SChristoph Lameter 		unsigned long off;
13975da7ca86SChristoph Lameter 
13983b98b087SNishanth Aravamudan 		/*
13993b98b087SNishanth Aravamudan 		 * for small pages, there is no difference between
14003b98b087SNishanth Aravamudan 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
14013b98b087SNishanth Aravamudan 		 * for huge pages, since vm_pgoff is in units of small
14023b98b087SNishanth Aravamudan 		 * pages, we need to shift off the always 0 bits to get
14033b98b087SNishanth Aravamudan 		 * a useful offset.
14043b98b087SNishanth Aravamudan 		 */
14053b98b087SNishanth Aravamudan 		BUG_ON(shift < PAGE_SHIFT);
14063b98b087SNishanth Aravamudan 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
14075da7ca86SChristoph Lameter 		off += (addr - vma->vm_start) >> shift;
14085da7ca86SChristoph Lameter 		return offset_il_node(pol, vma, off);
14095da7ca86SChristoph Lameter 	} else
14105da7ca86SChristoph Lameter 		return interleave_nodes(pol);
14115da7ca86SChristoph Lameter }
14125da7ca86SChristoph Lameter 
141300ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
1414480eccf9SLee Schermerhorn /*
1415480eccf9SLee Schermerhorn  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1416480eccf9SLee Schermerhorn  * @vma = virtual memory area whose policy is sought
1417480eccf9SLee Schermerhorn  * @addr = address in @vma for shared policy lookup and interleave policy
1418480eccf9SLee Schermerhorn  * @gfp_flags = for requested zone
141919770b32SMel Gorman  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
142019770b32SMel Gorman  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1421480eccf9SLee Schermerhorn  *
1422480eccf9SLee Schermerhorn  * Returns a zonelist suitable for a huge page allocation.
142319770b32SMel Gorman  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
142419770b32SMel Gorman  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1425480eccf9SLee Schermerhorn  * If it is also a policy for which get_vma_policy() returns an extra
142619770b32SMel Gorman  * reference, we must hold that reference until after the allocation.
1427480eccf9SLee Schermerhorn  * In that case, return policy via @mpol so hugetlb allocation can drop
1428480eccf9SLee Schermerhorn  * the reference. For non-'BIND referenced policies, we can/do drop the
1429480eccf9SLee Schermerhorn  * reference here, so the caller doesn't need to know about the special case
1430480eccf9SLee Schermerhorn  * for default and current task policy.
1431480eccf9SLee Schermerhorn  */
1432396faf03SMel Gorman struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
143319770b32SMel Gorman 				gfp_t gfp_flags, struct mempolicy **mpol,
143419770b32SMel Gorman 				nodemask_t **nodemask)
14355da7ca86SChristoph Lameter {
14365da7ca86SChristoph Lameter 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1437480eccf9SLee Schermerhorn 	struct zonelist *zl;
14385da7ca86SChristoph Lameter 
1439480eccf9SLee Schermerhorn 	*mpol = NULL;		/* probably no unref needed */
144019770b32SMel Gorman 	*nodemask = NULL;	/* assume !MPOL_BIND */
144119770b32SMel Gorman 	if (pol->policy == MPOL_BIND) {
144219770b32SMel Gorman 			*nodemask = &pol->v.nodes;
144319770b32SMel Gorman 	} else if (pol->policy == MPOL_INTERLEAVE) {
14445da7ca86SChristoph Lameter 		unsigned nid;
14455da7ca86SChristoph Lameter 
14465da7ca86SChristoph Lameter 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
144769682d85SLee Schermerhorn 		if (unlikely(pol != &default_policy &&
144869682d85SLee Schermerhorn 				pol != current->mempolicy))
1449480eccf9SLee Schermerhorn 			__mpol_free(pol);	/* finished with pol */
14500e88460dSMel Gorman 		return node_zonelist(nid, gfp_flags);
14515da7ca86SChristoph Lameter 	}
1452480eccf9SLee Schermerhorn 
1453480eccf9SLee Schermerhorn 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1454480eccf9SLee Schermerhorn 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1455480eccf9SLee Schermerhorn 		if (pol->policy != MPOL_BIND)
1456480eccf9SLee Schermerhorn 			__mpol_free(pol);	/* finished with pol */
1457480eccf9SLee Schermerhorn 		else
1458480eccf9SLee Schermerhorn 			*mpol = pol;	/* unref needed after allocation */
1459480eccf9SLee Schermerhorn 	}
1460480eccf9SLee Schermerhorn 	return zl;
14615da7ca86SChristoph Lameter }
146200ac59adSChen, Kenneth W #endif
14635da7ca86SChristoph Lameter 
14641da177e4SLinus Torvalds /* Allocate a page in interleaved policy.
14651da177e4SLinus Torvalds    Own path because it needs to do special accounting. */
1466662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1467662f3a0bSAndi Kleen 					unsigned nid)
14681da177e4SLinus Torvalds {
14691da177e4SLinus Torvalds 	struct zonelist *zl;
14701da177e4SLinus Torvalds 	struct page *page;
14711da177e4SLinus Torvalds 
14720e88460dSMel Gorman 	zl = node_zonelist(nid, gfp);
14731da177e4SLinus Torvalds 	page = __alloc_pages(gfp, order, zl);
1474dd1a239fSMel Gorman 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1475ca889e6cSChristoph Lameter 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
14761da177e4SLinus Torvalds 	return page;
14771da177e4SLinus Torvalds }
14781da177e4SLinus Torvalds 
14791da177e4SLinus Torvalds /**
14801da177e4SLinus Torvalds  * 	alloc_page_vma	- Allocate a page for a VMA.
14811da177e4SLinus Torvalds  *
14821da177e4SLinus Torvalds  * 	@gfp:
14831da177e4SLinus Torvalds  *      %GFP_USER    user allocation.
14841da177e4SLinus Torvalds  *      %GFP_KERNEL  kernel allocations,
14851da177e4SLinus Torvalds  *      %GFP_HIGHMEM highmem/user allocations,
14861da177e4SLinus Torvalds  *      %GFP_FS      allocation should not call back into a file system.
14871da177e4SLinus Torvalds  *      %GFP_ATOMIC  don't sleep.
14881da177e4SLinus Torvalds  *
14891da177e4SLinus Torvalds  * 	@vma:  Pointer to VMA or NULL if not available.
14901da177e4SLinus Torvalds  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
14911da177e4SLinus Torvalds  *
14921da177e4SLinus Torvalds  * 	This function allocates a page from the kernel page pool and applies
14931da177e4SLinus Torvalds  *	a NUMA policy associated with the VMA or the current process.
14941da177e4SLinus Torvalds  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
14951da177e4SLinus Torvalds  *	mm_struct of the VMA to prevent it from going away. Should be used for
14961da177e4SLinus Torvalds  *	all allocations for pages that will be mapped into
14971da177e4SLinus Torvalds  * 	user space. Returns NULL when no page can be allocated.
14981da177e4SLinus Torvalds  *
14991da177e4SLinus Torvalds  *	Should be called with the mm_sem of the vma hold.
15001da177e4SLinus Torvalds  */
15011da177e4SLinus Torvalds struct page *
1502dd0fc66fSAl Viro alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
15031da177e4SLinus Torvalds {
15046e21c8f1SChristoph Lameter 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1505480eccf9SLee Schermerhorn 	struct zonelist *zl;
15061da177e4SLinus Torvalds 
1507cf2a473cSPaul Jackson 	cpuset_update_task_memory_state();
15081da177e4SLinus Torvalds 
15091da177e4SLinus Torvalds 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
15101da177e4SLinus Torvalds 		unsigned nid;
15115da7ca86SChristoph Lameter 
15125da7ca86SChristoph Lameter 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
151369682d85SLee Schermerhorn 		if (unlikely(pol != &default_policy &&
151469682d85SLee Schermerhorn 				pol != current->mempolicy))
151569682d85SLee Schermerhorn 			__mpol_free(pol);	/* finished with pol */
15161da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, 0, nid);
15171da177e4SLinus Torvalds 	}
1518480eccf9SLee Schermerhorn 	zl = zonelist_policy(gfp, pol);
1519480eccf9SLee Schermerhorn 	if (pol != &default_policy && pol != current->mempolicy) {
1520480eccf9SLee Schermerhorn 		/*
1521480eccf9SLee Schermerhorn 		 * slow path: ref counted policy -- shared or vma
1522480eccf9SLee Schermerhorn 		 */
152319770b32SMel Gorman 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
152419770b32SMel Gorman 						zl, nodemask_policy(gfp, pol));
1525480eccf9SLee Schermerhorn 		__mpol_free(pol);
1526480eccf9SLee Schermerhorn 		return page;
1527480eccf9SLee Schermerhorn 	}
1528480eccf9SLee Schermerhorn 	/*
1529480eccf9SLee Schermerhorn 	 * fast path:  default or task policy
1530480eccf9SLee Schermerhorn 	 */
153119770b32SMel Gorman 	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
15321da177e4SLinus Torvalds }
15331da177e4SLinus Torvalds 
15341da177e4SLinus Torvalds /**
15351da177e4SLinus Torvalds  * 	alloc_pages_current - Allocate pages.
15361da177e4SLinus Torvalds  *
15371da177e4SLinus Torvalds  *	@gfp:
15381da177e4SLinus Torvalds  *		%GFP_USER   user allocation,
15391da177e4SLinus Torvalds  *      	%GFP_KERNEL kernel allocation,
15401da177e4SLinus Torvalds  *      	%GFP_HIGHMEM highmem allocation,
15411da177e4SLinus Torvalds  *      	%GFP_FS     don't call back into a file system.
15421da177e4SLinus Torvalds  *      	%GFP_ATOMIC don't sleep.
15431da177e4SLinus Torvalds  *	@order: Power of two of allocation size in pages. 0 is a single page.
15441da177e4SLinus Torvalds  *
15451da177e4SLinus Torvalds  *	Allocate a page from the kernel page pool.  When not in
15461da177e4SLinus Torvalds  *	interrupt context and apply the current process NUMA policy.
15471da177e4SLinus Torvalds  *	Returns NULL when no page can be allocated.
15481da177e4SLinus Torvalds  *
1549cf2a473cSPaul Jackson  *	Don't call cpuset_update_task_memory_state() unless
15501da177e4SLinus Torvalds  *	1) it's ok to take cpuset_sem (can WAIT), and
15511da177e4SLinus Torvalds  *	2) allocating for current task (not interrupt).
15521da177e4SLinus Torvalds  */
1553dd0fc66fSAl Viro struct page *alloc_pages_current(gfp_t gfp, unsigned order)
15541da177e4SLinus Torvalds {
15551da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
15561da177e4SLinus Torvalds 
15571da177e4SLinus Torvalds 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1558cf2a473cSPaul Jackson 		cpuset_update_task_memory_state();
15599b819d20SChristoph Lameter 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
15601da177e4SLinus Torvalds 		pol = &default_policy;
15611da177e4SLinus Torvalds 	if (pol->policy == MPOL_INTERLEAVE)
15621da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
156319770b32SMel Gorman 	return __alloc_pages_nodemask(gfp, order,
156419770b32SMel Gorman 			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
15651da177e4SLinus Torvalds }
15661da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current);
15671da177e4SLinus Torvalds 
15684225399aSPaul Jackson /*
15694225399aSPaul Jackson  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
15704225399aSPaul Jackson  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
15714225399aSPaul Jackson  * with the mems_allowed returned by cpuset_mems_allowed().  This
15724225399aSPaul Jackson  * keeps mempolicies cpuset relative after its cpuset moves.  See
15734225399aSPaul Jackson  * further kernel/cpuset.c update_nodemask().
15744225399aSPaul Jackson  */
15754225399aSPaul Jackson 
15761da177e4SLinus Torvalds /* Slow path of a mempolicy copy */
15771da177e4SLinus Torvalds struct mempolicy *__mpol_copy(struct mempolicy *old)
15781da177e4SLinus Torvalds {
15791da177e4SLinus Torvalds 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
15801da177e4SLinus Torvalds 
15811da177e4SLinus Torvalds 	if (!new)
15821da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
15834225399aSPaul Jackson 	if (current_cpuset_is_being_rebound()) {
15844225399aSPaul Jackson 		nodemask_t mems = cpuset_mems_allowed(current);
15854225399aSPaul Jackson 		mpol_rebind_policy(old, &mems);
15864225399aSPaul Jackson 	}
15871da177e4SLinus Torvalds 	*new = *old;
15881da177e4SLinus Torvalds 	atomic_set(&new->refcnt, 1);
15891da177e4SLinus Torvalds 	return new;
15901da177e4SLinus Torvalds }
15911da177e4SLinus Torvalds 
1592f5b087b5SDavid Rientjes static int mpol_match_intent(const struct mempolicy *a,
1593f5b087b5SDavid Rientjes 			     const struct mempolicy *b)
1594f5b087b5SDavid Rientjes {
1595f5b087b5SDavid Rientjes 	if (a->flags != b->flags)
1596f5b087b5SDavid Rientjes 		return 0;
1597f5b087b5SDavid Rientjes 	if (!mpol_store_user_nodemask(a))
1598f5b087b5SDavid Rientjes 		return 1;
1599f5b087b5SDavid Rientjes 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1600f5b087b5SDavid Rientjes }
1601f5b087b5SDavid Rientjes 
16021da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
16031da177e4SLinus Torvalds int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
16041da177e4SLinus Torvalds {
16051da177e4SLinus Torvalds 	if (!a || !b)
16061da177e4SLinus Torvalds 		return 0;
16071da177e4SLinus Torvalds 	if (a->policy != b->policy)
16081da177e4SLinus Torvalds 		return 0;
1609f5b087b5SDavid Rientjes 	if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1610f5b087b5SDavid Rientjes 		return 0;
16111da177e4SLinus Torvalds 	switch (a->policy) {
16121da177e4SLinus Torvalds 	case MPOL_DEFAULT:
16131da177e4SLinus Torvalds 		return 1;
161419770b32SMel Gorman 	case MPOL_BIND:
161519770b32SMel Gorman 		/* Fall through */
16161da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
1617dfcd3c0dSAndi Kleen 		return nodes_equal(a->v.nodes, b->v.nodes);
16181da177e4SLinus Torvalds 	case MPOL_PREFERRED:
16191da177e4SLinus Torvalds 		return a->v.preferred_node == b->v.preferred_node;
16201da177e4SLinus Torvalds 	default:
16211da177e4SLinus Torvalds 		BUG();
16221da177e4SLinus Torvalds 		return 0;
16231da177e4SLinus Torvalds 	}
16241da177e4SLinus Torvalds }
16251da177e4SLinus Torvalds 
16261da177e4SLinus Torvalds /* Slow path of a mpol destructor. */
16271da177e4SLinus Torvalds void __mpol_free(struct mempolicy *p)
16281da177e4SLinus Torvalds {
16291da177e4SLinus Torvalds 	if (!atomic_dec_and_test(&p->refcnt))
16301da177e4SLinus Torvalds 		return;
16311da177e4SLinus Torvalds 	p->policy = MPOL_DEFAULT;
16321da177e4SLinus Torvalds 	kmem_cache_free(policy_cache, p);
16331da177e4SLinus Torvalds }
16341da177e4SLinus Torvalds 
16351da177e4SLinus Torvalds /*
16361da177e4SLinus Torvalds  * Shared memory backing store policy support.
16371da177e4SLinus Torvalds  *
16381da177e4SLinus Torvalds  * Remember policies even when nobody has shared memory mapped.
16391da177e4SLinus Torvalds  * The policies are kept in Red-Black tree linked from the inode.
16401da177e4SLinus Torvalds  * They are protected by the sp->lock spinlock, which should be held
16411da177e4SLinus Torvalds  * for any accesses to the tree.
16421da177e4SLinus Torvalds  */
16431da177e4SLinus Torvalds 
16441da177e4SLinus Torvalds /* lookup first element intersecting start-end */
16451da177e4SLinus Torvalds /* Caller holds sp->lock */
16461da177e4SLinus Torvalds static struct sp_node *
16471da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
16481da177e4SLinus Torvalds {
16491da177e4SLinus Torvalds 	struct rb_node *n = sp->root.rb_node;
16501da177e4SLinus Torvalds 
16511da177e4SLinus Torvalds 	while (n) {
16521da177e4SLinus Torvalds 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
16531da177e4SLinus Torvalds 
16541da177e4SLinus Torvalds 		if (start >= p->end)
16551da177e4SLinus Torvalds 			n = n->rb_right;
16561da177e4SLinus Torvalds 		else if (end <= p->start)
16571da177e4SLinus Torvalds 			n = n->rb_left;
16581da177e4SLinus Torvalds 		else
16591da177e4SLinus Torvalds 			break;
16601da177e4SLinus Torvalds 	}
16611da177e4SLinus Torvalds 	if (!n)
16621da177e4SLinus Torvalds 		return NULL;
16631da177e4SLinus Torvalds 	for (;;) {
16641da177e4SLinus Torvalds 		struct sp_node *w = NULL;
16651da177e4SLinus Torvalds 		struct rb_node *prev = rb_prev(n);
16661da177e4SLinus Torvalds 		if (!prev)
16671da177e4SLinus Torvalds 			break;
16681da177e4SLinus Torvalds 		w = rb_entry(prev, struct sp_node, nd);
16691da177e4SLinus Torvalds 		if (w->end <= start)
16701da177e4SLinus Torvalds 			break;
16711da177e4SLinus Torvalds 		n = prev;
16721da177e4SLinus Torvalds 	}
16731da177e4SLinus Torvalds 	return rb_entry(n, struct sp_node, nd);
16741da177e4SLinus Torvalds }
16751da177e4SLinus Torvalds 
16761da177e4SLinus Torvalds /* Insert a new shared policy into the list. */
16771da177e4SLinus Torvalds /* Caller holds sp->lock */
16781da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
16791da177e4SLinus Torvalds {
16801da177e4SLinus Torvalds 	struct rb_node **p = &sp->root.rb_node;
16811da177e4SLinus Torvalds 	struct rb_node *parent = NULL;
16821da177e4SLinus Torvalds 	struct sp_node *nd;
16831da177e4SLinus Torvalds 
16841da177e4SLinus Torvalds 	while (*p) {
16851da177e4SLinus Torvalds 		parent = *p;
16861da177e4SLinus Torvalds 		nd = rb_entry(parent, struct sp_node, nd);
16871da177e4SLinus Torvalds 		if (new->start < nd->start)
16881da177e4SLinus Torvalds 			p = &(*p)->rb_left;
16891da177e4SLinus Torvalds 		else if (new->end > nd->end)
16901da177e4SLinus Torvalds 			p = &(*p)->rb_right;
16911da177e4SLinus Torvalds 		else
16921da177e4SLinus Torvalds 			BUG();
16931da177e4SLinus Torvalds 	}
16941da177e4SLinus Torvalds 	rb_link_node(&new->nd, parent, p);
16951da177e4SLinus Torvalds 	rb_insert_color(&new->nd, &sp->root);
1696140d5a49SPaul Mundt 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
16971da177e4SLinus Torvalds 		 new->policy ? new->policy->policy : 0);
16981da177e4SLinus Torvalds }
16991da177e4SLinus Torvalds 
17001da177e4SLinus Torvalds /* Find shared policy intersecting idx */
17011da177e4SLinus Torvalds struct mempolicy *
17021da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
17031da177e4SLinus Torvalds {
17041da177e4SLinus Torvalds 	struct mempolicy *pol = NULL;
17051da177e4SLinus Torvalds 	struct sp_node *sn;
17061da177e4SLinus Torvalds 
17071da177e4SLinus Torvalds 	if (!sp->root.rb_node)
17081da177e4SLinus Torvalds 		return NULL;
17091da177e4SLinus Torvalds 	spin_lock(&sp->lock);
17101da177e4SLinus Torvalds 	sn = sp_lookup(sp, idx, idx+1);
17111da177e4SLinus Torvalds 	if (sn) {
17121da177e4SLinus Torvalds 		mpol_get(sn->policy);
17131da177e4SLinus Torvalds 		pol = sn->policy;
17141da177e4SLinus Torvalds 	}
17151da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
17161da177e4SLinus Torvalds 	return pol;
17171da177e4SLinus Torvalds }
17181da177e4SLinus Torvalds 
17191da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
17201da177e4SLinus Torvalds {
1721140d5a49SPaul Mundt 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
17221da177e4SLinus Torvalds 	rb_erase(&n->nd, &sp->root);
17231da177e4SLinus Torvalds 	mpol_free(n->policy);
17241da177e4SLinus Torvalds 	kmem_cache_free(sn_cache, n);
17251da177e4SLinus Torvalds }
17261da177e4SLinus Torvalds 
1727dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1728dbcb0f19SAdrian Bunk 				struct mempolicy *pol)
17291da177e4SLinus Torvalds {
17301da177e4SLinus Torvalds 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
17311da177e4SLinus Torvalds 
17321da177e4SLinus Torvalds 	if (!n)
17331da177e4SLinus Torvalds 		return NULL;
17341da177e4SLinus Torvalds 	n->start = start;
17351da177e4SLinus Torvalds 	n->end = end;
17361da177e4SLinus Torvalds 	mpol_get(pol);
17371da177e4SLinus Torvalds 	n->policy = pol;
17381da177e4SLinus Torvalds 	return n;
17391da177e4SLinus Torvalds }
17401da177e4SLinus Torvalds 
17411da177e4SLinus Torvalds /* Replace a policy range. */
17421da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
17431da177e4SLinus Torvalds 				 unsigned long end, struct sp_node *new)
17441da177e4SLinus Torvalds {
17451da177e4SLinus Torvalds 	struct sp_node *n, *new2 = NULL;
17461da177e4SLinus Torvalds 
17471da177e4SLinus Torvalds restart:
17481da177e4SLinus Torvalds 	spin_lock(&sp->lock);
17491da177e4SLinus Torvalds 	n = sp_lookup(sp, start, end);
17501da177e4SLinus Torvalds 	/* Take care of old policies in the same range. */
17511da177e4SLinus Torvalds 	while (n && n->start < end) {
17521da177e4SLinus Torvalds 		struct rb_node *next = rb_next(&n->nd);
17531da177e4SLinus Torvalds 		if (n->start >= start) {
17541da177e4SLinus Torvalds 			if (n->end <= end)
17551da177e4SLinus Torvalds 				sp_delete(sp, n);
17561da177e4SLinus Torvalds 			else
17571da177e4SLinus Torvalds 				n->start = end;
17581da177e4SLinus Torvalds 		} else {
17591da177e4SLinus Torvalds 			/* Old policy spanning whole new range. */
17601da177e4SLinus Torvalds 			if (n->end > end) {
17611da177e4SLinus Torvalds 				if (!new2) {
17621da177e4SLinus Torvalds 					spin_unlock(&sp->lock);
17631da177e4SLinus Torvalds 					new2 = sp_alloc(end, n->end, n->policy);
17641da177e4SLinus Torvalds 					if (!new2)
17651da177e4SLinus Torvalds 						return -ENOMEM;
17661da177e4SLinus Torvalds 					goto restart;
17671da177e4SLinus Torvalds 				}
17681da177e4SLinus Torvalds 				n->end = start;
17691da177e4SLinus Torvalds 				sp_insert(sp, new2);
17701da177e4SLinus Torvalds 				new2 = NULL;
17711da177e4SLinus Torvalds 				break;
17721da177e4SLinus Torvalds 			} else
17731da177e4SLinus Torvalds 				n->end = start;
17741da177e4SLinus Torvalds 		}
17751da177e4SLinus Torvalds 		if (!next)
17761da177e4SLinus Torvalds 			break;
17771da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
17781da177e4SLinus Torvalds 	}
17791da177e4SLinus Torvalds 	if (new)
17801da177e4SLinus Torvalds 		sp_insert(sp, new);
17811da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
17821da177e4SLinus Torvalds 	if (new2) {
17831da177e4SLinus Torvalds 		mpol_free(new2->policy);
17841da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new2);
17851da177e4SLinus Torvalds 	}
17861da177e4SLinus Torvalds 	return 0;
17871da177e4SLinus Torvalds }
17881da177e4SLinus Torvalds 
1789a3b51e01SDavid Rientjes void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1790028fec41SDavid Rientjes 			unsigned short flags, nodemask_t *policy_nodes)
17917339ff83SRobin Holt {
17927339ff83SRobin Holt 	info->root = RB_ROOT;
17937339ff83SRobin Holt 	spin_lock_init(&info->lock);
17947339ff83SRobin Holt 
17957339ff83SRobin Holt 	if (policy != MPOL_DEFAULT) {
17967339ff83SRobin Holt 		struct mempolicy *newpol;
17977339ff83SRobin Holt 
17987339ff83SRobin Holt 		/* Falls back to MPOL_DEFAULT on any error */
1799028fec41SDavid Rientjes 		newpol = mpol_new(policy, flags, policy_nodes);
18007339ff83SRobin Holt 		if (!IS_ERR(newpol)) {
18017339ff83SRobin Holt 			/* Create pseudo-vma that contains just the policy */
18027339ff83SRobin Holt 			struct vm_area_struct pvma;
18037339ff83SRobin Holt 
18047339ff83SRobin Holt 			memset(&pvma, 0, sizeof(struct vm_area_struct));
18057339ff83SRobin Holt 			/* Policy covers entire file */
18067339ff83SRobin Holt 			pvma.vm_end = TASK_SIZE;
18077339ff83SRobin Holt 			mpol_set_shared_policy(info, &pvma, newpol);
18087339ff83SRobin Holt 			mpol_free(newpol);
18097339ff83SRobin Holt 		}
18107339ff83SRobin Holt 	}
18117339ff83SRobin Holt }
18127339ff83SRobin Holt 
18131da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info,
18141da177e4SLinus Torvalds 			struct vm_area_struct *vma, struct mempolicy *npol)
18151da177e4SLinus Torvalds {
18161da177e4SLinus Torvalds 	int err;
18171da177e4SLinus Torvalds 	struct sp_node *new = NULL;
18181da177e4SLinus Torvalds 	unsigned long sz = vma_pages(vma);
18191da177e4SLinus Torvalds 
1820028fec41SDavid Rientjes 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
18211da177e4SLinus Torvalds 		 vma->vm_pgoff,
18221da177e4SLinus Torvalds 		 sz, npol ? npol->policy : -1,
1823028fec41SDavid Rientjes 		 npol ? npol->flags : -1,
1824dfcd3c0dSAndi Kleen 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
18251da177e4SLinus Torvalds 
18261da177e4SLinus Torvalds 	if (npol) {
18271da177e4SLinus Torvalds 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
18281da177e4SLinus Torvalds 		if (!new)
18291da177e4SLinus Torvalds 			return -ENOMEM;
18301da177e4SLinus Torvalds 	}
18311da177e4SLinus Torvalds 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
18321da177e4SLinus Torvalds 	if (err && new)
18331da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new);
18341da177e4SLinus Torvalds 	return err;
18351da177e4SLinus Torvalds }
18361da177e4SLinus Torvalds 
18371da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
18381da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p)
18391da177e4SLinus Torvalds {
18401da177e4SLinus Torvalds 	struct sp_node *n;
18411da177e4SLinus Torvalds 	struct rb_node *next;
18421da177e4SLinus Torvalds 
18431da177e4SLinus Torvalds 	if (!p->root.rb_node)
18441da177e4SLinus Torvalds 		return;
18451da177e4SLinus Torvalds 	spin_lock(&p->lock);
18461da177e4SLinus Torvalds 	next = rb_first(&p->root);
18471da177e4SLinus Torvalds 	while (next) {
18481da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
18491da177e4SLinus Torvalds 		next = rb_next(&n->nd);
185090c5029eSAndi Kleen 		rb_erase(&n->nd, &p->root);
18511da177e4SLinus Torvalds 		mpol_free(n->policy);
18521da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, n);
18531da177e4SLinus Torvalds 	}
18541da177e4SLinus Torvalds 	spin_unlock(&p->lock);
18551da177e4SLinus Torvalds }
18561da177e4SLinus Torvalds 
18571da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */
18581da177e4SLinus Torvalds void __init numa_policy_init(void)
18591da177e4SLinus Torvalds {
1860b71636e2SPaul Mundt 	nodemask_t interleave_nodes;
1861b71636e2SPaul Mundt 	unsigned long largest = 0;
1862b71636e2SPaul Mundt 	int nid, prefer = 0;
1863b71636e2SPaul Mundt 
18641da177e4SLinus Torvalds 	policy_cache = kmem_cache_create("numa_policy",
18651da177e4SLinus Torvalds 					 sizeof(struct mempolicy),
186620c2df83SPaul Mundt 					 0, SLAB_PANIC, NULL);
18671da177e4SLinus Torvalds 
18681da177e4SLinus Torvalds 	sn_cache = kmem_cache_create("shared_policy_node",
18691da177e4SLinus Torvalds 				     sizeof(struct sp_node),
187020c2df83SPaul Mundt 				     0, SLAB_PANIC, NULL);
18711da177e4SLinus Torvalds 
1872b71636e2SPaul Mundt 	/*
1873b71636e2SPaul Mundt 	 * Set interleaving policy for system init. Interleaving is only
1874b71636e2SPaul Mundt 	 * enabled across suitably sized nodes (default is >= 16MB), or
1875b71636e2SPaul Mundt 	 * fall back to the largest node if they're all smaller.
1876b71636e2SPaul Mundt 	 */
1877b71636e2SPaul Mundt 	nodes_clear(interleave_nodes);
187856bbd65dSChristoph Lameter 	for_each_node_state(nid, N_HIGH_MEMORY) {
1879b71636e2SPaul Mundt 		unsigned long total_pages = node_present_pages(nid);
18801da177e4SLinus Torvalds 
1881b71636e2SPaul Mundt 		/* Preserve the largest node */
1882b71636e2SPaul Mundt 		if (largest < total_pages) {
1883b71636e2SPaul Mundt 			largest = total_pages;
1884b71636e2SPaul Mundt 			prefer = nid;
1885b71636e2SPaul Mundt 		}
1886b71636e2SPaul Mundt 
1887b71636e2SPaul Mundt 		/* Interleave this node? */
1888b71636e2SPaul Mundt 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1889b71636e2SPaul Mundt 			node_set(nid, interleave_nodes);
1890b71636e2SPaul Mundt 	}
1891b71636e2SPaul Mundt 
1892b71636e2SPaul Mundt 	/* All too small, use the largest */
1893b71636e2SPaul Mundt 	if (unlikely(nodes_empty(interleave_nodes)))
1894b71636e2SPaul Mundt 		node_set(prefer, interleave_nodes);
1895b71636e2SPaul Mundt 
1896028fec41SDavid Rientjes 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
18971da177e4SLinus Torvalds 		printk("numa_policy_init: interleaving failed\n");
18981da177e4SLinus Torvalds }
18991da177e4SLinus Torvalds 
19008bccd85fSChristoph Lameter /* Reset policy of current process to default */
19011da177e4SLinus Torvalds void numa_default_policy(void)
19021da177e4SLinus Torvalds {
1903028fec41SDavid Rientjes 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
19041da177e4SLinus Torvalds }
190568860ec1SPaul Jackson 
19064225399aSPaul Jackson /*
19071a75a6c8SChristoph Lameter  * Display pages allocated per node and memory policy via /proc.
19081a75a6c8SChristoph Lameter  */
190915ad7cdcSHelge Deller static const char * const policy_types[] =
191015ad7cdcSHelge Deller 	{ "default", "prefer", "bind", "interleave" };
19111a75a6c8SChristoph Lameter 
19121a75a6c8SChristoph Lameter /*
19131a75a6c8SChristoph Lameter  * Convert a mempolicy into a string.
19141a75a6c8SChristoph Lameter  * Returns the number of characters in buffer (if positive)
19151a75a6c8SChristoph Lameter  * or an error (negative)
19161a75a6c8SChristoph Lameter  */
19171a75a6c8SChristoph Lameter static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
19181a75a6c8SChristoph Lameter {
19191a75a6c8SChristoph Lameter 	char *p = buffer;
19201a75a6c8SChristoph Lameter 	int l;
19211a75a6c8SChristoph Lameter 	nodemask_t nodes;
1922a3b51e01SDavid Rientjes 	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1923f5b087b5SDavid Rientjes 	unsigned short flags = pol ? pol->flags : 0;
19241a75a6c8SChristoph Lameter 
19251a75a6c8SChristoph Lameter 	switch (mode) {
19261a75a6c8SChristoph Lameter 	case MPOL_DEFAULT:
19271a75a6c8SChristoph Lameter 		nodes_clear(nodes);
19281a75a6c8SChristoph Lameter 		break;
19291a75a6c8SChristoph Lameter 
19301a75a6c8SChristoph Lameter 	case MPOL_PREFERRED:
19311a75a6c8SChristoph Lameter 		nodes_clear(nodes);
19321a75a6c8SChristoph Lameter 		node_set(pol->v.preferred_node, nodes);
19331a75a6c8SChristoph Lameter 		break;
19341a75a6c8SChristoph Lameter 
19351a75a6c8SChristoph Lameter 	case MPOL_BIND:
193619770b32SMel Gorman 		/* Fall through */
19371a75a6c8SChristoph Lameter 	case MPOL_INTERLEAVE:
19381a75a6c8SChristoph Lameter 		nodes = pol->v.nodes;
19391a75a6c8SChristoph Lameter 		break;
19401a75a6c8SChristoph Lameter 
19411a75a6c8SChristoph Lameter 	default:
19421a75a6c8SChristoph Lameter 		BUG();
19431a75a6c8SChristoph Lameter 		return -EFAULT;
19441a75a6c8SChristoph Lameter 	}
19451a75a6c8SChristoph Lameter 
19461a75a6c8SChristoph Lameter 	l = strlen(policy_types[mode]);
19471a75a6c8SChristoph Lameter  	if (buffer + maxlen < p + l + 1)
19481a75a6c8SChristoph Lameter  		return -ENOSPC;
19491a75a6c8SChristoph Lameter 
19501a75a6c8SChristoph Lameter 	strcpy(p, policy_types[mode]);
19511a75a6c8SChristoph Lameter 	p += l;
19521a75a6c8SChristoph Lameter 
1953f5b087b5SDavid Rientjes 	if (flags) {
1954f5b087b5SDavid Rientjes 		int need_bar = 0;
1955f5b087b5SDavid Rientjes 
1956f5b087b5SDavid Rientjes 		if (buffer + maxlen < p + 2)
1957f5b087b5SDavid Rientjes 			return -ENOSPC;
1958f5b087b5SDavid Rientjes 		*p++ = '=';
1959f5b087b5SDavid Rientjes 
1960f5b087b5SDavid Rientjes 		if (flags & MPOL_F_STATIC_NODES)
1961f5b087b5SDavid Rientjes 			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
19624c50bc01SDavid Rientjes 		if (flags & MPOL_F_RELATIVE_NODES)
19634c50bc01SDavid Rientjes 			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1964f5b087b5SDavid Rientjes 	}
1965f5b087b5SDavid Rientjes 
19661a75a6c8SChristoph Lameter 	if (!nodes_empty(nodes)) {
19671a75a6c8SChristoph Lameter 		if (buffer + maxlen < p + 2)
19681a75a6c8SChristoph Lameter 			return -ENOSPC;
19691a75a6c8SChristoph Lameter 		*p++ = '=';
19701a75a6c8SChristoph Lameter 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
19711a75a6c8SChristoph Lameter 	}
19721a75a6c8SChristoph Lameter 	return p - buffer;
19731a75a6c8SChristoph Lameter }
19741a75a6c8SChristoph Lameter 
19751a75a6c8SChristoph Lameter struct numa_maps {
19761a75a6c8SChristoph Lameter 	unsigned long pages;
19771a75a6c8SChristoph Lameter 	unsigned long anon;
1978397874dfSChristoph Lameter 	unsigned long active;
1979397874dfSChristoph Lameter 	unsigned long writeback;
19801a75a6c8SChristoph Lameter 	unsigned long mapcount_max;
1981397874dfSChristoph Lameter 	unsigned long dirty;
1982397874dfSChristoph Lameter 	unsigned long swapcache;
19831a75a6c8SChristoph Lameter 	unsigned long node[MAX_NUMNODES];
19841a75a6c8SChristoph Lameter };
19851a75a6c8SChristoph Lameter 
1986397874dfSChristoph Lameter static void gather_stats(struct page *page, void *private, int pte_dirty)
19871a75a6c8SChristoph Lameter {
19881a75a6c8SChristoph Lameter 	struct numa_maps *md = private;
19891a75a6c8SChristoph Lameter 	int count = page_mapcount(page);
19901a75a6c8SChristoph Lameter 
19911a75a6c8SChristoph Lameter 	md->pages++;
1992397874dfSChristoph Lameter 	if (pte_dirty || PageDirty(page))
1993397874dfSChristoph Lameter 		md->dirty++;
1994397874dfSChristoph Lameter 
1995397874dfSChristoph Lameter 	if (PageSwapCache(page))
1996397874dfSChristoph Lameter 		md->swapcache++;
1997397874dfSChristoph Lameter 
1998397874dfSChristoph Lameter 	if (PageActive(page))
1999397874dfSChristoph Lameter 		md->active++;
2000397874dfSChristoph Lameter 
2001397874dfSChristoph Lameter 	if (PageWriteback(page))
2002397874dfSChristoph Lameter 		md->writeback++;
20031a75a6c8SChristoph Lameter 
20041a75a6c8SChristoph Lameter 	if (PageAnon(page))
20051a75a6c8SChristoph Lameter 		md->anon++;
20061a75a6c8SChristoph Lameter 
2007397874dfSChristoph Lameter 	if (count > md->mapcount_max)
2008397874dfSChristoph Lameter 		md->mapcount_max = count;
2009397874dfSChristoph Lameter 
20101a75a6c8SChristoph Lameter 	md->node[page_to_nid(page)]++;
20111a75a6c8SChristoph Lameter }
20121a75a6c8SChristoph Lameter 
20137f709ed0SAndrew Morton #ifdef CONFIG_HUGETLB_PAGE
2014397874dfSChristoph Lameter static void check_huge_range(struct vm_area_struct *vma,
2015397874dfSChristoph Lameter 		unsigned long start, unsigned long end,
2016397874dfSChristoph Lameter 		struct numa_maps *md)
2017397874dfSChristoph Lameter {
2018397874dfSChristoph Lameter 	unsigned long addr;
2019397874dfSChristoph Lameter 	struct page *page;
2020397874dfSChristoph Lameter 
2021397874dfSChristoph Lameter 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
2022397874dfSChristoph Lameter 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2023397874dfSChristoph Lameter 		pte_t pte;
2024397874dfSChristoph Lameter 
2025397874dfSChristoph Lameter 		if (!ptep)
2026397874dfSChristoph Lameter 			continue;
2027397874dfSChristoph Lameter 
2028397874dfSChristoph Lameter 		pte = *ptep;
2029397874dfSChristoph Lameter 		if (pte_none(pte))
2030397874dfSChristoph Lameter 			continue;
2031397874dfSChristoph Lameter 
2032397874dfSChristoph Lameter 		page = pte_page(pte);
2033397874dfSChristoph Lameter 		if (!page)
2034397874dfSChristoph Lameter 			continue;
2035397874dfSChristoph Lameter 
2036397874dfSChristoph Lameter 		gather_stats(page, md, pte_dirty(*ptep));
2037397874dfSChristoph Lameter 	}
2038397874dfSChristoph Lameter }
20397f709ed0SAndrew Morton #else
20407f709ed0SAndrew Morton static inline void check_huge_range(struct vm_area_struct *vma,
20417f709ed0SAndrew Morton 		unsigned long start, unsigned long end,
20427f709ed0SAndrew Morton 		struct numa_maps *md)
20437f709ed0SAndrew Morton {
20447f709ed0SAndrew Morton }
20457f709ed0SAndrew Morton #endif
2046397874dfSChristoph Lameter 
20471a75a6c8SChristoph Lameter int show_numa_map(struct seq_file *m, void *v)
20481a75a6c8SChristoph Lameter {
204999f89551SEric W. Biederman 	struct proc_maps_private *priv = m->private;
20501a75a6c8SChristoph Lameter 	struct vm_area_struct *vma = v;
20511a75a6c8SChristoph Lameter 	struct numa_maps *md;
2052397874dfSChristoph Lameter 	struct file *file = vma->vm_file;
2053397874dfSChristoph Lameter 	struct mm_struct *mm = vma->vm_mm;
2054480eccf9SLee Schermerhorn 	struct mempolicy *pol;
20551a75a6c8SChristoph Lameter 	int n;
20561a75a6c8SChristoph Lameter 	char buffer[50];
20571a75a6c8SChristoph Lameter 
2058397874dfSChristoph Lameter 	if (!mm)
20591a75a6c8SChristoph Lameter 		return 0;
20601a75a6c8SChristoph Lameter 
20611a75a6c8SChristoph Lameter 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
20621a75a6c8SChristoph Lameter 	if (!md)
20631a75a6c8SChristoph Lameter 		return 0;
20641a75a6c8SChristoph Lameter 
2065480eccf9SLee Schermerhorn 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2066480eccf9SLee Schermerhorn 	mpol_to_str(buffer, sizeof(buffer), pol);
2067480eccf9SLee Schermerhorn 	/*
2068480eccf9SLee Schermerhorn 	 * unref shared or other task's mempolicy
2069480eccf9SLee Schermerhorn 	 */
2070480eccf9SLee Schermerhorn 	if (pol != &default_policy && pol != current->mempolicy)
2071480eccf9SLee Schermerhorn 		__mpol_free(pol);
20721a75a6c8SChristoph Lameter 
2073397874dfSChristoph Lameter 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2074397874dfSChristoph Lameter 
2075397874dfSChristoph Lameter 	if (file) {
2076397874dfSChristoph Lameter 		seq_printf(m, " file=");
2077c32c2f63SJan Blunck 		seq_path(m, &file->f_path, "\n\t= ");
2078397874dfSChristoph Lameter 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2079397874dfSChristoph Lameter 		seq_printf(m, " heap");
2080397874dfSChristoph Lameter 	} else if (vma->vm_start <= mm->start_stack &&
2081397874dfSChristoph Lameter 			vma->vm_end >= mm->start_stack) {
2082397874dfSChristoph Lameter 		seq_printf(m, " stack");
2083397874dfSChristoph Lameter 	}
2084397874dfSChristoph Lameter 
2085397874dfSChristoph Lameter 	if (is_vm_hugetlb_page(vma)) {
2086397874dfSChristoph Lameter 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2087397874dfSChristoph Lameter 		seq_printf(m, " huge");
2088397874dfSChristoph Lameter 	} else {
2089397874dfSChristoph Lameter 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
209056bbd65dSChristoph Lameter 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2091397874dfSChristoph Lameter 	}
2092397874dfSChristoph Lameter 
2093397874dfSChristoph Lameter 	if (!md->pages)
2094397874dfSChristoph Lameter 		goto out;
20951a75a6c8SChristoph Lameter 
20961a75a6c8SChristoph Lameter 	if (md->anon)
20971a75a6c8SChristoph Lameter 		seq_printf(m," anon=%lu",md->anon);
20981a75a6c8SChristoph Lameter 
2099397874dfSChristoph Lameter 	if (md->dirty)
2100397874dfSChristoph Lameter 		seq_printf(m," dirty=%lu",md->dirty);
2101397874dfSChristoph Lameter 
2102397874dfSChristoph Lameter 	if (md->pages != md->anon && md->pages != md->dirty)
2103397874dfSChristoph Lameter 		seq_printf(m, " mapped=%lu", md->pages);
2104397874dfSChristoph Lameter 
2105397874dfSChristoph Lameter 	if (md->mapcount_max > 1)
2106397874dfSChristoph Lameter 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2107397874dfSChristoph Lameter 
2108397874dfSChristoph Lameter 	if (md->swapcache)
2109397874dfSChristoph Lameter 		seq_printf(m," swapcache=%lu", md->swapcache);
2110397874dfSChristoph Lameter 
2111397874dfSChristoph Lameter 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2112397874dfSChristoph Lameter 		seq_printf(m," active=%lu", md->active);
2113397874dfSChristoph Lameter 
2114397874dfSChristoph Lameter 	if (md->writeback)
2115397874dfSChristoph Lameter 		seq_printf(m," writeback=%lu", md->writeback);
2116397874dfSChristoph Lameter 
211756bbd65dSChristoph Lameter 	for_each_node_state(n, N_HIGH_MEMORY)
21181a75a6c8SChristoph Lameter 		if (md->node[n])
21191a75a6c8SChristoph Lameter 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2120397874dfSChristoph Lameter out:
21211a75a6c8SChristoph Lameter 	seq_putc(m, '\n');
21221a75a6c8SChristoph Lameter 	kfree(md);
21231a75a6c8SChristoph Lameter 
21241a75a6c8SChristoph Lameter 	if (m->count < m->size)
212599f89551SEric W. Biederman 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
21261a75a6c8SChristoph Lameter 	return 0;
21271a75a6c8SChristoph Lameter }
2128