xref: /linux/mm/mempolicy.c (revision f4e53d910b7dde2685b177f1e7c3e3e0b4a42f7b)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Simple NUMA memory policy for the Linux kernel.
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
58bccd85fSChristoph Lameter  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
61da177e4SLinus Torvalds  * Subject to the GNU Public License, version 2.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds  * be allocated.
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  * Support four policies per VMA and per process:
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds  *
151da177e4SLinus Torvalds  * interleave     Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds  *                with normal fallback if it fails.
171da177e4SLinus Torvalds  *                For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds  *                offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds  *                for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds  *                is used.
218bccd85fSChristoph Lameter  *
221da177e4SLinus Torvalds  * bind           Only allocate memory on a specific set of nodes,
231da177e4SLinus Torvalds  *                no fallback.
248bccd85fSChristoph Lameter  *                FIXME: memory is allocated starting with the first node
258bccd85fSChristoph Lameter  *                to the last. It would be better if bind would truly restrict
268bccd85fSChristoph Lameter  *                the allocation to memory nodes instead
278bccd85fSChristoph Lameter  *
281da177e4SLinus Torvalds  * preferred       Try a specific node first before normal fallback.
291da177e4SLinus Torvalds  *                As a special case node -1 here means do the allocation
301da177e4SLinus Torvalds  *                on the local CPU. This is normally identical to default,
311da177e4SLinus Torvalds  *                but useful to set in a VMA when you have a non default
321da177e4SLinus Torvalds  *                process policy.
338bccd85fSChristoph Lameter  *
341da177e4SLinus Torvalds  * default        Allocate on the local node first, or when on a VMA
351da177e4SLinus Torvalds  *                use the process policy. This is what Linux always did
361da177e4SLinus Torvalds  *		  in a NUMA aware kernel and still does by, ahem, default.
371da177e4SLinus Torvalds  *
381da177e4SLinus Torvalds  * The process policy is applied for most non interrupt memory allocations
391da177e4SLinus Torvalds  * in that process' context. Interrupts ignore the policies and always
401da177e4SLinus Torvalds  * try to allocate on the local CPU. The VMA policy is only applied for memory
411da177e4SLinus Torvalds  * allocations for a VMA in the VM.
421da177e4SLinus Torvalds  *
431da177e4SLinus Torvalds  * Currently there are a few corner cases in swapping where the policy
441da177e4SLinus Torvalds  * is not applied, but the majority should be handled. When process policy
451da177e4SLinus Torvalds  * is used it is not remembered over swap outs/swap ins.
461da177e4SLinus Torvalds  *
471da177e4SLinus Torvalds  * Only the highest zone in the zone hierarchy gets policied. Allocations
481da177e4SLinus Torvalds  * requesting a lower zone just use default policy. This implies that
491da177e4SLinus Torvalds  * on systems with highmem kernel lowmem allocation don't get policied.
501da177e4SLinus Torvalds  * Same with GFP_DMA allocations.
511da177e4SLinus Torvalds  *
521da177e4SLinus Torvalds  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
531da177e4SLinus Torvalds  * all users and remembered even when nobody has memory mapped.
541da177e4SLinus Torvalds  */
551da177e4SLinus Torvalds 
561da177e4SLinus Torvalds /* Notebook:
571da177e4SLinus Torvalds    fix mmap readahead to honour policy and enable policy for any page cache
581da177e4SLinus Torvalds    object
591da177e4SLinus Torvalds    statistics for bigpages
601da177e4SLinus Torvalds    global policy for page cache? currently it uses process policy. Requires
611da177e4SLinus Torvalds    first item above.
621da177e4SLinus Torvalds    handle mremap for shared memory (currently ignored for the policy)
631da177e4SLinus Torvalds    grows down?
641da177e4SLinus Torvalds    make bind policy root only? It can trigger oom much faster and the
651da177e4SLinus Torvalds    kernel is not always grateful with that.
661da177e4SLinus Torvalds */
671da177e4SLinus Torvalds 
681da177e4SLinus Torvalds #include <linux/mempolicy.h>
691da177e4SLinus Torvalds #include <linux/mm.h>
701da177e4SLinus Torvalds #include <linux/highmem.h>
711da177e4SLinus Torvalds #include <linux/hugetlb.h>
721da177e4SLinus Torvalds #include <linux/kernel.h>
731da177e4SLinus Torvalds #include <linux/sched.h>
741da177e4SLinus Torvalds #include <linux/nodemask.h>
751da177e4SLinus Torvalds #include <linux/cpuset.h>
761da177e4SLinus Torvalds #include <linux/gfp.h>
771da177e4SLinus Torvalds #include <linux/slab.h>
781da177e4SLinus Torvalds #include <linux/string.h>
791da177e4SLinus Torvalds #include <linux/module.h>
80b488893aSPavel Emelyanov #include <linux/nsproxy.h>
811da177e4SLinus Torvalds #include <linux/interrupt.h>
821da177e4SLinus Torvalds #include <linux/init.h>
831da177e4SLinus Torvalds #include <linux/compat.h>
84dc9aa5b9SChristoph Lameter #include <linux/swap.h>
851a75a6c8SChristoph Lameter #include <linux/seq_file.h>
861a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
87b20a3503SChristoph Lameter #include <linux/migrate.h>
8895a402c3SChristoph Lameter #include <linux/rmap.h>
8986c3a764SDavid Quigley #include <linux/security.h>
90dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
91dc9aa5b9SChristoph Lameter 
921da177e4SLinus Torvalds #include <asm/tlbflush.h>
931da177e4SLinus Torvalds #include <asm/uaccess.h>
941da177e4SLinus Torvalds 
9538e35860SChristoph Lameter /* Internal flags */
96dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
9738e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
981a75a6c8SChristoph Lameter #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
99dc9aa5b9SChristoph Lameter 
100fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
101fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1021da177e4SLinus Torvalds 
1031da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1041da177e4SLinus Torvalds    policied. */
1056267276fSChristoph Lameter enum zone_type policy_zone = 0;
1061da177e4SLinus Torvalds 
107d42c6997SAndi Kleen struct mempolicy default_policy = {
1081da177e4SLinus Torvalds 	.refcnt = ATOMIC_INIT(1), /* never free it */
1091da177e4SLinus Torvalds 	.policy = MPOL_DEFAULT,
1101da177e4SLinus Torvalds };
1111da177e4SLinus Torvalds 
11237012946SDavid Rientjes static const struct mempolicy_operations {
11337012946SDavid Rientjes 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
11437012946SDavid Rientjes 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
11537012946SDavid Rientjes } mpol_ops[MPOL_MAX];
11637012946SDavid Rientjes 
11719770b32SMel Gorman /* Check that the nodemask contains at least one populated zone */
11837012946SDavid Rientjes static int is_valid_nodemask(const nodemask_t *nodemask)
1191da177e4SLinus Torvalds {
12019770b32SMel Gorman 	int nd, k;
1211da177e4SLinus Torvalds 
12219770b32SMel Gorman 	/* Check that there is something useful in this mask */
12319770b32SMel Gorman 	k = policy_zone;
12419770b32SMel Gorman 
12519770b32SMel Gorman 	for_each_node_mask(nd, *nodemask) {
12619770b32SMel Gorman 		struct zone *z;
12719770b32SMel Gorman 
12819770b32SMel Gorman 		for (k = 0; k <= policy_zone; k++) {
12919770b32SMel Gorman 			z = &NODE_DATA(nd)->node_zones[k];
130dd942ae3SAndi Kleen 			if (z->present_pages > 0)
13119770b32SMel Gorman 				return 1;
132dd942ae3SAndi Kleen 		}
133dd942ae3SAndi Kleen 	}
13419770b32SMel Gorman 
13519770b32SMel Gorman 	return 0;
1361da177e4SLinus Torvalds }
1371da177e4SLinus Torvalds 
138f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
139f5b087b5SDavid Rientjes {
1404c50bc01SDavid Rientjes 	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
1414c50bc01SDavid Rientjes }
1424c50bc01SDavid Rientjes 
1434c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
1444c50bc01SDavid Rientjes 				   const nodemask_t *rel)
1454c50bc01SDavid Rientjes {
1464c50bc01SDavid Rientjes 	nodemask_t tmp;
1474c50bc01SDavid Rientjes 	nodes_fold(tmp, *orig, nodes_weight(*rel));
1484c50bc01SDavid Rientjes 	nodes_onto(*ret, tmp, *rel);
149f5b087b5SDavid Rientjes }
150f5b087b5SDavid Rientjes 
15137012946SDavid Rientjes static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
15237012946SDavid Rientjes {
15337012946SDavid Rientjes 	if (nodes_empty(*nodes))
15437012946SDavid Rientjes 		return -EINVAL;
15537012946SDavid Rientjes 	pol->v.nodes = *nodes;
15637012946SDavid Rientjes 	return 0;
15737012946SDavid Rientjes }
15837012946SDavid Rientjes 
15937012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
16037012946SDavid Rientjes {
16137012946SDavid Rientjes 	if (!nodes)
16237012946SDavid Rientjes 		pol->v.preferred_node = -1;	/* local allocation */
16337012946SDavid Rientjes 	else if (nodes_empty(*nodes))
16437012946SDavid Rientjes 		return -EINVAL;			/*  no allowed nodes */
16537012946SDavid Rientjes 	else
16637012946SDavid Rientjes 		pol->v.preferred_node = first_node(*nodes);
16737012946SDavid Rientjes 	return 0;
16837012946SDavid Rientjes }
16937012946SDavid Rientjes 
17037012946SDavid Rientjes static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
17137012946SDavid Rientjes {
17237012946SDavid Rientjes 	if (!is_valid_nodemask(nodes))
17337012946SDavid Rientjes 		return -EINVAL;
17437012946SDavid Rientjes 	pol->v.nodes = *nodes;
17537012946SDavid Rientjes 	return 0;
17637012946SDavid Rientjes }
17737012946SDavid Rientjes 
1781da177e4SLinus Torvalds /* Create a new policy */
179028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
180028fec41SDavid Rientjes 				  nodemask_t *nodes)
1811da177e4SLinus Torvalds {
1821da177e4SLinus Torvalds 	struct mempolicy *policy;
183f5b087b5SDavid Rientjes 	nodemask_t cpuset_context_nmask;
18437012946SDavid Rientjes 	int ret;
1851da177e4SLinus Torvalds 
186028fec41SDavid Rientjes 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
187028fec41SDavid Rientjes 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
188140d5a49SPaul Mundt 
1893e1f0645SDavid Rientjes 	if (mode == MPOL_DEFAULT) {
1903e1f0645SDavid Rientjes 		if (nodes && !nodes_empty(*nodes))
19137012946SDavid Rientjes 			return ERR_PTR(-EINVAL);
1923e1f0645SDavid Rientjes 		return NULL;
19337012946SDavid Rientjes 	}
1943e1f0645SDavid Rientjes 	VM_BUG_ON(!nodes);
1953e1f0645SDavid Rientjes 
1963e1f0645SDavid Rientjes 	/*
1973e1f0645SDavid Rientjes 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
1983e1f0645SDavid Rientjes 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
1993e1f0645SDavid Rientjes 	 * All other modes require a valid pointer to a non-empty nodemask.
2003e1f0645SDavid Rientjes 	 */
2013e1f0645SDavid Rientjes 	if (mode == MPOL_PREFERRED) {
2023e1f0645SDavid Rientjes 		if (nodes_empty(*nodes)) {
2033e1f0645SDavid Rientjes 			if (((flags & MPOL_F_STATIC_NODES) ||
2043e1f0645SDavid Rientjes 			     (flags & MPOL_F_RELATIVE_NODES)))
2053e1f0645SDavid Rientjes 				return ERR_PTR(-EINVAL);
2063e1f0645SDavid Rientjes 			nodes = NULL;	/* flag local alloc */
2073e1f0645SDavid Rientjes 		}
2083e1f0645SDavid Rientjes 	} else if (nodes_empty(*nodes))
2093e1f0645SDavid Rientjes 		return ERR_PTR(-EINVAL);
2101da177e4SLinus Torvalds 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2111da177e4SLinus Torvalds 	if (!policy)
2121da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2131da177e4SLinus Torvalds 	atomic_set(&policy->refcnt, 1);
21437012946SDavid Rientjes 	policy->policy = mode;
21537012946SDavid Rientjes 	policy->flags = flags;
2163e1f0645SDavid Rientjes 
2173e1f0645SDavid Rientjes 	if (nodes) {
2183e1f0645SDavid Rientjes 		/*
2193e1f0645SDavid Rientjes 		 * cpuset related setup doesn't apply to local allocation
2203e1f0645SDavid Rientjes 		 */
221f5b087b5SDavid Rientjes 		cpuset_update_task_memory_state();
2224c50bc01SDavid Rientjes 		if (flags & MPOL_F_RELATIVE_NODES)
2234c50bc01SDavid Rientjes 			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
2244c50bc01SDavid Rientjes 					       &cpuset_current_mems_allowed);
2254c50bc01SDavid Rientjes 		else
2264c50bc01SDavid Rientjes 			nodes_and(cpuset_context_nmask, *nodes,
2274c50bc01SDavid Rientjes 				  cpuset_current_mems_allowed);
228f5b087b5SDavid Rientjes 		if (mpol_store_user_nodemask(policy))
229f5b087b5SDavid Rientjes 			policy->w.user_nodemask = *nodes;
230f5b087b5SDavid Rientjes 		else
23137012946SDavid Rientjes 			policy->w.cpuset_mems_allowed =
23237012946SDavid Rientjes 						cpuset_mems_allowed(current);
2331da177e4SLinus Torvalds 	}
2341da177e4SLinus Torvalds 
23537012946SDavid Rientjes 	ret = mpol_ops[mode].create(policy,
2363e1f0645SDavid Rientjes 				nodes ? &cpuset_context_nmask : NULL);
23737012946SDavid Rientjes 	if (ret < 0) {
23837012946SDavid Rientjes 		kmem_cache_free(policy_cache, policy);
23937012946SDavid Rientjes 		return ERR_PTR(ret);
24037012946SDavid Rientjes 	}
24137012946SDavid Rientjes 	return policy;
24237012946SDavid Rientjes }
24337012946SDavid Rientjes 
24437012946SDavid Rientjes static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
24537012946SDavid Rientjes {
24637012946SDavid Rientjes }
24737012946SDavid Rientjes 
24837012946SDavid Rientjes static void mpol_rebind_nodemask(struct mempolicy *pol,
24937012946SDavid Rientjes 				 const nodemask_t *nodes)
2501d0d2680SDavid Rientjes {
2511d0d2680SDavid Rientjes 	nodemask_t tmp;
2521d0d2680SDavid Rientjes 
25337012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES)
25437012946SDavid Rientjes 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
25537012946SDavid Rientjes 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
25637012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
2571d0d2680SDavid Rientjes 	else {
25837012946SDavid Rientjes 		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
25937012946SDavid Rientjes 			    *nodes);
26037012946SDavid Rientjes 		pol->w.cpuset_mems_allowed = *nodes;
2611d0d2680SDavid Rientjes 	}
26237012946SDavid Rientjes 
2631d0d2680SDavid Rientjes 	pol->v.nodes = tmp;
2641d0d2680SDavid Rientjes 	if (!node_isset(current->il_next, tmp)) {
2651d0d2680SDavid Rientjes 		current->il_next = next_node(current->il_next, tmp);
2661d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
2671d0d2680SDavid Rientjes 			current->il_next = first_node(tmp);
2681d0d2680SDavid Rientjes 		if (current->il_next >= MAX_NUMNODES)
2691d0d2680SDavid Rientjes 			current->il_next = numa_node_id();
2701d0d2680SDavid Rientjes 	}
27137012946SDavid Rientjes }
27237012946SDavid Rientjes 
27337012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
27437012946SDavid Rientjes 				  const nodemask_t *nodes)
27537012946SDavid Rientjes {
27637012946SDavid Rientjes 	nodemask_t tmp;
27737012946SDavid Rientjes 
27837012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES) {
2791d0d2680SDavid Rientjes 		int node = first_node(pol->w.user_nodemask);
2801d0d2680SDavid Rientjes 
28137012946SDavid Rientjes 		if (node_isset(node, *nodes))
2821d0d2680SDavid Rientjes 			pol->v.preferred_node = node;
2831d0d2680SDavid Rientjes 		else
2841d0d2680SDavid Rientjes 			pol->v.preferred_node = -1;
28537012946SDavid Rientjes 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
28637012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
2871d0d2680SDavid Rientjes 		pol->v.preferred_node = first_node(tmp);
2883e1f0645SDavid Rientjes 	} else if (pol->v.preferred_node != -1) {
2891d0d2680SDavid Rientjes 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
29037012946SDavid Rientjes 						   pol->w.cpuset_mems_allowed,
29137012946SDavid Rientjes 						   *nodes);
29237012946SDavid Rientjes 		pol->w.cpuset_mems_allowed = *nodes;
2931d0d2680SDavid Rientjes 	}
2941d0d2680SDavid Rientjes }
29537012946SDavid Rientjes 
29637012946SDavid Rientjes /* Migrate a policy to a different set of nodes */
29737012946SDavid Rientjes static void mpol_rebind_policy(struct mempolicy *pol,
29837012946SDavid Rientjes 			       const nodemask_t *newmask)
29937012946SDavid Rientjes {
30037012946SDavid Rientjes 	if (!pol)
30137012946SDavid Rientjes 		return;
30237012946SDavid Rientjes 	if (!mpol_store_user_nodemask(pol) &&
30337012946SDavid Rientjes 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
30437012946SDavid Rientjes 		return;
30537012946SDavid Rientjes 	mpol_ops[pol->policy].rebind(pol, newmask);
3061d0d2680SDavid Rientjes }
3071d0d2680SDavid Rientjes 
3081d0d2680SDavid Rientjes /*
3091d0d2680SDavid Rientjes  * Wrapper for mpol_rebind_policy() that just requires task
3101d0d2680SDavid Rientjes  * pointer, and updates task mempolicy.
3111d0d2680SDavid Rientjes  */
3121d0d2680SDavid Rientjes 
3131d0d2680SDavid Rientjes void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
3141d0d2680SDavid Rientjes {
3151d0d2680SDavid Rientjes 	mpol_rebind_policy(tsk->mempolicy, new);
3161d0d2680SDavid Rientjes }
3171d0d2680SDavid Rientjes 
3181d0d2680SDavid Rientjes /*
3191d0d2680SDavid Rientjes  * Rebind each vma in mm to new nodemask.
3201d0d2680SDavid Rientjes  *
3211d0d2680SDavid Rientjes  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
3221d0d2680SDavid Rientjes  */
3231d0d2680SDavid Rientjes 
3241d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
3251d0d2680SDavid Rientjes {
3261d0d2680SDavid Rientjes 	struct vm_area_struct *vma;
3271d0d2680SDavid Rientjes 
3281d0d2680SDavid Rientjes 	down_write(&mm->mmap_sem);
3291d0d2680SDavid Rientjes 	for (vma = mm->mmap; vma; vma = vma->vm_next)
3301d0d2680SDavid Rientjes 		mpol_rebind_policy(vma->vm_policy, new);
3311d0d2680SDavid Rientjes 	up_write(&mm->mmap_sem);
3321d0d2680SDavid Rientjes }
3331d0d2680SDavid Rientjes 
33437012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
33537012946SDavid Rientjes 	[MPOL_DEFAULT] = {
33637012946SDavid Rientjes 		.rebind = mpol_rebind_default,
33737012946SDavid Rientjes 	},
33837012946SDavid Rientjes 	[MPOL_INTERLEAVE] = {
33937012946SDavid Rientjes 		.create = mpol_new_interleave,
34037012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
34137012946SDavid Rientjes 	},
34237012946SDavid Rientjes 	[MPOL_PREFERRED] = {
34337012946SDavid Rientjes 		.create = mpol_new_preferred,
34437012946SDavid Rientjes 		.rebind = mpol_rebind_preferred,
34537012946SDavid Rientjes 	},
34637012946SDavid Rientjes 	[MPOL_BIND] = {
34737012946SDavid Rientjes 		.create = mpol_new_bind,
34837012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
34937012946SDavid Rientjes 	},
35037012946SDavid Rientjes };
35137012946SDavid Rientjes 
352397874dfSChristoph Lameter static void gather_stats(struct page *, void *, int pte_dirty);
353fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
354fc301289SChristoph Lameter 				unsigned long flags);
3551a75a6c8SChristoph Lameter 
35638e35860SChristoph Lameter /* Scan through pages checking if pages follow certain conditions. */
357b5810039SNick Piggin static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
358dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
359dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
36038e35860SChristoph Lameter 		void *private)
3611da177e4SLinus Torvalds {
36291612e0dSHugh Dickins 	pte_t *orig_pte;
36391612e0dSHugh Dickins 	pte_t *pte;
364705e87c0SHugh Dickins 	spinlock_t *ptl;
365941150a3SHugh Dickins 
366705e87c0SHugh Dickins 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
36791612e0dSHugh Dickins 	do {
3686aab341eSLinus Torvalds 		struct page *page;
36925ba77c1SAndy Whitcroft 		int nid;
37091612e0dSHugh Dickins 
37191612e0dSHugh Dickins 		if (!pte_present(*pte))
37291612e0dSHugh Dickins 			continue;
3736aab341eSLinus Torvalds 		page = vm_normal_page(vma, addr, *pte);
3746aab341eSLinus Torvalds 		if (!page)
37591612e0dSHugh Dickins 			continue;
376053837fcSNick Piggin 		/*
377053837fcSNick Piggin 		 * The check for PageReserved here is important to avoid
378053837fcSNick Piggin 		 * handling zero pages and other pages that may have been
379053837fcSNick Piggin 		 * marked special by the system.
380053837fcSNick Piggin 		 *
381053837fcSNick Piggin 		 * If the PageReserved would not be checked here then f.e.
382053837fcSNick Piggin 		 * the location of the zero page could have an influence
383053837fcSNick Piggin 		 * on MPOL_MF_STRICT, zero pages would be counted for
384053837fcSNick Piggin 		 * the per node stats, and there would be useless attempts
385053837fcSNick Piggin 		 * to put zero pages on the migration list.
386053837fcSNick Piggin 		 */
387f4598c8bSChristoph Lameter 		if (PageReserved(page))
388f4598c8bSChristoph Lameter 			continue;
3896aab341eSLinus Torvalds 		nid = page_to_nid(page);
39038e35860SChristoph Lameter 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
39138e35860SChristoph Lameter 			continue;
39238e35860SChristoph Lameter 
3931a75a6c8SChristoph Lameter 		if (flags & MPOL_MF_STATS)
394397874dfSChristoph Lameter 			gather_stats(page, private, pte_dirty(*pte));
395053837fcSNick Piggin 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
396fc301289SChristoph Lameter 			migrate_page_add(page, private, flags);
397dc9aa5b9SChristoph Lameter 		else
3981da177e4SLinus Torvalds 			break;
39991612e0dSHugh Dickins 	} while (pte++, addr += PAGE_SIZE, addr != end);
400705e87c0SHugh Dickins 	pte_unmap_unlock(orig_pte, ptl);
40191612e0dSHugh Dickins 	return addr != end;
40291612e0dSHugh Dickins }
40391612e0dSHugh Dickins 
404b5810039SNick Piggin static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
405dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
406dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
40738e35860SChristoph Lameter 		void *private)
40891612e0dSHugh Dickins {
40991612e0dSHugh Dickins 	pmd_t *pmd;
41091612e0dSHugh Dickins 	unsigned long next;
41191612e0dSHugh Dickins 
41291612e0dSHugh Dickins 	pmd = pmd_offset(pud, addr);
41391612e0dSHugh Dickins 	do {
41491612e0dSHugh Dickins 		next = pmd_addr_end(addr, end);
41591612e0dSHugh Dickins 		if (pmd_none_or_clear_bad(pmd))
41691612e0dSHugh Dickins 			continue;
417dc9aa5b9SChristoph Lameter 		if (check_pte_range(vma, pmd, addr, next, nodes,
41838e35860SChristoph Lameter 				    flags, private))
41991612e0dSHugh Dickins 			return -EIO;
42091612e0dSHugh Dickins 	} while (pmd++, addr = next, addr != end);
42191612e0dSHugh Dickins 	return 0;
42291612e0dSHugh Dickins }
42391612e0dSHugh Dickins 
424b5810039SNick Piggin static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
425dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
426dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
42738e35860SChristoph Lameter 		void *private)
42891612e0dSHugh Dickins {
42991612e0dSHugh Dickins 	pud_t *pud;
43091612e0dSHugh Dickins 	unsigned long next;
43191612e0dSHugh Dickins 
43291612e0dSHugh Dickins 	pud = pud_offset(pgd, addr);
43391612e0dSHugh Dickins 	do {
43491612e0dSHugh Dickins 		next = pud_addr_end(addr, end);
43591612e0dSHugh Dickins 		if (pud_none_or_clear_bad(pud))
43691612e0dSHugh Dickins 			continue;
437dc9aa5b9SChristoph Lameter 		if (check_pmd_range(vma, pud, addr, next, nodes,
43838e35860SChristoph Lameter 				    flags, private))
43991612e0dSHugh Dickins 			return -EIO;
44091612e0dSHugh Dickins 	} while (pud++, addr = next, addr != end);
44191612e0dSHugh Dickins 	return 0;
44291612e0dSHugh Dickins }
44391612e0dSHugh Dickins 
444b5810039SNick Piggin static inline int check_pgd_range(struct vm_area_struct *vma,
445dc9aa5b9SChristoph Lameter 		unsigned long addr, unsigned long end,
446dc9aa5b9SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags,
44738e35860SChristoph Lameter 		void *private)
44891612e0dSHugh Dickins {
44991612e0dSHugh Dickins 	pgd_t *pgd;
45091612e0dSHugh Dickins 	unsigned long next;
45191612e0dSHugh Dickins 
452b5810039SNick Piggin 	pgd = pgd_offset(vma->vm_mm, addr);
45391612e0dSHugh Dickins 	do {
45491612e0dSHugh Dickins 		next = pgd_addr_end(addr, end);
45591612e0dSHugh Dickins 		if (pgd_none_or_clear_bad(pgd))
45691612e0dSHugh Dickins 			continue;
457dc9aa5b9SChristoph Lameter 		if (check_pud_range(vma, pgd, addr, next, nodes,
45838e35860SChristoph Lameter 				    flags, private))
45991612e0dSHugh Dickins 			return -EIO;
46091612e0dSHugh Dickins 	} while (pgd++, addr = next, addr != end);
46191612e0dSHugh Dickins 	return 0;
4621da177e4SLinus Torvalds }
4631da177e4SLinus Torvalds 
464dc9aa5b9SChristoph Lameter /*
465dc9aa5b9SChristoph Lameter  * Check if all pages in a range are on a set of nodes.
466dc9aa5b9SChristoph Lameter  * If pagelist != NULL then isolate pages from the LRU and
467dc9aa5b9SChristoph Lameter  * put them on the pagelist.
468dc9aa5b9SChristoph Lameter  */
4691da177e4SLinus Torvalds static struct vm_area_struct *
4701da177e4SLinus Torvalds check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
47138e35860SChristoph Lameter 		const nodemask_t *nodes, unsigned long flags, void *private)
4721da177e4SLinus Torvalds {
4731da177e4SLinus Torvalds 	int err;
4741da177e4SLinus Torvalds 	struct vm_area_struct *first, *vma, *prev;
4751da177e4SLinus Torvalds 
47690036ee5SChristoph Lameter 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
47790036ee5SChristoph Lameter 
478b20a3503SChristoph Lameter 		err = migrate_prep();
479b20a3503SChristoph Lameter 		if (err)
480b20a3503SChristoph Lameter 			return ERR_PTR(err);
48190036ee5SChristoph Lameter 	}
482053837fcSNick Piggin 
4831da177e4SLinus Torvalds 	first = find_vma(mm, start);
4841da177e4SLinus Torvalds 	if (!first)
4851da177e4SLinus Torvalds 		return ERR_PTR(-EFAULT);
4861da177e4SLinus Torvalds 	prev = NULL;
4871da177e4SLinus Torvalds 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
488dc9aa5b9SChristoph Lameter 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
4891da177e4SLinus Torvalds 			if (!vma->vm_next && vma->vm_end < end)
4901da177e4SLinus Torvalds 				return ERR_PTR(-EFAULT);
4911da177e4SLinus Torvalds 			if (prev && prev->vm_end < vma->vm_start)
4921da177e4SLinus Torvalds 				return ERR_PTR(-EFAULT);
493dc9aa5b9SChristoph Lameter 		}
494dc9aa5b9SChristoph Lameter 		if (!is_vm_hugetlb_page(vma) &&
495dc9aa5b9SChristoph Lameter 		    ((flags & MPOL_MF_STRICT) ||
496dc9aa5b9SChristoph Lameter 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
497dc9aa5b9SChristoph Lameter 				vma_migratable(vma)))) {
4985b952b3cSAndi Kleen 			unsigned long endvma = vma->vm_end;
499dc9aa5b9SChristoph Lameter 
5005b952b3cSAndi Kleen 			if (endvma > end)
5015b952b3cSAndi Kleen 				endvma = end;
5025b952b3cSAndi Kleen 			if (vma->vm_start > start)
5035b952b3cSAndi Kleen 				start = vma->vm_start;
504dc9aa5b9SChristoph Lameter 			err = check_pgd_range(vma, start, endvma, nodes,
50538e35860SChristoph Lameter 						flags, private);
5061da177e4SLinus Torvalds 			if (err) {
5071da177e4SLinus Torvalds 				first = ERR_PTR(err);
5081da177e4SLinus Torvalds 				break;
5091da177e4SLinus Torvalds 			}
5101da177e4SLinus Torvalds 		}
5111da177e4SLinus Torvalds 		prev = vma;
5121da177e4SLinus Torvalds 	}
5131da177e4SLinus Torvalds 	return first;
5141da177e4SLinus Torvalds }
5151da177e4SLinus Torvalds 
5161da177e4SLinus Torvalds /* Apply policy to a single VMA */
5171da177e4SLinus Torvalds static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
5181da177e4SLinus Torvalds {
5191da177e4SLinus Torvalds 	int err = 0;
5201da177e4SLinus Torvalds 	struct mempolicy *old = vma->vm_policy;
5211da177e4SLinus Torvalds 
522140d5a49SPaul Mundt 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
5231da177e4SLinus Torvalds 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
5241da177e4SLinus Torvalds 		 vma->vm_ops, vma->vm_file,
5251da177e4SLinus Torvalds 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
5261da177e4SLinus Torvalds 
5271da177e4SLinus Torvalds 	if (vma->vm_ops && vma->vm_ops->set_policy)
5281da177e4SLinus Torvalds 		err = vma->vm_ops->set_policy(vma, new);
5291da177e4SLinus Torvalds 	if (!err) {
5301da177e4SLinus Torvalds 		mpol_get(new);
5311da177e4SLinus Torvalds 		vma->vm_policy = new;
532f0be3d32SLee Schermerhorn 		mpol_put(old);
5331da177e4SLinus Torvalds 	}
5341da177e4SLinus Torvalds 	return err;
5351da177e4SLinus Torvalds }
5361da177e4SLinus Torvalds 
5371da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */
5381da177e4SLinus Torvalds static int mbind_range(struct vm_area_struct *vma, unsigned long start,
5391da177e4SLinus Torvalds 		       unsigned long end, struct mempolicy *new)
5401da177e4SLinus Torvalds {
5411da177e4SLinus Torvalds 	struct vm_area_struct *next;
5421da177e4SLinus Torvalds 	int err;
5431da177e4SLinus Torvalds 
5441da177e4SLinus Torvalds 	err = 0;
5451da177e4SLinus Torvalds 	for (; vma && vma->vm_start < end; vma = next) {
5461da177e4SLinus Torvalds 		next = vma->vm_next;
5471da177e4SLinus Torvalds 		if (vma->vm_start < start)
5481da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, start, 1);
5491da177e4SLinus Torvalds 		if (!err && vma->vm_end > end)
5501da177e4SLinus Torvalds 			err = split_vma(vma->vm_mm, vma, end, 0);
5511da177e4SLinus Torvalds 		if (!err)
5521da177e4SLinus Torvalds 			err = policy_vma(vma, new);
5531da177e4SLinus Torvalds 		if (err)
5541da177e4SLinus Torvalds 			break;
5551da177e4SLinus Torvalds 	}
5561da177e4SLinus Torvalds 	return err;
5571da177e4SLinus Torvalds }
5581da177e4SLinus Torvalds 
559c61afb18SPaul Jackson /*
560c61afb18SPaul Jackson  * Update task->flags PF_MEMPOLICY bit: set iff non-default
561c61afb18SPaul Jackson  * mempolicy.  Allows more rapid checking of this (combined perhaps
562c61afb18SPaul Jackson  * with other PF_* flag bits) on memory allocation hot code paths.
563c61afb18SPaul Jackson  *
564c61afb18SPaul Jackson  * If called from outside this file, the task 'p' should -only- be
565c61afb18SPaul Jackson  * a newly forked child not yet visible on the task list, because
566c61afb18SPaul Jackson  * manipulating the task flags of a visible task is not safe.
567c61afb18SPaul Jackson  *
568c61afb18SPaul Jackson  * The above limitation is why this routine has the funny name
569c61afb18SPaul Jackson  * mpol_fix_fork_child_flag().
570c61afb18SPaul Jackson  *
571c61afb18SPaul Jackson  * It is also safe to call this with a task pointer of current,
572c61afb18SPaul Jackson  * which the static wrapper mpol_set_task_struct_flag() does,
573c61afb18SPaul Jackson  * for use within this file.
574c61afb18SPaul Jackson  */
575c61afb18SPaul Jackson 
576c61afb18SPaul Jackson void mpol_fix_fork_child_flag(struct task_struct *p)
577c61afb18SPaul Jackson {
578c61afb18SPaul Jackson 	if (p->mempolicy)
579c61afb18SPaul Jackson 		p->flags |= PF_MEMPOLICY;
580c61afb18SPaul Jackson 	else
581c61afb18SPaul Jackson 		p->flags &= ~PF_MEMPOLICY;
582c61afb18SPaul Jackson }
583c61afb18SPaul Jackson 
584c61afb18SPaul Jackson static void mpol_set_task_struct_flag(void)
585c61afb18SPaul Jackson {
586c61afb18SPaul Jackson 	mpol_fix_fork_child_flag(current);
587c61afb18SPaul Jackson }
588c61afb18SPaul Jackson 
5891da177e4SLinus Torvalds /* Set the process memory policy */
590028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
591028fec41SDavid Rientjes 			     nodemask_t *nodes)
5921da177e4SLinus Torvalds {
5931da177e4SLinus Torvalds 	struct mempolicy *new;
594*f4e53d91SLee Schermerhorn 	struct mm_struct *mm = current->mm;
5951da177e4SLinus Torvalds 
596028fec41SDavid Rientjes 	new = mpol_new(mode, flags, nodes);
5971da177e4SLinus Torvalds 	if (IS_ERR(new))
5981da177e4SLinus Torvalds 		return PTR_ERR(new);
599*f4e53d91SLee Schermerhorn 
600*f4e53d91SLee Schermerhorn 	/*
601*f4e53d91SLee Schermerhorn 	 * prevent changing our mempolicy while show_numa_maps()
602*f4e53d91SLee Schermerhorn 	 * is using it.
603*f4e53d91SLee Schermerhorn 	 * Note:  do_set_mempolicy() can be called at init time
604*f4e53d91SLee Schermerhorn 	 * with no 'mm'.
605*f4e53d91SLee Schermerhorn 	 */
606*f4e53d91SLee Schermerhorn 	if (mm)
607*f4e53d91SLee Schermerhorn 		down_write(&mm->mmap_sem);
608f0be3d32SLee Schermerhorn 	mpol_put(current->mempolicy);
6091da177e4SLinus Torvalds 	current->mempolicy = new;
610c61afb18SPaul Jackson 	mpol_set_task_struct_flag();
611f5b087b5SDavid Rientjes 	if (new && new->policy == MPOL_INTERLEAVE &&
612f5b087b5SDavid Rientjes 	    nodes_weight(new->v.nodes))
613dfcd3c0dSAndi Kleen 		current->il_next = first_node(new->v.nodes);
614*f4e53d91SLee Schermerhorn 	if (mm)
615*f4e53d91SLee Schermerhorn 		up_write(&mm->mmap_sem);
616*f4e53d91SLee Schermerhorn 
6171da177e4SLinus Torvalds 	return 0;
6181da177e4SLinus Torvalds }
6191da177e4SLinus Torvalds 
6201da177e4SLinus Torvalds /* Fill a zone bitmap for a policy */
621dfcd3c0dSAndi Kleen static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
6221da177e4SLinus Torvalds {
623dfcd3c0dSAndi Kleen 	nodes_clear(*nodes);
6241da177e4SLinus Torvalds 	switch (p->policy) {
6251da177e4SLinus Torvalds 	case MPOL_DEFAULT:
6261da177e4SLinus Torvalds 		break;
62719770b32SMel Gorman 	case MPOL_BIND:
62819770b32SMel Gorman 		/* Fall through */
6291da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
630dfcd3c0dSAndi Kleen 		*nodes = p->v.nodes;
6311da177e4SLinus Torvalds 		break;
6321da177e4SLinus Torvalds 	case MPOL_PREFERRED:
63356bbd65dSChristoph Lameter 		/* or use current node instead of memory_map? */
6341da177e4SLinus Torvalds 		if (p->v.preferred_node < 0)
63556bbd65dSChristoph Lameter 			*nodes = node_states[N_HIGH_MEMORY];
6361da177e4SLinus Torvalds 		else
637dfcd3c0dSAndi Kleen 			node_set(p->v.preferred_node, *nodes);
6381da177e4SLinus Torvalds 		break;
6391da177e4SLinus Torvalds 	default:
6401da177e4SLinus Torvalds 		BUG();
6411da177e4SLinus Torvalds 	}
6421da177e4SLinus Torvalds }
6431da177e4SLinus Torvalds 
6441da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr)
6451da177e4SLinus Torvalds {
6461da177e4SLinus Torvalds 	struct page *p;
6471da177e4SLinus Torvalds 	int err;
6481da177e4SLinus Torvalds 
6491da177e4SLinus Torvalds 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
6501da177e4SLinus Torvalds 	if (err >= 0) {
6511da177e4SLinus Torvalds 		err = page_to_nid(p);
6521da177e4SLinus Torvalds 		put_page(p);
6531da177e4SLinus Torvalds 	}
6541da177e4SLinus Torvalds 	return err;
6551da177e4SLinus Torvalds }
6561da177e4SLinus Torvalds 
6571da177e4SLinus Torvalds /* Retrieve NUMA policy */
658dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
6591da177e4SLinus Torvalds 			     unsigned long addr, unsigned long flags)
6601da177e4SLinus Torvalds {
6618bccd85fSChristoph Lameter 	int err;
6621da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
6631da177e4SLinus Torvalds 	struct vm_area_struct *vma = NULL;
6641da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
6651da177e4SLinus Torvalds 
666cf2a473cSPaul Jackson 	cpuset_update_task_memory_state();
667754af6f5SLee Schermerhorn 	if (flags &
668754af6f5SLee Schermerhorn 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
6691da177e4SLinus Torvalds 		return -EINVAL;
670754af6f5SLee Schermerhorn 
671754af6f5SLee Schermerhorn 	if (flags & MPOL_F_MEMS_ALLOWED) {
672754af6f5SLee Schermerhorn 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
673754af6f5SLee Schermerhorn 			return -EINVAL;
674754af6f5SLee Schermerhorn 		*policy = 0;	/* just so it's initialized */
675754af6f5SLee Schermerhorn 		*nmask  = cpuset_current_mems_allowed;
676754af6f5SLee Schermerhorn 		return 0;
677754af6f5SLee Schermerhorn 	}
678754af6f5SLee Schermerhorn 
6791da177e4SLinus Torvalds 	if (flags & MPOL_F_ADDR) {
6801da177e4SLinus Torvalds 		down_read(&mm->mmap_sem);
6811da177e4SLinus Torvalds 		vma = find_vma_intersection(mm, addr, addr+1);
6821da177e4SLinus Torvalds 		if (!vma) {
6831da177e4SLinus Torvalds 			up_read(&mm->mmap_sem);
6841da177e4SLinus Torvalds 			return -EFAULT;
6851da177e4SLinus Torvalds 		}
6861da177e4SLinus Torvalds 		if (vma->vm_ops && vma->vm_ops->get_policy)
6871da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
6881da177e4SLinus Torvalds 		else
6891da177e4SLinus Torvalds 			pol = vma->vm_policy;
6901da177e4SLinus Torvalds 	} else if (addr)
6911da177e4SLinus Torvalds 		return -EINVAL;
6921da177e4SLinus Torvalds 
6931da177e4SLinus Torvalds 	if (!pol)
6941da177e4SLinus Torvalds 		pol = &default_policy;
6951da177e4SLinus Torvalds 
6961da177e4SLinus Torvalds 	if (flags & MPOL_F_NODE) {
6971da177e4SLinus Torvalds 		if (flags & MPOL_F_ADDR) {
6981da177e4SLinus Torvalds 			err = lookup_node(mm, addr);
6991da177e4SLinus Torvalds 			if (err < 0)
7001da177e4SLinus Torvalds 				goto out;
7018bccd85fSChristoph Lameter 			*policy = err;
7021da177e4SLinus Torvalds 		} else if (pol == current->mempolicy &&
7031da177e4SLinus Torvalds 				pol->policy == MPOL_INTERLEAVE) {
7048bccd85fSChristoph Lameter 			*policy = current->il_next;
7051da177e4SLinus Torvalds 		} else {
7061da177e4SLinus Torvalds 			err = -EINVAL;
7071da177e4SLinus Torvalds 			goto out;
7081da177e4SLinus Torvalds 		}
7091da177e4SLinus Torvalds 	} else
710028fec41SDavid Rientjes 		*policy = pol->policy | pol->flags;
7111da177e4SLinus Torvalds 
7121da177e4SLinus Torvalds 	if (vma) {
7131da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
7141da177e4SLinus Torvalds 		vma = NULL;
7151da177e4SLinus Torvalds 	}
7161da177e4SLinus Torvalds 
7171da177e4SLinus Torvalds 	err = 0;
7188bccd85fSChristoph Lameter 	if (nmask)
7198bccd85fSChristoph Lameter 		get_zonemask(pol, nmask);
7201da177e4SLinus Torvalds 
7211da177e4SLinus Torvalds  out:
7221da177e4SLinus Torvalds 	if (vma)
7231da177e4SLinus Torvalds 		up_read(&current->mm->mmap_sem);
7241da177e4SLinus Torvalds 	return err;
7251da177e4SLinus Torvalds }
7261da177e4SLinus Torvalds 
727b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION
7288bccd85fSChristoph Lameter /*
7296ce3c4c0SChristoph Lameter  * page migration
7306ce3c4c0SChristoph Lameter  */
731fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
732fc301289SChristoph Lameter 				unsigned long flags)
7336ce3c4c0SChristoph Lameter {
7346ce3c4c0SChristoph Lameter 	/*
735fc301289SChristoph Lameter 	 * Avoid migrating a page that is shared with others.
7366ce3c4c0SChristoph Lameter 	 */
737b20a3503SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
738b20a3503SChristoph Lameter 		isolate_lru_page(page, pagelist);
7396ce3c4c0SChristoph Lameter }
7406ce3c4c0SChristoph Lameter 
741742755a1SChristoph Lameter static struct page *new_node_page(struct page *page, unsigned long node, int **x)
74295a402c3SChristoph Lameter {
743769848c0SMel Gorman 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
74495a402c3SChristoph Lameter }
74595a402c3SChristoph Lameter 
7466ce3c4c0SChristoph Lameter /*
7477e2ab150SChristoph Lameter  * Migrate pages from one node to a target node.
7487e2ab150SChristoph Lameter  * Returns error or the number of pages not migrated.
7497e2ab150SChristoph Lameter  */
750dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest,
751dbcb0f19SAdrian Bunk 			   int flags)
7527e2ab150SChristoph Lameter {
7537e2ab150SChristoph Lameter 	nodemask_t nmask;
7547e2ab150SChristoph Lameter 	LIST_HEAD(pagelist);
7557e2ab150SChristoph Lameter 	int err = 0;
7567e2ab150SChristoph Lameter 
7577e2ab150SChristoph Lameter 	nodes_clear(nmask);
7587e2ab150SChristoph Lameter 	node_set(source, nmask);
7597e2ab150SChristoph Lameter 
7607e2ab150SChristoph Lameter 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
7617e2ab150SChristoph Lameter 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
7627e2ab150SChristoph Lameter 
7637e2ab150SChristoph Lameter 	if (!list_empty(&pagelist))
76495a402c3SChristoph Lameter 		err = migrate_pages(&pagelist, new_node_page, dest);
76595a402c3SChristoph Lameter 
7667e2ab150SChristoph Lameter 	return err;
7677e2ab150SChristoph Lameter }
7687e2ab150SChristoph Lameter 
7697e2ab150SChristoph Lameter /*
7707e2ab150SChristoph Lameter  * Move pages between the two nodesets so as to preserve the physical
7717e2ab150SChristoph Lameter  * layout as much as possible.
77239743889SChristoph Lameter  *
77339743889SChristoph Lameter  * Returns the number of page that could not be moved.
77439743889SChristoph Lameter  */
77539743889SChristoph Lameter int do_migrate_pages(struct mm_struct *mm,
77639743889SChristoph Lameter 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
77739743889SChristoph Lameter {
77839743889SChristoph Lameter 	LIST_HEAD(pagelist);
7797e2ab150SChristoph Lameter 	int busy = 0;
7807e2ab150SChristoph Lameter 	int err = 0;
7817e2ab150SChristoph Lameter 	nodemask_t tmp;
78239743889SChristoph Lameter 
78339743889SChristoph Lameter   	down_read(&mm->mmap_sem);
784d4984711SChristoph Lameter 
7857b2259b3SChristoph Lameter 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
7867b2259b3SChristoph Lameter 	if (err)
7877b2259b3SChristoph Lameter 		goto out;
7887b2259b3SChristoph Lameter 
7897e2ab150SChristoph Lameter /*
7907e2ab150SChristoph Lameter  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
7917e2ab150SChristoph Lameter  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
7927e2ab150SChristoph Lameter  * bit in 'tmp', and return that <source, dest> pair for migration.
7937e2ab150SChristoph Lameter  * The pair of nodemasks 'to' and 'from' define the map.
7947e2ab150SChristoph Lameter  *
7957e2ab150SChristoph Lameter  * If no pair of bits is found that way, fallback to picking some
7967e2ab150SChristoph Lameter  * pair of 'source' and 'dest' bits that are not the same.  If the
7977e2ab150SChristoph Lameter  * 'source' and 'dest' bits are the same, this represents a node
7987e2ab150SChristoph Lameter  * that will be migrating to itself, so no pages need move.
7997e2ab150SChristoph Lameter  *
8007e2ab150SChristoph Lameter  * If no bits are left in 'tmp', or if all remaining bits left
8017e2ab150SChristoph Lameter  * in 'tmp' correspond to the same bit in 'to', return false
8027e2ab150SChristoph Lameter  * (nothing left to migrate).
8037e2ab150SChristoph Lameter  *
8047e2ab150SChristoph Lameter  * This lets us pick a pair of nodes to migrate between, such that
8057e2ab150SChristoph Lameter  * if possible the dest node is not already occupied by some other
8067e2ab150SChristoph Lameter  * source node, minimizing the risk of overloading the memory on a
8077e2ab150SChristoph Lameter  * node that would happen if we migrated incoming memory to a node
8087e2ab150SChristoph Lameter  * before migrating outgoing memory source that same node.
8097e2ab150SChristoph Lameter  *
8107e2ab150SChristoph Lameter  * A single scan of tmp is sufficient.  As we go, we remember the
8117e2ab150SChristoph Lameter  * most recent <s, d> pair that moved (s != d).  If we find a pair
8127e2ab150SChristoph Lameter  * that not only moved, but what's better, moved to an empty slot
8137e2ab150SChristoph Lameter  * (d is not set in tmp), then we break out then, with that pair.
8147e2ab150SChristoph Lameter  * Otherwise when we finish scannng from_tmp, we at least have the
8157e2ab150SChristoph Lameter  * most recent <s, d> pair that moved.  If we get all the way through
8167e2ab150SChristoph Lameter  * the scan of tmp without finding any node that moved, much less
8177e2ab150SChristoph Lameter  * moved to an empty node, then there is nothing left worth migrating.
8187e2ab150SChristoph Lameter  */
8197e2ab150SChristoph Lameter 
8207e2ab150SChristoph Lameter 	tmp = *from_nodes;
8217e2ab150SChristoph Lameter 	while (!nodes_empty(tmp)) {
8227e2ab150SChristoph Lameter 		int s,d;
8237e2ab150SChristoph Lameter 		int source = -1;
8247e2ab150SChristoph Lameter 		int dest = 0;
8257e2ab150SChristoph Lameter 
8267e2ab150SChristoph Lameter 		for_each_node_mask(s, tmp) {
8277e2ab150SChristoph Lameter 			d = node_remap(s, *from_nodes, *to_nodes);
8287e2ab150SChristoph Lameter 			if (s == d)
8297e2ab150SChristoph Lameter 				continue;
8307e2ab150SChristoph Lameter 
8317e2ab150SChristoph Lameter 			source = s;	/* Node moved. Memorize */
8327e2ab150SChristoph Lameter 			dest = d;
8337e2ab150SChristoph Lameter 
8347e2ab150SChristoph Lameter 			/* dest not in remaining from nodes? */
8357e2ab150SChristoph Lameter 			if (!node_isset(dest, tmp))
8367e2ab150SChristoph Lameter 				break;
8377e2ab150SChristoph Lameter 		}
8387e2ab150SChristoph Lameter 		if (source == -1)
8397e2ab150SChristoph Lameter 			break;
8407e2ab150SChristoph Lameter 
8417e2ab150SChristoph Lameter 		node_clear(source, tmp);
8427e2ab150SChristoph Lameter 		err = migrate_to_node(mm, source, dest, flags);
8437e2ab150SChristoph Lameter 		if (err > 0)
8447e2ab150SChristoph Lameter 			busy += err;
8457e2ab150SChristoph Lameter 		if (err < 0)
8467e2ab150SChristoph Lameter 			break;
84739743889SChristoph Lameter 	}
8487b2259b3SChristoph Lameter out:
84939743889SChristoph Lameter 	up_read(&mm->mmap_sem);
8507e2ab150SChristoph Lameter 	if (err < 0)
8517e2ab150SChristoph Lameter 		return err;
8527e2ab150SChristoph Lameter 	return busy;
853b20a3503SChristoph Lameter 
85439743889SChristoph Lameter }
85539743889SChristoph Lameter 
8563ad33b24SLee Schermerhorn /*
8573ad33b24SLee Schermerhorn  * Allocate a new page for page migration based on vma policy.
8583ad33b24SLee Schermerhorn  * Start assuming that page is mapped by vma pointed to by @private.
8593ad33b24SLee Schermerhorn  * Search forward from there, if not.  N.B., this assumes that the
8603ad33b24SLee Schermerhorn  * list of pages handed to migrate_pages()--which is how we get here--
8613ad33b24SLee Schermerhorn  * is in virtual address order.
8623ad33b24SLee Schermerhorn  */
863742755a1SChristoph Lameter static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
86495a402c3SChristoph Lameter {
86595a402c3SChristoph Lameter 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
8663ad33b24SLee Schermerhorn 	unsigned long uninitialized_var(address);
86795a402c3SChristoph Lameter 
8683ad33b24SLee Schermerhorn 	while (vma) {
8693ad33b24SLee Schermerhorn 		address = page_address_in_vma(page, vma);
8703ad33b24SLee Schermerhorn 		if (address != -EFAULT)
8713ad33b24SLee Schermerhorn 			break;
8723ad33b24SLee Schermerhorn 		vma = vma->vm_next;
8733ad33b24SLee Schermerhorn 	}
8743ad33b24SLee Schermerhorn 
8753ad33b24SLee Schermerhorn 	/*
8763ad33b24SLee Schermerhorn 	 * if !vma, alloc_page_vma() will use task or system default policy
8773ad33b24SLee Schermerhorn 	 */
8783ad33b24SLee Schermerhorn 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
87995a402c3SChristoph Lameter }
880b20a3503SChristoph Lameter #else
881b20a3503SChristoph Lameter 
882b20a3503SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist,
883b20a3503SChristoph Lameter 				unsigned long flags)
884b20a3503SChristoph Lameter {
885b20a3503SChristoph Lameter }
886b20a3503SChristoph Lameter 
887b20a3503SChristoph Lameter int do_migrate_pages(struct mm_struct *mm,
888b20a3503SChristoph Lameter 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
889b20a3503SChristoph Lameter {
890b20a3503SChristoph Lameter 	return -ENOSYS;
891b20a3503SChristoph Lameter }
89295a402c3SChristoph Lameter 
89369939749SKeith Owens static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
89495a402c3SChristoph Lameter {
89595a402c3SChristoph Lameter 	return NULL;
89695a402c3SChristoph Lameter }
897b20a3503SChristoph Lameter #endif
898b20a3503SChristoph Lameter 
899dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
900028fec41SDavid Rientjes 		     unsigned short mode, unsigned short mode_flags,
901028fec41SDavid Rientjes 		     nodemask_t *nmask, unsigned long flags)
9026ce3c4c0SChristoph Lameter {
9036ce3c4c0SChristoph Lameter 	struct vm_area_struct *vma;
9046ce3c4c0SChristoph Lameter 	struct mm_struct *mm = current->mm;
9056ce3c4c0SChristoph Lameter 	struct mempolicy *new;
9066ce3c4c0SChristoph Lameter 	unsigned long end;
9076ce3c4c0SChristoph Lameter 	int err;
9086ce3c4c0SChristoph Lameter 	LIST_HEAD(pagelist);
9096ce3c4c0SChristoph Lameter 
910a3b51e01SDavid Rientjes 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
9116ce3c4c0SChristoph Lameter 				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
9126ce3c4c0SChristoph Lameter 		return -EINVAL;
91374c00241SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
9146ce3c4c0SChristoph Lameter 		return -EPERM;
9156ce3c4c0SChristoph Lameter 
9166ce3c4c0SChristoph Lameter 	if (start & ~PAGE_MASK)
9176ce3c4c0SChristoph Lameter 		return -EINVAL;
9186ce3c4c0SChristoph Lameter 
9196ce3c4c0SChristoph Lameter 	if (mode == MPOL_DEFAULT)
9206ce3c4c0SChristoph Lameter 		flags &= ~MPOL_MF_STRICT;
9216ce3c4c0SChristoph Lameter 
9226ce3c4c0SChristoph Lameter 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
9236ce3c4c0SChristoph Lameter 	end = start + len;
9246ce3c4c0SChristoph Lameter 
9256ce3c4c0SChristoph Lameter 	if (end < start)
9266ce3c4c0SChristoph Lameter 		return -EINVAL;
9276ce3c4c0SChristoph Lameter 	if (end == start)
9286ce3c4c0SChristoph Lameter 		return 0;
9296ce3c4c0SChristoph Lameter 
930028fec41SDavid Rientjes 	new = mpol_new(mode, mode_flags, nmask);
9316ce3c4c0SChristoph Lameter 	if (IS_ERR(new))
9326ce3c4c0SChristoph Lameter 		return PTR_ERR(new);
9336ce3c4c0SChristoph Lameter 
9346ce3c4c0SChristoph Lameter 	/*
9356ce3c4c0SChristoph Lameter 	 * If we are using the default policy then operation
9366ce3c4c0SChristoph Lameter 	 * on discontinuous address spaces is okay after all
9376ce3c4c0SChristoph Lameter 	 */
9386ce3c4c0SChristoph Lameter 	if (!new)
9396ce3c4c0SChristoph Lameter 		flags |= MPOL_MF_DISCONTIG_OK;
9406ce3c4c0SChristoph Lameter 
941028fec41SDavid Rientjes 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
942028fec41SDavid Rientjes 		 start, start + len, mode, mode_flags,
943028fec41SDavid Rientjes 		 nmask ? nodes_addr(*nmask)[0] : -1);
9446ce3c4c0SChristoph Lameter 
9456ce3c4c0SChristoph Lameter 	down_write(&mm->mmap_sem);
9466ce3c4c0SChristoph Lameter 	vma = check_range(mm, start, end, nmask,
9476ce3c4c0SChristoph Lameter 			  flags | MPOL_MF_INVERT, &pagelist);
9486ce3c4c0SChristoph Lameter 
9496ce3c4c0SChristoph Lameter 	err = PTR_ERR(vma);
9506ce3c4c0SChristoph Lameter 	if (!IS_ERR(vma)) {
9516ce3c4c0SChristoph Lameter 		int nr_failed = 0;
9526ce3c4c0SChristoph Lameter 
9536ce3c4c0SChristoph Lameter 		err = mbind_range(vma, start, end, new);
9547e2ab150SChristoph Lameter 
9556ce3c4c0SChristoph Lameter 		if (!list_empty(&pagelist))
95695a402c3SChristoph Lameter 			nr_failed = migrate_pages(&pagelist, new_vma_page,
95795a402c3SChristoph Lameter 						(unsigned long)vma);
9586ce3c4c0SChristoph Lameter 
9596ce3c4c0SChristoph Lameter 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
9606ce3c4c0SChristoph Lameter 			err = -EIO;
9616ce3c4c0SChristoph Lameter 	}
962b20a3503SChristoph Lameter 
9636ce3c4c0SChristoph Lameter 	up_write(&mm->mmap_sem);
964f0be3d32SLee Schermerhorn 	mpol_put(new);
9656ce3c4c0SChristoph Lameter 	return err;
9666ce3c4c0SChristoph Lameter }
9676ce3c4c0SChristoph Lameter 
96839743889SChristoph Lameter /*
9698bccd85fSChristoph Lameter  * User space interface with variable sized bitmaps for nodelists.
9708bccd85fSChristoph Lameter  */
9718bccd85fSChristoph Lameter 
9728bccd85fSChristoph Lameter /* Copy a node mask from user space. */
97339743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
9748bccd85fSChristoph Lameter 		     unsigned long maxnode)
9758bccd85fSChristoph Lameter {
9768bccd85fSChristoph Lameter 	unsigned long k;
9778bccd85fSChristoph Lameter 	unsigned long nlongs;
9788bccd85fSChristoph Lameter 	unsigned long endmask;
9798bccd85fSChristoph Lameter 
9808bccd85fSChristoph Lameter 	--maxnode;
9818bccd85fSChristoph Lameter 	nodes_clear(*nodes);
9828bccd85fSChristoph Lameter 	if (maxnode == 0 || !nmask)
9838bccd85fSChristoph Lameter 		return 0;
984a9c930baSAndi Kleen 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
985636f13c1SChris Wright 		return -EINVAL;
9868bccd85fSChristoph Lameter 
9878bccd85fSChristoph Lameter 	nlongs = BITS_TO_LONGS(maxnode);
9888bccd85fSChristoph Lameter 	if ((maxnode % BITS_PER_LONG) == 0)
9898bccd85fSChristoph Lameter 		endmask = ~0UL;
9908bccd85fSChristoph Lameter 	else
9918bccd85fSChristoph Lameter 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
9928bccd85fSChristoph Lameter 
9938bccd85fSChristoph Lameter 	/* When the user specified more nodes than supported just check
9948bccd85fSChristoph Lameter 	   if the non supported part is all zero. */
9958bccd85fSChristoph Lameter 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
9968bccd85fSChristoph Lameter 		if (nlongs > PAGE_SIZE/sizeof(long))
9978bccd85fSChristoph Lameter 			return -EINVAL;
9988bccd85fSChristoph Lameter 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
9998bccd85fSChristoph Lameter 			unsigned long t;
10008bccd85fSChristoph Lameter 			if (get_user(t, nmask + k))
10018bccd85fSChristoph Lameter 				return -EFAULT;
10028bccd85fSChristoph Lameter 			if (k == nlongs - 1) {
10038bccd85fSChristoph Lameter 				if (t & endmask)
10048bccd85fSChristoph Lameter 					return -EINVAL;
10058bccd85fSChristoph Lameter 			} else if (t)
10068bccd85fSChristoph Lameter 				return -EINVAL;
10078bccd85fSChristoph Lameter 		}
10088bccd85fSChristoph Lameter 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
10098bccd85fSChristoph Lameter 		endmask = ~0UL;
10108bccd85fSChristoph Lameter 	}
10118bccd85fSChristoph Lameter 
10128bccd85fSChristoph Lameter 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
10138bccd85fSChristoph Lameter 		return -EFAULT;
10148bccd85fSChristoph Lameter 	nodes_addr(*nodes)[nlongs-1] &= endmask;
10158bccd85fSChristoph Lameter 	return 0;
10168bccd85fSChristoph Lameter }
10178bccd85fSChristoph Lameter 
10188bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
10198bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
10208bccd85fSChristoph Lameter 			      nodemask_t *nodes)
10218bccd85fSChristoph Lameter {
10228bccd85fSChristoph Lameter 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
10238bccd85fSChristoph Lameter 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
10248bccd85fSChristoph Lameter 
10258bccd85fSChristoph Lameter 	if (copy > nbytes) {
10268bccd85fSChristoph Lameter 		if (copy > PAGE_SIZE)
10278bccd85fSChristoph Lameter 			return -EINVAL;
10288bccd85fSChristoph Lameter 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
10298bccd85fSChristoph Lameter 			return -EFAULT;
10308bccd85fSChristoph Lameter 		copy = nbytes;
10318bccd85fSChristoph Lameter 	}
10328bccd85fSChristoph Lameter 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
10338bccd85fSChristoph Lameter }
10348bccd85fSChristoph Lameter 
10358bccd85fSChristoph Lameter asmlinkage long sys_mbind(unsigned long start, unsigned long len,
10368bccd85fSChristoph Lameter 			unsigned long mode,
10378bccd85fSChristoph Lameter 			unsigned long __user *nmask, unsigned long maxnode,
10388bccd85fSChristoph Lameter 			unsigned flags)
10398bccd85fSChristoph Lameter {
10408bccd85fSChristoph Lameter 	nodemask_t nodes;
10418bccd85fSChristoph Lameter 	int err;
1042028fec41SDavid Rientjes 	unsigned short mode_flags;
10438bccd85fSChristoph Lameter 
1044028fec41SDavid Rientjes 	mode_flags = mode & MPOL_MODE_FLAGS;
1045028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1046a3b51e01SDavid Rientjes 	if (mode >= MPOL_MAX)
1047a3b51e01SDavid Rientjes 		return -EINVAL;
10484c50bc01SDavid Rientjes 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
10494c50bc01SDavid Rientjes 	    (mode_flags & MPOL_F_RELATIVE_NODES))
10504c50bc01SDavid Rientjes 		return -EINVAL;
10518bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
10528bccd85fSChristoph Lameter 	if (err)
10538bccd85fSChristoph Lameter 		return err;
1054028fec41SDavid Rientjes 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
10558bccd85fSChristoph Lameter }
10568bccd85fSChristoph Lameter 
10578bccd85fSChristoph Lameter /* Set the process memory policy */
10588bccd85fSChristoph Lameter asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
10598bccd85fSChristoph Lameter 		unsigned long maxnode)
10608bccd85fSChristoph Lameter {
10618bccd85fSChristoph Lameter 	int err;
10628bccd85fSChristoph Lameter 	nodemask_t nodes;
1063028fec41SDavid Rientjes 	unsigned short flags;
10648bccd85fSChristoph Lameter 
1065028fec41SDavid Rientjes 	flags = mode & MPOL_MODE_FLAGS;
1066028fec41SDavid Rientjes 	mode &= ~MPOL_MODE_FLAGS;
1067028fec41SDavid Rientjes 	if ((unsigned int)mode >= MPOL_MAX)
10688bccd85fSChristoph Lameter 		return -EINVAL;
10694c50bc01SDavid Rientjes 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
10704c50bc01SDavid Rientjes 		return -EINVAL;
10718bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
10728bccd85fSChristoph Lameter 	if (err)
10738bccd85fSChristoph Lameter 		return err;
1074028fec41SDavid Rientjes 	return do_set_mempolicy(mode, flags, &nodes);
10758bccd85fSChristoph Lameter }
10768bccd85fSChristoph Lameter 
107739743889SChristoph Lameter asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
107839743889SChristoph Lameter 		const unsigned long __user *old_nodes,
107939743889SChristoph Lameter 		const unsigned long __user *new_nodes)
108039743889SChristoph Lameter {
108139743889SChristoph Lameter 	struct mm_struct *mm;
108239743889SChristoph Lameter 	struct task_struct *task;
108339743889SChristoph Lameter 	nodemask_t old;
108439743889SChristoph Lameter 	nodemask_t new;
108539743889SChristoph Lameter 	nodemask_t task_nodes;
108639743889SChristoph Lameter 	int err;
108739743889SChristoph Lameter 
108839743889SChristoph Lameter 	err = get_nodes(&old, old_nodes, maxnode);
108939743889SChristoph Lameter 	if (err)
109039743889SChristoph Lameter 		return err;
109139743889SChristoph Lameter 
109239743889SChristoph Lameter 	err = get_nodes(&new, new_nodes, maxnode);
109339743889SChristoph Lameter 	if (err)
109439743889SChristoph Lameter 		return err;
109539743889SChristoph Lameter 
109639743889SChristoph Lameter 	/* Find the mm_struct */
109739743889SChristoph Lameter 	read_lock(&tasklist_lock);
1098228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
109939743889SChristoph Lameter 	if (!task) {
110039743889SChristoph Lameter 		read_unlock(&tasklist_lock);
110139743889SChristoph Lameter 		return -ESRCH;
110239743889SChristoph Lameter 	}
110339743889SChristoph Lameter 	mm = get_task_mm(task);
110439743889SChristoph Lameter 	read_unlock(&tasklist_lock);
110539743889SChristoph Lameter 
110639743889SChristoph Lameter 	if (!mm)
110739743889SChristoph Lameter 		return -EINVAL;
110839743889SChristoph Lameter 
110939743889SChristoph Lameter 	/*
111039743889SChristoph Lameter 	 * Check if this process has the right to modify the specified
111139743889SChristoph Lameter 	 * process. The right exists if the process has administrative
11127f927fccSAlexey Dobriyan 	 * capabilities, superuser privileges or the same
111339743889SChristoph Lameter 	 * userid as the target process.
111439743889SChristoph Lameter 	 */
111539743889SChristoph Lameter 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
111639743889SChristoph Lameter 	    (current->uid != task->suid) && (current->uid != task->uid) &&
111774c00241SChristoph Lameter 	    !capable(CAP_SYS_NICE)) {
111839743889SChristoph Lameter 		err = -EPERM;
111939743889SChristoph Lameter 		goto out;
112039743889SChristoph Lameter 	}
112139743889SChristoph Lameter 
112239743889SChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
112339743889SChristoph Lameter 	/* Is the user allowed to access the target nodes? */
112474c00241SChristoph Lameter 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
112539743889SChristoph Lameter 		err = -EPERM;
112639743889SChristoph Lameter 		goto out;
112739743889SChristoph Lameter 	}
112839743889SChristoph Lameter 
112937b07e41SLee Schermerhorn 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
11303b42d28bSChristoph Lameter 		err = -EINVAL;
11313b42d28bSChristoph Lameter 		goto out;
11323b42d28bSChristoph Lameter 	}
11333b42d28bSChristoph Lameter 
113486c3a764SDavid Quigley 	err = security_task_movememory(task);
113586c3a764SDavid Quigley 	if (err)
113686c3a764SDavid Quigley 		goto out;
113786c3a764SDavid Quigley 
1138511030bcSChristoph Lameter 	err = do_migrate_pages(mm, &old, &new,
113974c00241SChristoph Lameter 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
114039743889SChristoph Lameter out:
114139743889SChristoph Lameter 	mmput(mm);
114239743889SChristoph Lameter 	return err;
114339743889SChristoph Lameter }
114439743889SChristoph Lameter 
114539743889SChristoph Lameter 
11468bccd85fSChristoph Lameter /* Retrieve NUMA policy */
11478bccd85fSChristoph Lameter asmlinkage long sys_get_mempolicy(int __user *policy,
11488bccd85fSChristoph Lameter 				unsigned long __user *nmask,
11498bccd85fSChristoph Lameter 				unsigned long maxnode,
11508bccd85fSChristoph Lameter 				unsigned long addr, unsigned long flags)
11518bccd85fSChristoph Lameter {
1152dbcb0f19SAdrian Bunk 	int err;
1153dbcb0f19SAdrian Bunk 	int uninitialized_var(pval);
11548bccd85fSChristoph Lameter 	nodemask_t nodes;
11558bccd85fSChristoph Lameter 
11568bccd85fSChristoph Lameter 	if (nmask != NULL && maxnode < MAX_NUMNODES)
11578bccd85fSChristoph Lameter 		return -EINVAL;
11588bccd85fSChristoph Lameter 
11598bccd85fSChristoph Lameter 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
11608bccd85fSChristoph Lameter 
11618bccd85fSChristoph Lameter 	if (err)
11628bccd85fSChristoph Lameter 		return err;
11638bccd85fSChristoph Lameter 
11648bccd85fSChristoph Lameter 	if (policy && put_user(pval, policy))
11658bccd85fSChristoph Lameter 		return -EFAULT;
11668bccd85fSChristoph Lameter 
11678bccd85fSChristoph Lameter 	if (nmask)
11688bccd85fSChristoph Lameter 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
11698bccd85fSChristoph Lameter 
11708bccd85fSChristoph Lameter 	return err;
11718bccd85fSChristoph Lameter }
11728bccd85fSChristoph Lameter 
11731da177e4SLinus Torvalds #ifdef CONFIG_COMPAT
11741da177e4SLinus Torvalds 
11751da177e4SLinus Torvalds asmlinkage long compat_sys_get_mempolicy(int __user *policy,
11761da177e4SLinus Torvalds 				     compat_ulong_t __user *nmask,
11771da177e4SLinus Torvalds 				     compat_ulong_t maxnode,
11781da177e4SLinus Torvalds 				     compat_ulong_t addr, compat_ulong_t flags)
11791da177e4SLinus Torvalds {
11801da177e4SLinus Torvalds 	long err;
11811da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
11821da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
11831da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
11841da177e4SLinus Torvalds 
11851da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
11861da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
11871da177e4SLinus Torvalds 
11881da177e4SLinus Torvalds 	if (nmask)
11891da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
11901da177e4SLinus Torvalds 
11911da177e4SLinus Torvalds 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
11921da177e4SLinus Torvalds 
11931da177e4SLinus Torvalds 	if (!err && nmask) {
11941da177e4SLinus Torvalds 		err = copy_from_user(bm, nm, alloc_size);
11951da177e4SLinus Torvalds 		/* ensure entire bitmap is zeroed */
11961da177e4SLinus Torvalds 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
11971da177e4SLinus Torvalds 		err |= compat_put_bitmap(nmask, bm, nr_bits);
11981da177e4SLinus Torvalds 	}
11991da177e4SLinus Torvalds 
12001da177e4SLinus Torvalds 	return err;
12011da177e4SLinus Torvalds }
12021da177e4SLinus Torvalds 
12031da177e4SLinus Torvalds asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
12041da177e4SLinus Torvalds 				     compat_ulong_t maxnode)
12051da177e4SLinus Torvalds {
12061da177e4SLinus Torvalds 	long err = 0;
12071da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
12081da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
12091da177e4SLinus Torvalds 	DECLARE_BITMAP(bm, MAX_NUMNODES);
12101da177e4SLinus Torvalds 
12111da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
12121da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
12131da177e4SLinus Torvalds 
12141da177e4SLinus Torvalds 	if (nmask) {
12151da177e4SLinus Torvalds 		err = compat_get_bitmap(bm, nmask, nr_bits);
12161da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
12171da177e4SLinus Torvalds 		err |= copy_to_user(nm, bm, alloc_size);
12181da177e4SLinus Torvalds 	}
12191da177e4SLinus Torvalds 
12201da177e4SLinus Torvalds 	if (err)
12211da177e4SLinus Torvalds 		return -EFAULT;
12221da177e4SLinus Torvalds 
12231da177e4SLinus Torvalds 	return sys_set_mempolicy(mode, nm, nr_bits+1);
12241da177e4SLinus Torvalds }
12251da177e4SLinus Torvalds 
12261da177e4SLinus Torvalds asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
12271da177e4SLinus Torvalds 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
12281da177e4SLinus Torvalds 			     compat_ulong_t maxnode, compat_ulong_t flags)
12291da177e4SLinus Torvalds {
12301da177e4SLinus Torvalds 	long err = 0;
12311da177e4SLinus Torvalds 	unsigned long __user *nm = NULL;
12321da177e4SLinus Torvalds 	unsigned long nr_bits, alloc_size;
1233dfcd3c0dSAndi Kleen 	nodemask_t bm;
12341da177e4SLinus Torvalds 
12351da177e4SLinus Torvalds 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
12361da177e4SLinus Torvalds 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
12371da177e4SLinus Torvalds 
12381da177e4SLinus Torvalds 	if (nmask) {
1239dfcd3c0dSAndi Kleen 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
12401da177e4SLinus Torvalds 		nm = compat_alloc_user_space(alloc_size);
1241dfcd3c0dSAndi Kleen 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
12421da177e4SLinus Torvalds 	}
12431da177e4SLinus Torvalds 
12441da177e4SLinus Torvalds 	if (err)
12451da177e4SLinus Torvalds 		return -EFAULT;
12461da177e4SLinus Torvalds 
12471da177e4SLinus Torvalds 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
12481da177e4SLinus Torvalds }
12491da177e4SLinus Torvalds 
12501da177e4SLinus Torvalds #endif
12511da177e4SLinus Torvalds 
1252480eccf9SLee Schermerhorn /*
1253480eccf9SLee Schermerhorn  * get_vma_policy(@task, @vma, @addr)
1254480eccf9SLee Schermerhorn  * @task - task for fallback if vma policy == default
1255480eccf9SLee Schermerhorn  * @vma   - virtual memory area whose policy is sought
1256480eccf9SLee Schermerhorn  * @addr  - address in @vma for shared policy lookup
1257480eccf9SLee Schermerhorn  *
1258480eccf9SLee Schermerhorn  * Returns effective policy for a VMA at specified address.
1259480eccf9SLee Schermerhorn  * Falls back to @task or system default policy, as necessary.
1260480eccf9SLee Schermerhorn  * Returned policy has extra reference count if shared, vma,
1261480eccf9SLee Schermerhorn  * or some other task's policy [show_numa_maps() can pass
1262480eccf9SLee Schermerhorn  * @task != current].  It is the caller's responsibility to
1263480eccf9SLee Schermerhorn  * free the reference in these cases.
1264480eccf9SLee Schermerhorn  */
126548fce342SChristoph Lameter static struct mempolicy * get_vma_policy(struct task_struct *task,
126648fce342SChristoph Lameter 		struct vm_area_struct *vma, unsigned long addr)
12671da177e4SLinus Torvalds {
12686e21c8f1SChristoph Lameter 	struct mempolicy *pol = task->mempolicy;
1269480eccf9SLee Schermerhorn 	int shared_pol = 0;
12701da177e4SLinus Torvalds 
12711da177e4SLinus Torvalds 	if (vma) {
1272480eccf9SLee Schermerhorn 		if (vma->vm_ops && vma->vm_ops->get_policy) {
12731da177e4SLinus Torvalds 			pol = vma->vm_ops->get_policy(vma, addr);
1274480eccf9SLee Schermerhorn 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1275480eccf9SLee Schermerhorn 		} else if (vma->vm_policy &&
12761da177e4SLinus Torvalds 				vma->vm_policy->policy != MPOL_DEFAULT)
12771da177e4SLinus Torvalds 			pol = vma->vm_policy;
12781da177e4SLinus Torvalds 	}
12791da177e4SLinus Torvalds 	if (!pol)
12801da177e4SLinus Torvalds 		pol = &default_policy;
1281480eccf9SLee Schermerhorn 	else if (!shared_pol && pol != current->mempolicy)
1282480eccf9SLee Schermerhorn 		mpol_get(pol);	/* vma or other task's policy */
12831da177e4SLinus Torvalds 	return pol;
12841da177e4SLinus Torvalds }
12851da177e4SLinus Torvalds 
128619770b32SMel Gorman /* Return a nodemask representing a mempolicy */
128719770b32SMel Gorman static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
128819770b32SMel Gorman {
128919770b32SMel Gorman 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
129019770b32SMel Gorman 	if (unlikely(policy->policy == MPOL_BIND) &&
129119770b32SMel Gorman 			gfp_zone(gfp) >= policy_zone &&
129219770b32SMel Gorman 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
129319770b32SMel Gorman 		return &policy->v.nodes;
129419770b32SMel Gorman 
129519770b32SMel Gorman 	return NULL;
129619770b32SMel Gorman }
129719770b32SMel Gorman 
12981da177e4SLinus Torvalds /* Return a zonelist representing a mempolicy */
1299dd0fc66fSAl Viro static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
13001da177e4SLinus Torvalds {
13011da177e4SLinus Torvalds 	int nd;
13021da177e4SLinus Torvalds 
13031da177e4SLinus Torvalds 	switch (policy->policy) {
13041da177e4SLinus Torvalds 	case MPOL_PREFERRED:
13051da177e4SLinus Torvalds 		nd = policy->v.preferred_node;
13061da177e4SLinus Torvalds 		if (nd < 0)
13071da177e4SLinus Torvalds 			nd = numa_node_id();
13081da177e4SLinus Torvalds 		break;
13091da177e4SLinus Torvalds 	case MPOL_BIND:
131019770b32SMel Gorman 		/*
131119770b32SMel Gorman 		 * Normally, MPOL_BIND allocations node-local are node-local
131219770b32SMel Gorman 		 * within the allowed nodemask. However, if __GFP_THISNODE is
131319770b32SMel Gorman 		 * set and the current node is part of the mask, we use the
131419770b32SMel Gorman 		 * the zonelist for the first node in the mask instead.
131519770b32SMel Gorman 		 */
131619770b32SMel Gorman 		nd = numa_node_id();
131719770b32SMel Gorman 		if (unlikely(gfp & __GFP_THISNODE) &&
131819770b32SMel Gorman 				unlikely(!node_isset(nd, policy->v.nodes)))
131919770b32SMel Gorman 			nd = first_node(policy->v.nodes);
132019770b32SMel Gorman 		break;
13211da177e4SLinus Torvalds 	case MPOL_INTERLEAVE: /* should not happen */
13221da177e4SLinus Torvalds 	case MPOL_DEFAULT:
13231da177e4SLinus Torvalds 		nd = numa_node_id();
13241da177e4SLinus Torvalds 		break;
13251da177e4SLinus Torvalds 	default:
13261da177e4SLinus Torvalds 		nd = 0;
13271da177e4SLinus Torvalds 		BUG();
13281da177e4SLinus Torvalds 	}
13290e88460dSMel Gorman 	return node_zonelist(nd, gfp);
13301da177e4SLinus Torvalds }
13311da177e4SLinus Torvalds 
13321da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
13331da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy)
13341da177e4SLinus Torvalds {
13351da177e4SLinus Torvalds 	unsigned nid, next;
13361da177e4SLinus Torvalds 	struct task_struct *me = current;
13371da177e4SLinus Torvalds 
13381da177e4SLinus Torvalds 	nid = me->il_next;
1339dfcd3c0dSAndi Kleen 	next = next_node(nid, policy->v.nodes);
13401da177e4SLinus Torvalds 	if (next >= MAX_NUMNODES)
1341dfcd3c0dSAndi Kleen 		next = first_node(policy->v.nodes);
1342f5b087b5SDavid Rientjes 	if (next < MAX_NUMNODES)
13431da177e4SLinus Torvalds 		me->il_next = next;
13441da177e4SLinus Torvalds 	return nid;
13451da177e4SLinus Torvalds }
13461da177e4SLinus Torvalds 
1347dc85da15SChristoph Lameter /*
1348dc85da15SChristoph Lameter  * Depending on the memory policy provide a node from which to allocate the
1349dc85da15SChristoph Lameter  * next slab entry.
1350dc85da15SChristoph Lameter  */
1351dc85da15SChristoph Lameter unsigned slab_node(struct mempolicy *policy)
1352dc85da15SChristoph Lameter {
1353a3b51e01SDavid Rientjes 	unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1354765c4507SChristoph Lameter 
1355765c4507SChristoph Lameter 	switch (pol) {
1356dc85da15SChristoph Lameter 	case MPOL_INTERLEAVE:
1357dc85da15SChristoph Lameter 		return interleave_nodes(policy);
1358dc85da15SChristoph Lameter 
1359dd1a239fSMel Gorman 	case MPOL_BIND: {
1360dc85da15SChristoph Lameter 		/*
1361dc85da15SChristoph Lameter 		 * Follow bind policy behavior and start allocation at the
1362dc85da15SChristoph Lameter 		 * first node.
1363dc85da15SChristoph Lameter 		 */
136419770b32SMel Gorman 		struct zonelist *zonelist;
136519770b32SMel Gorman 		struct zone *zone;
136619770b32SMel Gorman 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
136719770b32SMel Gorman 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
136819770b32SMel Gorman 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
136919770b32SMel Gorman 							&policy->v.nodes,
137019770b32SMel Gorman 							&zone);
137119770b32SMel Gorman 		return zone->node;
1372dd1a239fSMel Gorman 	}
1373dc85da15SChristoph Lameter 
1374dc85da15SChristoph Lameter 	case MPOL_PREFERRED:
1375dc85da15SChristoph Lameter 		if (policy->v.preferred_node >= 0)
1376dc85da15SChristoph Lameter 			return policy->v.preferred_node;
1377dc85da15SChristoph Lameter 		/* Fall through */
1378dc85da15SChristoph Lameter 
1379dc85da15SChristoph Lameter 	default:
1380dc85da15SChristoph Lameter 		return numa_node_id();
1381dc85da15SChristoph Lameter 	}
1382dc85da15SChristoph Lameter }
1383dc85da15SChristoph Lameter 
13841da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */
13851da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol,
13861da177e4SLinus Torvalds 		struct vm_area_struct *vma, unsigned long off)
13871da177e4SLinus Torvalds {
1388dfcd3c0dSAndi Kleen 	unsigned nnodes = nodes_weight(pol->v.nodes);
1389f5b087b5SDavid Rientjes 	unsigned target;
13901da177e4SLinus Torvalds 	int c;
13911da177e4SLinus Torvalds 	int nid = -1;
13921da177e4SLinus Torvalds 
1393f5b087b5SDavid Rientjes 	if (!nnodes)
1394f5b087b5SDavid Rientjes 		return numa_node_id();
1395f5b087b5SDavid Rientjes 	target = (unsigned int)off % nnodes;
13961da177e4SLinus Torvalds 	c = 0;
13971da177e4SLinus Torvalds 	do {
1398dfcd3c0dSAndi Kleen 		nid = next_node(nid, pol->v.nodes);
13991da177e4SLinus Torvalds 		c++;
14001da177e4SLinus Torvalds 	} while (c <= target);
14011da177e4SLinus Torvalds 	return nid;
14021da177e4SLinus Torvalds }
14031da177e4SLinus Torvalds 
14045da7ca86SChristoph Lameter /* Determine a node number for interleave */
14055da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol,
14065da7ca86SChristoph Lameter 		 struct vm_area_struct *vma, unsigned long addr, int shift)
14075da7ca86SChristoph Lameter {
14085da7ca86SChristoph Lameter 	if (vma) {
14095da7ca86SChristoph Lameter 		unsigned long off;
14105da7ca86SChristoph Lameter 
14113b98b087SNishanth Aravamudan 		/*
14123b98b087SNishanth Aravamudan 		 * for small pages, there is no difference between
14133b98b087SNishanth Aravamudan 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
14143b98b087SNishanth Aravamudan 		 * for huge pages, since vm_pgoff is in units of small
14153b98b087SNishanth Aravamudan 		 * pages, we need to shift off the always 0 bits to get
14163b98b087SNishanth Aravamudan 		 * a useful offset.
14173b98b087SNishanth Aravamudan 		 */
14183b98b087SNishanth Aravamudan 		BUG_ON(shift < PAGE_SHIFT);
14193b98b087SNishanth Aravamudan 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
14205da7ca86SChristoph Lameter 		off += (addr - vma->vm_start) >> shift;
14215da7ca86SChristoph Lameter 		return offset_il_node(pol, vma, off);
14225da7ca86SChristoph Lameter 	} else
14235da7ca86SChristoph Lameter 		return interleave_nodes(pol);
14245da7ca86SChristoph Lameter }
14255da7ca86SChristoph Lameter 
142600ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
1427480eccf9SLee Schermerhorn /*
1428480eccf9SLee Schermerhorn  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1429480eccf9SLee Schermerhorn  * @vma = virtual memory area whose policy is sought
1430480eccf9SLee Schermerhorn  * @addr = address in @vma for shared policy lookup and interleave policy
1431480eccf9SLee Schermerhorn  * @gfp_flags = for requested zone
143219770b32SMel Gorman  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
143319770b32SMel Gorman  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1434480eccf9SLee Schermerhorn  *
1435480eccf9SLee Schermerhorn  * Returns a zonelist suitable for a huge page allocation.
143619770b32SMel Gorman  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
143719770b32SMel Gorman  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1438480eccf9SLee Schermerhorn  * If it is also a policy for which get_vma_policy() returns an extra
143919770b32SMel Gorman  * reference, we must hold that reference until after the allocation.
1440480eccf9SLee Schermerhorn  * In that case, return policy via @mpol so hugetlb allocation can drop
1441480eccf9SLee Schermerhorn  * the reference. For non-'BIND referenced policies, we can/do drop the
1442480eccf9SLee Schermerhorn  * reference here, so the caller doesn't need to know about the special case
1443480eccf9SLee Schermerhorn  * for default and current task policy.
1444480eccf9SLee Schermerhorn  */
1445396faf03SMel Gorman struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
144619770b32SMel Gorman 				gfp_t gfp_flags, struct mempolicy **mpol,
144719770b32SMel Gorman 				nodemask_t **nodemask)
14485da7ca86SChristoph Lameter {
14495da7ca86SChristoph Lameter 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1450480eccf9SLee Schermerhorn 	struct zonelist *zl;
14515da7ca86SChristoph Lameter 
1452480eccf9SLee Schermerhorn 	*mpol = NULL;		/* probably no unref needed */
145319770b32SMel Gorman 	*nodemask = NULL;	/* assume !MPOL_BIND */
145419770b32SMel Gorman 	if (pol->policy == MPOL_BIND) {
145519770b32SMel Gorman 			*nodemask = &pol->v.nodes;
145619770b32SMel Gorman 	} else if (pol->policy == MPOL_INTERLEAVE) {
14575da7ca86SChristoph Lameter 		unsigned nid;
14585da7ca86SChristoph Lameter 
14595da7ca86SChristoph Lameter 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
146069682d85SLee Schermerhorn 		if (unlikely(pol != &default_policy &&
146169682d85SLee Schermerhorn 				pol != current->mempolicy))
1462f0be3d32SLee Schermerhorn 			__mpol_put(pol);	/* finished with pol */
14630e88460dSMel Gorman 		return node_zonelist(nid, gfp_flags);
14645da7ca86SChristoph Lameter 	}
1465480eccf9SLee Schermerhorn 
1466480eccf9SLee Schermerhorn 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1467480eccf9SLee Schermerhorn 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1468480eccf9SLee Schermerhorn 		if (pol->policy != MPOL_BIND)
1469f0be3d32SLee Schermerhorn 			__mpol_put(pol);	/* finished with pol */
1470480eccf9SLee Schermerhorn 		else
1471480eccf9SLee Schermerhorn 			*mpol = pol;	/* unref needed after allocation */
1472480eccf9SLee Schermerhorn 	}
1473480eccf9SLee Schermerhorn 	return zl;
14745da7ca86SChristoph Lameter }
147500ac59adSChen, Kenneth W #endif
14765da7ca86SChristoph Lameter 
14771da177e4SLinus Torvalds /* Allocate a page in interleaved policy.
14781da177e4SLinus Torvalds    Own path because it needs to do special accounting. */
1479662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1480662f3a0bSAndi Kleen 					unsigned nid)
14811da177e4SLinus Torvalds {
14821da177e4SLinus Torvalds 	struct zonelist *zl;
14831da177e4SLinus Torvalds 	struct page *page;
14841da177e4SLinus Torvalds 
14850e88460dSMel Gorman 	zl = node_zonelist(nid, gfp);
14861da177e4SLinus Torvalds 	page = __alloc_pages(gfp, order, zl);
1487dd1a239fSMel Gorman 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1488ca889e6cSChristoph Lameter 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
14891da177e4SLinus Torvalds 	return page;
14901da177e4SLinus Torvalds }
14911da177e4SLinus Torvalds 
14921da177e4SLinus Torvalds /**
14931da177e4SLinus Torvalds  * 	alloc_page_vma	- Allocate a page for a VMA.
14941da177e4SLinus Torvalds  *
14951da177e4SLinus Torvalds  * 	@gfp:
14961da177e4SLinus Torvalds  *      %GFP_USER    user allocation.
14971da177e4SLinus Torvalds  *      %GFP_KERNEL  kernel allocations,
14981da177e4SLinus Torvalds  *      %GFP_HIGHMEM highmem/user allocations,
14991da177e4SLinus Torvalds  *      %GFP_FS      allocation should not call back into a file system.
15001da177e4SLinus Torvalds  *      %GFP_ATOMIC  don't sleep.
15011da177e4SLinus Torvalds  *
15021da177e4SLinus Torvalds  * 	@vma:  Pointer to VMA or NULL if not available.
15031da177e4SLinus Torvalds  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
15041da177e4SLinus Torvalds  *
15051da177e4SLinus Torvalds  * 	This function allocates a page from the kernel page pool and applies
15061da177e4SLinus Torvalds  *	a NUMA policy associated with the VMA or the current process.
15071da177e4SLinus Torvalds  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
15081da177e4SLinus Torvalds  *	mm_struct of the VMA to prevent it from going away. Should be used for
15091da177e4SLinus Torvalds  *	all allocations for pages that will be mapped into
15101da177e4SLinus Torvalds  * 	user space. Returns NULL when no page can be allocated.
15111da177e4SLinus Torvalds  *
15121da177e4SLinus Torvalds  *	Should be called with the mm_sem of the vma hold.
15131da177e4SLinus Torvalds  */
15141da177e4SLinus Torvalds struct page *
1515dd0fc66fSAl Viro alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
15161da177e4SLinus Torvalds {
15176e21c8f1SChristoph Lameter 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1518480eccf9SLee Schermerhorn 	struct zonelist *zl;
15191da177e4SLinus Torvalds 
1520cf2a473cSPaul Jackson 	cpuset_update_task_memory_state();
15211da177e4SLinus Torvalds 
15221da177e4SLinus Torvalds 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
15231da177e4SLinus Torvalds 		unsigned nid;
15245da7ca86SChristoph Lameter 
15255da7ca86SChristoph Lameter 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
152669682d85SLee Schermerhorn 		if (unlikely(pol != &default_policy &&
152769682d85SLee Schermerhorn 				pol != current->mempolicy))
1528f0be3d32SLee Schermerhorn 			__mpol_put(pol);	/* finished with pol */
15291da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, 0, nid);
15301da177e4SLinus Torvalds 	}
1531480eccf9SLee Schermerhorn 	zl = zonelist_policy(gfp, pol);
1532480eccf9SLee Schermerhorn 	if (pol != &default_policy && pol != current->mempolicy) {
1533480eccf9SLee Schermerhorn 		/*
1534480eccf9SLee Schermerhorn 		 * slow path: ref counted policy -- shared or vma
1535480eccf9SLee Schermerhorn 		 */
153619770b32SMel Gorman 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
153719770b32SMel Gorman 						zl, nodemask_policy(gfp, pol));
1538f0be3d32SLee Schermerhorn 		__mpol_put(pol);
1539480eccf9SLee Schermerhorn 		return page;
1540480eccf9SLee Schermerhorn 	}
1541480eccf9SLee Schermerhorn 	/*
1542480eccf9SLee Schermerhorn 	 * fast path:  default or task policy
1543480eccf9SLee Schermerhorn 	 */
154419770b32SMel Gorman 	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
15451da177e4SLinus Torvalds }
15461da177e4SLinus Torvalds 
15471da177e4SLinus Torvalds /**
15481da177e4SLinus Torvalds  * 	alloc_pages_current - Allocate pages.
15491da177e4SLinus Torvalds  *
15501da177e4SLinus Torvalds  *	@gfp:
15511da177e4SLinus Torvalds  *		%GFP_USER   user allocation,
15521da177e4SLinus Torvalds  *      	%GFP_KERNEL kernel allocation,
15531da177e4SLinus Torvalds  *      	%GFP_HIGHMEM highmem allocation,
15541da177e4SLinus Torvalds  *      	%GFP_FS     don't call back into a file system.
15551da177e4SLinus Torvalds  *      	%GFP_ATOMIC don't sleep.
15561da177e4SLinus Torvalds  *	@order: Power of two of allocation size in pages. 0 is a single page.
15571da177e4SLinus Torvalds  *
15581da177e4SLinus Torvalds  *	Allocate a page from the kernel page pool.  When not in
15591da177e4SLinus Torvalds  *	interrupt context and apply the current process NUMA policy.
15601da177e4SLinus Torvalds  *	Returns NULL when no page can be allocated.
15611da177e4SLinus Torvalds  *
1562cf2a473cSPaul Jackson  *	Don't call cpuset_update_task_memory_state() unless
15631da177e4SLinus Torvalds  *	1) it's ok to take cpuset_sem (can WAIT), and
15641da177e4SLinus Torvalds  *	2) allocating for current task (not interrupt).
15651da177e4SLinus Torvalds  */
1566dd0fc66fSAl Viro struct page *alloc_pages_current(gfp_t gfp, unsigned order)
15671da177e4SLinus Torvalds {
15681da177e4SLinus Torvalds 	struct mempolicy *pol = current->mempolicy;
15691da177e4SLinus Torvalds 
15701da177e4SLinus Torvalds 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1571cf2a473cSPaul Jackson 		cpuset_update_task_memory_state();
15729b819d20SChristoph Lameter 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
15731da177e4SLinus Torvalds 		pol = &default_policy;
15741da177e4SLinus Torvalds 	if (pol->policy == MPOL_INTERLEAVE)
15751da177e4SLinus Torvalds 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
157619770b32SMel Gorman 	return __alloc_pages_nodemask(gfp, order,
157719770b32SMel Gorman 			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
15781da177e4SLinus Torvalds }
15791da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current);
15801da177e4SLinus Torvalds 
15814225399aSPaul Jackson /*
1582846a16bfSLee Schermerhorn  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
15834225399aSPaul Jackson  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
15844225399aSPaul Jackson  * with the mems_allowed returned by cpuset_mems_allowed().  This
15854225399aSPaul Jackson  * keeps mempolicies cpuset relative after its cpuset moves.  See
15864225399aSPaul Jackson  * further kernel/cpuset.c update_nodemask().
15874225399aSPaul Jackson  */
15884225399aSPaul Jackson 
1589846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */
1590846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old)
15911da177e4SLinus Torvalds {
15921da177e4SLinus Torvalds 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
15931da177e4SLinus Torvalds 
15941da177e4SLinus Torvalds 	if (!new)
15951da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
15964225399aSPaul Jackson 	if (current_cpuset_is_being_rebound()) {
15974225399aSPaul Jackson 		nodemask_t mems = cpuset_mems_allowed(current);
15984225399aSPaul Jackson 		mpol_rebind_policy(old, &mems);
15994225399aSPaul Jackson 	}
16001da177e4SLinus Torvalds 	*new = *old;
16011da177e4SLinus Torvalds 	atomic_set(&new->refcnt, 1);
16021da177e4SLinus Torvalds 	return new;
16031da177e4SLinus Torvalds }
16041da177e4SLinus Torvalds 
1605f5b087b5SDavid Rientjes static int mpol_match_intent(const struct mempolicy *a,
1606f5b087b5SDavid Rientjes 			     const struct mempolicy *b)
1607f5b087b5SDavid Rientjes {
1608f5b087b5SDavid Rientjes 	if (a->flags != b->flags)
1609f5b087b5SDavid Rientjes 		return 0;
1610f5b087b5SDavid Rientjes 	if (!mpol_store_user_nodemask(a))
1611f5b087b5SDavid Rientjes 		return 1;
1612f5b087b5SDavid Rientjes 	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1613f5b087b5SDavid Rientjes }
1614f5b087b5SDavid Rientjes 
16151da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
16161da177e4SLinus Torvalds int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
16171da177e4SLinus Torvalds {
16181da177e4SLinus Torvalds 	if (!a || !b)
16191da177e4SLinus Torvalds 		return 0;
16201da177e4SLinus Torvalds 	if (a->policy != b->policy)
16211da177e4SLinus Torvalds 		return 0;
1622f5b087b5SDavid Rientjes 	if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1623f5b087b5SDavid Rientjes 		return 0;
16241da177e4SLinus Torvalds 	switch (a->policy) {
16251da177e4SLinus Torvalds 	case MPOL_DEFAULT:
16261da177e4SLinus Torvalds 		return 1;
162719770b32SMel Gorman 	case MPOL_BIND:
162819770b32SMel Gorman 		/* Fall through */
16291da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
1630dfcd3c0dSAndi Kleen 		return nodes_equal(a->v.nodes, b->v.nodes);
16311da177e4SLinus Torvalds 	case MPOL_PREFERRED:
16321da177e4SLinus Torvalds 		return a->v.preferred_node == b->v.preferred_node;
16331da177e4SLinus Torvalds 	default:
16341da177e4SLinus Torvalds 		BUG();
16351da177e4SLinus Torvalds 		return 0;
16361da177e4SLinus Torvalds 	}
16371da177e4SLinus Torvalds }
16381da177e4SLinus Torvalds 
16391da177e4SLinus Torvalds /* Slow path of a mpol destructor. */
1640f0be3d32SLee Schermerhorn void __mpol_put(struct mempolicy *p)
16411da177e4SLinus Torvalds {
16421da177e4SLinus Torvalds 	if (!atomic_dec_and_test(&p->refcnt))
16431da177e4SLinus Torvalds 		return;
16441da177e4SLinus Torvalds 	p->policy = MPOL_DEFAULT;
16451da177e4SLinus Torvalds 	kmem_cache_free(policy_cache, p);
16461da177e4SLinus Torvalds }
16471da177e4SLinus Torvalds 
16481da177e4SLinus Torvalds /*
16491da177e4SLinus Torvalds  * Shared memory backing store policy support.
16501da177e4SLinus Torvalds  *
16511da177e4SLinus Torvalds  * Remember policies even when nobody has shared memory mapped.
16521da177e4SLinus Torvalds  * The policies are kept in Red-Black tree linked from the inode.
16531da177e4SLinus Torvalds  * They are protected by the sp->lock spinlock, which should be held
16541da177e4SLinus Torvalds  * for any accesses to the tree.
16551da177e4SLinus Torvalds  */
16561da177e4SLinus Torvalds 
16571da177e4SLinus Torvalds /* lookup first element intersecting start-end */
16581da177e4SLinus Torvalds /* Caller holds sp->lock */
16591da177e4SLinus Torvalds static struct sp_node *
16601da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
16611da177e4SLinus Torvalds {
16621da177e4SLinus Torvalds 	struct rb_node *n = sp->root.rb_node;
16631da177e4SLinus Torvalds 
16641da177e4SLinus Torvalds 	while (n) {
16651da177e4SLinus Torvalds 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
16661da177e4SLinus Torvalds 
16671da177e4SLinus Torvalds 		if (start >= p->end)
16681da177e4SLinus Torvalds 			n = n->rb_right;
16691da177e4SLinus Torvalds 		else if (end <= p->start)
16701da177e4SLinus Torvalds 			n = n->rb_left;
16711da177e4SLinus Torvalds 		else
16721da177e4SLinus Torvalds 			break;
16731da177e4SLinus Torvalds 	}
16741da177e4SLinus Torvalds 	if (!n)
16751da177e4SLinus Torvalds 		return NULL;
16761da177e4SLinus Torvalds 	for (;;) {
16771da177e4SLinus Torvalds 		struct sp_node *w = NULL;
16781da177e4SLinus Torvalds 		struct rb_node *prev = rb_prev(n);
16791da177e4SLinus Torvalds 		if (!prev)
16801da177e4SLinus Torvalds 			break;
16811da177e4SLinus Torvalds 		w = rb_entry(prev, struct sp_node, nd);
16821da177e4SLinus Torvalds 		if (w->end <= start)
16831da177e4SLinus Torvalds 			break;
16841da177e4SLinus Torvalds 		n = prev;
16851da177e4SLinus Torvalds 	}
16861da177e4SLinus Torvalds 	return rb_entry(n, struct sp_node, nd);
16871da177e4SLinus Torvalds }
16881da177e4SLinus Torvalds 
16891da177e4SLinus Torvalds /* Insert a new shared policy into the list. */
16901da177e4SLinus Torvalds /* Caller holds sp->lock */
16911da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
16921da177e4SLinus Torvalds {
16931da177e4SLinus Torvalds 	struct rb_node **p = &sp->root.rb_node;
16941da177e4SLinus Torvalds 	struct rb_node *parent = NULL;
16951da177e4SLinus Torvalds 	struct sp_node *nd;
16961da177e4SLinus Torvalds 
16971da177e4SLinus Torvalds 	while (*p) {
16981da177e4SLinus Torvalds 		parent = *p;
16991da177e4SLinus Torvalds 		nd = rb_entry(parent, struct sp_node, nd);
17001da177e4SLinus Torvalds 		if (new->start < nd->start)
17011da177e4SLinus Torvalds 			p = &(*p)->rb_left;
17021da177e4SLinus Torvalds 		else if (new->end > nd->end)
17031da177e4SLinus Torvalds 			p = &(*p)->rb_right;
17041da177e4SLinus Torvalds 		else
17051da177e4SLinus Torvalds 			BUG();
17061da177e4SLinus Torvalds 	}
17071da177e4SLinus Torvalds 	rb_link_node(&new->nd, parent, p);
17081da177e4SLinus Torvalds 	rb_insert_color(&new->nd, &sp->root);
1709140d5a49SPaul Mundt 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
17101da177e4SLinus Torvalds 		 new->policy ? new->policy->policy : 0);
17111da177e4SLinus Torvalds }
17121da177e4SLinus Torvalds 
17131da177e4SLinus Torvalds /* Find shared policy intersecting idx */
17141da177e4SLinus Torvalds struct mempolicy *
17151da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
17161da177e4SLinus Torvalds {
17171da177e4SLinus Torvalds 	struct mempolicy *pol = NULL;
17181da177e4SLinus Torvalds 	struct sp_node *sn;
17191da177e4SLinus Torvalds 
17201da177e4SLinus Torvalds 	if (!sp->root.rb_node)
17211da177e4SLinus Torvalds 		return NULL;
17221da177e4SLinus Torvalds 	spin_lock(&sp->lock);
17231da177e4SLinus Torvalds 	sn = sp_lookup(sp, idx, idx+1);
17241da177e4SLinus Torvalds 	if (sn) {
17251da177e4SLinus Torvalds 		mpol_get(sn->policy);
17261da177e4SLinus Torvalds 		pol = sn->policy;
17271da177e4SLinus Torvalds 	}
17281da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
17291da177e4SLinus Torvalds 	return pol;
17301da177e4SLinus Torvalds }
17311da177e4SLinus Torvalds 
17321da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
17331da177e4SLinus Torvalds {
1734140d5a49SPaul Mundt 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
17351da177e4SLinus Torvalds 	rb_erase(&n->nd, &sp->root);
1736f0be3d32SLee Schermerhorn 	mpol_put(n->policy);
17371da177e4SLinus Torvalds 	kmem_cache_free(sn_cache, n);
17381da177e4SLinus Torvalds }
17391da177e4SLinus Torvalds 
1740dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1741dbcb0f19SAdrian Bunk 				struct mempolicy *pol)
17421da177e4SLinus Torvalds {
17431da177e4SLinus Torvalds 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
17441da177e4SLinus Torvalds 
17451da177e4SLinus Torvalds 	if (!n)
17461da177e4SLinus Torvalds 		return NULL;
17471da177e4SLinus Torvalds 	n->start = start;
17481da177e4SLinus Torvalds 	n->end = end;
17491da177e4SLinus Torvalds 	mpol_get(pol);
17501da177e4SLinus Torvalds 	n->policy = pol;
17511da177e4SLinus Torvalds 	return n;
17521da177e4SLinus Torvalds }
17531da177e4SLinus Torvalds 
17541da177e4SLinus Torvalds /* Replace a policy range. */
17551da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
17561da177e4SLinus Torvalds 				 unsigned long end, struct sp_node *new)
17571da177e4SLinus Torvalds {
17581da177e4SLinus Torvalds 	struct sp_node *n, *new2 = NULL;
17591da177e4SLinus Torvalds 
17601da177e4SLinus Torvalds restart:
17611da177e4SLinus Torvalds 	spin_lock(&sp->lock);
17621da177e4SLinus Torvalds 	n = sp_lookup(sp, start, end);
17631da177e4SLinus Torvalds 	/* Take care of old policies in the same range. */
17641da177e4SLinus Torvalds 	while (n && n->start < end) {
17651da177e4SLinus Torvalds 		struct rb_node *next = rb_next(&n->nd);
17661da177e4SLinus Torvalds 		if (n->start >= start) {
17671da177e4SLinus Torvalds 			if (n->end <= end)
17681da177e4SLinus Torvalds 				sp_delete(sp, n);
17691da177e4SLinus Torvalds 			else
17701da177e4SLinus Torvalds 				n->start = end;
17711da177e4SLinus Torvalds 		} else {
17721da177e4SLinus Torvalds 			/* Old policy spanning whole new range. */
17731da177e4SLinus Torvalds 			if (n->end > end) {
17741da177e4SLinus Torvalds 				if (!new2) {
17751da177e4SLinus Torvalds 					spin_unlock(&sp->lock);
17761da177e4SLinus Torvalds 					new2 = sp_alloc(end, n->end, n->policy);
17771da177e4SLinus Torvalds 					if (!new2)
17781da177e4SLinus Torvalds 						return -ENOMEM;
17791da177e4SLinus Torvalds 					goto restart;
17801da177e4SLinus Torvalds 				}
17811da177e4SLinus Torvalds 				n->end = start;
17821da177e4SLinus Torvalds 				sp_insert(sp, new2);
17831da177e4SLinus Torvalds 				new2 = NULL;
17841da177e4SLinus Torvalds 				break;
17851da177e4SLinus Torvalds 			} else
17861da177e4SLinus Torvalds 				n->end = start;
17871da177e4SLinus Torvalds 		}
17881da177e4SLinus Torvalds 		if (!next)
17891da177e4SLinus Torvalds 			break;
17901da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
17911da177e4SLinus Torvalds 	}
17921da177e4SLinus Torvalds 	if (new)
17931da177e4SLinus Torvalds 		sp_insert(sp, new);
17941da177e4SLinus Torvalds 	spin_unlock(&sp->lock);
17951da177e4SLinus Torvalds 	if (new2) {
1796f0be3d32SLee Schermerhorn 		mpol_put(new2->policy);
17971da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new2);
17981da177e4SLinus Torvalds 	}
17991da177e4SLinus Torvalds 	return 0;
18001da177e4SLinus Torvalds }
18011da177e4SLinus Torvalds 
1802a3b51e01SDavid Rientjes void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1803028fec41SDavid Rientjes 			unsigned short flags, nodemask_t *policy_nodes)
18047339ff83SRobin Holt {
18057339ff83SRobin Holt 	info->root = RB_ROOT;
18067339ff83SRobin Holt 	spin_lock_init(&info->lock);
18077339ff83SRobin Holt 
18087339ff83SRobin Holt 	if (policy != MPOL_DEFAULT) {
18097339ff83SRobin Holt 		struct mempolicy *newpol;
18107339ff83SRobin Holt 
18117339ff83SRobin Holt 		/* Falls back to MPOL_DEFAULT on any error */
1812028fec41SDavid Rientjes 		newpol = mpol_new(policy, flags, policy_nodes);
18137339ff83SRobin Holt 		if (!IS_ERR(newpol)) {
18147339ff83SRobin Holt 			/* Create pseudo-vma that contains just the policy */
18157339ff83SRobin Holt 			struct vm_area_struct pvma;
18167339ff83SRobin Holt 
18177339ff83SRobin Holt 			memset(&pvma, 0, sizeof(struct vm_area_struct));
18187339ff83SRobin Holt 			/* Policy covers entire file */
18197339ff83SRobin Holt 			pvma.vm_end = TASK_SIZE;
18207339ff83SRobin Holt 			mpol_set_shared_policy(info, &pvma, newpol);
1821f0be3d32SLee Schermerhorn 			mpol_put(newpol);
18227339ff83SRobin Holt 		}
18237339ff83SRobin Holt 	}
18247339ff83SRobin Holt }
18257339ff83SRobin Holt 
18261da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info,
18271da177e4SLinus Torvalds 			struct vm_area_struct *vma, struct mempolicy *npol)
18281da177e4SLinus Torvalds {
18291da177e4SLinus Torvalds 	int err;
18301da177e4SLinus Torvalds 	struct sp_node *new = NULL;
18311da177e4SLinus Torvalds 	unsigned long sz = vma_pages(vma);
18321da177e4SLinus Torvalds 
1833028fec41SDavid Rientjes 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
18341da177e4SLinus Torvalds 		 vma->vm_pgoff,
18351da177e4SLinus Torvalds 		 sz, npol ? npol->policy : -1,
1836028fec41SDavid Rientjes 		 npol ? npol->flags : -1,
1837dfcd3c0dSAndi Kleen 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
18381da177e4SLinus Torvalds 
18391da177e4SLinus Torvalds 	if (npol) {
18401da177e4SLinus Torvalds 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
18411da177e4SLinus Torvalds 		if (!new)
18421da177e4SLinus Torvalds 			return -ENOMEM;
18431da177e4SLinus Torvalds 	}
18441da177e4SLinus Torvalds 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
18451da177e4SLinus Torvalds 	if (err && new)
18461da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, new);
18471da177e4SLinus Torvalds 	return err;
18481da177e4SLinus Torvalds }
18491da177e4SLinus Torvalds 
18501da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
18511da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p)
18521da177e4SLinus Torvalds {
18531da177e4SLinus Torvalds 	struct sp_node *n;
18541da177e4SLinus Torvalds 	struct rb_node *next;
18551da177e4SLinus Torvalds 
18561da177e4SLinus Torvalds 	if (!p->root.rb_node)
18571da177e4SLinus Torvalds 		return;
18581da177e4SLinus Torvalds 	spin_lock(&p->lock);
18591da177e4SLinus Torvalds 	next = rb_first(&p->root);
18601da177e4SLinus Torvalds 	while (next) {
18611da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
18621da177e4SLinus Torvalds 		next = rb_next(&n->nd);
186390c5029eSAndi Kleen 		rb_erase(&n->nd, &p->root);
1864f0be3d32SLee Schermerhorn 		mpol_put(n->policy);
18651da177e4SLinus Torvalds 		kmem_cache_free(sn_cache, n);
18661da177e4SLinus Torvalds 	}
18671da177e4SLinus Torvalds 	spin_unlock(&p->lock);
18681da177e4SLinus Torvalds }
18691da177e4SLinus Torvalds 
18701da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */
18711da177e4SLinus Torvalds void __init numa_policy_init(void)
18721da177e4SLinus Torvalds {
1873b71636e2SPaul Mundt 	nodemask_t interleave_nodes;
1874b71636e2SPaul Mundt 	unsigned long largest = 0;
1875b71636e2SPaul Mundt 	int nid, prefer = 0;
1876b71636e2SPaul Mundt 
18771da177e4SLinus Torvalds 	policy_cache = kmem_cache_create("numa_policy",
18781da177e4SLinus Torvalds 					 sizeof(struct mempolicy),
187920c2df83SPaul Mundt 					 0, SLAB_PANIC, NULL);
18801da177e4SLinus Torvalds 
18811da177e4SLinus Torvalds 	sn_cache = kmem_cache_create("shared_policy_node",
18821da177e4SLinus Torvalds 				     sizeof(struct sp_node),
188320c2df83SPaul Mundt 				     0, SLAB_PANIC, NULL);
18841da177e4SLinus Torvalds 
1885b71636e2SPaul Mundt 	/*
1886b71636e2SPaul Mundt 	 * Set interleaving policy for system init. Interleaving is only
1887b71636e2SPaul Mundt 	 * enabled across suitably sized nodes (default is >= 16MB), or
1888b71636e2SPaul Mundt 	 * fall back to the largest node if they're all smaller.
1889b71636e2SPaul Mundt 	 */
1890b71636e2SPaul Mundt 	nodes_clear(interleave_nodes);
189156bbd65dSChristoph Lameter 	for_each_node_state(nid, N_HIGH_MEMORY) {
1892b71636e2SPaul Mundt 		unsigned long total_pages = node_present_pages(nid);
18931da177e4SLinus Torvalds 
1894b71636e2SPaul Mundt 		/* Preserve the largest node */
1895b71636e2SPaul Mundt 		if (largest < total_pages) {
1896b71636e2SPaul Mundt 			largest = total_pages;
1897b71636e2SPaul Mundt 			prefer = nid;
1898b71636e2SPaul Mundt 		}
1899b71636e2SPaul Mundt 
1900b71636e2SPaul Mundt 		/* Interleave this node? */
1901b71636e2SPaul Mundt 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1902b71636e2SPaul Mundt 			node_set(nid, interleave_nodes);
1903b71636e2SPaul Mundt 	}
1904b71636e2SPaul Mundt 
1905b71636e2SPaul Mundt 	/* All too small, use the largest */
1906b71636e2SPaul Mundt 	if (unlikely(nodes_empty(interleave_nodes)))
1907b71636e2SPaul Mundt 		node_set(prefer, interleave_nodes);
1908b71636e2SPaul Mundt 
1909028fec41SDavid Rientjes 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
19101da177e4SLinus Torvalds 		printk("numa_policy_init: interleaving failed\n");
19111da177e4SLinus Torvalds }
19121da177e4SLinus Torvalds 
19138bccd85fSChristoph Lameter /* Reset policy of current process to default */
19141da177e4SLinus Torvalds void numa_default_policy(void)
19151da177e4SLinus Torvalds {
1916028fec41SDavid Rientjes 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
19171da177e4SLinus Torvalds }
191868860ec1SPaul Jackson 
19194225399aSPaul Jackson /*
19201a75a6c8SChristoph Lameter  * Display pages allocated per node and memory policy via /proc.
19211a75a6c8SChristoph Lameter  */
192215ad7cdcSHelge Deller static const char * const policy_types[] =
192315ad7cdcSHelge Deller 	{ "default", "prefer", "bind", "interleave" };
19241a75a6c8SChristoph Lameter 
19251a75a6c8SChristoph Lameter /*
19261a75a6c8SChristoph Lameter  * Convert a mempolicy into a string.
19271a75a6c8SChristoph Lameter  * Returns the number of characters in buffer (if positive)
19281a75a6c8SChristoph Lameter  * or an error (negative)
19291a75a6c8SChristoph Lameter  */
19301a75a6c8SChristoph Lameter static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
19311a75a6c8SChristoph Lameter {
19321a75a6c8SChristoph Lameter 	char *p = buffer;
19331a75a6c8SChristoph Lameter 	int l;
19341a75a6c8SChristoph Lameter 	nodemask_t nodes;
1935a3b51e01SDavid Rientjes 	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1936f5b087b5SDavid Rientjes 	unsigned short flags = pol ? pol->flags : 0;
19371a75a6c8SChristoph Lameter 
19381a75a6c8SChristoph Lameter 	switch (mode) {
19391a75a6c8SChristoph Lameter 	case MPOL_DEFAULT:
19401a75a6c8SChristoph Lameter 		nodes_clear(nodes);
19411a75a6c8SChristoph Lameter 		break;
19421a75a6c8SChristoph Lameter 
19431a75a6c8SChristoph Lameter 	case MPOL_PREFERRED:
19441a75a6c8SChristoph Lameter 		nodes_clear(nodes);
19451a75a6c8SChristoph Lameter 		node_set(pol->v.preferred_node, nodes);
19461a75a6c8SChristoph Lameter 		break;
19471a75a6c8SChristoph Lameter 
19481a75a6c8SChristoph Lameter 	case MPOL_BIND:
194919770b32SMel Gorman 		/* Fall through */
19501a75a6c8SChristoph Lameter 	case MPOL_INTERLEAVE:
19511a75a6c8SChristoph Lameter 		nodes = pol->v.nodes;
19521a75a6c8SChristoph Lameter 		break;
19531a75a6c8SChristoph Lameter 
19541a75a6c8SChristoph Lameter 	default:
19551a75a6c8SChristoph Lameter 		BUG();
19561a75a6c8SChristoph Lameter 		return -EFAULT;
19571a75a6c8SChristoph Lameter 	}
19581a75a6c8SChristoph Lameter 
19591a75a6c8SChristoph Lameter 	l = strlen(policy_types[mode]);
19601a75a6c8SChristoph Lameter  	if (buffer + maxlen < p + l + 1)
19611a75a6c8SChristoph Lameter  		return -ENOSPC;
19621a75a6c8SChristoph Lameter 
19631a75a6c8SChristoph Lameter 	strcpy(p, policy_types[mode]);
19641a75a6c8SChristoph Lameter 	p += l;
19651a75a6c8SChristoph Lameter 
1966f5b087b5SDavid Rientjes 	if (flags) {
1967f5b087b5SDavid Rientjes 		int need_bar = 0;
1968f5b087b5SDavid Rientjes 
1969f5b087b5SDavid Rientjes 		if (buffer + maxlen < p + 2)
1970f5b087b5SDavid Rientjes 			return -ENOSPC;
1971f5b087b5SDavid Rientjes 		*p++ = '=';
1972f5b087b5SDavid Rientjes 
1973f5b087b5SDavid Rientjes 		if (flags & MPOL_F_STATIC_NODES)
1974f5b087b5SDavid Rientjes 			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
19754c50bc01SDavid Rientjes 		if (flags & MPOL_F_RELATIVE_NODES)
19764c50bc01SDavid Rientjes 			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1977f5b087b5SDavid Rientjes 	}
1978f5b087b5SDavid Rientjes 
19791a75a6c8SChristoph Lameter 	if (!nodes_empty(nodes)) {
19801a75a6c8SChristoph Lameter 		if (buffer + maxlen < p + 2)
19811a75a6c8SChristoph Lameter 			return -ENOSPC;
19821a75a6c8SChristoph Lameter 		*p++ = '=';
19831a75a6c8SChristoph Lameter 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
19841a75a6c8SChristoph Lameter 	}
19851a75a6c8SChristoph Lameter 	return p - buffer;
19861a75a6c8SChristoph Lameter }
19871a75a6c8SChristoph Lameter 
19881a75a6c8SChristoph Lameter struct numa_maps {
19891a75a6c8SChristoph Lameter 	unsigned long pages;
19901a75a6c8SChristoph Lameter 	unsigned long anon;
1991397874dfSChristoph Lameter 	unsigned long active;
1992397874dfSChristoph Lameter 	unsigned long writeback;
19931a75a6c8SChristoph Lameter 	unsigned long mapcount_max;
1994397874dfSChristoph Lameter 	unsigned long dirty;
1995397874dfSChristoph Lameter 	unsigned long swapcache;
19961a75a6c8SChristoph Lameter 	unsigned long node[MAX_NUMNODES];
19971a75a6c8SChristoph Lameter };
19981a75a6c8SChristoph Lameter 
1999397874dfSChristoph Lameter static void gather_stats(struct page *page, void *private, int pte_dirty)
20001a75a6c8SChristoph Lameter {
20011a75a6c8SChristoph Lameter 	struct numa_maps *md = private;
20021a75a6c8SChristoph Lameter 	int count = page_mapcount(page);
20031a75a6c8SChristoph Lameter 
20041a75a6c8SChristoph Lameter 	md->pages++;
2005397874dfSChristoph Lameter 	if (pte_dirty || PageDirty(page))
2006397874dfSChristoph Lameter 		md->dirty++;
2007397874dfSChristoph Lameter 
2008397874dfSChristoph Lameter 	if (PageSwapCache(page))
2009397874dfSChristoph Lameter 		md->swapcache++;
2010397874dfSChristoph Lameter 
2011397874dfSChristoph Lameter 	if (PageActive(page))
2012397874dfSChristoph Lameter 		md->active++;
2013397874dfSChristoph Lameter 
2014397874dfSChristoph Lameter 	if (PageWriteback(page))
2015397874dfSChristoph Lameter 		md->writeback++;
20161a75a6c8SChristoph Lameter 
20171a75a6c8SChristoph Lameter 	if (PageAnon(page))
20181a75a6c8SChristoph Lameter 		md->anon++;
20191a75a6c8SChristoph Lameter 
2020397874dfSChristoph Lameter 	if (count > md->mapcount_max)
2021397874dfSChristoph Lameter 		md->mapcount_max = count;
2022397874dfSChristoph Lameter 
20231a75a6c8SChristoph Lameter 	md->node[page_to_nid(page)]++;
20241a75a6c8SChristoph Lameter }
20251a75a6c8SChristoph Lameter 
20267f709ed0SAndrew Morton #ifdef CONFIG_HUGETLB_PAGE
2027397874dfSChristoph Lameter static void check_huge_range(struct vm_area_struct *vma,
2028397874dfSChristoph Lameter 		unsigned long start, unsigned long end,
2029397874dfSChristoph Lameter 		struct numa_maps *md)
2030397874dfSChristoph Lameter {
2031397874dfSChristoph Lameter 	unsigned long addr;
2032397874dfSChristoph Lameter 	struct page *page;
2033397874dfSChristoph Lameter 
2034397874dfSChristoph Lameter 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
2035397874dfSChristoph Lameter 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2036397874dfSChristoph Lameter 		pte_t pte;
2037397874dfSChristoph Lameter 
2038397874dfSChristoph Lameter 		if (!ptep)
2039397874dfSChristoph Lameter 			continue;
2040397874dfSChristoph Lameter 
2041397874dfSChristoph Lameter 		pte = *ptep;
2042397874dfSChristoph Lameter 		if (pte_none(pte))
2043397874dfSChristoph Lameter 			continue;
2044397874dfSChristoph Lameter 
2045397874dfSChristoph Lameter 		page = pte_page(pte);
2046397874dfSChristoph Lameter 		if (!page)
2047397874dfSChristoph Lameter 			continue;
2048397874dfSChristoph Lameter 
2049397874dfSChristoph Lameter 		gather_stats(page, md, pte_dirty(*ptep));
2050397874dfSChristoph Lameter 	}
2051397874dfSChristoph Lameter }
20527f709ed0SAndrew Morton #else
20537f709ed0SAndrew Morton static inline void check_huge_range(struct vm_area_struct *vma,
20547f709ed0SAndrew Morton 		unsigned long start, unsigned long end,
20557f709ed0SAndrew Morton 		struct numa_maps *md)
20567f709ed0SAndrew Morton {
20577f709ed0SAndrew Morton }
20587f709ed0SAndrew Morton #endif
2059397874dfSChristoph Lameter 
20601a75a6c8SChristoph Lameter int show_numa_map(struct seq_file *m, void *v)
20611a75a6c8SChristoph Lameter {
206299f89551SEric W. Biederman 	struct proc_maps_private *priv = m->private;
20631a75a6c8SChristoph Lameter 	struct vm_area_struct *vma = v;
20641a75a6c8SChristoph Lameter 	struct numa_maps *md;
2065397874dfSChristoph Lameter 	struct file *file = vma->vm_file;
2066397874dfSChristoph Lameter 	struct mm_struct *mm = vma->vm_mm;
2067480eccf9SLee Schermerhorn 	struct mempolicy *pol;
20681a75a6c8SChristoph Lameter 	int n;
20691a75a6c8SChristoph Lameter 	char buffer[50];
20701a75a6c8SChristoph Lameter 
2071397874dfSChristoph Lameter 	if (!mm)
20721a75a6c8SChristoph Lameter 		return 0;
20731a75a6c8SChristoph Lameter 
20741a75a6c8SChristoph Lameter 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
20751a75a6c8SChristoph Lameter 	if (!md)
20761a75a6c8SChristoph Lameter 		return 0;
20771a75a6c8SChristoph Lameter 
2078480eccf9SLee Schermerhorn 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2079480eccf9SLee Schermerhorn 	mpol_to_str(buffer, sizeof(buffer), pol);
2080480eccf9SLee Schermerhorn 	/*
2081480eccf9SLee Schermerhorn 	 * unref shared or other task's mempolicy
2082480eccf9SLee Schermerhorn 	 */
2083480eccf9SLee Schermerhorn 	if (pol != &default_policy && pol != current->mempolicy)
2084f0be3d32SLee Schermerhorn 		__mpol_put(pol);
20851a75a6c8SChristoph Lameter 
2086397874dfSChristoph Lameter 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2087397874dfSChristoph Lameter 
2088397874dfSChristoph Lameter 	if (file) {
2089397874dfSChristoph Lameter 		seq_printf(m, " file=");
2090c32c2f63SJan Blunck 		seq_path(m, &file->f_path, "\n\t= ");
2091397874dfSChristoph Lameter 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2092397874dfSChristoph Lameter 		seq_printf(m, " heap");
2093397874dfSChristoph Lameter 	} else if (vma->vm_start <= mm->start_stack &&
2094397874dfSChristoph Lameter 			vma->vm_end >= mm->start_stack) {
2095397874dfSChristoph Lameter 		seq_printf(m, " stack");
2096397874dfSChristoph Lameter 	}
2097397874dfSChristoph Lameter 
2098397874dfSChristoph Lameter 	if (is_vm_hugetlb_page(vma)) {
2099397874dfSChristoph Lameter 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2100397874dfSChristoph Lameter 		seq_printf(m, " huge");
2101397874dfSChristoph Lameter 	} else {
2102397874dfSChristoph Lameter 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
210356bbd65dSChristoph Lameter 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2104397874dfSChristoph Lameter 	}
2105397874dfSChristoph Lameter 
2106397874dfSChristoph Lameter 	if (!md->pages)
2107397874dfSChristoph Lameter 		goto out;
21081a75a6c8SChristoph Lameter 
21091a75a6c8SChristoph Lameter 	if (md->anon)
21101a75a6c8SChristoph Lameter 		seq_printf(m," anon=%lu",md->anon);
21111a75a6c8SChristoph Lameter 
2112397874dfSChristoph Lameter 	if (md->dirty)
2113397874dfSChristoph Lameter 		seq_printf(m," dirty=%lu",md->dirty);
2114397874dfSChristoph Lameter 
2115397874dfSChristoph Lameter 	if (md->pages != md->anon && md->pages != md->dirty)
2116397874dfSChristoph Lameter 		seq_printf(m, " mapped=%lu", md->pages);
2117397874dfSChristoph Lameter 
2118397874dfSChristoph Lameter 	if (md->mapcount_max > 1)
2119397874dfSChristoph Lameter 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2120397874dfSChristoph Lameter 
2121397874dfSChristoph Lameter 	if (md->swapcache)
2122397874dfSChristoph Lameter 		seq_printf(m," swapcache=%lu", md->swapcache);
2123397874dfSChristoph Lameter 
2124397874dfSChristoph Lameter 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2125397874dfSChristoph Lameter 		seq_printf(m," active=%lu", md->active);
2126397874dfSChristoph Lameter 
2127397874dfSChristoph Lameter 	if (md->writeback)
2128397874dfSChristoph Lameter 		seq_printf(m," writeback=%lu", md->writeback);
2129397874dfSChristoph Lameter 
213056bbd65dSChristoph Lameter 	for_each_node_state(n, N_HIGH_MEMORY)
21311a75a6c8SChristoph Lameter 		if (md->node[n])
21321a75a6c8SChristoph Lameter 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2133397874dfSChristoph Lameter out:
21341a75a6c8SChristoph Lameter 	seq_putc(m, '\n');
21351a75a6c8SChristoph Lameter 	kfree(md);
21361a75a6c8SChristoph Lameter 
21371a75a6c8SChristoph Lameter 	if (m->count < m->size)
213899f89551SEric W. Biederman 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
21391a75a6c8SChristoph Lameter 	return 0;
21401a75a6c8SChristoph Lameter }
2141