xref: /linux/mm/mempolicy.c (revision 100c85421b52e41269ada88f7d71a6b8a06c7a11)
146aeb7e6SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  * Simple NUMA memory policy for the Linux kernel.
41da177e4SLinus Torvalds  *
51da177e4SLinus Torvalds  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
68bccd85fSChristoph Lameter  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds  * be allocated.
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  * Support four policies per VMA and per process:
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds  *
151da177e4SLinus Torvalds  * interleave     Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds  *                with normal fallback if it fails.
171da177e4SLinus Torvalds  *                For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds  *                offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds  *                for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds  *                is used.
218bccd85fSChristoph Lameter  *
22fa3bea4eSGregory Price  * weighted interleave
23fa3bea4eSGregory Price  *                Allocate memory interleaved over a set of nodes based on
24fa3bea4eSGregory Price  *                a set of weights (per-node), with normal fallback if it
25fa3bea4eSGregory Price  *                fails.  Otherwise operates the same as interleave.
26fa3bea4eSGregory Price  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27fa3bea4eSGregory Price  *                on node 0 for every 1 page allocated on node 1.
28fa3bea4eSGregory Price  *
291da177e4SLinus Torvalds  * bind           Only allocate memory on a specific set of nodes,
301da177e4SLinus Torvalds  *                no fallback.
318bccd85fSChristoph Lameter  *                FIXME: memory is allocated starting with the first node
328bccd85fSChristoph Lameter  *                to the last. It would be better if bind would truly restrict
338bccd85fSChristoph Lameter  *                the allocation to memory nodes instead
348bccd85fSChristoph Lameter  *
351da177e4SLinus Torvalds  * preferred      Try a specific node first before normal fallback.
3600ef2d2fSDavid Rientjes  *                As a special case NUMA_NO_NODE here means do the allocation
371da177e4SLinus Torvalds  *                on the local CPU. This is normally identical to default,
381da177e4SLinus Torvalds  *                but useful to set in a VMA when you have a non default
391da177e4SLinus Torvalds  *                process policy.
408bccd85fSChristoph Lameter  *
41b27abaccSDave Hansen  * preferred many Try a set of nodes first before normal fallback. This is
42b27abaccSDave Hansen  *                similar to preferred without the special case.
43b27abaccSDave Hansen  *
441da177e4SLinus Torvalds  * default        Allocate on the local node first, or when on a VMA
451da177e4SLinus Torvalds  *                use the process policy. This is what Linux always did
461da177e4SLinus Torvalds  *		  in a NUMA aware kernel and still does by, ahem, default.
471da177e4SLinus Torvalds  *
481da177e4SLinus Torvalds  * The process policy is applied for most non interrupt memory allocations
491da177e4SLinus Torvalds  * in that process' context. Interrupts ignore the policies and always
501da177e4SLinus Torvalds  * try to allocate on the local CPU. The VMA policy is only applied for memory
511da177e4SLinus Torvalds  * allocations for a VMA in the VM.
521da177e4SLinus Torvalds  *
531da177e4SLinus Torvalds  * Currently there are a few corner cases in swapping where the policy
541da177e4SLinus Torvalds  * is not applied, but the majority should be handled. When process policy
551da177e4SLinus Torvalds  * is used it is not remembered over swap outs/swap ins.
561da177e4SLinus Torvalds  *
571da177e4SLinus Torvalds  * Only the highest zone in the zone hierarchy gets policied. Allocations
581da177e4SLinus Torvalds  * requesting a lower zone just use default policy. This implies that
591da177e4SLinus Torvalds  * on systems with highmem kernel lowmem allocation don't get policied.
601da177e4SLinus Torvalds  * Same with GFP_DMA allocations.
611da177e4SLinus Torvalds  *
62c36f6e6dSHugh Dickins  * For shmem/tmpfs shared memory the policy is shared between
631da177e4SLinus Torvalds  * all users and remembered even when nobody has memory mapped.
641da177e4SLinus Torvalds  */
651da177e4SLinus Torvalds 
661da177e4SLinus Torvalds /* Notebook:
671da177e4SLinus Torvalds    fix mmap readahead to honour policy and enable policy for any page cache
681da177e4SLinus Torvalds    object
691da177e4SLinus Torvalds    statistics for bigpages
701da177e4SLinus Torvalds    global policy for page cache? currently it uses process policy. Requires
711da177e4SLinus Torvalds    first item above.
721da177e4SLinus Torvalds    handle mremap for shared memory (currently ignored for the policy)
731da177e4SLinus Torvalds    grows down?
741da177e4SLinus Torvalds    make bind policy root only? It can trigger oom much faster and the
751da177e4SLinus Torvalds    kernel is not always grateful with that.
761da177e4SLinus Torvalds */
771da177e4SLinus Torvalds 
78b1de0d13SMitchel Humpherys #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79b1de0d13SMitchel Humpherys 
801da177e4SLinus Torvalds #include <linux/mempolicy.h>
81a520110eSChristoph Hellwig #include <linux/pagewalk.h>
821da177e4SLinus Torvalds #include <linux/highmem.h>
831da177e4SLinus Torvalds #include <linux/hugetlb.h>
841da177e4SLinus Torvalds #include <linux/kernel.h>
851da177e4SLinus Torvalds #include <linux/sched.h>
866e84f315SIngo Molnar #include <linux/sched/mm.h>
876a3827d7SIngo Molnar #include <linux/sched/numa_balancing.h>
88f719ff9bSIngo Molnar #include <linux/sched/task.h>
891da177e4SLinus Torvalds #include <linux/nodemask.h>
901da177e4SLinus Torvalds #include <linux/cpuset.h>
911da177e4SLinus Torvalds #include <linux/slab.h>
921da177e4SLinus Torvalds #include <linux/string.h>
93b95f1b31SPaul Gortmaker #include <linux/export.h>
94b488893aSPavel Emelyanov #include <linux/nsproxy.h>
951da177e4SLinus Torvalds #include <linux/interrupt.h>
961da177e4SLinus Torvalds #include <linux/init.h>
971da177e4SLinus Torvalds #include <linux/compat.h>
9831367466SOtto Ebeling #include <linux/ptrace.h>
99dc9aa5b9SChristoph Lameter #include <linux/swap.h>
1001a75a6c8SChristoph Lameter #include <linux/seq_file.h>
1011a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
102b20a3503SChristoph Lameter #include <linux/migrate.h>
10362b61f61SHugh Dickins #include <linux/ksm.h>
10495a402c3SChristoph Lameter #include <linux/rmap.h>
10586c3a764SDavid Quigley #include <linux/security.h>
106dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
107095f1fc4SLee Schermerhorn #include <linux/ctype.h>
1086d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h>
109b24f53a0SLee Schermerhorn #include <linux/mmu_notifier.h>
110b1de0d13SMitchel Humpherys #include <linux/printk.h>
111c8633798SNaoya Horiguchi #include <linux/swapops.h>
112dc9aa5b9SChristoph Lameter 
1131da177e4SLinus Torvalds #include <asm/tlbflush.h>
1144a18419fSNadav Amit #include <asm/tlb.h>
1157c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
1161da177e4SLinus Torvalds 
11762695a84SNick Piggin #include "internal.h"
11862695a84SNick Piggin 
11938e35860SChristoph Lameter /* Internal flags */
120dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
12138e35860SChristoph Lameter #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
1221cb5d11aSHugh Dickins #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
123dc9aa5b9SChristoph Lameter 
124fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
125fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1261da177e4SLinus Torvalds 
1271da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1281da177e4SLinus Torvalds    policied. */
1296267276fSChristoph Lameter enum zone_type policy_zone = 0;
1301da177e4SLinus Torvalds 
131bea904d5SLee Schermerhorn /*
132bea904d5SLee Schermerhorn  * run-time system-wide default policy => local allocation
133bea904d5SLee Schermerhorn  */
134e754d79dSH Hartley Sweeten static struct mempolicy default_policy = {
1351da177e4SLinus Torvalds 	.refcnt = ATOMIC_INIT(1), /* never free it */
1367858d7bcSFeng Tang 	.mode = MPOL_LOCAL,
1371da177e4SLinus Torvalds };
1381da177e4SLinus Torvalds 
1395606e387SMel Gorman static struct mempolicy preferred_node_policy[MAX_NUMNODES];
1405606e387SMel Gorman 
141dce41f5aSRakie Kim /*
142dce41f5aSRakie Kim  * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
143dce41f5aSRakie Kim  * system-default value should be used. A NULL iw_table also denotes that
144dce41f5aSRakie Kim  * system-default values should be used. Until the system-default table
145dce41f5aSRakie Kim  * is implemented, the system-default is always 1.
146dce41f5aSRakie Kim  *
147dce41f5aSRakie Kim  * iw_table is RCU protected
148dce41f5aSRakie Kim  */
149dce41f5aSRakie Kim static u8 __rcu *iw_table;
150dce41f5aSRakie Kim static DEFINE_MUTEX(iw_table_lock);
151dce41f5aSRakie Kim 
152dce41f5aSRakie Kim static u8 get_il_weight(int node)
153dce41f5aSRakie Kim {
154dce41f5aSRakie Kim 	u8 *table;
155dce41f5aSRakie Kim 	u8 weight;
156dce41f5aSRakie Kim 
157dce41f5aSRakie Kim 	rcu_read_lock();
158dce41f5aSRakie Kim 	table = rcu_dereference(iw_table);
159dce41f5aSRakie Kim 	/* if no iw_table, use system default */
160dce41f5aSRakie Kim 	weight = table ? table[node] : 1;
161dce41f5aSRakie Kim 	/* if value in iw_table is 0, use system default */
162dce41f5aSRakie Kim 	weight = weight ? weight : 1;
163dce41f5aSRakie Kim 	rcu_read_unlock();
164dce41f5aSRakie Kim 	return weight;
165dce41f5aSRakie Kim }
166dce41f5aSRakie Kim 
167b2ca916cSDan Williams /**
168b1f099b1SYury Norov  * numa_nearest_node - Find nearest node by state
169f6e92f40SKrzysztof Kozlowski  * @node: Node id to start the search
170b1f099b1SYury Norov  * @state: State to filter the search
171b2ca916cSDan Williams  *
172b1f099b1SYury Norov  * Lookup the closest node by distance if @nid is not in state.
173dad5b023SRandy Dunlap  *
174b1f099b1SYury Norov  * Return: this @node if it is in state, otherwise the closest node by distance
175b2ca916cSDan Williams  */
176b1f099b1SYury Norov int numa_nearest_node(int node, unsigned int state)
177b2ca916cSDan Williams {
1784fcbe96eSDan Williams 	int min_dist = INT_MAX, dist, n, min_node;
179b2ca916cSDan Williams 
180b1f099b1SYury Norov 	if (state >= NR_NODE_STATES)
181b1f099b1SYury Norov 		return -EINVAL;
182b1f099b1SYury Norov 
183b1f099b1SYury Norov 	if (node == NUMA_NO_NODE || node_state(node, state))
1844fcbe96eSDan Williams 		return node;
185b2ca916cSDan Williams 
186b2ca916cSDan Williams 	min_node = node;
187b1f099b1SYury Norov 	for_each_node_state(n, state) {
188b2ca916cSDan Williams 		dist = node_distance(node, n);
189b2ca916cSDan Williams 		if (dist < min_dist) {
190b2ca916cSDan Williams 			min_dist = dist;
191b2ca916cSDan Williams 			min_node = n;
192b2ca916cSDan Williams 		}
193b2ca916cSDan Williams 	}
194b2ca916cSDan Williams 
195b2ca916cSDan Williams 	return min_node;
196b2ca916cSDan Williams }
197b1f099b1SYury Norov EXPORT_SYMBOL_GPL(numa_nearest_node);
198b2ca916cSDan Williams 
19974d2c3a0SOleg Nesterov struct mempolicy *get_task_policy(struct task_struct *p)
2005606e387SMel Gorman {
2015606e387SMel Gorman 	struct mempolicy *pol = p->mempolicy;
202f15ca78eSOleg Nesterov 	int node;
2035606e387SMel Gorman 
204f15ca78eSOleg Nesterov 	if (pol)
205f15ca78eSOleg Nesterov 		return pol;
2065606e387SMel Gorman 
207f15ca78eSOleg Nesterov 	node = numa_node_id();
2081da6f0e1SJianguo Wu 	if (node != NUMA_NO_NODE) {
2091da6f0e1SJianguo Wu 		pol = &preferred_node_policy[node];
210f15ca78eSOleg Nesterov 		/* preferred_node_policy is not initialised early in boot */
211f15ca78eSOleg Nesterov 		if (pol->mode)
212f15ca78eSOleg Nesterov 			return pol;
2131da6f0e1SJianguo Wu 	}
2145606e387SMel Gorman 
215f15ca78eSOleg Nesterov 	return &default_policy;
2165606e387SMel Gorman }
2175606e387SMel Gorman 
21837012946SDavid Rientjes static const struct mempolicy_operations {
21937012946SDavid Rientjes 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
220213980c0SVlastimil Babka 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
22137012946SDavid Rientjes } mpol_ops[MPOL_MAX];
22237012946SDavid Rientjes 
223f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
224f5b087b5SDavid Rientjes {
2256d556294SBob Liu 	return pol->flags & MPOL_MODE_FLAGS;
2264c50bc01SDavid Rientjes }
2274c50bc01SDavid Rientjes 
2284c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
2294c50bc01SDavid Rientjes 				   const nodemask_t *rel)
2304c50bc01SDavid Rientjes {
2314c50bc01SDavid Rientjes 	nodemask_t tmp;
2324c50bc01SDavid Rientjes 	nodes_fold(tmp, *orig, nodes_weight(*rel));
2334c50bc01SDavid Rientjes 	nodes_onto(*ret, tmp, *rel);
234f5b087b5SDavid Rientjes }
235f5b087b5SDavid Rientjes 
236be897d48SFeng Tang static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
23737012946SDavid Rientjes {
23837012946SDavid Rientjes 	if (nodes_empty(*nodes))
23937012946SDavid Rientjes 		return -EINVAL;
240269fbe72SBen Widawsky 	pol->nodes = *nodes;
24137012946SDavid Rientjes 	return 0;
24237012946SDavid Rientjes }
24337012946SDavid Rientjes 
24437012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
24537012946SDavid Rientjes {
2467858d7bcSFeng Tang 	if (nodes_empty(*nodes))
2477858d7bcSFeng Tang 		return -EINVAL;
248269fbe72SBen Widawsky 
249269fbe72SBen Widawsky 	nodes_clear(pol->nodes);
250269fbe72SBen Widawsky 	node_set(first_node(*nodes), pol->nodes);
25137012946SDavid Rientjes 	return 0;
25237012946SDavid Rientjes }
25337012946SDavid Rientjes 
25458568d2aSMiao Xie /*
25558568d2aSMiao Xie  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
25658568d2aSMiao Xie  * any, for the new policy.  mpol_new() has already validated the nodes
2577858d7bcSFeng Tang  * parameter with respect to the policy mode and flags.
25858568d2aSMiao Xie  *
25958568d2aSMiao Xie  * Must be called holding task's alloc_lock to protect task's mems_allowed
260c1e8d7c6SMichel Lespinasse  * and mempolicy.  May also be called holding the mmap_lock for write.
26158568d2aSMiao Xie  */
2624bfc4495SKAMEZAWA Hiroyuki static int mpol_set_nodemask(struct mempolicy *pol,
2634bfc4495SKAMEZAWA Hiroyuki 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
26458568d2aSMiao Xie {
26558568d2aSMiao Xie 	int ret;
26658568d2aSMiao Xie 
2677858d7bcSFeng Tang 	/*
2687858d7bcSFeng Tang 	 * Default (pol==NULL) resp. local memory policies are not a
2697858d7bcSFeng Tang 	 * subject of any remapping. They also do not need any special
2707858d7bcSFeng Tang 	 * constructor.
2717858d7bcSFeng Tang 	 */
2727858d7bcSFeng Tang 	if (!pol || pol->mode == MPOL_LOCAL)
27358568d2aSMiao Xie 		return 0;
2747858d7bcSFeng Tang 
27501f13bd6SLai Jiangshan 	/* Check N_MEMORY */
2764bfc4495SKAMEZAWA Hiroyuki 	nodes_and(nsc->mask1,
27701f13bd6SLai Jiangshan 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
27858568d2aSMiao Xie 
27958568d2aSMiao Xie 	VM_BUG_ON(!nodes);
2807858d7bcSFeng Tang 
28158568d2aSMiao Xie 	if (pol->flags & MPOL_F_RELATIVE_NODES)
2824bfc4495SKAMEZAWA Hiroyuki 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
28358568d2aSMiao Xie 	else
2844bfc4495SKAMEZAWA Hiroyuki 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
2854bfc4495SKAMEZAWA Hiroyuki 
28658568d2aSMiao Xie 	if (mpol_store_user_nodemask(pol))
28758568d2aSMiao Xie 		pol->w.user_nodemask = *nodes;
28858568d2aSMiao Xie 	else
2897858d7bcSFeng Tang 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
29058568d2aSMiao Xie 
2914bfc4495SKAMEZAWA Hiroyuki 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
29258568d2aSMiao Xie 	return ret;
29358568d2aSMiao Xie }
29458568d2aSMiao Xie 
29558568d2aSMiao Xie /*
29658568d2aSMiao Xie  * This function just creates a new policy, does some check and simple
29758568d2aSMiao Xie  * initialization. You must invoke mpol_set_nodemask() to set nodes.
29858568d2aSMiao Xie  */
299028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
300028fec41SDavid Rientjes 				  nodemask_t *nodes)
3011da177e4SLinus Torvalds {
3021da177e4SLinus Torvalds 	struct mempolicy *policy;
3031da177e4SLinus Torvalds 
3043e1f0645SDavid Rientjes 	if (mode == MPOL_DEFAULT) {
3053e1f0645SDavid Rientjes 		if (nodes && !nodes_empty(*nodes))
30637012946SDavid Rientjes 			return ERR_PTR(-EINVAL);
307d3a71033SLee Schermerhorn 		return NULL;
30837012946SDavid Rientjes 	}
3093e1f0645SDavid Rientjes 	VM_BUG_ON(!nodes);
3103e1f0645SDavid Rientjes 
3113e1f0645SDavid Rientjes 	/*
3123e1f0645SDavid Rientjes 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
3133e1f0645SDavid Rientjes 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
3143e1f0645SDavid Rientjes 	 * All other modes require a valid pointer to a non-empty nodemask.
3153e1f0645SDavid Rientjes 	 */
3163e1f0645SDavid Rientjes 	if (mode == MPOL_PREFERRED) {
3173e1f0645SDavid Rientjes 		if (nodes_empty(*nodes)) {
3183e1f0645SDavid Rientjes 			if (((flags & MPOL_F_STATIC_NODES) ||
3193e1f0645SDavid Rientjes 			     (flags & MPOL_F_RELATIVE_NODES)))
3203e1f0645SDavid Rientjes 				return ERR_PTR(-EINVAL);
3217858d7bcSFeng Tang 
3227858d7bcSFeng Tang 			mode = MPOL_LOCAL;
3233e1f0645SDavid Rientjes 		}
324479e2802SPeter Zijlstra 	} else if (mode == MPOL_LOCAL) {
3258d303e44SPiotr Kwapulinski 		if (!nodes_empty(*nodes) ||
3268d303e44SPiotr Kwapulinski 		    (flags & MPOL_F_STATIC_NODES) ||
3278d303e44SPiotr Kwapulinski 		    (flags & MPOL_F_RELATIVE_NODES))
328479e2802SPeter Zijlstra 			return ERR_PTR(-EINVAL);
3293e1f0645SDavid Rientjes 	} else if (nodes_empty(*nodes))
3303e1f0645SDavid Rientjes 		return ERR_PTR(-EINVAL);
331c36f6e6dSHugh Dickins 
3321da177e4SLinus Torvalds 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3331da177e4SLinus Torvalds 	if (!policy)
3341da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
3351da177e4SLinus Torvalds 	atomic_set(&policy->refcnt, 1);
33645c4745aSLee Schermerhorn 	policy->mode = mode;
33737012946SDavid Rientjes 	policy->flags = flags;
338c6018b4bSAneesh Kumar K.V 	policy->home_node = NUMA_NO_NODE;
3393e1f0645SDavid Rientjes 
34037012946SDavid Rientjes 	return policy;
34137012946SDavid Rientjes }
34237012946SDavid Rientjes 
34352cd3b07SLee Schermerhorn /* Slow path of a mpol destructor. */
344c36f6e6dSHugh Dickins void __mpol_put(struct mempolicy *pol)
34552cd3b07SLee Schermerhorn {
346c36f6e6dSHugh Dickins 	if (!atomic_dec_and_test(&pol->refcnt))
34752cd3b07SLee Schermerhorn 		return;
348c36f6e6dSHugh Dickins 	kmem_cache_free(policy_cache, pol);
34952cd3b07SLee Schermerhorn }
35052cd3b07SLee Schermerhorn 
351213980c0SVlastimil Babka static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
35237012946SDavid Rientjes {
35337012946SDavid Rientjes }
35437012946SDavid Rientjes 
355213980c0SVlastimil Babka static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
3561d0d2680SDavid Rientjes {
3571d0d2680SDavid Rientjes 	nodemask_t tmp;
3581d0d2680SDavid Rientjes 
35937012946SDavid Rientjes 	if (pol->flags & MPOL_F_STATIC_NODES)
36037012946SDavid Rientjes 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
36137012946SDavid Rientjes 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
36237012946SDavid Rientjes 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
3631d0d2680SDavid Rientjes 	else {
364269fbe72SBen Widawsky 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
365213980c0SVlastimil Babka 								*nodes);
36629b190faSzhong jiang 		pol->w.cpuset_mems_allowed = *nodes;
3671d0d2680SDavid Rientjes 	}
36837012946SDavid Rientjes 
369708c1bbcSMiao Xie 	if (nodes_empty(tmp))
370708c1bbcSMiao Xie 		tmp = *nodes;
371708c1bbcSMiao Xie 
372269fbe72SBen Widawsky 	pol->nodes = tmp;
37337012946SDavid Rientjes }
37437012946SDavid Rientjes 
37537012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
376213980c0SVlastimil Babka 						const nodemask_t *nodes)
37737012946SDavid Rientjes {
37837012946SDavid Rientjes 	pol->w.cpuset_mems_allowed = *nodes;
3791d0d2680SDavid Rientjes }
38037012946SDavid Rientjes 
381708c1bbcSMiao Xie /*
382708c1bbcSMiao Xie  * mpol_rebind_policy - Migrate a policy to a different set of nodes
383708c1bbcSMiao Xie  *
384c1e8d7c6SMichel Lespinasse  * Per-vma policies are protected by mmap_lock. Allocations using per-task
385213980c0SVlastimil Babka  * policies are protected by task->mems_allowed_seq to prevent a premature
386213980c0SVlastimil Babka  * OOM/allocation failure due to parallel nodemask modification.
387708c1bbcSMiao Xie  */
388213980c0SVlastimil Babka static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
38937012946SDavid Rientjes {
390018160adSWang Cheng 	if (!pol || pol->mode == MPOL_LOCAL)
39137012946SDavid Rientjes 		return;
3927858d7bcSFeng Tang 	if (!mpol_store_user_nodemask(pol) &&
39337012946SDavid Rientjes 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
39437012946SDavid Rientjes 		return;
395708c1bbcSMiao Xie 
396213980c0SVlastimil Babka 	mpol_ops[pol->mode].rebind(pol, newmask);
3971d0d2680SDavid Rientjes }
3981d0d2680SDavid Rientjes 
3991d0d2680SDavid Rientjes /*
4001d0d2680SDavid Rientjes  * Wrapper for mpol_rebind_policy() that just requires task
4011d0d2680SDavid Rientjes  * pointer, and updates task mempolicy.
40258568d2aSMiao Xie  *
40358568d2aSMiao Xie  * Called with task's alloc_lock held.
4041d0d2680SDavid Rientjes  */
405213980c0SVlastimil Babka void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
4061d0d2680SDavid Rientjes {
407213980c0SVlastimil Babka 	mpol_rebind_policy(tsk->mempolicy, new);
4081d0d2680SDavid Rientjes }
4091d0d2680SDavid Rientjes 
4101d0d2680SDavid Rientjes /*
4111d0d2680SDavid Rientjes  * Rebind each vma in mm to new nodemask.
4121d0d2680SDavid Rientjes  *
413c1e8d7c6SMichel Lespinasse  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
4141d0d2680SDavid Rientjes  */
4151d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
4161d0d2680SDavid Rientjes {
4171d0d2680SDavid Rientjes 	struct vm_area_struct *vma;
41866850be5SLiam R. Howlett 	VMA_ITERATOR(vmi, mm, 0);
4191d0d2680SDavid Rientjes 
420d8ed45c5SMichel Lespinasse 	mmap_write_lock(mm);
4216c21e066SJann Horn 	for_each_vma(vmi, vma) {
4226c21e066SJann Horn 		vma_start_write(vma);
423213980c0SVlastimil Babka 		mpol_rebind_policy(vma->vm_policy, new);
4246c21e066SJann Horn 	}
425d8ed45c5SMichel Lespinasse 	mmap_write_unlock(mm);
4261d0d2680SDavid Rientjes }
4271d0d2680SDavid Rientjes 
42837012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
42937012946SDavid Rientjes 	[MPOL_DEFAULT] = {
43037012946SDavid Rientjes 		.rebind = mpol_rebind_default,
43137012946SDavid Rientjes 	},
43237012946SDavid Rientjes 	[MPOL_INTERLEAVE] = {
433be897d48SFeng Tang 		.create = mpol_new_nodemask,
43437012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
43537012946SDavid Rientjes 	},
43637012946SDavid Rientjes 	[MPOL_PREFERRED] = {
43737012946SDavid Rientjes 		.create = mpol_new_preferred,
43837012946SDavid Rientjes 		.rebind = mpol_rebind_preferred,
43937012946SDavid Rientjes 	},
44037012946SDavid Rientjes 	[MPOL_BIND] = {
441be897d48SFeng Tang 		.create = mpol_new_nodemask,
44237012946SDavid Rientjes 		.rebind = mpol_rebind_nodemask,
44337012946SDavid Rientjes 	},
4447858d7bcSFeng Tang 	[MPOL_LOCAL] = {
4457858d7bcSFeng Tang 		.rebind = mpol_rebind_default,
4467858d7bcSFeng Tang 	},
447b27abaccSDave Hansen 	[MPOL_PREFERRED_MANY] = {
448be897d48SFeng Tang 		.create = mpol_new_nodemask,
449b27abaccSDave Hansen 		.rebind = mpol_rebind_preferred,
450b27abaccSDave Hansen 	},
451fa3bea4eSGregory Price 	[MPOL_WEIGHTED_INTERLEAVE] = {
452fa3bea4eSGregory Price 		.create = mpol_new_nodemask,
453fa3bea4eSGregory Price 		.rebind = mpol_rebind_nodemask,
454fa3bea4eSGregory Price 	},
45537012946SDavid Rientjes };
45637012946SDavid Rientjes 
4571cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
458fc301289SChristoph Lameter 				unsigned long flags);
45972e315f7SHugh Dickins static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
46072e315f7SHugh Dickins 				pgoff_t ilx, int *nid);
4611a75a6c8SChristoph Lameter 
4621cb5d11aSHugh Dickins static bool strictly_unmovable(unsigned long flags)
4631cb5d11aSHugh Dickins {
4641cb5d11aSHugh Dickins 	/*
4651cb5d11aSHugh Dickins 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
4661cb5d11aSHugh Dickins 	 * if any misplaced page is found.
4671cb5d11aSHugh Dickins 	 */
4681cb5d11aSHugh Dickins 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
4691cb5d11aSHugh Dickins 			 MPOL_MF_STRICT;
4701cb5d11aSHugh Dickins }
4711cb5d11aSHugh Dickins 
47288c91dc5SHugh Dickins struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
47388c91dc5SHugh Dickins 	struct mempolicy *pol;
47488c91dc5SHugh Dickins 	pgoff_t ilx;
47588c91dc5SHugh Dickins };
476dc9aa5b9SChristoph Lameter 
4776f4576e3SNaoya Horiguchi struct queue_pages {
4786f4576e3SNaoya Horiguchi 	struct list_head *pagelist;
4796f4576e3SNaoya Horiguchi 	unsigned long flags;
4806f4576e3SNaoya Horiguchi 	nodemask_t *nmask;
481f18da660SLi Xinhai 	unsigned long start;
482f18da660SLi Xinhai 	unsigned long end;
483f18da660SLi Xinhai 	struct vm_area_struct *first;
4841cb5d11aSHugh Dickins 	struct folio *large;		/* note last large folio encountered */
4851cb5d11aSHugh Dickins 	long nr_failed;			/* could not be isolated at this time */
4866f4576e3SNaoya Horiguchi };
4876f4576e3SNaoya Horiguchi 
48898094945SNaoya Horiguchi /*
489d451b89dSVishal Moola (Oracle)  * Check if the folio's nid is in qp->nmask.
49088aaa2a1SNaoya Horiguchi  *
49188aaa2a1SNaoya Horiguchi  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
49288aaa2a1SNaoya Horiguchi  * in the invert of qp->nmask.
49388aaa2a1SNaoya Horiguchi  */
494d451b89dSVishal Moola (Oracle) static inline bool queue_folio_required(struct folio *folio,
49588aaa2a1SNaoya Horiguchi 					struct queue_pages *qp)
49688aaa2a1SNaoya Horiguchi {
497d451b89dSVishal Moola (Oracle) 	int nid = folio_nid(folio);
49888aaa2a1SNaoya Horiguchi 	unsigned long flags = qp->flags;
49988aaa2a1SNaoya Horiguchi 
50088aaa2a1SNaoya Horiguchi 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
50188aaa2a1SNaoya Horiguchi }
50288aaa2a1SNaoya Horiguchi 
5031cb5d11aSHugh Dickins static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
504c8633798SNaoya Horiguchi {
505de1f5055SVishal Moola (Oracle) 	struct folio *folio;
506c8633798SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
507c8633798SNaoya Horiguchi 
508c8633798SNaoya Horiguchi 	if (unlikely(is_pmd_migration_entry(*pmd))) {
5091cb5d11aSHugh Dickins 		qp->nr_failed++;
5101cb5d11aSHugh Dickins 		return;
511c8633798SNaoya Horiguchi 	}
512de1f5055SVishal Moola (Oracle) 	folio = pfn_folio(pmd_pfn(*pmd));
513de1f5055SVishal Moola (Oracle) 	if (is_huge_zero_page(&folio->page)) {
514e5947d23SYang Shi 		walk->action = ACTION_CONTINUE;
5151cb5d11aSHugh Dickins 		return;
516c8633798SNaoya Horiguchi 	}
517d451b89dSVishal Moola (Oracle) 	if (!queue_folio_required(folio, qp))
5181cb5d11aSHugh Dickins 		return;
5191cb5d11aSHugh Dickins 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
5201cb5d11aSHugh Dickins 	    !vma_migratable(walk->vma) ||
5211cb5d11aSHugh Dickins 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
5221cb5d11aSHugh Dickins 		qp->nr_failed++;
523c8633798SNaoya Horiguchi }
524c8633798SNaoya Horiguchi 
52588aaa2a1SNaoya Horiguchi /*
5261cb5d11aSHugh Dickins  * Scan through folios, checking if they satisfy the required conditions,
5271cb5d11aSHugh Dickins  * moving them from LRU to local pagelist for migration if they do (or not).
528d8835445SYang Shi  *
5291cb5d11aSHugh Dickins  * queue_folios_pte_range() has two possible return values:
5301cb5d11aSHugh Dickins  * 0 - continue walking to scan for more, even if an existing folio on the
5311cb5d11aSHugh Dickins  *     wrong node could not be isolated and queued for migration.
5321cb5d11aSHugh Dickins  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
5331cb5d11aSHugh Dickins  *        and an existing folio was on a node that does not follow the policy.
53498094945SNaoya Horiguchi  */
5353dae02bbSVishal Moola (Oracle) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
5366f4576e3SNaoya Horiguchi 			unsigned long end, struct mm_walk *walk)
5371da177e4SLinus Torvalds {
5386f4576e3SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
5393dae02bbSVishal Moola (Oracle) 	struct folio *folio;
5406f4576e3SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
5416f4576e3SNaoya Horiguchi 	unsigned long flags = qp->flags;
5423f088420SShijie Luo 	pte_t *pte, *mapped_pte;
543c33c7948SRyan Roberts 	pte_t ptent;
544705e87c0SHugh Dickins 	spinlock_t *ptl;
545941150a3SHugh Dickins 
546c8633798SNaoya Horiguchi 	ptl = pmd_trans_huge_lock(pmd, vma);
5471cb5d11aSHugh Dickins 	if (ptl) {
5481cb5d11aSHugh Dickins 		queue_folios_pmd(pmd, walk);
5491cb5d11aSHugh Dickins 		spin_unlock(ptl);
5501cb5d11aSHugh Dickins 		goto out;
5511cb5d11aSHugh Dickins 	}
55291612e0dSHugh Dickins 
5533f088420SShijie Luo 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
5547780d040SHugh Dickins 	if (!pte) {
5557780d040SHugh Dickins 		walk->action = ACTION_AGAIN;
5567780d040SHugh Dickins 		return 0;
5577780d040SHugh Dickins 	}
5586f4576e3SNaoya Horiguchi 	for (; addr != end; pte++, addr += PAGE_SIZE) {
559c33c7948SRyan Roberts 		ptent = ptep_get(pte);
5601cb5d11aSHugh Dickins 		if (pte_none(ptent))
56191612e0dSHugh Dickins 			continue;
5621cb5d11aSHugh Dickins 		if (!pte_present(ptent)) {
5631cb5d11aSHugh Dickins 			if (is_migration_entry(pte_to_swp_entry(ptent)))
5641cb5d11aSHugh Dickins 				qp->nr_failed++;
5651cb5d11aSHugh Dickins 			continue;
5661cb5d11aSHugh Dickins 		}
567c33c7948SRyan Roberts 		folio = vm_normal_folio(vma, addr, ptent);
5683dae02bbSVishal Moola (Oracle) 		if (!folio || folio_is_zone_device(folio))
56991612e0dSHugh Dickins 			continue;
570053837fcSNick Piggin 		/*
5713dae02bbSVishal Moola (Oracle) 		 * vm_normal_folio() filters out zero pages, but there might
5723dae02bbSVishal Moola (Oracle) 		 * still be reserved folios to skip, perhaps in a VDSO.
573053837fcSNick Piggin 		 */
5743dae02bbSVishal Moola (Oracle) 		if (folio_test_reserved(folio))
575f4598c8bSChristoph Lameter 			continue;
576d451b89dSVishal Moola (Oracle) 		if (!queue_folio_required(folio, qp))
57738e35860SChristoph Lameter 			continue;
5781cb5d11aSHugh Dickins 		if (folio_test_large(folio)) {
57924526268SYang Shi 			/*
5801cb5d11aSHugh Dickins 			 * A large folio can only be isolated from LRU once,
5811cb5d11aSHugh Dickins 			 * but may be mapped by many PTEs (and Copy-On-Write may
5821cb5d11aSHugh Dickins 			 * intersperse PTEs of other, order 0, folios).  This is
5831cb5d11aSHugh Dickins 			 * a common case, so don't mistake it for failure (but
5841cb5d11aSHugh Dickins 			 * there can be other cases of multi-mapped pages which
5851cb5d11aSHugh Dickins 			 * this quick check does not help to filter out - and a
5861cb5d11aSHugh Dickins 			 * search of the pagelist might grow to be prohibitive).
5871cb5d11aSHugh Dickins 			 *
5881cb5d11aSHugh Dickins 			 * migrate_pages(&pagelist) returns nr_failed folios, so
5891cb5d11aSHugh Dickins 			 * check "large" now so that queue_pages_range() returns
5901cb5d11aSHugh Dickins 			 * a comparable nr_failed folios.  This does imply that
5911cb5d11aSHugh Dickins 			 * if folio could not be isolated for some racy reason
5921cb5d11aSHugh Dickins 			 * at its first PTE, later PTEs will not give it another
5931cb5d11aSHugh Dickins 			 * chance of isolation; but keeps the accounting simple.
59424526268SYang Shi 			 */
5951cb5d11aSHugh Dickins 			if (folio == qp->large)
5961cb5d11aSHugh Dickins 				continue;
5971cb5d11aSHugh Dickins 			qp->large = folio;
5981cb5d11aSHugh Dickins 		}
5991cb5d11aSHugh Dickins 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
6001cb5d11aSHugh Dickins 		    !vma_migratable(vma) ||
6011cb5d11aSHugh Dickins 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
6021cb5d11aSHugh Dickins 			qp->nr_failed++;
6031cb5d11aSHugh Dickins 			if (strictly_unmovable(flags))
604a7f40cfeSYang Shi 				break;
6056f4576e3SNaoya Horiguchi 		}
6061cb5d11aSHugh Dickins 	}
6073f088420SShijie Luo 	pte_unmap_unlock(mapped_pte, ptl);
6086f4576e3SNaoya Horiguchi 	cond_resched();
6091cb5d11aSHugh Dickins out:
6101cb5d11aSHugh Dickins 	if (qp->nr_failed && strictly_unmovable(flags))
6111cb5d11aSHugh Dickins 		return -EIO;
6121cb5d11aSHugh Dickins 	return 0;
61391612e0dSHugh Dickins }
61491612e0dSHugh Dickins 
6150a2c1e81SVishal Moola (Oracle) static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
6166f4576e3SNaoya Horiguchi 			       unsigned long addr, unsigned long end,
6176f4576e3SNaoya Horiguchi 			       struct mm_walk *walk)
618e2d8cf40SNaoya Horiguchi {
619e2d8cf40SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
6206f4576e3SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
6211cb5d11aSHugh Dickins 	unsigned long flags = qp->flags;
6220a2c1e81SVishal Moola (Oracle) 	struct folio *folio;
623cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
624d4c54919SNaoya Horiguchi 	pte_t entry;
625e2d8cf40SNaoya Horiguchi 
6266f4576e3SNaoya Horiguchi 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
6276f4576e3SNaoya Horiguchi 	entry = huge_ptep_get(pte);
6281cb5d11aSHugh Dickins 	if (!pte_present(entry)) {
6291cb5d11aSHugh Dickins 		if (unlikely(is_hugetlb_entry_migration(entry)))
6301cb5d11aSHugh Dickins 			qp->nr_failed++;
631d4c54919SNaoya Horiguchi 		goto unlock;
6321cb5d11aSHugh Dickins 	}
6330a2c1e81SVishal Moola (Oracle) 	folio = pfn_folio(pte_pfn(entry));
634d451b89dSVishal Moola (Oracle) 	if (!queue_folio_required(folio, qp))
635e2d8cf40SNaoya Horiguchi 		goto unlock;
6361cb5d11aSHugh Dickins 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
6371cb5d11aSHugh Dickins 	    !vma_migratable(walk->vma)) {
6381cb5d11aSHugh Dickins 		qp->nr_failed++;
639dcf17635SLi Xinhai 		goto unlock;
640dcf17635SLi Xinhai 	}
641dcf17635SLi Xinhai 	/*
6421cb5d11aSHugh Dickins 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
6431cb5d11aSHugh Dickins 	 * Choosing not to migrate a shared folio is not counted as a failure.
6440a2c1e81SVishal Moola (Oracle) 	 *
6450a2c1e81SVishal Moola (Oracle) 	 * To check if the folio is shared, ideally we want to make sure
6460a2c1e81SVishal Moola (Oracle) 	 * every page is mapped to the same process. Doing that is very
6471cb5d11aSHugh Dickins 	 * expensive, so check the estimated sharers of the folio instead.
6480a2c1e81SVishal Moola (Oracle) 	 */
6491cb5d11aSHugh Dickins 	if ((flags & MPOL_MF_MOVE_ALL) ||
6501cb5d11aSHugh Dickins 	    (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
6511cb5d11aSHugh Dickins 		if (!isolate_hugetlb(folio, qp->pagelist))
6521cb5d11aSHugh Dickins 			qp->nr_failed++;
653e2d8cf40SNaoya Horiguchi unlock:
654cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
6551cb5d11aSHugh Dickins 	if (qp->nr_failed && strictly_unmovable(flags))
6561cb5d11aSHugh Dickins 		return -EIO;
657e2d8cf40SNaoya Horiguchi #endif
6581cb5d11aSHugh Dickins 	return 0;
6591da177e4SLinus Torvalds }
6601da177e4SLinus Torvalds 
6615877231fSAneesh Kumar K.V #ifdef CONFIG_NUMA_BALANCING
662b24f53a0SLee Schermerhorn /*
6634b10e7d5SMel Gorman  * This is used to mark a range of virtual addresses to be inaccessible.
6644b10e7d5SMel Gorman  * These are later cleared by a NUMA hinting fault. Depending on these
6654b10e7d5SMel Gorman  * faults, pages may be migrated for better NUMA placement.
6664b10e7d5SMel Gorman  *
6674b10e7d5SMel Gorman  * This is assuming that NUMA faults are handled using PROT_NONE. If
6684b10e7d5SMel Gorman  * an architecture makes a different choice, it will need further
6694b10e7d5SMel Gorman  * changes to the core.
670b24f53a0SLee Schermerhorn  */
6714b10e7d5SMel Gorman unsigned long change_prot_numa(struct vm_area_struct *vma,
6724b10e7d5SMel Gorman 			unsigned long addr, unsigned long end)
673b24f53a0SLee Schermerhorn {
6744a18419fSNadav Amit 	struct mmu_gather tlb;
675a79390f5SPeter Xu 	long nr_updated;
676b24f53a0SLee Schermerhorn 
6774a18419fSNadav Amit 	tlb_gather_mmu(&tlb, vma->vm_mm);
6784a18419fSNadav Amit 
6791ef488edSDavid Hildenbrand 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
680d1751118SPeter Xu 	if (nr_updated > 0)
68103c5a6e1SMel Gorman 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
682b24f53a0SLee Schermerhorn 
6834a18419fSNadav Amit 	tlb_finish_mmu(&tlb);
6844a18419fSNadav Amit 
6854b10e7d5SMel Gorman 	return nr_updated;
686b24f53a0SLee Schermerhorn }
6875877231fSAneesh Kumar K.V #endif /* CONFIG_NUMA_BALANCING */
688b24f53a0SLee Schermerhorn 
6896f4576e3SNaoya Horiguchi static int queue_pages_test_walk(unsigned long start, unsigned long end,
6906f4576e3SNaoya Horiguchi 				struct mm_walk *walk)
6911da177e4SLinus Torvalds {
69266850be5SLiam R. Howlett 	struct vm_area_struct *next, *vma = walk->vma;
6936f4576e3SNaoya Horiguchi 	struct queue_pages *qp = walk->private;
6946f4576e3SNaoya Horiguchi 	unsigned long flags = qp->flags;
695dc9aa5b9SChristoph Lameter 
696a18b3ac2SLi Xinhai 	/* range check first */
697ce33135cSMiaohe Lin 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
698f18da660SLi Xinhai 
699f18da660SLi Xinhai 	if (!qp->first) {
700f18da660SLi Xinhai 		qp->first = vma;
701f18da660SLi Xinhai 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
702f18da660SLi Xinhai 			(qp->start < vma->vm_start))
703f18da660SLi Xinhai 			/* hole at head side of range */
704a18b3ac2SLi Xinhai 			return -EFAULT;
705a18b3ac2SLi Xinhai 	}
70666850be5SLiam R. Howlett 	next = find_vma(vma->vm_mm, vma->vm_end);
707f18da660SLi Xinhai 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
708f18da660SLi Xinhai 		((vma->vm_end < qp->end) &&
70966850be5SLiam R. Howlett 		(!next || vma->vm_end < next->vm_start)))
710f18da660SLi Xinhai 		/* hole at middle or tail of range */
711f18da660SLi Xinhai 		return -EFAULT;
712a18b3ac2SLi Xinhai 
713a7f40cfeSYang Shi 	/*
714a7f40cfeSYang Shi 	 * Need check MPOL_MF_STRICT to return -EIO if possible
715a7f40cfeSYang Shi 	 * regardless of vma_migratable
716a7f40cfeSYang Shi 	 */
717a7f40cfeSYang Shi 	if (!vma_migratable(vma) &&
718a7f40cfeSYang Shi 	    !(flags & MPOL_MF_STRICT))
71948684a65SNaoya Horiguchi 		return 1;
72048684a65SNaoya Horiguchi 
7211cb5d11aSHugh Dickins 	/*
7221cb5d11aSHugh Dickins 	 * Check page nodes, and queue pages to move, in the current vma.
7231cb5d11aSHugh Dickins 	 * But if no moving, and no strict checking, the scan can be skipped.
7241cb5d11aSHugh Dickins 	 */
7251cb5d11aSHugh Dickins 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
7266f4576e3SNaoya Horiguchi 		return 0;
7276f4576e3SNaoya Horiguchi 	return 1;
7286f4576e3SNaoya Horiguchi }
729b24f53a0SLee Schermerhorn 
7307b86ac33SChristoph Hellwig static const struct mm_walk_ops queue_pages_walk_ops = {
7310a2c1e81SVishal Moola (Oracle) 	.hugetlb_entry		= queue_folios_hugetlb,
7323dae02bbSVishal Moola (Oracle) 	.pmd_entry		= queue_folios_pte_range,
7337b86ac33SChristoph Hellwig 	.test_walk		= queue_pages_test_walk,
73449b06385SSuren Baghdasaryan 	.walk_lock		= PGWALK_RDLOCK,
73549b06385SSuren Baghdasaryan };
73649b06385SSuren Baghdasaryan 
73749b06385SSuren Baghdasaryan static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
73849b06385SSuren Baghdasaryan 	.hugetlb_entry		= queue_folios_hugetlb,
73949b06385SSuren Baghdasaryan 	.pmd_entry		= queue_folios_pte_range,
74049b06385SSuren Baghdasaryan 	.test_walk		= queue_pages_test_walk,
74149b06385SSuren Baghdasaryan 	.walk_lock		= PGWALK_WRLOCK,
7427b86ac33SChristoph Hellwig };
7437b86ac33SChristoph Hellwig 
7446f4576e3SNaoya Horiguchi /*
7456f4576e3SNaoya Horiguchi  * Walk through page tables and collect pages to be migrated.
7466f4576e3SNaoya Horiguchi  *
7471cb5d11aSHugh Dickins  * If pages found in a given range are not on the required set of @nodes,
7481cb5d11aSHugh Dickins  * and migration is allowed, they are isolated and queued to @pagelist.
749d8835445SYang Shi  *
7501cb5d11aSHugh Dickins  * queue_pages_range() may return:
7511cb5d11aSHugh Dickins  * 0 - all pages already on the right node, or successfully queued for moving
7521cb5d11aSHugh Dickins  *     (or neither strict checking nor moving requested: only range checking).
7531cb5d11aSHugh Dickins  * >0 - this number of misplaced folios could not be queued for moving
7541cb5d11aSHugh Dickins  *      (a hugetlbfs page or a transparent huge page being counted as 1).
7551cb5d11aSHugh Dickins  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
7561cb5d11aSHugh Dickins  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
7576f4576e3SNaoya Horiguchi  */
7581cb5d11aSHugh Dickins static long
7596f4576e3SNaoya Horiguchi queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
7606f4576e3SNaoya Horiguchi 		nodemask_t *nodes, unsigned long flags,
7611cb5d11aSHugh Dickins 		struct list_head *pagelist)
7626f4576e3SNaoya Horiguchi {
763f18da660SLi Xinhai 	int err;
7646f4576e3SNaoya Horiguchi 	struct queue_pages qp = {
7656f4576e3SNaoya Horiguchi 		.pagelist = pagelist,
7666f4576e3SNaoya Horiguchi 		.flags = flags,
7676f4576e3SNaoya Horiguchi 		.nmask = nodes,
768f18da660SLi Xinhai 		.start = start,
769f18da660SLi Xinhai 		.end = end,
770f18da660SLi Xinhai 		.first = NULL,
7716f4576e3SNaoya Horiguchi 	};
7721cb5d11aSHugh Dickins 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
77349b06385SSuren Baghdasaryan 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
7746f4576e3SNaoya Horiguchi 
77549b06385SSuren Baghdasaryan 	err = walk_page_range(mm, start, end, ops, &qp);
776f18da660SLi Xinhai 
777f18da660SLi Xinhai 	if (!qp.first)
778f18da660SLi Xinhai 		/* whole range in hole */
779f18da660SLi Xinhai 		err = -EFAULT;
780f18da660SLi Xinhai 
7811cb5d11aSHugh Dickins 	return err ? : qp.nr_failed;
7821da177e4SLinus Torvalds }
7831da177e4SLinus Torvalds 
784869833f2SKOSAKI Motohiro /*
785869833f2SKOSAKI Motohiro  * Apply policy to a single VMA
786c1e8d7c6SMichel Lespinasse  * This must be called with the mmap_lock held for writing.
787869833f2SKOSAKI Motohiro  */
788869833f2SKOSAKI Motohiro static int vma_replace_policy(struct vm_area_struct *vma,
789869833f2SKOSAKI Motohiro 				struct mempolicy *pol)
7908d34694cSKOSAKI Motohiro {
791869833f2SKOSAKI Motohiro 	int err;
792869833f2SKOSAKI Motohiro 	struct mempolicy *old;
793869833f2SKOSAKI Motohiro 	struct mempolicy *new;
7948d34694cSKOSAKI Motohiro 
7956c21e066SJann Horn 	vma_assert_write_locked(vma);
7966c21e066SJann Horn 
797869833f2SKOSAKI Motohiro 	new = mpol_dup(pol);
798869833f2SKOSAKI Motohiro 	if (IS_ERR(new))
799869833f2SKOSAKI Motohiro 		return PTR_ERR(new);
800869833f2SKOSAKI Motohiro 
801869833f2SKOSAKI Motohiro 	if (vma->vm_ops && vma->vm_ops->set_policy) {
8028d34694cSKOSAKI Motohiro 		err = vma->vm_ops->set_policy(vma, new);
803869833f2SKOSAKI Motohiro 		if (err)
804869833f2SKOSAKI Motohiro 			goto err_out;
8058d34694cSKOSAKI Motohiro 	}
806869833f2SKOSAKI Motohiro 
807869833f2SKOSAKI Motohiro 	old = vma->vm_policy;
808c1e8d7c6SMichel Lespinasse 	vma->vm_policy = new; /* protected by mmap_lock */
809869833f2SKOSAKI Motohiro 	mpol_put(old);
810869833f2SKOSAKI Motohiro 
811869833f2SKOSAKI Motohiro 	return 0;
812869833f2SKOSAKI Motohiro  err_out:
813869833f2SKOSAKI Motohiro 	mpol_put(new);
8148d34694cSKOSAKI Motohiro 	return err;
8158d34694cSKOSAKI Motohiro }
8168d34694cSKOSAKI Motohiro 
817f4e9e0e6SLiam R. Howlett /* Split or merge the VMA (if required) and apply the new policy */
818f4e9e0e6SLiam R. Howlett static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
819f4e9e0e6SLiam R. Howlett 		struct vm_area_struct **prev, unsigned long start,
8209d8cebd4SKOSAKI Motohiro 		unsigned long end, struct mempolicy *new_pol)
8211da177e4SLinus Torvalds {
822f4e9e0e6SLiam R. Howlett 	unsigned long vmstart, vmend;
8231da177e4SLinus Torvalds 
824f4e9e0e6SLiam R. Howlett 	vmend = min(end, vma->vm_end);
825f4e9e0e6SLiam R. Howlett 	if (start > vma->vm_start) {
826f4e9e0e6SLiam R. Howlett 		*prev = vma;
827f4e9e0e6SLiam R. Howlett 		vmstart = start;
828f4e9e0e6SLiam R. Howlett 	} else {
829f4e9e0e6SLiam R. Howlett 		vmstart = vma->vm_start;
830f4e9e0e6SLiam R. Howlett 	}
8319d8cebd4SKOSAKI Motohiro 
832c36f6e6dSHugh Dickins 	if (mpol_equal(vma->vm_policy, new_pol)) {
83300ca0f2eSLorenzo Stoakes 		*prev = vma;
834f4e9e0e6SLiam R. Howlett 		return 0;
83500ca0f2eSLorenzo Stoakes 	}
836e26a5114SKOSAKI Motohiro 
83794d7d923SLorenzo Stoakes 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
83894d7d923SLorenzo Stoakes 	if (IS_ERR(vma))
83994d7d923SLorenzo Stoakes 		return PTR_ERR(vma);
840f4e9e0e6SLiam R. Howlett 
841f4e9e0e6SLiam R. Howlett 	*prev = vma;
842f4e9e0e6SLiam R. Howlett 	return vma_replace_policy(vma, new_pol);
843f4e9e0e6SLiam R. Howlett }
844f4e9e0e6SLiam R. Howlett 
8451da177e4SLinus Torvalds /* Set the process memory policy */
846028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
847028fec41SDavid Rientjes 			     nodemask_t *nodes)
8481da177e4SLinus Torvalds {
84958568d2aSMiao Xie 	struct mempolicy *new, *old;
8504bfc4495SKAMEZAWA Hiroyuki 	NODEMASK_SCRATCH(scratch);
85158568d2aSMiao Xie 	int ret;
8521da177e4SLinus Torvalds 
8534bfc4495SKAMEZAWA Hiroyuki 	if (!scratch)
8544bfc4495SKAMEZAWA Hiroyuki 		return -ENOMEM;
855f4e53d91SLee Schermerhorn 
8564bfc4495SKAMEZAWA Hiroyuki 	new = mpol_new(mode, flags, nodes);
8574bfc4495SKAMEZAWA Hiroyuki 	if (IS_ERR(new)) {
8584bfc4495SKAMEZAWA Hiroyuki 		ret = PTR_ERR(new);
8594bfc4495SKAMEZAWA Hiroyuki 		goto out;
8604bfc4495SKAMEZAWA Hiroyuki 	}
8612c7c3a7dSOleg Nesterov 
86212c1dc8eSAbel Wu 	task_lock(current);
8634bfc4495SKAMEZAWA Hiroyuki 	ret = mpol_set_nodemask(new, nodes, scratch);
86458568d2aSMiao Xie 	if (ret) {
86512c1dc8eSAbel Wu 		task_unlock(current);
86658568d2aSMiao Xie 		mpol_put(new);
8674bfc4495SKAMEZAWA Hiroyuki 		goto out;
86858568d2aSMiao Xie 	}
86912c1dc8eSAbel Wu 
87058568d2aSMiao Xie 	old = current->mempolicy;
8711da177e4SLinus Torvalds 	current->mempolicy = new;
872fa3bea4eSGregory Price 	if (new && (new->mode == MPOL_INTERLEAVE ||
873fa3bea4eSGregory Price 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
87445816682SVlastimil Babka 		current->il_prev = MAX_NUMNODES-1;
875fa3bea4eSGregory Price 		current->il_weight = 0;
876fa3bea4eSGregory Price 	}
87758568d2aSMiao Xie 	task_unlock(current);
87858568d2aSMiao Xie 	mpol_put(old);
8794bfc4495SKAMEZAWA Hiroyuki 	ret = 0;
8804bfc4495SKAMEZAWA Hiroyuki out:
8814bfc4495SKAMEZAWA Hiroyuki 	NODEMASK_SCRATCH_FREE(scratch);
8824bfc4495SKAMEZAWA Hiroyuki 	return ret;
8831da177e4SLinus Torvalds }
8841da177e4SLinus Torvalds 
885bea904d5SLee Schermerhorn /*
886bea904d5SLee Schermerhorn  * Return nodemask for policy for get_mempolicy() query
88758568d2aSMiao Xie  *
88858568d2aSMiao Xie  * Called with task's alloc_lock held
889bea904d5SLee Schermerhorn  */
890c36f6e6dSHugh Dickins static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
8911da177e4SLinus Torvalds {
892dfcd3c0dSAndi Kleen 	nodes_clear(*nodes);
893c36f6e6dSHugh Dickins 	if (pol == &default_policy)
894bea904d5SLee Schermerhorn 		return;
895bea904d5SLee Schermerhorn 
896c36f6e6dSHugh Dickins 	switch (pol->mode) {
89719770b32SMel Gorman 	case MPOL_BIND:
8981da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
899269fbe72SBen Widawsky 	case MPOL_PREFERRED:
900b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
901fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
902c36f6e6dSHugh Dickins 		*nodes = pol->nodes;
9031da177e4SLinus Torvalds 		break;
9047858d7bcSFeng Tang 	case MPOL_LOCAL:
9057858d7bcSFeng Tang 		/* return empty node mask for local allocation */
9067858d7bcSFeng Tang 		break;
9071da177e4SLinus Torvalds 	default:
9081da177e4SLinus Torvalds 		BUG();
9091da177e4SLinus Torvalds 	}
9101da177e4SLinus Torvalds }
9111da177e4SLinus Torvalds 
9123b9aadf7SAndrea Arcangeli static int lookup_node(struct mm_struct *mm, unsigned long addr)
9131da177e4SLinus Torvalds {
914ba841078SPeter Xu 	struct page *p = NULL;
915f728b9c4SJohn Hubbard 	int ret;
9161da177e4SLinus Torvalds 
917f728b9c4SJohn Hubbard 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
918f728b9c4SJohn Hubbard 	if (ret > 0) {
919f728b9c4SJohn Hubbard 		ret = page_to_nid(p);
9201da177e4SLinus Torvalds 		put_page(p);
9211da177e4SLinus Torvalds 	}
922f728b9c4SJohn Hubbard 	return ret;
9231da177e4SLinus Torvalds }
9241da177e4SLinus Torvalds 
9251da177e4SLinus Torvalds /* Retrieve NUMA policy */
926dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
9271da177e4SLinus Torvalds 			     unsigned long addr, unsigned long flags)
9281da177e4SLinus Torvalds {
9298bccd85fSChristoph Lameter 	int err;
9301da177e4SLinus Torvalds 	struct mm_struct *mm = current->mm;
9311da177e4SLinus Torvalds 	struct vm_area_struct *vma = NULL;
9323b9aadf7SAndrea Arcangeli 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
9331da177e4SLinus Torvalds 
934754af6f5SLee Schermerhorn 	if (flags &
935754af6f5SLee Schermerhorn 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
9361da177e4SLinus Torvalds 		return -EINVAL;
937754af6f5SLee Schermerhorn 
938754af6f5SLee Schermerhorn 	if (flags & MPOL_F_MEMS_ALLOWED) {
939754af6f5SLee Schermerhorn 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
940754af6f5SLee Schermerhorn 			return -EINVAL;
941754af6f5SLee Schermerhorn 		*policy = 0;	/* just so it's initialized */
94258568d2aSMiao Xie 		task_lock(current);
943754af6f5SLee Schermerhorn 		*nmask  = cpuset_current_mems_allowed;
94458568d2aSMiao Xie 		task_unlock(current);
945754af6f5SLee Schermerhorn 		return 0;
946754af6f5SLee Schermerhorn 	}
947754af6f5SLee Schermerhorn 
9481da177e4SLinus Torvalds 	if (flags & MPOL_F_ADDR) {
949ddc1a5cbSHugh Dickins 		pgoff_t ilx;		/* ignored here */
950bea904d5SLee Schermerhorn 		/*
951bea904d5SLee Schermerhorn 		 * Do NOT fall back to task policy if the
952bea904d5SLee Schermerhorn 		 * vma/shared policy at addr is NULL.  We
953bea904d5SLee Schermerhorn 		 * want to return MPOL_DEFAULT in this case.
954bea904d5SLee Schermerhorn 		 */
955d8ed45c5SMichel Lespinasse 		mmap_read_lock(mm);
95633e3575cSLiam Howlett 		vma = vma_lookup(mm, addr);
9571da177e4SLinus Torvalds 		if (!vma) {
958d8ed45c5SMichel Lespinasse 			mmap_read_unlock(mm);
9591da177e4SLinus Torvalds 			return -EFAULT;
9601da177e4SLinus Torvalds 		}
961ddc1a5cbSHugh Dickins 		pol = __get_vma_policy(vma, addr, &ilx);
9621da177e4SLinus Torvalds 	} else if (addr)
9631da177e4SLinus Torvalds 		return -EINVAL;
9641da177e4SLinus Torvalds 
9651da177e4SLinus Torvalds 	if (!pol)
966bea904d5SLee Schermerhorn 		pol = &default_policy;	/* indicates default behavior */
9671da177e4SLinus Torvalds 
9681da177e4SLinus Torvalds 	if (flags & MPOL_F_NODE) {
9691da177e4SLinus Torvalds 		if (flags & MPOL_F_ADDR) {
9703b9aadf7SAndrea Arcangeli 			/*
971f728b9c4SJohn Hubbard 			 * Take a refcount on the mpol, because we are about to
972f728b9c4SJohn Hubbard 			 * drop the mmap_lock, after which only "pol" remains
973f728b9c4SJohn Hubbard 			 * valid, "vma" is stale.
9743b9aadf7SAndrea Arcangeli 			 */
9753b9aadf7SAndrea Arcangeli 			pol_refcount = pol;
9763b9aadf7SAndrea Arcangeli 			vma = NULL;
9773b9aadf7SAndrea Arcangeli 			mpol_get(pol);
978f728b9c4SJohn Hubbard 			mmap_read_unlock(mm);
9793b9aadf7SAndrea Arcangeli 			err = lookup_node(mm, addr);
9801da177e4SLinus Torvalds 			if (err < 0)
9811da177e4SLinus Torvalds 				goto out;
9828bccd85fSChristoph Lameter 			*policy = err;
9831da177e4SLinus Torvalds 		} else if (pol == current->mempolicy &&
98445c4745aSLee Schermerhorn 				pol->mode == MPOL_INTERLEAVE) {
985269fbe72SBen Widawsky 			*policy = next_node_in(current->il_prev, pol->nodes);
986fa3bea4eSGregory Price 		} else if (pol == current->mempolicy &&
987fa3bea4eSGregory Price 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
988fa3bea4eSGregory Price 			if (current->il_weight)
989fa3bea4eSGregory Price 				*policy = current->il_prev;
990fa3bea4eSGregory Price 			else
991fa3bea4eSGregory Price 				*policy = next_node_in(current->il_prev,
992fa3bea4eSGregory Price 						       pol->nodes);
9931da177e4SLinus Torvalds 		} else {
9941da177e4SLinus Torvalds 			err = -EINVAL;
9951da177e4SLinus Torvalds 			goto out;
9961da177e4SLinus Torvalds 		}
997bea904d5SLee Schermerhorn 	} else {
998bea904d5SLee Schermerhorn 		*policy = pol == &default_policy ? MPOL_DEFAULT :
999bea904d5SLee Schermerhorn 						pol->mode;
1000d79df630SDavid Rientjes 		/*
1001d79df630SDavid Rientjes 		 * Internal mempolicy flags must be masked off before exposing
1002d79df630SDavid Rientjes 		 * the policy to userspace.
1003d79df630SDavid Rientjes 		 */
1004d79df630SDavid Rientjes 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1005bea904d5SLee Schermerhorn 	}
10061da177e4SLinus Torvalds 
10071da177e4SLinus Torvalds 	err = 0;
100858568d2aSMiao Xie 	if (nmask) {
1009c6b6ef8bSLee Schermerhorn 		if (mpol_store_user_nodemask(pol)) {
1010c6b6ef8bSLee Schermerhorn 			*nmask = pol->w.user_nodemask;
1011c6b6ef8bSLee Schermerhorn 		} else {
101258568d2aSMiao Xie 			task_lock(current);
1013bea904d5SLee Schermerhorn 			get_policy_nodemask(pol, nmask);
101458568d2aSMiao Xie 			task_unlock(current);
101558568d2aSMiao Xie 		}
1016c6b6ef8bSLee Schermerhorn 	}
10171da177e4SLinus Torvalds 
10181da177e4SLinus Torvalds  out:
101952cd3b07SLee Schermerhorn 	mpol_cond_put(pol);
10201da177e4SLinus Torvalds 	if (vma)
1021d8ed45c5SMichel Lespinasse 		mmap_read_unlock(mm);
10223b9aadf7SAndrea Arcangeli 	if (pol_refcount)
10233b9aadf7SAndrea Arcangeli 		mpol_put(pol_refcount);
10241da177e4SLinus Torvalds 	return err;
10251da177e4SLinus Torvalds }
10261da177e4SLinus Torvalds 
1027b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION
10281cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1029fc301289SChristoph Lameter 				unsigned long flags)
10306ce3c4c0SChristoph Lameter {
10316ce3c4c0SChristoph Lameter 	/*
10321cb5d11aSHugh Dickins 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
10331cb5d11aSHugh Dickins 	 * Choosing not to migrate a shared folio is not counted as a failure.
10344a64981dSVishal Moola (Oracle) 	 *
10354a64981dSVishal Moola (Oracle) 	 * To check if the folio is shared, ideally we want to make sure
10364a64981dSVishal Moola (Oracle) 	 * every page is mapped to the same process. Doing that is very
10371cb5d11aSHugh Dickins 	 * expensive, so check the estimated sharers of the folio instead.
10386ce3c4c0SChristoph Lameter 	 */
10394a64981dSVishal Moola (Oracle) 	if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
1040be2d5756SBaolin Wang 		if (folio_isolate_lru(folio)) {
10414a64981dSVishal Moola (Oracle) 			list_add_tail(&folio->lru, foliolist);
10424a64981dSVishal Moola (Oracle) 			node_stat_mod_folio(folio,
10434a64981dSVishal Moola (Oracle) 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
10444a64981dSVishal Moola (Oracle) 				folio_nr_pages(folio));
10451cb5d11aSHugh Dickins 		} else {
1046a53190a4SYang Shi 			/*
10474a64981dSVishal Moola (Oracle) 			 * Non-movable folio may reach here.  And, there may be
10484a64981dSVishal Moola (Oracle) 			 * temporary off LRU folios or non-LRU movable folios.
10494a64981dSVishal Moola (Oracle) 			 * Treat them as unmovable folios since they can't be
10501cb5d11aSHugh Dickins 			 * isolated, so they can't be moved at the moment.
1051a53190a4SYang Shi 			 */
10521cb5d11aSHugh Dickins 			return false;
105362695a84SNick Piggin 		}
105462695a84SNick Piggin 	}
10551cb5d11aSHugh Dickins 	return true;
10566ce3c4c0SChristoph Lameter }
10576ce3c4c0SChristoph Lameter 
10586ce3c4c0SChristoph Lameter /*
10597e2ab150SChristoph Lameter  * Migrate pages from one node to a target node.
10607e2ab150SChristoph Lameter  * Returns error or the number of pages not migrated.
10617e2ab150SChristoph Lameter  */
10621cb5d11aSHugh Dickins static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1063dbcb0f19SAdrian Bunk 			    int flags)
10647e2ab150SChristoph Lameter {
10657e2ab150SChristoph Lameter 	nodemask_t nmask;
106666850be5SLiam R. Howlett 	struct vm_area_struct *vma;
10677e2ab150SChristoph Lameter 	LIST_HEAD(pagelist);
10681cb5d11aSHugh Dickins 	long nr_failed;
10691cb5d11aSHugh Dickins 	long err = 0;
1070a0976311SJoonsoo Kim 	struct migration_target_control mtc = {
1071a0976311SJoonsoo Kim 		.nid = dest,
1072a0976311SJoonsoo Kim 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1073a0976311SJoonsoo Kim 	};
10747e2ab150SChristoph Lameter 
10757e2ab150SChristoph Lameter 	nodes_clear(nmask);
10767e2ab150SChristoph Lameter 	node_set(source, nmask);
10777e2ab150SChristoph Lameter 
107808270807SMinchan Kim 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
107972e315f7SHugh Dickins 
108072e315f7SHugh Dickins 	mmap_read_lock(mm);
10811cb5d11aSHugh Dickins 	vma = find_vma(mm, 0);
10821cb5d11aSHugh Dickins 
10831cb5d11aSHugh Dickins 	/*
10841cb5d11aSHugh Dickins 	 * This does not migrate the range, but isolates all pages that
10851cb5d11aSHugh Dickins 	 * need migration.  Between passing in the full user address
10861cb5d11aSHugh Dickins 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
10871cb5d11aSHugh Dickins 	 * but passes back the count of pages which could not be isolated.
10881cb5d11aSHugh Dickins 	 */
10891cb5d11aSHugh Dickins 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
10901cb5d11aSHugh Dickins 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
109172e315f7SHugh Dickins 	mmap_read_unlock(mm);
10927e2ab150SChristoph Lameter 
1093cf608ac1SMinchan Kim 	if (!list_empty(&pagelist)) {
1094a0976311SJoonsoo Kim 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
10955ac95884SYang Shi 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1096cf608ac1SMinchan Kim 		if (err)
1097e2d8cf40SNaoya Horiguchi 			putback_movable_pages(&pagelist);
1098cf608ac1SMinchan Kim 	}
109995a402c3SChristoph Lameter 
11001cb5d11aSHugh Dickins 	if (err >= 0)
11011cb5d11aSHugh Dickins 		err += nr_failed;
11027e2ab150SChristoph Lameter 	return err;
11037e2ab150SChristoph Lameter }
11047e2ab150SChristoph Lameter 
11057e2ab150SChristoph Lameter /*
11067e2ab150SChristoph Lameter  * Move pages between the two nodesets so as to preserve the physical
11077e2ab150SChristoph Lameter  * layout as much as possible.
110839743889SChristoph Lameter  *
110939743889SChristoph Lameter  * Returns the number of page that could not be moved.
111039743889SChristoph Lameter  */
11110ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
11120ce72d4fSAndrew Morton 		     const nodemask_t *to, int flags)
111339743889SChristoph Lameter {
11141cb5d11aSHugh Dickins 	long nr_failed = 0;
11151cb5d11aSHugh Dickins 	long err = 0;
11167e2ab150SChristoph Lameter 	nodemask_t tmp;
111739743889SChristoph Lameter 
1118361a2a22SMinchan Kim 	lru_cache_disable();
11190aedadf9SChristoph Lameter 
11207e2ab150SChristoph Lameter 	/*
11217e2ab150SChristoph Lameter 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
11227e2ab150SChristoph Lameter 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
11237e2ab150SChristoph Lameter 	 * bit in 'tmp', and return that <source, dest> pair for migration.
11247e2ab150SChristoph Lameter 	 * The pair of nodemasks 'to' and 'from' define the map.
11257e2ab150SChristoph Lameter 	 *
11267e2ab150SChristoph Lameter 	 * If no pair of bits is found that way, fallback to picking some
11277e2ab150SChristoph Lameter 	 * pair of 'source' and 'dest' bits that are not the same.  If the
11287e2ab150SChristoph Lameter 	 * 'source' and 'dest' bits are the same, this represents a node
11297e2ab150SChristoph Lameter 	 * that will be migrating to itself, so no pages need move.
11307e2ab150SChristoph Lameter 	 *
11317e2ab150SChristoph Lameter 	 * If no bits are left in 'tmp', or if all remaining bits left
11327e2ab150SChristoph Lameter 	 * in 'tmp' correspond to the same bit in 'to', return false
11337e2ab150SChristoph Lameter 	 * (nothing left to migrate).
11347e2ab150SChristoph Lameter 	 *
11357e2ab150SChristoph Lameter 	 * This lets us pick a pair of nodes to migrate between, such that
11367e2ab150SChristoph Lameter 	 * if possible the dest node is not already occupied by some other
11377e2ab150SChristoph Lameter 	 * source node, minimizing the risk of overloading the memory on a
11387e2ab150SChristoph Lameter 	 * node that would happen if we migrated incoming memory to a node
11397e2ab150SChristoph Lameter 	 * before migrating outgoing memory source that same node.
11407e2ab150SChristoph Lameter 	 *
11417e2ab150SChristoph Lameter 	 * A single scan of tmp is sufficient.  As we go, we remember the
11427e2ab150SChristoph Lameter 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
11437e2ab150SChristoph Lameter 	 * that not only moved, but what's better, moved to an empty slot
11447e2ab150SChristoph Lameter 	 * (d is not set in tmp), then we break out then, with that pair.
1145ae0e47f0SJustin P. Mattock 	 * Otherwise when we finish scanning from_tmp, we at least have the
11467e2ab150SChristoph Lameter 	 * most recent <s, d> pair that moved.  If we get all the way through
11477e2ab150SChristoph Lameter 	 * the scan of tmp without finding any node that moved, much less
11487e2ab150SChristoph Lameter 	 * moved to an empty node, then there is nothing left worth migrating.
11497e2ab150SChristoph Lameter 	 */
11507e2ab150SChristoph Lameter 
11510ce72d4fSAndrew Morton 	tmp = *from;
11527e2ab150SChristoph Lameter 	while (!nodes_empty(tmp)) {
11537e2ab150SChristoph Lameter 		int s, d;
1154b76ac7e7SJianguo Wu 		int source = NUMA_NO_NODE;
11557e2ab150SChristoph Lameter 		int dest = 0;
11567e2ab150SChristoph Lameter 
11577e2ab150SChristoph Lameter 		for_each_node_mask(s, tmp) {
11584a5b18ccSLarry Woodman 
11594a5b18ccSLarry Woodman 			/*
11604a5b18ccSLarry Woodman 			 * do_migrate_pages() tries to maintain the relative
11614a5b18ccSLarry Woodman 			 * node relationship of the pages established between
11624a5b18ccSLarry Woodman 			 * threads and memory areas.
11634a5b18ccSLarry Woodman                          *
11644a5b18ccSLarry Woodman 			 * However if the number of source nodes is not equal to
11654a5b18ccSLarry Woodman 			 * the number of destination nodes we can not preserve
11664a5b18ccSLarry Woodman 			 * this node relative relationship.  In that case, skip
11674a5b18ccSLarry Woodman 			 * copying memory from a node that is in the destination
11684a5b18ccSLarry Woodman 			 * mask.
11694a5b18ccSLarry Woodman 			 *
11704a5b18ccSLarry Woodman 			 * Example: [2,3,4] -> [3,4,5] moves everything.
11714a5b18ccSLarry Woodman 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
11724a5b18ccSLarry Woodman 			 */
11734a5b18ccSLarry Woodman 
11740ce72d4fSAndrew Morton 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
11750ce72d4fSAndrew Morton 						(node_isset(s, *to)))
11764a5b18ccSLarry Woodman 				continue;
11774a5b18ccSLarry Woodman 
11780ce72d4fSAndrew Morton 			d = node_remap(s, *from, *to);
11797e2ab150SChristoph Lameter 			if (s == d)
11807e2ab150SChristoph Lameter 				continue;
11817e2ab150SChristoph Lameter 
11827e2ab150SChristoph Lameter 			source = s;	/* Node moved. Memorize */
11837e2ab150SChristoph Lameter 			dest = d;
11847e2ab150SChristoph Lameter 
11857e2ab150SChristoph Lameter 			/* dest not in remaining from nodes? */
11867e2ab150SChristoph Lameter 			if (!node_isset(dest, tmp))
11877e2ab150SChristoph Lameter 				break;
11887e2ab150SChristoph Lameter 		}
1189b76ac7e7SJianguo Wu 		if (source == NUMA_NO_NODE)
11907e2ab150SChristoph Lameter 			break;
11917e2ab150SChristoph Lameter 
11927e2ab150SChristoph Lameter 		node_clear(source, tmp);
11937e2ab150SChristoph Lameter 		err = migrate_to_node(mm, source, dest, flags);
11947e2ab150SChristoph Lameter 		if (err > 0)
11951cb5d11aSHugh Dickins 			nr_failed += err;
11967e2ab150SChristoph Lameter 		if (err < 0)
11977e2ab150SChristoph Lameter 			break;
119839743889SChristoph Lameter 	}
1199d479960eSMinchan Kim 
1200361a2a22SMinchan Kim 	lru_cache_enable();
12017e2ab150SChristoph Lameter 	if (err < 0)
12027e2ab150SChristoph Lameter 		return err;
12031cb5d11aSHugh Dickins 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
120439743889SChristoph Lameter }
120539743889SChristoph Lameter 
12063ad33b24SLee Schermerhorn /*
120772e315f7SHugh Dickins  * Allocate a new folio for page migration, according to NUMA mempolicy.
12083ad33b24SLee Schermerhorn  */
120972e315f7SHugh Dickins static struct folio *alloc_migration_target_by_mpol(struct folio *src,
121072e315f7SHugh Dickins 						    unsigned long private)
121195a402c3SChristoph Lameter {
121288c91dc5SHugh Dickins 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
121388c91dc5SHugh Dickins 	struct mempolicy *pol = mmpol->pol;
121488c91dc5SHugh Dickins 	pgoff_t ilx = mmpol->ilx;
121572e315f7SHugh Dickins 	struct page *page;
121672e315f7SHugh Dickins 	unsigned int order;
121772e315f7SHugh Dickins 	int nid = numa_node_id();
121872e315f7SHugh Dickins 	gfp_t gfp;
121995a402c3SChristoph Lameter 
122072e315f7SHugh Dickins 	order = folio_order(src);
122172e315f7SHugh Dickins 	ilx += src->index >> order;
12223ad33b24SLee Schermerhorn 
1223d0ce0e47SSidhartha Kumar 	if (folio_test_hugetlb(src)) {
122472e315f7SHugh Dickins 		nodemask_t *nodemask;
122572e315f7SHugh Dickins 		struct hstate *h;
122672e315f7SHugh Dickins 
122772e315f7SHugh Dickins 		h = folio_hstate(src);
122872e315f7SHugh Dickins 		gfp = htlb_alloc_mask(h);
122972e315f7SHugh Dickins 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
123072e315f7SHugh Dickins 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
1231d0ce0e47SSidhartha Kumar 	}
1232c8633798SNaoya Horiguchi 
1233ec4858e0SMatthew Wilcox (Oracle) 	if (folio_test_large(src))
1234ec4858e0SMatthew Wilcox (Oracle) 		gfp = GFP_TRANSHUGE;
123572e315f7SHugh Dickins 	else
123672e315f7SHugh Dickins 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1237ec4858e0SMatthew Wilcox (Oracle) 
123872e315f7SHugh Dickins 	page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
123972e315f7SHugh Dickins 	return page_rmappable_folio(page);
124095a402c3SChristoph Lameter }
1241b20a3503SChristoph Lameter #else
1242b20a3503SChristoph Lameter 
12431cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1244b20a3503SChristoph Lameter 				unsigned long flags)
1245b20a3503SChristoph Lameter {
12461cb5d11aSHugh Dickins 	return false;
1247b20a3503SChristoph Lameter }
1248b20a3503SChristoph Lameter 
12490ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
12500ce72d4fSAndrew Morton 		     const nodemask_t *to, int flags)
1251b20a3503SChristoph Lameter {
1252b20a3503SChristoph Lameter 	return -ENOSYS;
1253b20a3503SChristoph Lameter }
125495a402c3SChristoph Lameter 
125572e315f7SHugh Dickins static struct folio *alloc_migration_target_by_mpol(struct folio *src,
125672e315f7SHugh Dickins 						    unsigned long private)
125795a402c3SChristoph Lameter {
125895a402c3SChristoph Lameter 	return NULL;
125995a402c3SChristoph Lameter }
1260b20a3503SChristoph Lameter #endif
1261b20a3503SChristoph Lameter 
1262dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
1263028fec41SDavid Rientjes 		     unsigned short mode, unsigned short mode_flags,
1264028fec41SDavid Rientjes 		     nodemask_t *nmask, unsigned long flags)
12656ce3c4c0SChristoph Lameter {
12666ce3c4c0SChristoph Lameter 	struct mm_struct *mm = current->mm;
1267f4e9e0e6SLiam R. Howlett 	struct vm_area_struct *vma, *prev;
1268f4e9e0e6SLiam R. Howlett 	struct vma_iterator vmi;
126988c91dc5SHugh Dickins 	struct migration_mpol mmpol;
12706ce3c4c0SChristoph Lameter 	struct mempolicy *new;
12716ce3c4c0SChristoph Lameter 	unsigned long end;
12721cb5d11aSHugh Dickins 	long err;
12731cb5d11aSHugh Dickins 	long nr_failed;
12746ce3c4c0SChristoph Lameter 	LIST_HEAD(pagelist);
12756ce3c4c0SChristoph Lameter 
1276b24f53a0SLee Schermerhorn 	if (flags & ~(unsigned long)MPOL_MF_VALID)
12776ce3c4c0SChristoph Lameter 		return -EINVAL;
127874c00241SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
12796ce3c4c0SChristoph Lameter 		return -EPERM;
12806ce3c4c0SChristoph Lameter 
12816ce3c4c0SChristoph Lameter 	if (start & ~PAGE_MASK)
12826ce3c4c0SChristoph Lameter 		return -EINVAL;
12836ce3c4c0SChristoph Lameter 
12846ce3c4c0SChristoph Lameter 	if (mode == MPOL_DEFAULT)
12856ce3c4c0SChristoph Lameter 		flags &= ~MPOL_MF_STRICT;
12866ce3c4c0SChristoph Lameter 
1287aaa31e05Sze zuo 	len = PAGE_ALIGN(len);
12886ce3c4c0SChristoph Lameter 	end = start + len;
12896ce3c4c0SChristoph Lameter 
12906ce3c4c0SChristoph Lameter 	if (end < start)
12916ce3c4c0SChristoph Lameter 		return -EINVAL;
12926ce3c4c0SChristoph Lameter 	if (end == start)
12936ce3c4c0SChristoph Lameter 		return 0;
12946ce3c4c0SChristoph Lameter 
1295028fec41SDavid Rientjes 	new = mpol_new(mode, mode_flags, nmask);
12966ce3c4c0SChristoph Lameter 	if (IS_ERR(new))
12976ce3c4c0SChristoph Lameter 		return PTR_ERR(new);
12986ce3c4c0SChristoph Lameter 
12996ce3c4c0SChristoph Lameter 	/*
13006ce3c4c0SChristoph Lameter 	 * If we are using the default policy then operation
13016ce3c4c0SChristoph Lameter 	 * on discontinuous address spaces is okay after all
13026ce3c4c0SChristoph Lameter 	 */
13036ce3c4c0SChristoph Lameter 	if (!new)
13046ce3c4c0SChristoph Lameter 		flags |= MPOL_MF_DISCONTIG_OK;
13056ce3c4c0SChristoph Lameter 
13061cb5d11aSHugh Dickins 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1307361a2a22SMinchan Kim 		lru_cache_disable();
13084bfc4495SKAMEZAWA Hiroyuki 	{
13094bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH(scratch);
13104bfc4495SKAMEZAWA Hiroyuki 		if (scratch) {
1311d8ed45c5SMichel Lespinasse 			mmap_write_lock(mm);
13124bfc4495SKAMEZAWA Hiroyuki 			err = mpol_set_nodemask(new, nmask, scratch);
13134bfc4495SKAMEZAWA Hiroyuki 			if (err)
1314d8ed45c5SMichel Lespinasse 				mmap_write_unlock(mm);
13154bfc4495SKAMEZAWA Hiroyuki 		} else
13164bfc4495SKAMEZAWA Hiroyuki 			err = -ENOMEM;
13174bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH_FREE(scratch);
13184bfc4495SKAMEZAWA Hiroyuki 	}
1319b05ca738SKOSAKI Motohiro 	if (err)
1320b05ca738SKOSAKI Motohiro 		goto mpol_out;
1321b05ca738SKOSAKI Motohiro 
13226c21e066SJann Horn 	/*
13231cb5d11aSHugh Dickins 	 * Lock the VMAs before scanning for pages to migrate,
13241cb5d11aSHugh Dickins 	 * to ensure we don't miss a concurrently inserted page.
13256c21e066SJann Horn 	 */
13261cb5d11aSHugh Dickins 	nr_failed = queue_pages_range(mm, start, end, nmask,
13271cb5d11aSHugh Dickins 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1328d8835445SYang Shi 
13291cb5d11aSHugh Dickins 	if (nr_failed < 0) {
13301cb5d11aSHugh Dickins 		err = nr_failed;
133172e315f7SHugh Dickins 		nr_failed = 0;
13321cb5d11aSHugh Dickins 	} else {
1333f4e9e0e6SLiam R. Howlett 		vma_iter_init(&vmi, mm, start);
1334f4e9e0e6SLiam R. Howlett 		prev = vma_prev(&vmi);
1335f4e9e0e6SLiam R. Howlett 		for_each_vma_range(vmi, vma, end) {
1336f4e9e0e6SLiam R. Howlett 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1337f4e9e0e6SLiam R. Howlett 			if (err)
1338f4e9e0e6SLiam R. Howlett 				break;
1339f4e9e0e6SLiam R. Howlett 		}
1340cf608ac1SMinchan Kim 	}
13416ce3c4c0SChristoph Lameter 
134272e315f7SHugh Dickins 	if (!err && !list_empty(&pagelist)) {
134372e315f7SHugh Dickins 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
134472e315f7SHugh Dickins 		if (!new) {
134572e315f7SHugh Dickins 			new = get_task_policy(current);
134672e315f7SHugh Dickins 			mpol_get(new);
13471cb5d11aSHugh Dickins 		}
134888c91dc5SHugh Dickins 		mmpol.pol = new;
134988c91dc5SHugh Dickins 		mmpol.ilx = 0;
135088c91dc5SHugh Dickins 
135188c91dc5SHugh Dickins 		/*
135288c91dc5SHugh Dickins 		 * In the interleaved case, attempt to allocate on exactly the
135388c91dc5SHugh Dickins 		 * targeted nodes, for the first VMA to be migrated; for later
135488c91dc5SHugh Dickins 		 * VMAs, the nodes will still be interleaved from the targeted
135588c91dc5SHugh Dickins 		 * nodemask, but one by one may be selected differently.
135688c91dc5SHugh Dickins 		 */
1357fa3bea4eSGregory Price 		if (new->mode == MPOL_INTERLEAVE ||
1358fa3bea4eSGregory Price 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1359*f1cce6f7SMatthew Wilcox (Oracle) 			struct folio *folio;
136088c91dc5SHugh Dickins 			unsigned int order;
136188c91dc5SHugh Dickins 			unsigned long addr = -EFAULT;
136288c91dc5SHugh Dickins 
1363*f1cce6f7SMatthew Wilcox (Oracle) 			list_for_each_entry(folio, &pagelist, lru) {
1364*f1cce6f7SMatthew Wilcox (Oracle) 				if (!folio_test_ksm(folio))
136588c91dc5SHugh Dickins 					break;
136688c91dc5SHugh Dickins 			}
1367*f1cce6f7SMatthew Wilcox (Oracle) 			if (!list_entry_is_head(folio, &pagelist, lru)) {
136888c91dc5SHugh Dickins 				vma_iter_init(&vmi, mm, start);
136988c91dc5SHugh Dickins 				for_each_vma_range(vmi, vma, end) {
1370*f1cce6f7SMatthew Wilcox (Oracle) 					addr = page_address_in_vma(
1371*f1cce6f7SMatthew Wilcox (Oracle) 						folio_page(folio, 0), vma);
137288c91dc5SHugh Dickins 					if (addr != -EFAULT)
137388c91dc5SHugh Dickins 						break;
137488c91dc5SHugh Dickins 				}
137588c91dc5SHugh Dickins 			}
137688c91dc5SHugh Dickins 			if (addr != -EFAULT) {
1377*f1cce6f7SMatthew Wilcox (Oracle) 				order = folio_order(folio);
137888c91dc5SHugh Dickins 				/* We already know the pol, but not the ilx */
137988c91dc5SHugh Dickins 				mpol_cond_put(get_vma_policy(vma, addr, order,
138088c91dc5SHugh Dickins 							     &mmpol.ilx));
138188c91dc5SHugh Dickins 				/* Set base from which to increment by index */
1382*f1cce6f7SMatthew Wilcox (Oracle) 				mmpol.ilx -= folio->index >> order;
138388c91dc5SHugh Dickins 			}
138488c91dc5SHugh Dickins 		}
1385a85dfc30SYang Shi 	}
1386a85dfc30SYang Shi 
1387d8ed45c5SMichel Lespinasse 	mmap_write_unlock(mm);
138888c91dc5SHugh Dickins 
138988c91dc5SHugh Dickins 	if (!err && !list_empty(&pagelist)) {
139072e315f7SHugh Dickins 		nr_failed |= migrate_pages(&pagelist,
139172e315f7SHugh Dickins 				alloc_migration_target_by_mpol, NULL,
139288c91dc5SHugh Dickins 				(unsigned long)&mmpol, MIGRATE_SYNC,
139372e315f7SHugh Dickins 				MR_MEMPOLICY_MBIND, NULL);
139472e315f7SHugh Dickins 	}
139572e315f7SHugh Dickins 
13961cb5d11aSHugh Dickins 	if (nr_failed && (flags & MPOL_MF_STRICT))
13971cb5d11aSHugh Dickins 		err = -EIO;
13986ce3c4c0SChristoph Lameter 	if (!list_empty(&pagelist))
1399b05ca738SKOSAKI Motohiro 		putback_movable_pages(&pagelist);
14006ce3c4c0SChristoph Lameter mpol_out:
1401f0be3d32SLee Schermerhorn 	mpol_put(new);
1402d479960eSMinchan Kim 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1403361a2a22SMinchan Kim 		lru_cache_enable();
14046ce3c4c0SChristoph Lameter 	return err;
14056ce3c4c0SChristoph Lameter }
14066ce3c4c0SChristoph Lameter 
140739743889SChristoph Lameter /*
14088bccd85fSChristoph Lameter  * User space interface with variable sized bitmaps for nodelists.
14098bccd85fSChristoph Lameter  */
1410e130242dSArnd Bergmann static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1411e130242dSArnd Bergmann 		      unsigned long maxnode)
1412e130242dSArnd Bergmann {
1413e130242dSArnd Bergmann 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1414e130242dSArnd Bergmann 	int ret;
1415e130242dSArnd Bergmann 
1416e130242dSArnd Bergmann 	if (in_compat_syscall())
1417e130242dSArnd Bergmann 		ret = compat_get_bitmap(mask,
1418e130242dSArnd Bergmann 					(const compat_ulong_t __user *)nmask,
1419e130242dSArnd Bergmann 					maxnode);
1420e130242dSArnd Bergmann 	else
1421e130242dSArnd Bergmann 		ret = copy_from_user(mask, nmask,
1422e130242dSArnd Bergmann 				     nlongs * sizeof(unsigned long));
1423e130242dSArnd Bergmann 
1424e130242dSArnd Bergmann 	if (ret)
1425e130242dSArnd Bergmann 		return -EFAULT;
1426e130242dSArnd Bergmann 
1427e130242dSArnd Bergmann 	if (maxnode % BITS_PER_LONG)
1428e130242dSArnd Bergmann 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1429e130242dSArnd Bergmann 
1430e130242dSArnd Bergmann 	return 0;
1431e130242dSArnd Bergmann }
14328bccd85fSChristoph Lameter 
14338bccd85fSChristoph Lameter /* Copy a node mask from user space. */
143439743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
14358bccd85fSChristoph Lameter 		     unsigned long maxnode)
14368bccd85fSChristoph Lameter {
14378bccd85fSChristoph Lameter 	--maxnode;
14388bccd85fSChristoph Lameter 	nodes_clear(*nodes);
14398bccd85fSChristoph Lameter 	if (maxnode == 0 || !nmask)
14408bccd85fSChristoph Lameter 		return 0;
1441a9c930baSAndi Kleen 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1442636f13c1SChris Wright 		return -EINVAL;
14438bccd85fSChristoph Lameter 
144456521e7aSYisheng Xie 	/*
144556521e7aSYisheng Xie 	 * When the user specified more nodes than supported just check
1446e130242dSArnd Bergmann 	 * if the non supported part is all zero, one word at a time,
1447e130242dSArnd Bergmann 	 * starting at the end.
144856521e7aSYisheng Xie 	 */
1449e130242dSArnd Bergmann 	while (maxnode > MAX_NUMNODES) {
1450e130242dSArnd Bergmann 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1451e130242dSArnd Bergmann 		unsigned long t;
14528bccd85fSChristoph Lameter 
1453000eca5dSTianyu Li 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
145456521e7aSYisheng Xie 			return -EFAULT;
1455e130242dSArnd Bergmann 
1456e130242dSArnd Bergmann 		if (maxnode - bits >= MAX_NUMNODES) {
1457e130242dSArnd Bergmann 			maxnode -= bits;
1458e130242dSArnd Bergmann 		} else {
1459e130242dSArnd Bergmann 			maxnode = MAX_NUMNODES;
1460e130242dSArnd Bergmann 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1461e130242dSArnd Bergmann 		}
1462e130242dSArnd Bergmann 		if (t)
146356521e7aSYisheng Xie 			return -EINVAL;
146456521e7aSYisheng Xie 	}
146556521e7aSYisheng Xie 
1466e130242dSArnd Bergmann 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
14678bccd85fSChristoph Lameter }
14688bccd85fSChristoph Lameter 
14698bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
14708bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
14718bccd85fSChristoph Lameter 			      nodemask_t *nodes)
14728bccd85fSChristoph Lameter {
14738bccd85fSChristoph Lameter 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1474050c17f2SRalph Campbell 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1475e130242dSArnd Bergmann 	bool compat = in_compat_syscall();
1476e130242dSArnd Bergmann 
1477e130242dSArnd Bergmann 	if (compat)
1478e130242dSArnd Bergmann 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
14798bccd85fSChristoph Lameter 
14808bccd85fSChristoph Lameter 	if (copy > nbytes) {
14818bccd85fSChristoph Lameter 		if (copy > PAGE_SIZE)
14828bccd85fSChristoph Lameter 			return -EINVAL;
14838bccd85fSChristoph Lameter 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
14848bccd85fSChristoph Lameter 			return -EFAULT;
14858bccd85fSChristoph Lameter 		copy = nbytes;
1486e130242dSArnd Bergmann 		maxnode = nr_node_ids;
14878bccd85fSChristoph Lameter 	}
1488e130242dSArnd Bergmann 
1489e130242dSArnd Bergmann 	if (compat)
1490e130242dSArnd Bergmann 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1491e130242dSArnd Bergmann 					 nodes_addr(*nodes), maxnode);
1492e130242dSArnd Bergmann 
14938bccd85fSChristoph Lameter 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
14948bccd85fSChristoph Lameter }
14958bccd85fSChristoph Lameter 
149695837924SFeng Tang /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
149795837924SFeng Tang static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
149895837924SFeng Tang {
149995837924SFeng Tang 	*flags = *mode & MPOL_MODE_FLAGS;
150095837924SFeng Tang 	*mode &= ~MPOL_MODE_FLAGS;
1501b27abaccSDave Hansen 
1502a38a59fdSBen Widawsky 	if ((unsigned int)(*mode) >=  MPOL_MAX)
150395837924SFeng Tang 		return -EINVAL;
150495837924SFeng Tang 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
150595837924SFeng Tang 		return -EINVAL;
15066d2aec9eSEric Dumazet 	if (*flags & MPOL_F_NUMA_BALANCING) {
15076d2aec9eSEric Dumazet 		if (*mode != MPOL_BIND)
15086d2aec9eSEric Dumazet 			return -EINVAL;
15096d2aec9eSEric Dumazet 		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
15106d2aec9eSEric Dumazet 	}
151195837924SFeng Tang 	return 0;
151295837924SFeng Tang }
151395837924SFeng Tang 
1514e7dc9ad6SDominik Brodowski static long kernel_mbind(unsigned long start, unsigned long len,
1515e7dc9ad6SDominik Brodowski 			 unsigned long mode, const unsigned long __user *nmask,
1516e7dc9ad6SDominik Brodowski 			 unsigned long maxnode, unsigned int flags)
15178bccd85fSChristoph Lameter {
1518028fec41SDavid Rientjes 	unsigned short mode_flags;
151995837924SFeng Tang 	nodemask_t nodes;
152095837924SFeng Tang 	int lmode = mode;
152195837924SFeng Tang 	int err;
15228bccd85fSChristoph Lameter 
1523057d3389SAndrey Konovalov 	start = untagged_addr(start);
152495837924SFeng Tang 	err = sanitize_mpol_flags(&lmode, &mode_flags);
152595837924SFeng Tang 	if (err)
152695837924SFeng Tang 		return err;
152795837924SFeng Tang 
15288bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
15298bccd85fSChristoph Lameter 	if (err)
15308bccd85fSChristoph Lameter 		return err;
153195837924SFeng Tang 
153295837924SFeng Tang 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
15338bccd85fSChristoph Lameter }
15348bccd85fSChristoph Lameter 
1535c6018b4bSAneesh Kumar K.V SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1536c6018b4bSAneesh Kumar K.V 		unsigned long, home_node, unsigned long, flags)
1537c6018b4bSAneesh Kumar K.V {
1538c6018b4bSAneesh Kumar K.V 	struct mm_struct *mm = current->mm;
1539f4e9e0e6SLiam R. Howlett 	struct vm_area_struct *vma, *prev;
1540e976936cSMichal Hocko 	struct mempolicy *new, *old;
1541c6018b4bSAneesh Kumar K.V 	unsigned long end;
1542c6018b4bSAneesh Kumar K.V 	int err = -ENOENT;
154366850be5SLiam R. Howlett 	VMA_ITERATOR(vmi, mm, start);
1544c6018b4bSAneesh Kumar K.V 
1545c6018b4bSAneesh Kumar K.V 	start = untagged_addr(start);
1546c6018b4bSAneesh Kumar K.V 	if (start & ~PAGE_MASK)
1547c6018b4bSAneesh Kumar K.V 		return -EINVAL;
1548c6018b4bSAneesh Kumar K.V 	/*
1549c6018b4bSAneesh Kumar K.V 	 * flags is used for future extension if any.
1550c6018b4bSAneesh Kumar K.V 	 */
1551c6018b4bSAneesh Kumar K.V 	if (flags != 0)
1552c6018b4bSAneesh Kumar K.V 		return -EINVAL;
1553c6018b4bSAneesh Kumar K.V 
1554c6018b4bSAneesh Kumar K.V 	/*
1555c6018b4bSAneesh Kumar K.V 	 * Check home_node is online to avoid accessing uninitialized
1556c6018b4bSAneesh Kumar K.V 	 * NODE_DATA.
1557c6018b4bSAneesh Kumar K.V 	 */
1558c6018b4bSAneesh Kumar K.V 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1559c6018b4bSAneesh Kumar K.V 		return -EINVAL;
1560c6018b4bSAneesh Kumar K.V 
1561aaa31e05Sze zuo 	len = PAGE_ALIGN(len);
1562c6018b4bSAneesh Kumar K.V 	end = start + len;
1563c6018b4bSAneesh Kumar K.V 
1564c6018b4bSAneesh Kumar K.V 	if (end < start)
1565c6018b4bSAneesh Kumar K.V 		return -EINVAL;
1566c6018b4bSAneesh Kumar K.V 	if (end == start)
1567c6018b4bSAneesh Kumar K.V 		return 0;
1568c6018b4bSAneesh Kumar K.V 	mmap_write_lock(mm);
1569f4e9e0e6SLiam R. Howlett 	prev = vma_prev(&vmi);
157066850be5SLiam R. Howlett 	for_each_vma_range(vmi, vma, end) {
1571c6018b4bSAneesh Kumar K.V 		/*
1572c6018b4bSAneesh Kumar K.V 		 * If any vma in the range got policy other than MPOL_BIND
1573c6018b4bSAneesh Kumar K.V 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1574c6018b4bSAneesh Kumar K.V 		 * the home node for vmas we already updated before.
1575c6018b4bSAneesh Kumar K.V 		 */
1576e976936cSMichal Hocko 		old = vma_policy(vma);
157751f62537SLiam R. Howlett 		if (!old) {
157851f62537SLiam R. Howlett 			prev = vma;
1579e976936cSMichal Hocko 			continue;
158051f62537SLiam R. Howlett 		}
1581e976936cSMichal Hocko 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1582c6018b4bSAneesh Kumar K.V 			err = -EOPNOTSUPP;
1583c6018b4bSAneesh Kumar K.V 			break;
1584c6018b4bSAneesh Kumar K.V 		}
1585e976936cSMichal Hocko 		new = mpol_dup(old);
1586e976936cSMichal Hocko 		if (IS_ERR(new)) {
1587e976936cSMichal Hocko 			err = PTR_ERR(new);
1588e976936cSMichal Hocko 			break;
1589e976936cSMichal Hocko 		}
1590c6018b4bSAneesh Kumar K.V 
15916c21e066SJann Horn 		vma_start_write(vma);
1592c6018b4bSAneesh Kumar K.V 		new->home_node = home_node;
1593f4e9e0e6SLiam R. Howlett 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1594c6018b4bSAneesh Kumar K.V 		mpol_put(new);
1595c6018b4bSAneesh Kumar K.V 		if (err)
1596c6018b4bSAneesh Kumar K.V 			break;
1597c6018b4bSAneesh Kumar K.V 	}
1598c6018b4bSAneesh Kumar K.V 	mmap_write_unlock(mm);
1599c6018b4bSAneesh Kumar K.V 	return err;
1600c6018b4bSAneesh Kumar K.V }
1601c6018b4bSAneesh Kumar K.V 
1602e7dc9ad6SDominik Brodowski SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1603e7dc9ad6SDominik Brodowski 		unsigned long, mode, const unsigned long __user *, nmask,
1604e7dc9ad6SDominik Brodowski 		unsigned long, maxnode, unsigned int, flags)
1605e7dc9ad6SDominik Brodowski {
1606e7dc9ad6SDominik Brodowski 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1607e7dc9ad6SDominik Brodowski }
1608e7dc9ad6SDominik Brodowski 
16098bccd85fSChristoph Lameter /* Set the process memory policy */
1610af03c4acSDominik Brodowski static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1611af03c4acSDominik Brodowski 				 unsigned long maxnode)
16128bccd85fSChristoph Lameter {
161395837924SFeng Tang 	unsigned short mode_flags;
16148bccd85fSChristoph Lameter 	nodemask_t nodes;
161595837924SFeng Tang 	int lmode = mode;
161695837924SFeng Tang 	int err;
16178bccd85fSChristoph Lameter 
161895837924SFeng Tang 	err = sanitize_mpol_flags(&lmode, &mode_flags);
161995837924SFeng Tang 	if (err)
162095837924SFeng Tang 		return err;
162195837924SFeng Tang 
16228bccd85fSChristoph Lameter 	err = get_nodes(&nodes, nmask, maxnode);
16238bccd85fSChristoph Lameter 	if (err)
16248bccd85fSChristoph Lameter 		return err;
162595837924SFeng Tang 
162695837924SFeng Tang 	return do_set_mempolicy(lmode, mode_flags, &nodes);
16278bccd85fSChristoph Lameter }
16288bccd85fSChristoph Lameter 
1629af03c4acSDominik Brodowski SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1630af03c4acSDominik Brodowski 		unsigned long, maxnode)
1631af03c4acSDominik Brodowski {
1632af03c4acSDominik Brodowski 	return kernel_set_mempolicy(mode, nmask, maxnode);
1633af03c4acSDominik Brodowski }
1634af03c4acSDominik Brodowski 
1635b6e9b0baSDominik Brodowski static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1636b6e9b0baSDominik Brodowski 				const unsigned long __user *old_nodes,
1637b6e9b0baSDominik Brodowski 				const unsigned long __user *new_nodes)
163839743889SChristoph Lameter {
1639596d7cfaSKOSAKI Motohiro 	struct mm_struct *mm = NULL;
164039743889SChristoph Lameter 	struct task_struct *task;
164139743889SChristoph Lameter 	nodemask_t task_nodes;
164239743889SChristoph Lameter 	int err;
1643596d7cfaSKOSAKI Motohiro 	nodemask_t *old;
1644596d7cfaSKOSAKI Motohiro 	nodemask_t *new;
1645596d7cfaSKOSAKI Motohiro 	NODEMASK_SCRATCH(scratch);
164639743889SChristoph Lameter 
1647596d7cfaSKOSAKI Motohiro 	if (!scratch)
1648596d7cfaSKOSAKI Motohiro 		return -ENOMEM;
164939743889SChristoph Lameter 
1650596d7cfaSKOSAKI Motohiro 	old = &scratch->mask1;
1651596d7cfaSKOSAKI Motohiro 	new = &scratch->mask2;
1652596d7cfaSKOSAKI Motohiro 
1653596d7cfaSKOSAKI Motohiro 	err = get_nodes(old, old_nodes, maxnode);
165439743889SChristoph Lameter 	if (err)
1655596d7cfaSKOSAKI Motohiro 		goto out;
1656596d7cfaSKOSAKI Motohiro 
1657596d7cfaSKOSAKI Motohiro 	err = get_nodes(new, new_nodes, maxnode);
1658596d7cfaSKOSAKI Motohiro 	if (err)
1659596d7cfaSKOSAKI Motohiro 		goto out;
166039743889SChristoph Lameter 
166139743889SChristoph Lameter 	/* Find the mm_struct */
166255cfaa3cSZeng Zhaoming 	rcu_read_lock();
1663228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
166439743889SChristoph Lameter 	if (!task) {
166555cfaa3cSZeng Zhaoming 		rcu_read_unlock();
1666596d7cfaSKOSAKI Motohiro 		err = -ESRCH;
1667596d7cfaSKOSAKI Motohiro 		goto out;
166839743889SChristoph Lameter 	}
16693268c63eSChristoph Lameter 	get_task_struct(task);
167039743889SChristoph Lameter 
1671596d7cfaSKOSAKI Motohiro 	err = -EINVAL;
167239743889SChristoph Lameter 
167339743889SChristoph Lameter 	/*
167431367466SOtto Ebeling 	 * Check if this process has the right to modify the specified process.
167531367466SOtto Ebeling 	 * Use the regular "ptrace_may_access()" checks.
167639743889SChristoph Lameter 	 */
167731367466SOtto Ebeling 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1678c69e8d9cSDavid Howells 		rcu_read_unlock();
167939743889SChristoph Lameter 		err = -EPERM;
16803268c63eSChristoph Lameter 		goto out_put;
168139743889SChristoph Lameter 	}
1682c69e8d9cSDavid Howells 	rcu_read_unlock();
168339743889SChristoph Lameter 
168439743889SChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
168539743889SChristoph Lameter 	/* Is the user allowed to access the target nodes? */
1686596d7cfaSKOSAKI Motohiro 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
168739743889SChristoph Lameter 		err = -EPERM;
16883268c63eSChristoph Lameter 		goto out_put;
168939743889SChristoph Lameter 	}
169039743889SChristoph Lameter 
16910486a38bSYisheng Xie 	task_nodes = cpuset_mems_allowed(current);
16920486a38bSYisheng Xie 	nodes_and(*new, *new, task_nodes);
16930486a38bSYisheng Xie 	if (nodes_empty(*new))
16943268c63eSChristoph Lameter 		goto out_put;
16950486a38bSYisheng Xie 
169686c3a764SDavid Quigley 	err = security_task_movememory(task);
169786c3a764SDavid Quigley 	if (err)
16983268c63eSChristoph Lameter 		goto out_put;
169986c3a764SDavid Quigley 
17003268c63eSChristoph Lameter 	mm = get_task_mm(task);
17013268c63eSChristoph Lameter 	put_task_struct(task);
1702f2a9ef88SSasha Levin 
1703f2a9ef88SSasha Levin 	if (!mm) {
1704f2a9ef88SSasha Levin 		err = -EINVAL;
1705f2a9ef88SSasha Levin 		goto out;
1706f2a9ef88SSasha Levin 	}
1707f2a9ef88SSasha Levin 
1708596d7cfaSKOSAKI Motohiro 	err = do_migrate_pages(mm, old, new,
170974c00241SChristoph Lameter 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
17103268c63eSChristoph Lameter 
171139743889SChristoph Lameter 	mmput(mm);
17123268c63eSChristoph Lameter out:
1713596d7cfaSKOSAKI Motohiro 	NODEMASK_SCRATCH_FREE(scratch);
1714596d7cfaSKOSAKI Motohiro 
171539743889SChristoph Lameter 	return err;
17163268c63eSChristoph Lameter 
17173268c63eSChristoph Lameter out_put:
17183268c63eSChristoph Lameter 	put_task_struct(task);
17193268c63eSChristoph Lameter 	goto out;
172039743889SChristoph Lameter }
172139743889SChristoph Lameter 
1722b6e9b0baSDominik Brodowski SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1723b6e9b0baSDominik Brodowski 		const unsigned long __user *, old_nodes,
1724b6e9b0baSDominik Brodowski 		const unsigned long __user *, new_nodes)
1725b6e9b0baSDominik Brodowski {
1726b6e9b0baSDominik Brodowski 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1727b6e9b0baSDominik Brodowski }
1728b6e9b0baSDominik Brodowski 
17298bccd85fSChristoph Lameter /* Retrieve NUMA policy */
1730af03c4acSDominik Brodowski static int kernel_get_mempolicy(int __user *policy,
1731af03c4acSDominik Brodowski 				unsigned long __user *nmask,
1732af03c4acSDominik Brodowski 				unsigned long maxnode,
1733af03c4acSDominik Brodowski 				unsigned long addr,
1734af03c4acSDominik Brodowski 				unsigned long flags)
17358bccd85fSChristoph Lameter {
1736dbcb0f19SAdrian Bunk 	int err;
17373f649ab7SKees Cook 	int pval;
17388bccd85fSChristoph Lameter 	nodemask_t nodes;
17398bccd85fSChristoph Lameter 
1740050c17f2SRalph Campbell 	if (nmask != NULL && maxnode < nr_node_ids)
17418bccd85fSChristoph Lameter 		return -EINVAL;
17428bccd85fSChristoph Lameter 
17434605f057SWenchao Hao 	addr = untagged_addr(addr);
17444605f057SWenchao Hao 
17458bccd85fSChristoph Lameter 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
17468bccd85fSChristoph Lameter 
17478bccd85fSChristoph Lameter 	if (err)
17488bccd85fSChristoph Lameter 		return err;
17498bccd85fSChristoph Lameter 
17508bccd85fSChristoph Lameter 	if (policy && put_user(pval, policy))
17518bccd85fSChristoph Lameter 		return -EFAULT;
17528bccd85fSChristoph Lameter 
17538bccd85fSChristoph Lameter 	if (nmask)
17548bccd85fSChristoph Lameter 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
17558bccd85fSChristoph Lameter 
17568bccd85fSChristoph Lameter 	return err;
17578bccd85fSChristoph Lameter }
17588bccd85fSChristoph Lameter 
1759af03c4acSDominik Brodowski SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1760af03c4acSDominik Brodowski 		unsigned long __user *, nmask, unsigned long, maxnode,
1761af03c4acSDominik Brodowski 		unsigned long, addr, unsigned long, flags)
1762af03c4acSDominik Brodowski {
1763af03c4acSDominik Brodowski 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1764af03c4acSDominik Brodowski }
1765af03c4acSDominik Brodowski 
176620ca87f2SLi Xinhai bool vma_migratable(struct vm_area_struct *vma)
176720ca87f2SLi Xinhai {
176820ca87f2SLi Xinhai 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
176920ca87f2SLi Xinhai 		return false;
177020ca87f2SLi Xinhai 
177120ca87f2SLi Xinhai 	/*
177220ca87f2SLi Xinhai 	 * DAX device mappings require predictable access latency, so avoid
177320ca87f2SLi Xinhai 	 * incurring periodic faults.
177420ca87f2SLi Xinhai 	 */
177520ca87f2SLi Xinhai 	if (vma_is_dax(vma))
177620ca87f2SLi Xinhai 		return false;
177720ca87f2SLi Xinhai 
177820ca87f2SLi Xinhai 	if (is_vm_hugetlb_page(vma) &&
177920ca87f2SLi Xinhai 		!hugepage_migration_supported(hstate_vma(vma)))
178020ca87f2SLi Xinhai 		return false;
178120ca87f2SLi Xinhai 
178220ca87f2SLi Xinhai 	/*
178320ca87f2SLi Xinhai 	 * Migration allocates pages in the highest zone. If we cannot
178420ca87f2SLi Xinhai 	 * do so then migration (at least from node to node) is not
178520ca87f2SLi Xinhai 	 * possible.
178620ca87f2SLi Xinhai 	 */
178720ca87f2SLi Xinhai 	if (vma->vm_file &&
178820ca87f2SLi Xinhai 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
178920ca87f2SLi Xinhai 			< policy_zone)
179020ca87f2SLi Xinhai 		return false;
179120ca87f2SLi Xinhai 	return true;
179220ca87f2SLi Xinhai }
179320ca87f2SLi Xinhai 
179474d2c3a0SOleg Nesterov struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1795ddc1a5cbSHugh Dickins 				   unsigned long addr, pgoff_t *ilx)
17961da177e4SLinus Torvalds {
1797ddc1a5cbSHugh Dickins 	*ilx = 0;
1798ddc1a5cbSHugh Dickins 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
1799ddc1a5cbSHugh Dickins 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
180074d2c3a0SOleg Nesterov }
180174d2c3a0SOleg Nesterov 
180274d2c3a0SOleg Nesterov /*
1803ddc1a5cbSHugh Dickins  * get_vma_policy(@vma, @addr, @order, @ilx)
180474d2c3a0SOleg Nesterov  * @vma: virtual memory area whose policy is sought
180574d2c3a0SOleg Nesterov  * @addr: address in @vma for shared policy lookup
1806ddc1a5cbSHugh Dickins  * @order: 0, or appropriate huge_page_order for interleaving
1807fa3bea4eSGregory Price  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1808fa3bea4eSGregory Price  *       MPOL_WEIGHTED_INTERLEAVE
180974d2c3a0SOleg Nesterov  *
181074d2c3a0SOleg Nesterov  * Returns effective policy for a VMA at specified address.
1811dd6eecb9SOleg Nesterov  * Falls back to current->mempolicy or system default policy, as necessary.
181274d2c3a0SOleg Nesterov  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
181374d2c3a0SOleg Nesterov  * count--added by the get_policy() vm_op, as appropriate--to protect against
181474d2c3a0SOleg Nesterov  * freeing by another task.  It is the caller's responsibility to free the
181574d2c3a0SOleg Nesterov  * extra reference for shared policies.
181674d2c3a0SOleg Nesterov  */
1817ddc1a5cbSHugh Dickins struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1818ddc1a5cbSHugh Dickins 				 unsigned long addr, int order, pgoff_t *ilx)
181974d2c3a0SOleg Nesterov {
1820ddc1a5cbSHugh Dickins 	struct mempolicy *pol;
182174d2c3a0SOleg Nesterov 
1822ddc1a5cbSHugh Dickins 	pol = __get_vma_policy(vma, addr, ilx);
18238d90274bSOleg Nesterov 	if (!pol)
1824dd6eecb9SOleg Nesterov 		pol = get_task_policy(current);
1825fa3bea4eSGregory Price 	if (pol->mode == MPOL_INTERLEAVE ||
1826fa3bea4eSGregory Price 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1827ddc1a5cbSHugh Dickins 		*ilx += vma->vm_pgoff >> order;
1828ddc1a5cbSHugh Dickins 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1829ddc1a5cbSHugh Dickins 	}
18301da177e4SLinus Torvalds 	return pol;
18311da177e4SLinus Torvalds }
18321da177e4SLinus Torvalds 
18336b6482bbSOleg Nesterov bool vma_policy_mof(struct vm_area_struct *vma)
1834fc314724SMel Gorman {
18356b6482bbSOleg Nesterov 	struct mempolicy *pol;
1836f15ca78eSOleg Nesterov 
1837fc314724SMel Gorman 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1838fc314724SMel Gorman 		bool ret = false;
1839ddc1a5cbSHugh Dickins 		pgoff_t ilx;		/* ignored here */
1840fc314724SMel Gorman 
1841ddc1a5cbSHugh Dickins 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1842fc314724SMel Gorman 		if (pol && (pol->flags & MPOL_F_MOF))
1843fc314724SMel Gorman 			ret = true;
1844fc314724SMel Gorman 		mpol_cond_put(pol);
1845fc314724SMel Gorman 
1846fc314724SMel Gorman 		return ret;
18478d90274bSOleg Nesterov 	}
18488d90274bSOleg Nesterov 
1849fc314724SMel Gorman 	pol = vma->vm_policy;
18508d90274bSOleg Nesterov 	if (!pol)
18516b6482bbSOleg Nesterov 		pol = get_task_policy(current);
1852fc314724SMel Gorman 
1853fc314724SMel Gorman 	return pol->flags & MPOL_F_MOF;
1854fc314724SMel Gorman }
1855fc314724SMel Gorman 
1856d2226ebdSFeng Tang bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1857d3eb1570SLai Jiangshan {
1858d3eb1570SLai Jiangshan 	enum zone_type dynamic_policy_zone = policy_zone;
1859d3eb1570SLai Jiangshan 
1860d3eb1570SLai Jiangshan 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1861d3eb1570SLai Jiangshan 
1862d3eb1570SLai Jiangshan 	/*
1863269fbe72SBen Widawsky 	 * if policy->nodes has movable memory only,
1864d3eb1570SLai Jiangshan 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1865d3eb1570SLai Jiangshan 	 *
1866269fbe72SBen Widawsky 	 * policy->nodes is intersect with node_states[N_MEMORY].
1867f0953a1bSIngo Molnar 	 * so if the following test fails, it implies
1868269fbe72SBen Widawsky 	 * policy->nodes has movable memory only.
1869d3eb1570SLai Jiangshan 	 */
1870269fbe72SBen Widawsky 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1871d3eb1570SLai Jiangshan 		dynamic_policy_zone = ZONE_MOVABLE;
1872d3eb1570SLai Jiangshan 
1873d3eb1570SLai Jiangshan 	return zone >= dynamic_policy_zone;
1874d3eb1570SLai Jiangshan }
1875d3eb1570SLai Jiangshan 
1876fa3bea4eSGregory Price static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
1877fa3bea4eSGregory Price {
1878274519edSGregory Price 	unsigned int node;
1879274519edSGregory Price 	unsigned int cpuset_mems_cookie;
1880fa3bea4eSGregory Price 
1881274519edSGregory Price retry:
1882274519edSGregory Price 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
1883274519edSGregory Price 	cpuset_mems_cookie = read_mems_allowed_begin();
1884274519edSGregory Price 	node = current->il_prev;
1885fa3bea4eSGregory Price 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
1886fa3bea4eSGregory Price 		node = next_node_in(node, policy->nodes);
1887274519edSGregory Price 		if (read_mems_allowed_retry(cpuset_mems_cookie))
1888274519edSGregory Price 			goto retry;
1889fa3bea4eSGregory Price 		if (node == MAX_NUMNODES)
1890fa3bea4eSGregory Price 			return node;
1891fa3bea4eSGregory Price 		current->il_prev = node;
1892fa3bea4eSGregory Price 		current->il_weight = get_il_weight(node);
1893fa3bea4eSGregory Price 	}
1894fa3bea4eSGregory Price 	current->il_weight--;
1895fa3bea4eSGregory Price 	return node;
1896fa3bea4eSGregory Price }
1897fa3bea4eSGregory Price 
18981da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
1899c36f6e6dSHugh Dickins static unsigned int interleave_nodes(struct mempolicy *policy)
19001da177e4SLinus Torvalds {
1901c36f6e6dSHugh Dickins 	unsigned int nid;
1902274519edSGregory Price 	unsigned int cpuset_mems_cookie;
19031da177e4SLinus Torvalds 
1904274519edSGregory Price 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
1905274519edSGregory Price 	do {
1906274519edSGregory Price 		cpuset_mems_cookie = read_mems_allowed_begin();
1907c36f6e6dSHugh Dickins 		nid = next_node_in(current->il_prev, policy->nodes);
1908274519edSGregory Price 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
1909274519edSGregory Price 
1910c36f6e6dSHugh Dickins 	if (nid < MAX_NUMNODES)
1911c36f6e6dSHugh Dickins 		current->il_prev = nid;
1912c36f6e6dSHugh Dickins 	return nid;
19131da177e4SLinus Torvalds }
19141da177e4SLinus Torvalds 
1915dc85da15SChristoph Lameter /*
1916dc85da15SChristoph Lameter  * Depending on the memory policy provide a node from which to allocate the
1917dc85da15SChristoph Lameter  * next slab entry.
1918dc85da15SChristoph Lameter  */
19192a389610SDavid Rientjes unsigned int mempolicy_slab_node(void)
1920dc85da15SChristoph Lameter {
1921e7b691b0SAndi Kleen 	struct mempolicy *policy;
19222a389610SDavid Rientjes 	int node = numa_mem_id();
1923e7b691b0SAndi Kleen 
192438b031ddSVasily Averin 	if (!in_task())
19252a389610SDavid Rientjes 		return node;
1926e7b691b0SAndi Kleen 
1927e7b691b0SAndi Kleen 	policy = current->mempolicy;
19287858d7bcSFeng Tang 	if (!policy)
19292a389610SDavid Rientjes 		return node;
1930765c4507SChristoph Lameter 
1931bea904d5SLee Schermerhorn 	switch (policy->mode) {
1932bea904d5SLee Schermerhorn 	case MPOL_PREFERRED:
1933269fbe72SBen Widawsky 		return first_node(policy->nodes);
1934bea904d5SLee Schermerhorn 
1935dc85da15SChristoph Lameter 	case MPOL_INTERLEAVE:
1936dc85da15SChristoph Lameter 		return interleave_nodes(policy);
1937dc85da15SChristoph Lameter 
1938fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
1939fa3bea4eSGregory Price 		return weighted_interleave_nodes(policy);
1940fa3bea4eSGregory Price 
1941b27abaccSDave Hansen 	case MPOL_BIND:
1942b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
1943b27abaccSDave Hansen 	{
1944c33d6c06SMel Gorman 		struct zoneref *z;
1945c33d6c06SMel Gorman 
1946dc85da15SChristoph Lameter 		/*
1947dc85da15SChristoph Lameter 		 * Follow bind policy behavior and start allocation at the
1948dc85da15SChristoph Lameter 		 * first node.
1949dc85da15SChristoph Lameter 		 */
195019770b32SMel Gorman 		struct zonelist *zonelist;
195119770b32SMel Gorman 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1952c9634cf0SAneesh Kumar K.V 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1953c33d6c06SMel Gorman 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1954269fbe72SBen Widawsky 							&policy->nodes);
1955c1093b74SPavel Tatashin 		return z->zone ? zone_to_nid(z->zone) : node;
1956dd1a239fSMel Gorman 	}
19577858d7bcSFeng Tang 	case MPOL_LOCAL:
19587858d7bcSFeng Tang 		return node;
1959dc85da15SChristoph Lameter 
1960dc85da15SChristoph Lameter 	default:
1961bea904d5SLee Schermerhorn 		BUG();
1962dc85da15SChristoph Lameter 	}
1963dc85da15SChristoph Lameter }
1964dc85da15SChristoph Lameter 
19659685e6e3SGregory Price static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
19669685e6e3SGregory Price 					      nodemask_t *mask)
19679685e6e3SGregory Price {
19689685e6e3SGregory Price 	/*
19699685e6e3SGregory Price 	 * barrier stabilizes the nodemask locally so that it can be iterated
19709685e6e3SGregory Price 	 * over safely without concern for changes. Allocators validate node
19719685e6e3SGregory Price 	 * selection does not violate mems_allowed, so this is safe.
19729685e6e3SGregory Price 	 */
19739685e6e3SGregory Price 	barrier();
19749685e6e3SGregory Price 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
19759685e6e3SGregory Price 	barrier();
19769685e6e3SGregory Price 	return nodes_weight(*mask);
19779685e6e3SGregory Price }
19789685e6e3SGregory Price 
1979fa3bea4eSGregory Price static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
1980fa3bea4eSGregory Price {
1981fa3bea4eSGregory Price 	nodemask_t nodemask;
1982fa3bea4eSGregory Price 	unsigned int target, nr_nodes;
1983fa3bea4eSGregory Price 	u8 *table;
1984fa3bea4eSGregory Price 	unsigned int weight_total = 0;
1985fa3bea4eSGregory Price 	u8 weight;
1986fa3bea4eSGregory Price 	int nid;
1987fa3bea4eSGregory Price 
1988fa3bea4eSGregory Price 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
1989fa3bea4eSGregory Price 	if (!nr_nodes)
1990fa3bea4eSGregory Price 		return numa_node_id();
1991fa3bea4eSGregory Price 
1992fa3bea4eSGregory Price 	rcu_read_lock();
1993fa3bea4eSGregory Price 	table = rcu_dereference(iw_table);
1994fa3bea4eSGregory Price 	/* calculate the total weight */
1995fa3bea4eSGregory Price 	for_each_node_mask(nid, nodemask) {
1996fa3bea4eSGregory Price 		/* detect system default usage */
1997fa3bea4eSGregory Price 		weight = table ? table[nid] : 1;
1998fa3bea4eSGregory Price 		weight = weight ? weight : 1;
1999fa3bea4eSGregory Price 		weight_total += weight;
2000fa3bea4eSGregory Price 	}
2001fa3bea4eSGregory Price 
2002fa3bea4eSGregory Price 	/* Calculate the node offset based on totals */
2003fa3bea4eSGregory Price 	target = ilx % weight_total;
2004fa3bea4eSGregory Price 	nid = first_node(nodemask);
2005fa3bea4eSGregory Price 	while (target) {
2006fa3bea4eSGregory Price 		/* detect system default usage */
2007fa3bea4eSGregory Price 		weight = table ? table[nid] : 1;
2008fa3bea4eSGregory Price 		weight = weight ? weight : 1;
2009fa3bea4eSGregory Price 		if (target < weight)
2010fa3bea4eSGregory Price 			break;
2011fa3bea4eSGregory Price 		target -= weight;
2012fa3bea4eSGregory Price 		nid = next_node_in(nid, nodemask);
2013fa3bea4eSGregory Price 	}
2014fa3bea4eSGregory Price 	rcu_read_unlock();
2015fa3bea4eSGregory Price 	return nid;
2016fa3bea4eSGregory Price }
2017fa3bea4eSGregory Price 
2018fee83b3aSAndrew Morton /*
2019ddc1a5cbSHugh Dickins  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2020ddc1a5cbSHugh Dickins  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2021ddc1a5cbSHugh Dickins  * exceeds the number of present nodes.
2022fee83b3aSAndrew Morton  */
2023ddc1a5cbSHugh Dickins static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
20241da177e4SLinus Torvalds {
20259685e6e3SGregory Price 	nodemask_t nodemask;
2026276aeee1Syanghui 	unsigned int target, nnodes;
2027fee83b3aSAndrew Morton 	int i;
2028fee83b3aSAndrew Morton 	int nid;
20291da177e4SLinus Torvalds 
20309685e6e3SGregory Price 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2031f5b087b5SDavid Rientjes 	if (!nnodes)
2032f5b087b5SDavid Rientjes 		return numa_node_id();
2033ddc1a5cbSHugh Dickins 	target = ilx % nnodes;
2034276aeee1Syanghui 	nid = first_node(nodemask);
2035fee83b3aSAndrew Morton 	for (i = 0; i < target; i++)
2036276aeee1Syanghui 		nid = next_node(nid, nodemask);
20371da177e4SLinus Torvalds 	return nid;
20381da177e4SLinus Torvalds }
20391da177e4SLinus Torvalds 
20403b98b087SNishanth Aravamudan /*
2041ddc1a5cbSHugh Dickins  * Return a nodemask representing a mempolicy for filtering nodes for
2042ddc1a5cbSHugh Dickins  * page allocation, together with preferred node id (or the input node id).
20433b98b087SNishanth Aravamudan  */
2044ddc1a5cbSHugh Dickins static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2045ddc1a5cbSHugh Dickins 				   pgoff_t ilx, int *nid)
2046ddc1a5cbSHugh Dickins {
2047ddc1a5cbSHugh Dickins 	nodemask_t *nodemask = NULL;
2048ddc1a5cbSHugh Dickins 
2049ddc1a5cbSHugh Dickins 	switch (pol->mode) {
2050ddc1a5cbSHugh Dickins 	case MPOL_PREFERRED:
2051ddc1a5cbSHugh Dickins 		/* Override input node id */
2052ddc1a5cbSHugh Dickins 		*nid = first_node(pol->nodes);
2053ddc1a5cbSHugh Dickins 		break;
2054ddc1a5cbSHugh Dickins 	case MPOL_PREFERRED_MANY:
2055ddc1a5cbSHugh Dickins 		nodemask = &pol->nodes;
2056ddc1a5cbSHugh Dickins 		if (pol->home_node != NUMA_NO_NODE)
2057ddc1a5cbSHugh Dickins 			*nid = pol->home_node;
2058ddc1a5cbSHugh Dickins 		break;
2059ddc1a5cbSHugh Dickins 	case MPOL_BIND:
2060ddc1a5cbSHugh Dickins 		/* Restrict to nodemask (but not on lower zones) */
2061ddc1a5cbSHugh Dickins 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2062ddc1a5cbSHugh Dickins 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2063ddc1a5cbSHugh Dickins 			nodemask = &pol->nodes;
2064ddc1a5cbSHugh Dickins 		if (pol->home_node != NUMA_NO_NODE)
2065ddc1a5cbSHugh Dickins 			*nid = pol->home_node;
2066ddc1a5cbSHugh Dickins 		/*
2067ddc1a5cbSHugh Dickins 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2068ddc1a5cbSHugh Dickins 		 * because we might easily break the expectation to stay on the
2069ddc1a5cbSHugh Dickins 		 * requested node and not break the policy.
2070ddc1a5cbSHugh Dickins 		 */
2071ddc1a5cbSHugh Dickins 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2072ddc1a5cbSHugh Dickins 		break;
2073ddc1a5cbSHugh Dickins 	case MPOL_INTERLEAVE:
2074ddc1a5cbSHugh Dickins 		/* Override input node id */
2075ddc1a5cbSHugh Dickins 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2076ddc1a5cbSHugh Dickins 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2077ddc1a5cbSHugh Dickins 		break;
2078fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
2079fa3bea4eSGregory Price 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2080fa3bea4eSGregory Price 			weighted_interleave_nodes(pol) :
2081fa3bea4eSGregory Price 			weighted_interleave_nid(pol, ilx);
2082fa3bea4eSGregory Price 		break;
2083ddc1a5cbSHugh Dickins 	}
2084ddc1a5cbSHugh Dickins 
2085ddc1a5cbSHugh Dickins 	return nodemask;
20865da7ca86SChristoph Lameter }
20875da7ca86SChristoph Lameter 
208800ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
2089480eccf9SLee Schermerhorn /*
209004ec6264SVlastimil Babka  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2091b46e14acSFabian Frederick  * @vma: virtual memory area whose policy is sought
2092b46e14acSFabian Frederick  * @addr: address in @vma for shared policy lookup and interleave policy
2093b46e14acSFabian Frederick  * @gfp_flags: for requested zone
2094b46e14acSFabian Frederick  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2095b27abaccSDave Hansen  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2096480eccf9SLee Schermerhorn  *
209704ec6264SVlastimil Babka  * Returns a nid suitable for a huge page allocation and a pointer
209852cd3b07SLee Schermerhorn  * to the struct mempolicy for conditional unref after allocation.
2099b27abaccSDave Hansen  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2100b27abaccSDave Hansen  * to the mempolicy's @nodemask for filtering the zonelist.
2101480eccf9SLee Schermerhorn  */
210204ec6264SVlastimil Babka int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
210304ec6264SVlastimil Babka 		struct mempolicy **mpol, nodemask_t **nodemask)
21045da7ca86SChristoph Lameter {
2105ddc1a5cbSHugh Dickins 	pgoff_t ilx;
210604ec6264SVlastimil Babka 	int nid;
21075da7ca86SChristoph Lameter 
2108ddc1a5cbSHugh Dickins 	nid = numa_node_id();
2109ddc1a5cbSHugh Dickins 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2110ddc1a5cbSHugh Dickins 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
211104ec6264SVlastimil Babka 	return nid;
21125da7ca86SChristoph Lameter }
211306808b08SLee Schermerhorn 
211406808b08SLee Schermerhorn /*
211506808b08SLee Schermerhorn  * init_nodemask_of_mempolicy
211606808b08SLee Schermerhorn  *
211706808b08SLee Schermerhorn  * If the current task's mempolicy is "default" [NULL], return 'false'
211806808b08SLee Schermerhorn  * to indicate default policy.  Otherwise, extract the policy nodemask
211906808b08SLee Schermerhorn  * for 'bind' or 'interleave' policy into the argument nodemask, or
212006808b08SLee Schermerhorn  * initialize the argument nodemask to contain the single node for
212106808b08SLee Schermerhorn  * 'preferred' or 'local' policy and return 'true' to indicate presence
212206808b08SLee Schermerhorn  * of non-default mempolicy.
212306808b08SLee Schermerhorn  *
212406808b08SLee Schermerhorn  * We don't bother with reference counting the mempolicy [mpol_get/put]
212506808b08SLee Schermerhorn  * because the current task is examining it's own mempolicy and a task's
212606808b08SLee Schermerhorn  * mempolicy is only ever changed by the task itself.
212706808b08SLee Schermerhorn  *
212806808b08SLee Schermerhorn  * N.B., it is the caller's responsibility to free a returned nodemask.
212906808b08SLee Schermerhorn  */
213006808b08SLee Schermerhorn bool init_nodemask_of_mempolicy(nodemask_t *mask)
213106808b08SLee Schermerhorn {
213206808b08SLee Schermerhorn 	struct mempolicy *mempolicy;
213306808b08SLee Schermerhorn 
213406808b08SLee Schermerhorn 	if (!(mask && current->mempolicy))
213506808b08SLee Schermerhorn 		return false;
213606808b08SLee Schermerhorn 
2137c0ff7453SMiao Xie 	task_lock(current);
213806808b08SLee Schermerhorn 	mempolicy = current->mempolicy;
213906808b08SLee Schermerhorn 	switch (mempolicy->mode) {
214006808b08SLee Schermerhorn 	case MPOL_PREFERRED:
2141b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
214206808b08SLee Schermerhorn 	case MPOL_BIND:
214306808b08SLee Schermerhorn 	case MPOL_INTERLEAVE:
2144fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
2145269fbe72SBen Widawsky 		*mask = mempolicy->nodes;
214606808b08SLee Schermerhorn 		break;
214706808b08SLee Schermerhorn 
21487858d7bcSFeng Tang 	case MPOL_LOCAL:
2149269fbe72SBen Widawsky 		init_nodemask_of_node(mask, numa_node_id());
21507858d7bcSFeng Tang 		break;
21517858d7bcSFeng Tang 
215206808b08SLee Schermerhorn 	default:
215306808b08SLee Schermerhorn 		BUG();
215406808b08SLee Schermerhorn 	}
2155c0ff7453SMiao Xie 	task_unlock(current);
215606808b08SLee Schermerhorn 
215706808b08SLee Schermerhorn 	return true;
215806808b08SLee Schermerhorn }
215900ac59adSChen, Kenneth W #endif
21605da7ca86SChristoph Lameter 
21616f48d0ebSDavid Rientjes /*
2162b26e517aSFeng Tang  * mempolicy_in_oom_domain
21636f48d0ebSDavid Rientjes  *
2164b26e517aSFeng Tang  * If tsk's mempolicy is "bind", check for intersection between mask and
2165b26e517aSFeng Tang  * the policy nodemask. Otherwise, return true for all other policies
2166b26e517aSFeng Tang  * including "interleave", as a tsk with "interleave" policy may have
2167b26e517aSFeng Tang  * memory allocated from all nodes in system.
21686f48d0ebSDavid Rientjes  *
21696f48d0ebSDavid Rientjes  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
21706f48d0ebSDavid Rientjes  */
2171b26e517aSFeng Tang bool mempolicy_in_oom_domain(struct task_struct *tsk,
21726f48d0ebSDavid Rientjes 					const nodemask_t *mask)
21736f48d0ebSDavid Rientjes {
21746f48d0ebSDavid Rientjes 	struct mempolicy *mempolicy;
21756f48d0ebSDavid Rientjes 	bool ret = true;
21766f48d0ebSDavid Rientjes 
21776f48d0ebSDavid Rientjes 	if (!mask)
21786f48d0ebSDavid Rientjes 		return ret;
2179b26e517aSFeng Tang 
21806f48d0ebSDavid Rientjes 	task_lock(tsk);
21816f48d0ebSDavid Rientjes 	mempolicy = tsk->mempolicy;
2182b26e517aSFeng Tang 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2183269fbe72SBen Widawsky 		ret = nodes_intersects(mempolicy->nodes, *mask);
21846f48d0ebSDavid Rientjes 	task_unlock(tsk);
2185b26e517aSFeng Tang 
21866f48d0ebSDavid Rientjes 	return ret;
21876f48d0ebSDavid Rientjes }
21886f48d0ebSDavid Rientjes 
21894c54d949SFeng Tang static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2190ddc1a5cbSHugh Dickins 						int nid, nodemask_t *nodemask)
21914c54d949SFeng Tang {
21924c54d949SFeng Tang 	struct page *page;
21934c54d949SFeng Tang 	gfp_t preferred_gfp;
21944c54d949SFeng Tang 
21954c54d949SFeng Tang 	/*
21964c54d949SFeng Tang 	 * This is a two pass approach. The first pass will only try the
21974c54d949SFeng Tang 	 * preferred nodes but skip the direct reclaim and allow the
21984c54d949SFeng Tang 	 * allocation to fail, while the second pass will try all the
21994c54d949SFeng Tang 	 * nodes in system.
22004c54d949SFeng Tang 	 */
22014c54d949SFeng Tang 	preferred_gfp = gfp | __GFP_NOWARN;
22024c54d949SFeng Tang 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2203ddc1a5cbSHugh Dickins 	page = __alloc_pages(preferred_gfp, order, nid, nodemask);
22044c54d949SFeng Tang 	if (!page)
2205c0455116SAneesh Kumar K.V 		page = __alloc_pages(gfp, order, nid, NULL);
22064c54d949SFeng Tang 
22074c54d949SFeng Tang 	return page;
22084c54d949SFeng Tang }
22094c54d949SFeng Tang 
22101da177e4SLinus Torvalds /**
2211ddc1a5cbSHugh Dickins  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2212eb350739SMatthew Wilcox (Oracle)  * @gfp: GFP flags.
2213ddc1a5cbSHugh Dickins  * @order: Order of the page allocation.
2214ddc1a5cbSHugh Dickins  * @pol: Pointer to the NUMA mempolicy.
2215ddc1a5cbSHugh Dickins  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2216ddc1a5cbSHugh Dickins  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
22171da177e4SLinus Torvalds  *
2218ddc1a5cbSHugh Dickins  * Return: The page on success or NULL if allocation fails.
22191da177e4SLinus Torvalds  */
2220ddc1a5cbSHugh Dickins struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2221ddc1a5cbSHugh Dickins 		struct mempolicy *pol, pgoff_t ilx, int nid)
22221da177e4SLinus Torvalds {
2223ddc1a5cbSHugh Dickins 	nodemask_t *nodemask;
2224adf88aa8SMatthew Wilcox (Oracle) 	struct page *page;
2225adf88aa8SMatthew Wilcox (Oracle) 
2226ddc1a5cbSHugh Dickins 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
22274c54d949SFeng Tang 
2228ddc1a5cbSHugh Dickins 	if (pol->mode == MPOL_PREFERRED_MANY)
2229ddc1a5cbSHugh Dickins 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
223019deb769SDavid Rientjes 
2231ddc1a5cbSHugh Dickins 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2232ddc1a5cbSHugh Dickins 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2233ddc1a5cbSHugh Dickins 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
223419deb769SDavid Rientjes 		/*
223519deb769SDavid Rientjes 		 * For hugepage allocation and non-interleave policy which
223619deb769SDavid Rientjes 		 * allows the current node (or other explicitly preferred
223719deb769SDavid Rientjes 		 * node) we only try to allocate from the current/preferred
223819deb769SDavid Rientjes 		 * node and don't fall back to other nodes, as the cost of
223919deb769SDavid Rientjes 		 * remote accesses would likely offset THP benefits.
224019deb769SDavid Rientjes 		 *
2241b27abaccSDave Hansen 		 * If the policy is interleave or does not allow the current
224219deb769SDavid Rientjes 		 * node in its nodemask, we allocate the standard way.
224319deb769SDavid Rientjes 		 */
2244ddc1a5cbSHugh Dickins 		if (pol->mode != MPOL_INTERLEAVE &&
2245fa3bea4eSGregory Price 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2246ddc1a5cbSHugh Dickins 		    (!nodemask || node_isset(nid, *nodemask))) {
2247cc638f32SVlastimil Babka 			/*
2248cc638f32SVlastimil Babka 			 * First, try to allocate THP only on local node, but
2249cc638f32SVlastimil Babka 			 * don't reclaim unnecessarily, just compact.
2250cc638f32SVlastimil Babka 			 */
2251ddc1a5cbSHugh Dickins 			page = __alloc_pages_node(nid,
2252ddc1a5cbSHugh Dickins 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2253ddc1a5cbSHugh Dickins 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2254ddc1a5cbSHugh Dickins 				return page;
225576e654ccSDavid Rientjes 			/*
225676e654ccSDavid Rientjes 			 * If hugepage allocations are configured to always
225776e654ccSDavid Rientjes 			 * synchronous compact or the vma has been madvised
225876e654ccSDavid Rientjes 			 * to prefer hugepage backing, retry allowing remote
2259cc638f32SVlastimil Babka 			 * memory with both reclaim and compact as well.
226076e654ccSDavid Rientjes 			 */
226119deb769SDavid Rientjes 		}
226219deb769SDavid Rientjes 	}
226319deb769SDavid Rientjes 
2264ddc1a5cbSHugh Dickins 	page = __alloc_pages(gfp, order, nid, nodemask);
2265ddc1a5cbSHugh Dickins 
2266ddc1a5cbSHugh Dickins 	if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
2267ddc1a5cbSHugh Dickins 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2268ddc1a5cbSHugh Dickins 		if (static_branch_likely(&vm_numa_stat_key) &&
2269ddc1a5cbSHugh Dickins 		    page_to_nid(page) == nid) {
2270ddc1a5cbSHugh Dickins 			preempt_disable();
2271ddc1a5cbSHugh Dickins 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2272ddc1a5cbSHugh Dickins 			preempt_enable();
2273ddc1a5cbSHugh Dickins 		}
2274ddc1a5cbSHugh Dickins 	}
2275ddc1a5cbSHugh Dickins 
2276ddc1a5cbSHugh Dickins 	return page;
2277ddc1a5cbSHugh Dickins }
2278ddc1a5cbSHugh Dickins 
2279ddc1a5cbSHugh Dickins /**
2280ddc1a5cbSHugh Dickins  * vma_alloc_folio - Allocate a folio for a VMA.
2281ddc1a5cbSHugh Dickins  * @gfp: GFP flags.
2282ddc1a5cbSHugh Dickins  * @order: Order of the folio.
2283ddc1a5cbSHugh Dickins  * @vma: Pointer to VMA.
2284ddc1a5cbSHugh Dickins  * @addr: Virtual address of the allocation.  Must be inside @vma.
2285ddc1a5cbSHugh Dickins  * @hugepage: Unused (was: For hugepages try only preferred node if possible).
2286ddc1a5cbSHugh Dickins  *
2287ddc1a5cbSHugh Dickins  * Allocate a folio for a specific address in @vma, using the appropriate
2288ddc1a5cbSHugh Dickins  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2289ddc1a5cbSHugh Dickins  * VMA to prevent it from going away.  Should be used for all allocations
2290ddc1a5cbSHugh Dickins  * for folios that will be mapped into user space, excepting hugetlbfs, and
2291ddc1a5cbSHugh Dickins  * excepting where direct use of alloc_pages_mpol() is more appropriate.
2292ddc1a5cbSHugh Dickins  *
2293ddc1a5cbSHugh Dickins  * Return: The folio on success or NULL if allocation fails.
2294ddc1a5cbSHugh Dickins  */
2295ddc1a5cbSHugh Dickins struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2296ddc1a5cbSHugh Dickins 		unsigned long addr, bool hugepage)
2297ddc1a5cbSHugh Dickins {
2298ddc1a5cbSHugh Dickins 	struct mempolicy *pol;
2299ddc1a5cbSHugh Dickins 	pgoff_t ilx;
2300ddc1a5cbSHugh Dickins 	struct page *page;
2301ddc1a5cbSHugh Dickins 
2302ddc1a5cbSHugh Dickins 	pol = get_vma_policy(vma, addr, order, &ilx);
2303ddc1a5cbSHugh Dickins 	page = alloc_pages_mpol(gfp | __GFP_COMP, order,
2304ddc1a5cbSHugh Dickins 				pol, ilx, numa_node_id());
2305d51e9894SVlastimil Babka 	mpol_cond_put(pol);
2306ddc1a5cbSHugh Dickins 	return page_rmappable_folio(page);
2307f584b680SMatthew Wilcox (Oracle) }
2308adf88aa8SMatthew Wilcox (Oracle) EXPORT_SYMBOL(vma_alloc_folio);
2309f584b680SMatthew Wilcox (Oracle) 
23101da177e4SLinus Torvalds /**
2311d7f946d0SMatthew Wilcox (Oracle)  * alloc_pages - Allocate pages.
23126421ec76SMatthew Wilcox (Oracle)  * @gfp: GFP flags.
23136421ec76SMatthew Wilcox (Oracle)  * @order: Power of two of number of pages to allocate.
23141da177e4SLinus Torvalds  *
23156421ec76SMatthew Wilcox (Oracle)  * Allocate 1 << @order contiguous pages.  The physical address of the
23166421ec76SMatthew Wilcox (Oracle)  * first page is naturally aligned (eg an order-3 allocation will be aligned
23176421ec76SMatthew Wilcox (Oracle)  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
23186421ec76SMatthew Wilcox (Oracle)  * process is honoured when in process context.
23191da177e4SLinus Torvalds  *
23206421ec76SMatthew Wilcox (Oracle)  * Context: Can be called from any context, providing the appropriate GFP
23216421ec76SMatthew Wilcox (Oracle)  * flags are used.
23226421ec76SMatthew Wilcox (Oracle)  * Return: The page on success or NULL if allocation fails.
23231da177e4SLinus Torvalds  */
2324ddc1a5cbSHugh Dickins struct page *alloc_pages(gfp_t gfp, unsigned int order)
23251da177e4SLinus Torvalds {
23268d90274bSOleg Nesterov 	struct mempolicy *pol = &default_policy;
232752cd3b07SLee Schermerhorn 
232852cd3b07SLee Schermerhorn 	/*
232952cd3b07SLee Schermerhorn 	 * No reference counting needed for current->mempolicy
233052cd3b07SLee Schermerhorn 	 * nor system default_policy
233152cd3b07SLee Schermerhorn 	 */
2332ddc1a5cbSHugh Dickins 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2333ddc1a5cbSHugh Dickins 		pol = get_task_policy(current);
2334cc9a6c87SMel Gorman 
2335ddc1a5cbSHugh Dickins 	return alloc_pages_mpol(gfp, order,
2336ddc1a5cbSHugh Dickins 				pol, NO_INTERLEAVE_INDEX, numa_node_id());
23371da177e4SLinus Torvalds }
2338d7f946d0SMatthew Wilcox (Oracle) EXPORT_SYMBOL(alloc_pages);
23391da177e4SLinus Torvalds 
2340ddc1a5cbSHugh Dickins struct folio *folio_alloc(gfp_t gfp, unsigned int order)
2341cc09cb13SMatthew Wilcox (Oracle) {
234223e48832SHugh Dickins 	return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
2343cc09cb13SMatthew Wilcox (Oracle) }
2344cc09cb13SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_alloc);
2345cc09cb13SMatthew Wilcox (Oracle) 
2346c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2347c00b6b96SChen Wandun 		struct mempolicy *pol, unsigned long nr_pages,
2348c00b6b96SChen Wandun 		struct page **page_array)
2349c00b6b96SChen Wandun {
2350c00b6b96SChen Wandun 	int nodes;
2351c00b6b96SChen Wandun 	unsigned long nr_pages_per_node;
2352c00b6b96SChen Wandun 	int delta;
2353c00b6b96SChen Wandun 	int i;
2354c00b6b96SChen Wandun 	unsigned long nr_allocated;
2355c00b6b96SChen Wandun 	unsigned long total_allocated = 0;
2356c00b6b96SChen Wandun 
2357c00b6b96SChen Wandun 	nodes = nodes_weight(pol->nodes);
2358c00b6b96SChen Wandun 	nr_pages_per_node = nr_pages / nodes;
2359c00b6b96SChen Wandun 	delta = nr_pages - nodes * nr_pages_per_node;
2360c00b6b96SChen Wandun 
2361c00b6b96SChen Wandun 	for (i = 0; i < nodes; i++) {
2362c00b6b96SChen Wandun 		if (delta) {
2363c00b6b96SChen Wandun 			nr_allocated = __alloc_pages_bulk(gfp,
2364c00b6b96SChen Wandun 					interleave_nodes(pol), NULL,
2365c00b6b96SChen Wandun 					nr_pages_per_node + 1, NULL,
2366c00b6b96SChen Wandun 					page_array);
2367c00b6b96SChen Wandun 			delta--;
2368c00b6b96SChen Wandun 		} else {
2369c00b6b96SChen Wandun 			nr_allocated = __alloc_pages_bulk(gfp,
2370c00b6b96SChen Wandun 					interleave_nodes(pol), NULL,
2371c00b6b96SChen Wandun 					nr_pages_per_node, NULL, page_array);
2372c00b6b96SChen Wandun 		}
2373c00b6b96SChen Wandun 
2374c00b6b96SChen Wandun 		page_array += nr_allocated;
2375c00b6b96SChen Wandun 		total_allocated += nr_allocated;
2376c00b6b96SChen Wandun 	}
2377c00b6b96SChen Wandun 
2378c00b6b96SChen Wandun 	return total_allocated;
2379c00b6b96SChen Wandun }
2380c00b6b96SChen Wandun 
2381fa3bea4eSGregory Price static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
2382fa3bea4eSGregory Price 		struct mempolicy *pol, unsigned long nr_pages,
2383fa3bea4eSGregory Price 		struct page **page_array)
2384fa3bea4eSGregory Price {
2385fa3bea4eSGregory Price 	struct task_struct *me = current;
2386274519edSGregory Price 	unsigned int cpuset_mems_cookie;
2387fa3bea4eSGregory Price 	unsigned long total_allocated = 0;
2388fa3bea4eSGregory Price 	unsigned long nr_allocated = 0;
2389fa3bea4eSGregory Price 	unsigned long rounds;
2390fa3bea4eSGregory Price 	unsigned long node_pages, delta;
2391fa3bea4eSGregory Price 	u8 *table, *weights, weight;
2392fa3bea4eSGregory Price 	unsigned int weight_total = 0;
2393fa3bea4eSGregory Price 	unsigned long rem_pages = nr_pages;
2394fa3bea4eSGregory Price 	nodemask_t nodes;
2395fa3bea4eSGregory Price 	int nnodes, node;
2396fa3bea4eSGregory Price 	int resume_node = MAX_NUMNODES - 1;
2397fa3bea4eSGregory Price 	u8 resume_weight = 0;
2398fa3bea4eSGregory Price 	int prev_node;
2399fa3bea4eSGregory Price 	int i;
2400fa3bea4eSGregory Price 
2401fa3bea4eSGregory Price 	if (!nr_pages)
2402fa3bea4eSGregory Price 		return 0;
2403fa3bea4eSGregory Price 
2404274519edSGregory Price 	/* read the nodes onto the stack, retry if done during rebind */
2405274519edSGregory Price 	do {
2406274519edSGregory Price 		cpuset_mems_cookie = read_mems_allowed_begin();
2407fa3bea4eSGregory Price 		nnodes = read_once_policy_nodemask(pol, &nodes);
2408274519edSGregory Price 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2409274519edSGregory Price 
2410274519edSGregory Price 	/* if the nodemask has become invalid, we cannot do anything */
2411fa3bea4eSGregory Price 	if (!nnodes)
2412fa3bea4eSGregory Price 		return 0;
2413fa3bea4eSGregory Price 
2414fa3bea4eSGregory Price 	/* Continue allocating from most recent node and adjust the nr_pages */
2415fa3bea4eSGregory Price 	node = me->il_prev;
2416fa3bea4eSGregory Price 	weight = me->il_weight;
2417fa3bea4eSGregory Price 	if (weight && node_isset(node, nodes)) {
2418fa3bea4eSGregory Price 		node_pages = min(rem_pages, weight);
2419fa3bea4eSGregory Price 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2420fa3bea4eSGregory Price 						  NULL, page_array);
2421fa3bea4eSGregory Price 		page_array += nr_allocated;
2422fa3bea4eSGregory Price 		total_allocated += nr_allocated;
2423fa3bea4eSGregory Price 		/* if that's all the pages, no need to interleave */
2424fa3bea4eSGregory Price 		if (rem_pages <= weight) {
2425fa3bea4eSGregory Price 			me->il_weight -= rem_pages;
2426fa3bea4eSGregory Price 			return total_allocated;
2427fa3bea4eSGregory Price 		}
2428fa3bea4eSGregory Price 		/* Otherwise we adjust remaining pages, continue from there */
2429fa3bea4eSGregory Price 		rem_pages -= weight;
2430fa3bea4eSGregory Price 	}
2431fa3bea4eSGregory Price 	/* clear active weight in case of an allocation failure */
2432fa3bea4eSGregory Price 	me->il_weight = 0;
2433fa3bea4eSGregory Price 	prev_node = node;
2434fa3bea4eSGregory Price 
2435fa3bea4eSGregory Price 	/* create a local copy of node weights to operate on outside rcu */
2436fa3bea4eSGregory Price 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2437fa3bea4eSGregory Price 	if (!weights)
2438fa3bea4eSGregory Price 		return total_allocated;
2439fa3bea4eSGregory Price 
2440fa3bea4eSGregory Price 	rcu_read_lock();
2441fa3bea4eSGregory Price 	table = rcu_dereference(iw_table);
2442fa3bea4eSGregory Price 	if (table)
2443fa3bea4eSGregory Price 		memcpy(weights, table, nr_node_ids);
2444fa3bea4eSGregory Price 	rcu_read_unlock();
2445fa3bea4eSGregory Price 
2446fa3bea4eSGregory Price 	/* calculate total, detect system default usage */
2447fa3bea4eSGregory Price 	for_each_node_mask(node, nodes) {
2448fa3bea4eSGregory Price 		if (!weights[node])
2449fa3bea4eSGregory Price 			weights[node] = 1;
2450fa3bea4eSGregory Price 		weight_total += weights[node];
2451fa3bea4eSGregory Price 	}
2452fa3bea4eSGregory Price 
2453fa3bea4eSGregory Price 	/*
2454fa3bea4eSGregory Price 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2455fa3bea4eSGregory Price 	 * Track which node weighted interleave should resume from.
2456fa3bea4eSGregory Price 	 *
2457fa3bea4eSGregory Price 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2458fa3bea4eSGregory Price 	 * the node following prev_node and its weight.
2459fa3bea4eSGregory Price 	 */
2460fa3bea4eSGregory Price 	rounds = rem_pages / weight_total;
2461fa3bea4eSGregory Price 	delta = rem_pages % weight_total;
2462fa3bea4eSGregory Price 	resume_node = next_node_in(prev_node, nodes);
2463fa3bea4eSGregory Price 	resume_weight = weights[resume_node];
2464fa3bea4eSGregory Price 	for (i = 0; i < nnodes; i++) {
2465fa3bea4eSGregory Price 		node = next_node_in(prev_node, nodes);
2466fa3bea4eSGregory Price 		weight = weights[node];
2467fa3bea4eSGregory Price 		node_pages = weight * rounds;
2468fa3bea4eSGregory Price 		/* If a delta exists, add this node's portion of the delta */
2469fa3bea4eSGregory Price 		if (delta > weight) {
2470fa3bea4eSGregory Price 			node_pages += weight;
2471fa3bea4eSGregory Price 			delta -= weight;
2472fa3bea4eSGregory Price 		} else if (delta) {
2473fa3bea4eSGregory Price 			/* when delta is depleted, resume from that node */
2474fa3bea4eSGregory Price 			node_pages += delta;
2475fa3bea4eSGregory Price 			resume_node = node;
2476fa3bea4eSGregory Price 			resume_weight = weight - delta;
2477fa3bea4eSGregory Price 			delta = 0;
2478fa3bea4eSGregory Price 		}
2479fa3bea4eSGregory Price 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2480fa3bea4eSGregory Price 		if (!node_pages)
2481fa3bea4eSGregory Price 			break;
2482fa3bea4eSGregory Price 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2483fa3bea4eSGregory Price 						  NULL, page_array);
2484fa3bea4eSGregory Price 		page_array += nr_allocated;
2485fa3bea4eSGregory Price 		total_allocated += nr_allocated;
2486fa3bea4eSGregory Price 		if (total_allocated == nr_pages)
2487fa3bea4eSGregory Price 			break;
2488fa3bea4eSGregory Price 		prev_node = node;
2489fa3bea4eSGregory Price 	}
2490fa3bea4eSGregory Price 	me->il_prev = resume_node;
2491fa3bea4eSGregory Price 	me->il_weight = resume_weight;
2492fa3bea4eSGregory Price 	kfree(weights);
2493fa3bea4eSGregory Price 	return total_allocated;
2494fa3bea4eSGregory Price }
2495fa3bea4eSGregory Price 
2496c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2497c00b6b96SChen Wandun 		struct mempolicy *pol, unsigned long nr_pages,
2498c00b6b96SChen Wandun 		struct page **page_array)
2499c00b6b96SChen Wandun {
2500c00b6b96SChen Wandun 	gfp_t preferred_gfp;
2501c00b6b96SChen Wandun 	unsigned long nr_allocated = 0;
2502c00b6b96SChen Wandun 
2503c00b6b96SChen Wandun 	preferred_gfp = gfp | __GFP_NOWARN;
2504c00b6b96SChen Wandun 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2505c00b6b96SChen Wandun 
2506c00b6b96SChen Wandun 	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2507c00b6b96SChen Wandun 					   nr_pages, NULL, page_array);
2508c00b6b96SChen Wandun 
2509c00b6b96SChen Wandun 	if (nr_allocated < nr_pages)
2510c00b6b96SChen Wandun 		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2511c00b6b96SChen Wandun 				nr_pages - nr_allocated, NULL,
2512c00b6b96SChen Wandun 				page_array + nr_allocated);
2513c00b6b96SChen Wandun 	return nr_allocated;
2514c00b6b96SChen Wandun }
2515c00b6b96SChen Wandun 
2516c00b6b96SChen Wandun /* alloc pages bulk and mempolicy should be considered at the
2517c00b6b96SChen Wandun  * same time in some situation such as vmalloc.
2518c00b6b96SChen Wandun  *
2519c00b6b96SChen Wandun  * It can accelerate memory allocation especially interleaving
2520c00b6b96SChen Wandun  * allocate memory.
2521c00b6b96SChen Wandun  */
2522c00b6b96SChen Wandun unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2523c00b6b96SChen Wandun 		unsigned long nr_pages, struct page **page_array)
2524c00b6b96SChen Wandun {
2525c00b6b96SChen Wandun 	struct mempolicy *pol = &default_policy;
2526ddc1a5cbSHugh Dickins 	nodemask_t *nodemask;
2527ddc1a5cbSHugh Dickins 	int nid;
2528c00b6b96SChen Wandun 
2529c00b6b96SChen Wandun 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2530c00b6b96SChen Wandun 		pol = get_task_policy(current);
2531c00b6b96SChen Wandun 
2532c00b6b96SChen Wandun 	if (pol->mode == MPOL_INTERLEAVE)
2533c00b6b96SChen Wandun 		return alloc_pages_bulk_array_interleave(gfp, pol,
2534c00b6b96SChen Wandun 							 nr_pages, page_array);
2535c00b6b96SChen Wandun 
2536fa3bea4eSGregory Price 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2537fa3bea4eSGregory Price 		return alloc_pages_bulk_array_weighted_interleave(
2538fa3bea4eSGregory Price 				  gfp, pol, nr_pages, page_array);
2539fa3bea4eSGregory Price 
2540c00b6b96SChen Wandun 	if (pol->mode == MPOL_PREFERRED_MANY)
2541c00b6b96SChen Wandun 		return alloc_pages_bulk_array_preferred_many(gfp,
2542c00b6b96SChen Wandun 				numa_node_id(), pol, nr_pages, page_array);
2543c00b6b96SChen Wandun 
2544ddc1a5cbSHugh Dickins 	nid = numa_node_id();
2545ddc1a5cbSHugh Dickins 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2546ddc1a5cbSHugh Dickins 	return __alloc_pages_bulk(gfp, nid, nodemask,
2547ddc1a5cbSHugh Dickins 				  nr_pages, NULL, page_array);
2548c00b6b96SChen Wandun }
2549c00b6b96SChen Wandun 
2550ef0855d3SOleg Nesterov int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2551ef0855d3SOleg Nesterov {
2552c36f6e6dSHugh Dickins 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2553ef0855d3SOleg Nesterov 
2554ef0855d3SOleg Nesterov 	if (IS_ERR(pol))
2555ef0855d3SOleg Nesterov 		return PTR_ERR(pol);
2556ef0855d3SOleg Nesterov 	dst->vm_policy = pol;
2557ef0855d3SOleg Nesterov 	return 0;
2558ef0855d3SOleg Nesterov }
2559ef0855d3SOleg Nesterov 
25604225399aSPaul Jackson /*
2561846a16bfSLee Schermerhorn  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
25624225399aSPaul Jackson  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
25634225399aSPaul Jackson  * with the mems_allowed returned by cpuset_mems_allowed().  This
25644225399aSPaul Jackson  * keeps mempolicies cpuset relative after its cpuset moves.  See
25654225399aSPaul Jackson  * further kernel/cpuset.c update_nodemask().
2566708c1bbcSMiao Xie  *
2567708c1bbcSMiao Xie  * current's mempolicy may be rebinded by the other task(the task that changes
2568708c1bbcSMiao Xie  * cpuset's mems), so we needn't do rebind work for current task.
25694225399aSPaul Jackson  */
25704225399aSPaul Jackson 
2571846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */
2572846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old)
25731da177e4SLinus Torvalds {
25741da177e4SLinus Torvalds 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
25751da177e4SLinus Torvalds 
25761da177e4SLinus Torvalds 	if (!new)
25771da177e4SLinus Torvalds 		return ERR_PTR(-ENOMEM);
2578708c1bbcSMiao Xie 
2579708c1bbcSMiao Xie 	/* task's mempolicy is protected by alloc_lock */
2580708c1bbcSMiao Xie 	if (old == current->mempolicy) {
2581708c1bbcSMiao Xie 		task_lock(current);
2582708c1bbcSMiao Xie 		*new = *old;
2583708c1bbcSMiao Xie 		task_unlock(current);
2584708c1bbcSMiao Xie 	} else
2585708c1bbcSMiao Xie 		*new = *old;
2586708c1bbcSMiao Xie 
25874225399aSPaul Jackson 	if (current_cpuset_is_being_rebound()) {
25884225399aSPaul Jackson 		nodemask_t mems = cpuset_mems_allowed(current);
2589213980c0SVlastimil Babka 		mpol_rebind_policy(new, &mems);
25904225399aSPaul Jackson 	}
25911da177e4SLinus Torvalds 	atomic_set(&new->refcnt, 1);
25921da177e4SLinus Torvalds 	return new;
25931da177e4SLinus Torvalds }
25941da177e4SLinus Torvalds 
25951da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
2596fcfb4dccSKOSAKI Motohiro bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
25971da177e4SLinus Torvalds {
25981da177e4SLinus Torvalds 	if (!a || !b)
2599fcfb4dccSKOSAKI Motohiro 		return false;
260045c4745aSLee Schermerhorn 	if (a->mode != b->mode)
2601fcfb4dccSKOSAKI Motohiro 		return false;
260219800502SBob Liu 	if (a->flags != b->flags)
2603fcfb4dccSKOSAKI Motohiro 		return false;
2604c6018b4bSAneesh Kumar K.V 	if (a->home_node != b->home_node)
2605c6018b4bSAneesh Kumar K.V 		return false;
260619800502SBob Liu 	if (mpol_store_user_nodemask(a))
260719800502SBob Liu 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2608fcfb4dccSKOSAKI Motohiro 			return false;
260919800502SBob Liu 
261045c4745aSLee Schermerhorn 	switch (a->mode) {
261119770b32SMel Gorman 	case MPOL_BIND:
26121da177e4SLinus Torvalds 	case MPOL_INTERLEAVE:
26131da177e4SLinus Torvalds 	case MPOL_PREFERRED:
2614b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
2615fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
2616269fbe72SBen Widawsky 		return !!nodes_equal(a->nodes, b->nodes);
26177858d7bcSFeng Tang 	case MPOL_LOCAL:
26187858d7bcSFeng Tang 		return true;
26191da177e4SLinus Torvalds 	default:
26201da177e4SLinus Torvalds 		BUG();
2621fcfb4dccSKOSAKI Motohiro 		return false;
26221da177e4SLinus Torvalds 	}
26231da177e4SLinus Torvalds }
26241da177e4SLinus Torvalds 
26251da177e4SLinus Torvalds /*
26261da177e4SLinus Torvalds  * Shared memory backing store policy support.
26271da177e4SLinus Torvalds  *
26281da177e4SLinus Torvalds  * Remember policies even when nobody has shared memory mapped.
26291da177e4SLinus Torvalds  * The policies are kept in Red-Black tree linked from the inode.
26304a8c7bb5SNathan Zimmer  * They are protected by the sp->lock rwlock, which should be held
26311da177e4SLinus Torvalds  * for any accesses to the tree.
26321da177e4SLinus Torvalds  */
26331da177e4SLinus Torvalds 
26344a8c7bb5SNathan Zimmer /*
26354a8c7bb5SNathan Zimmer  * lookup first element intersecting start-end.  Caller holds sp->lock for
26364a8c7bb5SNathan Zimmer  * reading or for writing
26374a8c7bb5SNathan Zimmer  */
263893397c3bSHugh Dickins static struct sp_node *sp_lookup(struct shared_policy *sp,
263993397c3bSHugh Dickins 					pgoff_t start, pgoff_t end)
26401da177e4SLinus Torvalds {
26411da177e4SLinus Torvalds 	struct rb_node *n = sp->root.rb_node;
26421da177e4SLinus Torvalds 
26431da177e4SLinus Torvalds 	while (n) {
26441da177e4SLinus Torvalds 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
26451da177e4SLinus Torvalds 
26461da177e4SLinus Torvalds 		if (start >= p->end)
26471da177e4SLinus Torvalds 			n = n->rb_right;
26481da177e4SLinus Torvalds 		else if (end <= p->start)
26491da177e4SLinus Torvalds 			n = n->rb_left;
26501da177e4SLinus Torvalds 		else
26511da177e4SLinus Torvalds 			break;
26521da177e4SLinus Torvalds 	}
26531da177e4SLinus Torvalds 	if (!n)
26541da177e4SLinus Torvalds 		return NULL;
26551da177e4SLinus Torvalds 	for (;;) {
26561da177e4SLinus Torvalds 		struct sp_node *w = NULL;
26571da177e4SLinus Torvalds 		struct rb_node *prev = rb_prev(n);
26581da177e4SLinus Torvalds 		if (!prev)
26591da177e4SLinus Torvalds 			break;
26601da177e4SLinus Torvalds 		w = rb_entry(prev, struct sp_node, nd);
26611da177e4SLinus Torvalds 		if (w->end <= start)
26621da177e4SLinus Torvalds 			break;
26631da177e4SLinus Torvalds 		n = prev;
26641da177e4SLinus Torvalds 	}
26651da177e4SLinus Torvalds 	return rb_entry(n, struct sp_node, nd);
26661da177e4SLinus Torvalds }
26671da177e4SLinus Torvalds 
26684a8c7bb5SNathan Zimmer /*
26694a8c7bb5SNathan Zimmer  * Insert a new shared policy into the list.  Caller holds sp->lock for
26704a8c7bb5SNathan Zimmer  * writing.
26714a8c7bb5SNathan Zimmer  */
26721da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
26731da177e4SLinus Torvalds {
26741da177e4SLinus Torvalds 	struct rb_node **p = &sp->root.rb_node;
26751da177e4SLinus Torvalds 	struct rb_node *parent = NULL;
26761da177e4SLinus Torvalds 	struct sp_node *nd;
26771da177e4SLinus Torvalds 
26781da177e4SLinus Torvalds 	while (*p) {
26791da177e4SLinus Torvalds 		parent = *p;
26801da177e4SLinus Torvalds 		nd = rb_entry(parent, struct sp_node, nd);
26811da177e4SLinus Torvalds 		if (new->start < nd->start)
26821da177e4SLinus Torvalds 			p = &(*p)->rb_left;
26831da177e4SLinus Torvalds 		else if (new->end > nd->end)
26841da177e4SLinus Torvalds 			p = &(*p)->rb_right;
26851da177e4SLinus Torvalds 		else
26861da177e4SLinus Torvalds 			BUG();
26871da177e4SLinus Torvalds 	}
26881da177e4SLinus Torvalds 	rb_link_node(&new->nd, parent, p);
26891da177e4SLinus Torvalds 	rb_insert_color(&new->nd, &sp->root);
26901da177e4SLinus Torvalds }
26911da177e4SLinus Torvalds 
26921da177e4SLinus Torvalds /* Find shared policy intersecting idx */
269393397c3bSHugh Dickins struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
269493397c3bSHugh Dickins 						pgoff_t idx)
26951da177e4SLinus Torvalds {
26961da177e4SLinus Torvalds 	struct mempolicy *pol = NULL;
26971da177e4SLinus Torvalds 	struct sp_node *sn;
26981da177e4SLinus Torvalds 
26991da177e4SLinus Torvalds 	if (!sp->root.rb_node)
27001da177e4SLinus Torvalds 		return NULL;
27014a8c7bb5SNathan Zimmer 	read_lock(&sp->lock);
27021da177e4SLinus Torvalds 	sn = sp_lookup(sp, idx, idx+1);
27031da177e4SLinus Torvalds 	if (sn) {
27041da177e4SLinus Torvalds 		mpol_get(sn->policy);
27051da177e4SLinus Torvalds 		pol = sn->policy;
27061da177e4SLinus Torvalds 	}
27074a8c7bb5SNathan Zimmer 	read_unlock(&sp->lock);
27081da177e4SLinus Torvalds 	return pol;
27091da177e4SLinus Torvalds }
27101da177e4SLinus Torvalds 
271163f74ca2SKOSAKI Motohiro static void sp_free(struct sp_node *n)
271263f74ca2SKOSAKI Motohiro {
271363f74ca2SKOSAKI Motohiro 	mpol_put(n->policy);
271463f74ca2SKOSAKI Motohiro 	kmem_cache_free(sn_cache, n);
271563f74ca2SKOSAKI Motohiro }
271663f74ca2SKOSAKI Motohiro 
2717771fb4d8SLee Schermerhorn /**
271875c70128SKefeng Wang  * mpol_misplaced - check whether current folio node is valid in policy
2719771fb4d8SLee Schermerhorn  *
272075c70128SKefeng Wang  * @folio: folio to be checked
272175c70128SKefeng Wang  * @vma: vm area where folio mapped
272275c70128SKefeng Wang  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2723771fb4d8SLee Schermerhorn  *
272475c70128SKefeng Wang  * Lookup current policy node id for vma,addr and "compare to" folio's
27255f076944SMatthew Wilcox (Oracle)  * node id.  Policy determination "mimics" alloc_page_vma().
2726771fb4d8SLee Schermerhorn  * Called from fault path where we know the vma and faulting address.
27275f076944SMatthew Wilcox (Oracle)  *
2728062db293SBaolin Wang  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
272975c70128SKefeng Wang  * policy, or a suitable node ID to allocate a replacement folio from.
2730771fb4d8SLee Schermerhorn  */
273175c70128SKefeng Wang int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
273275c70128SKefeng Wang 		   unsigned long addr)
2733771fb4d8SLee Schermerhorn {
2734771fb4d8SLee Schermerhorn 	struct mempolicy *pol;
2735ddc1a5cbSHugh Dickins 	pgoff_t ilx;
2736c33d6c06SMel Gorman 	struct zoneref *z;
273775c70128SKefeng Wang 	int curnid = folio_nid(folio);
273890572890SPeter Zijlstra 	int thiscpu = raw_smp_processor_id();
273990572890SPeter Zijlstra 	int thisnid = cpu_to_node(thiscpu);
274098fa15f3SAnshuman Khandual 	int polnid = NUMA_NO_NODE;
2741062db293SBaolin Wang 	int ret = NUMA_NO_NODE;
2742771fb4d8SLee Schermerhorn 
2743ddc1a5cbSHugh Dickins 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2744771fb4d8SLee Schermerhorn 	if (!(pol->flags & MPOL_F_MOF))
2745771fb4d8SLee Schermerhorn 		goto out;
2746771fb4d8SLee Schermerhorn 
2747771fb4d8SLee Schermerhorn 	switch (pol->mode) {
2748771fb4d8SLee Schermerhorn 	case MPOL_INTERLEAVE:
2749ddc1a5cbSHugh Dickins 		polnid = interleave_nid(pol, ilx);
2750771fb4d8SLee Schermerhorn 		break;
2751771fb4d8SLee Schermerhorn 
2752fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
2753fa3bea4eSGregory Price 		polnid = weighted_interleave_nid(pol, ilx);
2754fa3bea4eSGregory Price 		break;
2755fa3bea4eSGregory Price 
2756771fb4d8SLee Schermerhorn 	case MPOL_PREFERRED:
2757b27abaccSDave Hansen 		if (node_isset(curnid, pol->nodes))
2758b27abaccSDave Hansen 			goto out;
2759269fbe72SBen Widawsky 		polnid = first_node(pol->nodes);
2760771fb4d8SLee Schermerhorn 		break;
2761771fb4d8SLee Schermerhorn 
27627858d7bcSFeng Tang 	case MPOL_LOCAL:
27637858d7bcSFeng Tang 		polnid = numa_node_id();
27647858d7bcSFeng Tang 		break;
27657858d7bcSFeng Tang 
2766771fb4d8SLee Schermerhorn 	case MPOL_BIND:
2767bda420b9SHuang Ying 		/* Optimize placement among multiple nodes via NUMA balancing */
2768bda420b9SHuang Ying 		if (pol->flags & MPOL_F_MORON) {
2769269fbe72SBen Widawsky 			if (node_isset(thisnid, pol->nodes))
2770bda420b9SHuang Ying 				break;
2771bda420b9SHuang Ying 			goto out;
2772bda420b9SHuang Ying 		}
2773b27abaccSDave Hansen 		fallthrough;
2774c33d6c06SMel Gorman 
2775b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
2776771fb4d8SLee Schermerhorn 		/*
2777771fb4d8SLee Schermerhorn 		 * use current page if in policy nodemask,
2778771fb4d8SLee Schermerhorn 		 * else select nearest allowed node, if any.
2779771fb4d8SLee Schermerhorn 		 * If no allowed nodes, use current [!misplaced].
2780771fb4d8SLee Schermerhorn 		 */
2781269fbe72SBen Widawsky 		if (node_isset(curnid, pol->nodes))
2782771fb4d8SLee Schermerhorn 			goto out;
2783c33d6c06SMel Gorman 		z = first_zones_zonelist(
2784771fb4d8SLee Schermerhorn 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2785771fb4d8SLee Schermerhorn 				gfp_zone(GFP_HIGHUSER),
2786269fbe72SBen Widawsky 				&pol->nodes);
2787c1093b74SPavel Tatashin 		polnid = zone_to_nid(z->zone);
2788771fb4d8SLee Schermerhorn 		break;
2789771fb4d8SLee Schermerhorn 
2790771fb4d8SLee Schermerhorn 	default:
2791771fb4d8SLee Schermerhorn 		BUG();
2792771fb4d8SLee Schermerhorn 	}
27935606e387SMel Gorman 
279475c70128SKefeng Wang 	/* Migrate the folio towards the node whose CPU is referencing it */
2795e42c8ff2SMel Gorman 	if (pol->flags & MPOL_F_MORON) {
279690572890SPeter Zijlstra 		polnid = thisnid;
27975606e387SMel Gorman 
27988c9ae56dSKefeng Wang 		if (!should_numa_migrate_memory(current, folio, curnid,
279975c70128SKefeng Wang 						thiscpu))
2800de1c9ce6SRik van Riel 			goto out;
2801de1c9ce6SRik van Riel 	}
2802e42c8ff2SMel Gorman 
2803771fb4d8SLee Schermerhorn 	if (curnid != polnid)
2804771fb4d8SLee Schermerhorn 		ret = polnid;
2805771fb4d8SLee Schermerhorn out:
2806771fb4d8SLee Schermerhorn 	mpol_cond_put(pol);
2807771fb4d8SLee Schermerhorn 
2808771fb4d8SLee Schermerhorn 	return ret;
2809771fb4d8SLee Schermerhorn }
2810771fb4d8SLee Schermerhorn 
2811c11600e4SDavid Rientjes /*
2812c11600e4SDavid Rientjes  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2813c11600e4SDavid Rientjes  * dropped after task->mempolicy is set to NULL so that any allocation done as
2814c11600e4SDavid Rientjes  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2815c11600e4SDavid Rientjes  * policy.
2816c11600e4SDavid Rientjes  */
2817c11600e4SDavid Rientjes void mpol_put_task_policy(struct task_struct *task)
2818c11600e4SDavid Rientjes {
2819c11600e4SDavid Rientjes 	struct mempolicy *pol;
2820c11600e4SDavid Rientjes 
2821c11600e4SDavid Rientjes 	task_lock(task);
2822c11600e4SDavid Rientjes 	pol = task->mempolicy;
2823c11600e4SDavid Rientjes 	task->mempolicy = NULL;
2824c11600e4SDavid Rientjes 	task_unlock(task);
2825c11600e4SDavid Rientjes 	mpol_put(pol);
2826c11600e4SDavid Rientjes }
2827c11600e4SDavid Rientjes 
28281da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
28291da177e4SLinus Torvalds {
28301da177e4SLinus Torvalds 	rb_erase(&n->nd, &sp->root);
283163f74ca2SKOSAKI Motohiro 	sp_free(n);
28321da177e4SLinus Torvalds }
28331da177e4SLinus Torvalds 
283442288fe3SMel Gorman static void sp_node_init(struct sp_node *node, unsigned long start,
283542288fe3SMel Gorman 			unsigned long end, struct mempolicy *pol)
283642288fe3SMel Gorman {
283742288fe3SMel Gorman 	node->start = start;
283842288fe3SMel Gorman 	node->end = end;
283942288fe3SMel Gorman 	node->policy = pol;
284042288fe3SMel Gorman }
284142288fe3SMel Gorman 
2842dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2843dbcb0f19SAdrian Bunk 				struct mempolicy *pol)
28441da177e4SLinus Torvalds {
2845869833f2SKOSAKI Motohiro 	struct sp_node *n;
2846869833f2SKOSAKI Motohiro 	struct mempolicy *newpol;
28471da177e4SLinus Torvalds 
2848869833f2SKOSAKI Motohiro 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
28491da177e4SLinus Torvalds 	if (!n)
28501da177e4SLinus Torvalds 		return NULL;
2851869833f2SKOSAKI Motohiro 
2852869833f2SKOSAKI Motohiro 	newpol = mpol_dup(pol);
2853869833f2SKOSAKI Motohiro 	if (IS_ERR(newpol)) {
2854869833f2SKOSAKI Motohiro 		kmem_cache_free(sn_cache, n);
2855869833f2SKOSAKI Motohiro 		return NULL;
2856869833f2SKOSAKI Motohiro 	}
2857869833f2SKOSAKI Motohiro 	newpol->flags |= MPOL_F_SHARED;
285842288fe3SMel Gorman 	sp_node_init(n, start, end, newpol);
2859869833f2SKOSAKI Motohiro 
28601da177e4SLinus Torvalds 	return n;
28611da177e4SLinus Torvalds }
28621da177e4SLinus Torvalds 
28631da177e4SLinus Torvalds /* Replace a policy range. */
286493397c3bSHugh Dickins static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
286593397c3bSHugh Dickins 				 pgoff_t end, struct sp_node *new)
28661da177e4SLinus Torvalds {
2867b22d127aSMel Gorman 	struct sp_node *n;
286842288fe3SMel Gorman 	struct sp_node *n_new = NULL;
286942288fe3SMel Gorman 	struct mempolicy *mpol_new = NULL;
2870b22d127aSMel Gorman 	int ret = 0;
28711da177e4SLinus Torvalds 
287242288fe3SMel Gorman restart:
28734a8c7bb5SNathan Zimmer 	write_lock(&sp->lock);
28741da177e4SLinus Torvalds 	n = sp_lookup(sp, start, end);
28751da177e4SLinus Torvalds 	/* Take care of old policies in the same range. */
28761da177e4SLinus Torvalds 	while (n && n->start < end) {
28771da177e4SLinus Torvalds 		struct rb_node *next = rb_next(&n->nd);
28781da177e4SLinus Torvalds 		if (n->start >= start) {
28791da177e4SLinus Torvalds 			if (n->end <= end)
28801da177e4SLinus Torvalds 				sp_delete(sp, n);
28811da177e4SLinus Torvalds 			else
28821da177e4SLinus Torvalds 				n->start = end;
28831da177e4SLinus Torvalds 		} else {
28841da177e4SLinus Torvalds 			/* Old policy spanning whole new range. */
28851da177e4SLinus Torvalds 			if (n->end > end) {
288642288fe3SMel Gorman 				if (!n_new)
288742288fe3SMel Gorman 					goto alloc_new;
288842288fe3SMel Gorman 
288942288fe3SMel Gorman 				*mpol_new = *n->policy;
289042288fe3SMel Gorman 				atomic_set(&mpol_new->refcnt, 1);
28917880639cSKOSAKI Motohiro 				sp_node_init(n_new, end, n->end, mpol_new);
28921da177e4SLinus Torvalds 				n->end = start;
28935ca39575SHillf Danton 				sp_insert(sp, n_new);
289442288fe3SMel Gorman 				n_new = NULL;
289542288fe3SMel Gorman 				mpol_new = NULL;
28961da177e4SLinus Torvalds 				break;
28971da177e4SLinus Torvalds 			} else
28981da177e4SLinus Torvalds 				n->end = start;
28991da177e4SLinus Torvalds 		}
29001da177e4SLinus Torvalds 		if (!next)
29011da177e4SLinus Torvalds 			break;
29021da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
29031da177e4SLinus Torvalds 	}
29041da177e4SLinus Torvalds 	if (new)
29051da177e4SLinus Torvalds 		sp_insert(sp, new);
29064a8c7bb5SNathan Zimmer 	write_unlock(&sp->lock);
290742288fe3SMel Gorman 	ret = 0;
290842288fe3SMel Gorman 
290942288fe3SMel Gorman err_out:
291042288fe3SMel Gorman 	if (mpol_new)
291142288fe3SMel Gorman 		mpol_put(mpol_new);
291242288fe3SMel Gorman 	if (n_new)
291342288fe3SMel Gorman 		kmem_cache_free(sn_cache, n_new);
291442288fe3SMel Gorman 
2915b22d127aSMel Gorman 	return ret;
291642288fe3SMel Gorman 
291742288fe3SMel Gorman alloc_new:
29184a8c7bb5SNathan Zimmer 	write_unlock(&sp->lock);
291942288fe3SMel Gorman 	ret = -ENOMEM;
292042288fe3SMel Gorman 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
292142288fe3SMel Gorman 	if (!n_new)
292242288fe3SMel Gorman 		goto err_out;
292342288fe3SMel Gorman 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
292442288fe3SMel Gorman 	if (!mpol_new)
292542288fe3SMel Gorman 		goto err_out;
29264ad09955SMiaohe Lin 	atomic_set(&mpol_new->refcnt, 1);
292742288fe3SMel Gorman 	goto restart;
29281da177e4SLinus Torvalds }
29291da177e4SLinus Torvalds 
293071fe804bSLee Schermerhorn /**
293171fe804bSLee Schermerhorn  * mpol_shared_policy_init - initialize shared policy for inode
293271fe804bSLee Schermerhorn  * @sp: pointer to inode shared policy
293371fe804bSLee Schermerhorn  * @mpol:  struct mempolicy to install
293471fe804bSLee Schermerhorn  *
293571fe804bSLee Schermerhorn  * Install non-NULL @mpol in inode's shared policy rb-tree.
293671fe804bSLee Schermerhorn  * On entry, the current task has a reference on a non-NULL @mpol.
293771fe804bSLee Schermerhorn  * This must be released on exit.
29384bfc4495SKAMEZAWA Hiroyuki  * This is called at get_inode() calls and we can use GFP_KERNEL.
293971fe804bSLee Schermerhorn  */
294071fe804bSLee Schermerhorn void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
29417339ff83SRobin Holt {
294258568d2aSMiao Xie 	int ret;
294358568d2aSMiao Xie 
294471fe804bSLee Schermerhorn 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
29454a8c7bb5SNathan Zimmer 	rwlock_init(&sp->lock);
29467339ff83SRobin Holt 
294771fe804bSLee Schermerhorn 	if (mpol) {
294835ec8fa0SHugh Dickins 		struct sp_node *sn;
294935ec8fa0SHugh Dickins 		struct mempolicy *npol;
29504bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH(scratch);
29517339ff83SRobin Holt 
29524bfc4495SKAMEZAWA Hiroyuki 		if (!scratch)
29535c0c1654SLee Schermerhorn 			goto put_mpol;
295435ec8fa0SHugh Dickins 
295535ec8fa0SHugh Dickins 		/* contextualize the tmpfs mount point mempolicy to this file */
295635ec8fa0SHugh Dickins 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
295735ec8fa0SHugh Dickins 		if (IS_ERR(npol))
29580cae3457SDan Carpenter 			goto free_scratch; /* no valid nodemask intersection */
295958568d2aSMiao Xie 
296058568d2aSMiao Xie 		task_lock(current);
296135ec8fa0SHugh Dickins 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
296258568d2aSMiao Xie 		task_unlock(current);
296315d77835SLee Schermerhorn 		if (ret)
296435ec8fa0SHugh Dickins 			goto put_npol;
296571fe804bSLee Schermerhorn 
296635ec8fa0SHugh Dickins 		/* alloc node covering entire file; adds ref to file's npol */
296735ec8fa0SHugh Dickins 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
296835ec8fa0SHugh Dickins 		if (sn)
296935ec8fa0SHugh Dickins 			sp_insert(sp, sn);
297035ec8fa0SHugh Dickins put_npol:
297135ec8fa0SHugh Dickins 		mpol_put(npol);	/* drop initial ref on file's npol */
29720cae3457SDan Carpenter free_scratch:
29734bfc4495SKAMEZAWA Hiroyuki 		NODEMASK_SCRATCH_FREE(scratch);
29745c0c1654SLee Schermerhorn put_mpol:
29755c0c1654SLee Schermerhorn 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
29767339ff83SRobin Holt 	}
29777339ff83SRobin Holt }
29787339ff83SRobin Holt 
2979c36f6e6dSHugh Dickins int mpol_set_shared_policy(struct shared_policy *sp,
2980c36f6e6dSHugh Dickins 			struct vm_area_struct *vma, struct mempolicy *pol)
29811da177e4SLinus Torvalds {
29821da177e4SLinus Torvalds 	int err;
29831da177e4SLinus Torvalds 	struct sp_node *new = NULL;
29841da177e4SLinus Torvalds 	unsigned long sz = vma_pages(vma);
29851da177e4SLinus Torvalds 
2986c36f6e6dSHugh Dickins 	if (pol) {
2987c36f6e6dSHugh Dickins 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
29881da177e4SLinus Torvalds 		if (!new)
29891da177e4SLinus Torvalds 			return -ENOMEM;
29901da177e4SLinus Torvalds 	}
2991c36f6e6dSHugh Dickins 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
29921da177e4SLinus Torvalds 	if (err && new)
299363f74ca2SKOSAKI Motohiro 		sp_free(new);
29941da177e4SLinus Torvalds 	return err;
29951da177e4SLinus Torvalds }
29961da177e4SLinus Torvalds 
29971da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
2998c36f6e6dSHugh Dickins void mpol_free_shared_policy(struct shared_policy *sp)
29991da177e4SLinus Torvalds {
30001da177e4SLinus Torvalds 	struct sp_node *n;
30011da177e4SLinus Torvalds 	struct rb_node *next;
30021da177e4SLinus Torvalds 
3003c36f6e6dSHugh Dickins 	if (!sp->root.rb_node)
30041da177e4SLinus Torvalds 		return;
3005c36f6e6dSHugh Dickins 	write_lock(&sp->lock);
3006c36f6e6dSHugh Dickins 	next = rb_first(&sp->root);
30071da177e4SLinus Torvalds 	while (next) {
30081da177e4SLinus Torvalds 		n = rb_entry(next, struct sp_node, nd);
30091da177e4SLinus Torvalds 		next = rb_next(&n->nd);
3010c36f6e6dSHugh Dickins 		sp_delete(sp, n);
30111da177e4SLinus Torvalds 	}
3012c36f6e6dSHugh Dickins 	write_unlock(&sp->lock);
30131da177e4SLinus Torvalds }
30141da177e4SLinus Torvalds 
30151a687c2eSMel Gorman #ifdef CONFIG_NUMA_BALANCING
3016c297663cSMel Gorman static int __initdata numabalancing_override;
30171a687c2eSMel Gorman 
30181a687c2eSMel Gorman static void __init check_numabalancing_enable(void)
30191a687c2eSMel Gorman {
30201a687c2eSMel Gorman 	bool numabalancing_default = false;
30211a687c2eSMel Gorman 
30221a687c2eSMel Gorman 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
30231a687c2eSMel Gorman 		numabalancing_default = true;
30241a687c2eSMel Gorman 
3025c297663cSMel Gorman 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3026c297663cSMel Gorman 	if (numabalancing_override)
3027c297663cSMel Gorman 		set_numabalancing_state(numabalancing_override == 1);
3028c297663cSMel Gorman 
3029b0dc2b9bSMel Gorman 	if (num_online_nodes() > 1 && !numabalancing_override) {
3030756a025fSJoe Perches 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3031c297663cSMel Gorman 			numabalancing_default ? "Enabling" : "Disabling");
30321a687c2eSMel Gorman 		set_numabalancing_state(numabalancing_default);
30331a687c2eSMel Gorman 	}
30341a687c2eSMel Gorman }
30351a687c2eSMel Gorman 
30361a687c2eSMel Gorman static int __init setup_numabalancing(char *str)
30371a687c2eSMel Gorman {
30381a687c2eSMel Gorman 	int ret = 0;
30391a687c2eSMel Gorman 	if (!str)
30401a687c2eSMel Gorman 		goto out;
30411a687c2eSMel Gorman 
30421a687c2eSMel Gorman 	if (!strcmp(str, "enable")) {
3043c297663cSMel Gorman 		numabalancing_override = 1;
30441a687c2eSMel Gorman 		ret = 1;
30451a687c2eSMel Gorman 	} else if (!strcmp(str, "disable")) {
3046c297663cSMel Gorman 		numabalancing_override = -1;
30471a687c2eSMel Gorman 		ret = 1;
30481a687c2eSMel Gorman 	}
30491a687c2eSMel Gorman out:
30501a687c2eSMel Gorman 	if (!ret)
30514a404beaSAndrew Morton 		pr_warn("Unable to parse numa_balancing=\n");
30521a687c2eSMel Gorman 
30531a687c2eSMel Gorman 	return ret;
30541a687c2eSMel Gorman }
30551a687c2eSMel Gorman __setup("numa_balancing=", setup_numabalancing);
30561a687c2eSMel Gorman #else
30571a687c2eSMel Gorman static inline void __init check_numabalancing_enable(void)
30581a687c2eSMel Gorman {
30591a687c2eSMel Gorman }
30601a687c2eSMel Gorman #endif /* CONFIG_NUMA_BALANCING */
30611a687c2eSMel Gorman 
30621da177e4SLinus Torvalds void __init numa_policy_init(void)
30631da177e4SLinus Torvalds {
3064b71636e2SPaul Mundt 	nodemask_t interleave_nodes;
3065b71636e2SPaul Mundt 	unsigned long largest = 0;
3066b71636e2SPaul Mundt 	int nid, prefer = 0;
3067b71636e2SPaul Mundt 
30681da177e4SLinus Torvalds 	policy_cache = kmem_cache_create("numa_policy",
30691da177e4SLinus Torvalds 					 sizeof(struct mempolicy),
307020c2df83SPaul Mundt 					 0, SLAB_PANIC, NULL);
30711da177e4SLinus Torvalds 
30721da177e4SLinus Torvalds 	sn_cache = kmem_cache_create("shared_policy_node",
30731da177e4SLinus Torvalds 				     sizeof(struct sp_node),
307420c2df83SPaul Mundt 				     0, SLAB_PANIC, NULL);
30751da177e4SLinus Torvalds 
30765606e387SMel Gorman 	for_each_node(nid) {
30775606e387SMel Gorman 		preferred_node_policy[nid] = (struct mempolicy) {
30785606e387SMel Gorman 			.refcnt = ATOMIC_INIT(1),
30795606e387SMel Gorman 			.mode = MPOL_PREFERRED,
30805606e387SMel Gorman 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3081269fbe72SBen Widawsky 			.nodes = nodemask_of_node(nid),
30825606e387SMel Gorman 		};
30835606e387SMel Gorman 	}
30845606e387SMel Gorman 
3085b71636e2SPaul Mundt 	/*
3086b71636e2SPaul Mundt 	 * Set interleaving policy for system init. Interleaving is only
3087b71636e2SPaul Mundt 	 * enabled across suitably sized nodes (default is >= 16MB), or
3088b71636e2SPaul Mundt 	 * fall back to the largest node if they're all smaller.
3089b71636e2SPaul Mundt 	 */
3090b71636e2SPaul Mundt 	nodes_clear(interleave_nodes);
309101f13bd6SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
3092b71636e2SPaul Mundt 		unsigned long total_pages = node_present_pages(nid);
30931da177e4SLinus Torvalds 
3094b71636e2SPaul Mundt 		/* Preserve the largest node */
3095b71636e2SPaul Mundt 		if (largest < total_pages) {
3096b71636e2SPaul Mundt 			largest = total_pages;
3097b71636e2SPaul Mundt 			prefer = nid;
3098b71636e2SPaul Mundt 		}
3099b71636e2SPaul Mundt 
3100b71636e2SPaul Mundt 		/* Interleave this node? */
3101b71636e2SPaul Mundt 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3102b71636e2SPaul Mundt 			node_set(nid, interleave_nodes);
3103b71636e2SPaul Mundt 	}
3104b71636e2SPaul Mundt 
3105b71636e2SPaul Mundt 	/* All too small, use the largest */
3106b71636e2SPaul Mundt 	if (unlikely(nodes_empty(interleave_nodes)))
3107b71636e2SPaul Mundt 		node_set(prefer, interleave_nodes);
3108b71636e2SPaul Mundt 
3109028fec41SDavid Rientjes 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3110b1de0d13SMitchel Humpherys 		pr_err("%s: interleaving failed\n", __func__);
31111a687c2eSMel Gorman 
31121a687c2eSMel Gorman 	check_numabalancing_enable();
31131da177e4SLinus Torvalds }
31141da177e4SLinus Torvalds 
31158bccd85fSChristoph Lameter /* Reset policy of current process to default */
31161da177e4SLinus Torvalds void numa_default_policy(void)
31171da177e4SLinus Torvalds {
3118028fec41SDavid Rientjes 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
31191da177e4SLinus Torvalds }
312068860ec1SPaul Jackson 
31214225399aSPaul Jackson /*
3122095f1fc4SLee Schermerhorn  * Parse and format mempolicy from/to strings
3123095f1fc4SLee Schermerhorn  */
3124345ace9cSLee Schermerhorn static const char * const policy_modes[] =
3125345ace9cSLee Schermerhorn {
3126345ace9cSLee Schermerhorn 	[MPOL_DEFAULT]    = "default",
3127345ace9cSLee Schermerhorn 	[MPOL_PREFERRED]  = "prefer",
3128345ace9cSLee Schermerhorn 	[MPOL_BIND]       = "bind",
3129345ace9cSLee Schermerhorn 	[MPOL_INTERLEAVE] = "interleave",
3130fa3bea4eSGregory Price 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3131d3a71033SLee Schermerhorn 	[MPOL_LOCAL]      = "local",
3132b27abaccSDave Hansen 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3133345ace9cSLee Schermerhorn };
31341a75a6c8SChristoph Lameter 
3135095f1fc4SLee Schermerhorn #ifdef CONFIG_TMPFS
3136095f1fc4SLee Schermerhorn /**
3137f2a07f40SHugh Dickins  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3138095f1fc4SLee Schermerhorn  * @str:  string containing mempolicy to parse
313971fe804bSLee Schermerhorn  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3140095f1fc4SLee Schermerhorn  *
3141095f1fc4SLee Schermerhorn  * Format of input:
3142095f1fc4SLee Schermerhorn  *	<mode>[=<flags>][:<nodelist>]
3143095f1fc4SLee Schermerhorn  *
3144dad5b023SRandy Dunlap  * Return: %0 on success, else %1
3145095f1fc4SLee Schermerhorn  */
3146a7a88b23SHugh Dickins int mpol_parse_str(char *str, struct mempolicy **mpol)
3147095f1fc4SLee Schermerhorn {
314871fe804bSLee Schermerhorn 	struct mempolicy *new = NULL;
3149f2a07f40SHugh Dickins 	unsigned short mode_flags;
315071fe804bSLee Schermerhorn 	nodemask_t nodes;
3151095f1fc4SLee Schermerhorn 	char *nodelist = strchr(str, ':');
3152095f1fc4SLee Schermerhorn 	char *flags = strchr(str, '=');
3153dedf2c73Szhong jiang 	int err = 1, mode;
3154095f1fc4SLee Schermerhorn 
3155c7a91bc7SDan Carpenter 	if (flags)
3156c7a91bc7SDan Carpenter 		*flags++ = '\0';	/* terminate mode string */
3157c7a91bc7SDan Carpenter 
3158095f1fc4SLee Schermerhorn 	if (nodelist) {
3159095f1fc4SLee Schermerhorn 		/* NUL-terminate mode or flags string */
3160095f1fc4SLee Schermerhorn 		*nodelist++ = '\0';
316171fe804bSLee Schermerhorn 		if (nodelist_parse(nodelist, nodes))
3162095f1fc4SLee Schermerhorn 			goto out;
316301f13bd6SLai Jiangshan 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3164095f1fc4SLee Schermerhorn 			goto out;
316571fe804bSLee Schermerhorn 	} else
316671fe804bSLee Schermerhorn 		nodes_clear(nodes);
316771fe804bSLee Schermerhorn 
3168dedf2c73Szhong jiang 	mode = match_string(policy_modes, MPOL_MAX, str);
3169dedf2c73Szhong jiang 	if (mode < 0)
3170095f1fc4SLee Schermerhorn 		goto out;
3171095f1fc4SLee Schermerhorn 
317271fe804bSLee Schermerhorn 	switch (mode) {
3173095f1fc4SLee Schermerhorn 	case MPOL_PREFERRED:
317471fe804bSLee Schermerhorn 		/*
3175aa9f7d51SRandy Dunlap 		 * Insist on a nodelist of one node only, although later
3176aa9f7d51SRandy Dunlap 		 * we use first_node(nodes) to grab a single node, so here
3177aa9f7d51SRandy Dunlap 		 * nodelist (or nodes) cannot be empty.
317871fe804bSLee Schermerhorn 		 */
3179095f1fc4SLee Schermerhorn 		if (nodelist) {
3180095f1fc4SLee Schermerhorn 			char *rest = nodelist;
3181095f1fc4SLee Schermerhorn 			while (isdigit(*rest))
3182095f1fc4SLee Schermerhorn 				rest++;
3183926f2ae0SKOSAKI Motohiro 			if (*rest)
3184926f2ae0SKOSAKI Motohiro 				goto out;
3185aa9f7d51SRandy Dunlap 			if (nodes_empty(nodes))
3186aa9f7d51SRandy Dunlap 				goto out;
3187095f1fc4SLee Schermerhorn 		}
3188095f1fc4SLee Schermerhorn 		break;
3189095f1fc4SLee Schermerhorn 	case MPOL_INTERLEAVE:
3190fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
3191095f1fc4SLee Schermerhorn 		/*
3192095f1fc4SLee Schermerhorn 		 * Default to online nodes with memory if no nodelist
3193095f1fc4SLee Schermerhorn 		 */
3194095f1fc4SLee Schermerhorn 		if (!nodelist)
319501f13bd6SLai Jiangshan 			nodes = node_states[N_MEMORY];
31963f226aa1SLee Schermerhorn 		break;
319771fe804bSLee Schermerhorn 	case MPOL_LOCAL:
31983f226aa1SLee Schermerhorn 		/*
319971fe804bSLee Schermerhorn 		 * Don't allow a nodelist;  mpol_new() checks flags
32003f226aa1SLee Schermerhorn 		 */
320171fe804bSLee Schermerhorn 		if (nodelist)
32023f226aa1SLee Schermerhorn 			goto out;
32033f226aa1SLee Schermerhorn 		break;
3204413b43deSRavikiran G Thirumalai 	case MPOL_DEFAULT:
3205413b43deSRavikiran G Thirumalai 		/*
3206413b43deSRavikiran G Thirumalai 		 * Insist on a empty nodelist
3207413b43deSRavikiran G Thirumalai 		 */
3208413b43deSRavikiran G Thirumalai 		if (!nodelist)
3209413b43deSRavikiran G Thirumalai 			err = 0;
3210413b43deSRavikiran G Thirumalai 		goto out;
3211b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
3212d69b2e63SKOSAKI Motohiro 	case MPOL_BIND:
321371fe804bSLee Schermerhorn 		/*
3214d69b2e63SKOSAKI Motohiro 		 * Insist on a nodelist
321571fe804bSLee Schermerhorn 		 */
3216d69b2e63SKOSAKI Motohiro 		if (!nodelist)
3217d69b2e63SKOSAKI Motohiro 			goto out;
3218095f1fc4SLee Schermerhorn 	}
3219095f1fc4SLee Schermerhorn 
322071fe804bSLee Schermerhorn 	mode_flags = 0;
3221095f1fc4SLee Schermerhorn 	if (flags) {
3222095f1fc4SLee Schermerhorn 		/*
3223095f1fc4SLee Schermerhorn 		 * Currently, we only support two mutually exclusive
3224095f1fc4SLee Schermerhorn 		 * mode flags.
3225095f1fc4SLee Schermerhorn 		 */
3226095f1fc4SLee Schermerhorn 		if (!strcmp(flags, "static"))
322771fe804bSLee Schermerhorn 			mode_flags |= MPOL_F_STATIC_NODES;
3228095f1fc4SLee Schermerhorn 		else if (!strcmp(flags, "relative"))
322971fe804bSLee Schermerhorn 			mode_flags |= MPOL_F_RELATIVE_NODES;
3230095f1fc4SLee Schermerhorn 		else
3231926f2ae0SKOSAKI Motohiro 			goto out;
3232095f1fc4SLee Schermerhorn 	}
323371fe804bSLee Schermerhorn 
323471fe804bSLee Schermerhorn 	new = mpol_new(mode, mode_flags, &nodes);
323571fe804bSLee Schermerhorn 	if (IS_ERR(new))
3236926f2ae0SKOSAKI Motohiro 		goto out;
3237926f2ae0SKOSAKI Motohiro 
3238f2a07f40SHugh Dickins 	/*
3239f2a07f40SHugh Dickins 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3240f2a07f40SHugh Dickins 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3241f2a07f40SHugh Dickins 	 */
3242269fbe72SBen Widawsky 	if (mode != MPOL_PREFERRED) {
3243269fbe72SBen Widawsky 		new->nodes = nodes;
3244269fbe72SBen Widawsky 	} else if (nodelist) {
3245269fbe72SBen Widawsky 		nodes_clear(new->nodes);
3246269fbe72SBen Widawsky 		node_set(first_node(nodes), new->nodes);
3247269fbe72SBen Widawsky 	} else {
32487858d7bcSFeng Tang 		new->mode = MPOL_LOCAL;
3249269fbe72SBen Widawsky 	}
3250f2a07f40SHugh Dickins 
3251f2a07f40SHugh Dickins 	/*
3252f2a07f40SHugh Dickins 	 * Save nodes for contextualization: this will be used to "clone"
3253f2a07f40SHugh Dickins 	 * the mempolicy in a specific context [cpuset] at a later time.
3254f2a07f40SHugh Dickins 	 */
3255e17f74afSLee Schermerhorn 	new->w.user_nodemask = nodes;
3256f2a07f40SHugh Dickins 
3257926f2ae0SKOSAKI Motohiro 	err = 0;
325871fe804bSLee Schermerhorn 
3259095f1fc4SLee Schermerhorn out:
3260095f1fc4SLee Schermerhorn 	/* Restore string for error message */
3261095f1fc4SLee Schermerhorn 	if (nodelist)
3262095f1fc4SLee Schermerhorn 		*--nodelist = ':';
3263095f1fc4SLee Schermerhorn 	if (flags)
3264095f1fc4SLee Schermerhorn 		*--flags = '=';
326571fe804bSLee Schermerhorn 	if (!err)
326671fe804bSLee Schermerhorn 		*mpol = new;
3267095f1fc4SLee Schermerhorn 	return err;
3268095f1fc4SLee Schermerhorn }
3269095f1fc4SLee Schermerhorn #endif /* CONFIG_TMPFS */
3270095f1fc4SLee Schermerhorn 
327171fe804bSLee Schermerhorn /**
327271fe804bSLee Schermerhorn  * mpol_to_str - format a mempolicy structure for printing
327371fe804bSLee Schermerhorn  * @buffer:  to contain formatted mempolicy string
327471fe804bSLee Schermerhorn  * @maxlen:  length of @buffer
327571fe804bSLee Schermerhorn  * @pol:  pointer to mempolicy to be formatted
327671fe804bSLee Schermerhorn  *
3277948927eeSDavid Rientjes  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3278948927eeSDavid Rientjes  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3279948927eeSDavid Rientjes  * longest flag, "relative", and to display at least a few node ids.
32801a75a6c8SChristoph Lameter  */
3281948927eeSDavid Rientjes void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
32821a75a6c8SChristoph Lameter {
32831a75a6c8SChristoph Lameter 	char *p = buffer;
3284948927eeSDavid Rientjes 	nodemask_t nodes = NODE_MASK_NONE;
3285948927eeSDavid Rientjes 	unsigned short mode = MPOL_DEFAULT;
3286948927eeSDavid Rientjes 	unsigned short flags = 0;
32871a75a6c8SChristoph Lameter 
32888790c71aSDavid Rientjes 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3289bea904d5SLee Schermerhorn 		mode = pol->mode;
3290948927eeSDavid Rientjes 		flags = pol->flags;
3291948927eeSDavid Rientjes 	}
3292bea904d5SLee Schermerhorn 
32931a75a6c8SChristoph Lameter 	switch (mode) {
32941a75a6c8SChristoph Lameter 	case MPOL_DEFAULT:
32957858d7bcSFeng Tang 	case MPOL_LOCAL:
32961a75a6c8SChristoph Lameter 		break;
32971a75a6c8SChristoph Lameter 	case MPOL_PREFERRED:
3298b27abaccSDave Hansen 	case MPOL_PREFERRED_MANY:
32991a75a6c8SChristoph Lameter 	case MPOL_BIND:
33001a75a6c8SChristoph Lameter 	case MPOL_INTERLEAVE:
3301fa3bea4eSGregory Price 	case MPOL_WEIGHTED_INTERLEAVE:
3302269fbe72SBen Widawsky 		nodes = pol->nodes;
33031a75a6c8SChristoph Lameter 		break;
33041a75a6c8SChristoph Lameter 	default:
3305948927eeSDavid Rientjes 		WARN_ON_ONCE(1);
3306948927eeSDavid Rientjes 		snprintf(p, maxlen, "unknown");
3307948927eeSDavid Rientjes 		return;
33081a75a6c8SChristoph Lameter 	}
33091a75a6c8SChristoph Lameter 
3310b7a9f420SDavid Rientjes 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
33111a75a6c8SChristoph Lameter 
3312fc36b8d3SLee Schermerhorn 	if (flags & MPOL_MODE_FLAGS) {
3313948927eeSDavid Rientjes 		p += snprintf(p, buffer + maxlen - p, "=");
3314f5b087b5SDavid Rientjes 
33152291990aSLee Schermerhorn 		/*
33162291990aSLee Schermerhorn 		 * Currently, the only defined flags are mutually exclusive
33172291990aSLee Schermerhorn 		 */
3318f5b087b5SDavid Rientjes 		if (flags & MPOL_F_STATIC_NODES)
33192291990aSLee Schermerhorn 			p += snprintf(p, buffer + maxlen - p, "static");
33202291990aSLee Schermerhorn 		else if (flags & MPOL_F_RELATIVE_NODES)
33212291990aSLee Schermerhorn 			p += snprintf(p, buffer + maxlen - p, "relative");
3322f5b087b5SDavid Rientjes 	}
3323f5b087b5SDavid Rientjes 
33249e763e0fSTejun Heo 	if (!nodes_empty(nodes))
33259e763e0fSTejun Heo 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
33269e763e0fSTejun Heo 			       nodemask_pr_args(&nodes));
33271a75a6c8SChristoph Lameter }
3328dce41f5aSRakie Kim 
3329dce41f5aSRakie Kim #ifdef CONFIG_SYSFS
3330dce41f5aSRakie Kim struct iw_node_attr {
3331dce41f5aSRakie Kim 	struct kobj_attribute kobj_attr;
3332dce41f5aSRakie Kim 	int nid;
3333dce41f5aSRakie Kim };
3334dce41f5aSRakie Kim 
3335dce41f5aSRakie Kim static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3336dce41f5aSRakie Kim 			 char *buf)
3337dce41f5aSRakie Kim {
3338dce41f5aSRakie Kim 	struct iw_node_attr *node_attr;
3339dce41f5aSRakie Kim 	u8 weight;
3340dce41f5aSRakie Kim 
3341dce41f5aSRakie Kim 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3342dce41f5aSRakie Kim 	weight = get_il_weight(node_attr->nid);
3343dce41f5aSRakie Kim 	return sysfs_emit(buf, "%d\n", weight);
3344dce41f5aSRakie Kim }
3345dce41f5aSRakie Kim 
3346dce41f5aSRakie Kim static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3347dce41f5aSRakie Kim 			  const char *buf, size_t count)
3348dce41f5aSRakie Kim {
3349dce41f5aSRakie Kim 	struct iw_node_attr *node_attr;
3350dce41f5aSRakie Kim 	u8 *new;
3351dce41f5aSRakie Kim 	u8 *old;
3352dce41f5aSRakie Kim 	u8 weight = 0;
3353dce41f5aSRakie Kim 
3354dce41f5aSRakie Kim 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3355dce41f5aSRakie Kim 	if (count == 0 || sysfs_streq(buf, ""))
3356dce41f5aSRakie Kim 		weight = 0;
3357dce41f5aSRakie Kim 	else if (kstrtou8(buf, 0, &weight))
3358dce41f5aSRakie Kim 		return -EINVAL;
3359dce41f5aSRakie Kim 
3360dce41f5aSRakie Kim 	new = kzalloc(nr_node_ids, GFP_KERNEL);
3361dce41f5aSRakie Kim 	if (!new)
3362dce41f5aSRakie Kim 		return -ENOMEM;
3363dce41f5aSRakie Kim 
3364dce41f5aSRakie Kim 	mutex_lock(&iw_table_lock);
3365dce41f5aSRakie Kim 	old = rcu_dereference_protected(iw_table,
3366dce41f5aSRakie Kim 					lockdep_is_held(&iw_table_lock));
3367dce41f5aSRakie Kim 	if (old)
3368dce41f5aSRakie Kim 		memcpy(new, old, nr_node_ids);
3369dce41f5aSRakie Kim 	new[node_attr->nid] = weight;
3370dce41f5aSRakie Kim 	rcu_assign_pointer(iw_table, new);
3371dce41f5aSRakie Kim 	mutex_unlock(&iw_table_lock);
3372dce41f5aSRakie Kim 	synchronize_rcu();
3373dce41f5aSRakie Kim 	kfree(old);
3374dce41f5aSRakie Kim 	return count;
3375dce41f5aSRakie Kim }
3376dce41f5aSRakie Kim 
3377dce41f5aSRakie Kim static struct iw_node_attr **node_attrs;
3378dce41f5aSRakie Kim 
3379dce41f5aSRakie Kim static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
3380dce41f5aSRakie Kim 				  struct kobject *parent)
3381dce41f5aSRakie Kim {
3382dce41f5aSRakie Kim 	if (!node_attr)
3383dce41f5aSRakie Kim 		return;
3384dce41f5aSRakie Kim 	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
3385dce41f5aSRakie Kim 	kfree(node_attr->kobj_attr.attr.name);
3386dce41f5aSRakie Kim 	kfree(node_attr);
3387dce41f5aSRakie Kim }
3388dce41f5aSRakie Kim 
3389dce41f5aSRakie Kim static void sysfs_wi_release(struct kobject *wi_kobj)
3390dce41f5aSRakie Kim {
3391dce41f5aSRakie Kim 	int i;
3392dce41f5aSRakie Kim 
3393dce41f5aSRakie Kim 	for (i = 0; i < nr_node_ids; i++)
3394dce41f5aSRakie Kim 		sysfs_wi_node_release(node_attrs[i], wi_kobj);
3395dce41f5aSRakie Kim 	kobject_put(wi_kobj);
3396dce41f5aSRakie Kim }
3397dce41f5aSRakie Kim 
3398dce41f5aSRakie Kim static const struct kobj_type wi_ktype = {
3399dce41f5aSRakie Kim 	.sysfs_ops = &kobj_sysfs_ops,
3400dce41f5aSRakie Kim 	.release = sysfs_wi_release,
3401dce41f5aSRakie Kim };
3402dce41f5aSRakie Kim 
3403dce41f5aSRakie Kim static int add_weight_node(int nid, struct kobject *wi_kobj)
3404dce41f5aSRakie Kim {
3405dce41f5aSRakie Kim 	struct iw_node_attr *node_attr;
3406dce41f5aSRakie Kim 	char *name;
3407dce41f5aSRakie Kim 
3408dce41f5aSRakie Kim 	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
3409dce41f5aSRakie Kim 	if (!node_attr)
3410dce41f5aSRakie Kim 		return -ENOMEM;
3411dce41f5aSRakie Kim 
3412dce41f5aSRakie Kim 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3413dce41f5aSRakie Kim 	if (!name) {
3414dce41f5aSRakie Kim 		kfree(node_attr);
3415dce41f5aSRakie Kim 		return -ENOMEM;
3416dce41f5aSRakie Kim 	}
3417dce41f5aSRakie Kim 
3418dce41f5aSRakie Kim 	sysfs_attr_init(&node_attr->kobj_attr.attr);
3419dce41f5aSRakie Kim 	node_attr->kobj_attr.attr.name = name;
3420dce41f5aSRakie Kim 	node_attr->kobj_attr.attr.mode = 0644;
3421dce41f5aSRakie Kim 	node_attr->kobj_attr.show = node_show;
3422dce41f5aSRakie Kim 	node_attr->kobj_attr.store = node_store;
3423dce41f5aSRakie Kim 	node_attr->nid = nid;
3424dce41f5aSRakie Kim 
3425dce41f5aSRakie Kim 	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
3426dce41f5aSRakie Kim 		kfree(node_attr->kobj_attr.attr.name);
3427dce41f5aSRakie Kim 		kfree(node_attr);
3428dce41f5aSRakie Kim 		pr_err("failed to add attribute to weighted_interleave\n");
3429dce41f5aSRakie Kim 		return -ENOMEM;
3430dce41f5aSRakie Kim 	}
3431dce41f5aSRakie Kim 
3432dce41f5aSRakie Kim 	node_attrs[nid] = node_attr;
3433dce41f5aSRakie Kim 	return 0;
3434dce41f5aSRakie Kim }
3435dce41f5aSRakie Kim 
3436dce41f5aSRakie Kim static int add_weighted_interleave_group(struct kobject *root_kobj)
3437dce41f5aSRakie Kim {
3438dce41f5aSRakie Kim 	struct kobject *wi_kobj;
3439dce41f5aSRakie Kim 	int nid, err;
3440dce41f5aSRakie Kim 
3441dce41f5aSRakie Kim 	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
3442dce41f5aSRakie Kim 	if (!wi_kobj)
3443dce41f5aSRakie Kim 		return -ENOMEM;
3444dce41f5aSRakie Kim 
3445dce41f5aSRakie Kim 	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
3446dce41f5aSRakie Kim 				   "weighted_interleave");
3447dce41f5aSRakie Kim 	if (err) {
3448dce41f5aSRakie Kim 		kfree(wi_kobj);
3449dce41f5aSRakie Kim 		return err;
3450dce41f5aSRakie Kim 	}
3451dce41f5aSRakie Kim 
3452dce41f5aSRakie Kim 	for_each_node_state(nid, N_POSSIBLE) {
3453dce41f5aSRakie Kim 		err = add_weight_node(nid, wi_kobj);
3454dce41f5aSRakie Kim 		if (err) {
3455dce41f5aSRakie Kim 			pr_err("failed to add sysfs [node%d]\n", nid);
3456dce41f5aSRakie Kim 			break;
3457dce41f5aSRakie Kim 		}
3458dce41f5aSRakie Kim 	}
3459dce41f5aSRakie Kim 	if (err)
3460dce41f5aSRakie Kim 		kobject_put(wi_kobj);
3461dce41f5aSRakie Kim 	return 0;
3462dce41f5aSRakie Kim }
3463dce41f5aSRakie Kim 
3464dce41f5aSRakie Kim static void mempolicy_kobj_release(struct kobject *kobj)
3465dce41f5aSRakie Kim {
3466dce41f5aSRakie Kim 	u8 *old;
3467dce41f5aSRakie Kim 
3468dce41f5aSRakie Kim 	mutex_lock(&iw_table_lock);
3469dce41f5aSRakie Kim 	old = rcu_dereference_protected(iw_table,
3470dce41f5aSRakie Kim 					lockdep_is_held(&iw_table_lock));
3471dce41f5aSRakie Kim 	rcu_assign_pointer(iw_table, NULL);
3472dce41f5aSRakie Kim 	mutex_unlock(&iw_table_lock);
3473dce41f5aSRakie Kim 	synchronize_rcu();
3474dce41f5aSRakie Kim 	kfree(old);
3475dce41f5aSRakie Kim 	kfree(node_attrs);
3476dce41f5aSRakie Kim 	kfree(kobj);
3477dce41f5aSRakie Kim }
3478dce41f5aSRakie Kim 
3479dce41f5aSRakie Kim static const struct kobj_type mempolicy_ktype = {
3480dce41f5aSRakie Kim 	.release = mempolicy_kobj_release
3481dce41f5aSRakie Kim };
3482dce41f5aSRakie Kim 
3483dce41f5aSRakie Kim static int __init mempolicy_sysfs_init(void)
3484dce41f5aSRakie Kim {
3485dce41f5aSRakie Kim 	int err;
3486dce41f5aSRakie Kim 	static struct kobject *mempolicy_kobj;
3487dce41f5aSRakie Kim 
3488dce41f5aSRakie Kim 	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
3489dce41f5aSRakie Kim 	if (!mempolicy_kobj) {
3490dce41f5aSRakie Kim 		err = -ENOMEM;
3491dce41f5aSRakie Kim 		goto err_out;
3492dce41f5aSRakie Kim 	}
3493dce41f5aSRakie Kim 
3494dce41f5aSRakie Kim 	node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
3495dce41f5aSRakie Kim 			     GFP_KERNEL);
3496dce41f5aSRakie Kim 	if (!node_attrs) {
3497dce41f5aSRakie Kim 		err = -ENOMEM;
3498dce41f5aSRakie Kim 		goto mempol_out;
3499dce41f5aSRakie Kim 	}
3500dce41f5aSRakie Kim 
3501dce41f5aSRakie Kim 	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
3502dce41f5aSRakie Kim 				   "mempolicy");
3503dce41f5aSRakie Kim 	if (err)
3504dce41f5aSRakie Kim 		goto node_out;
3505dce41f5aSRakie Kim 
3506dce41f5aSRakie Kim 	err = add_weighted_interleave_group(mempolicy_kobj);
3507dce41f5aSRakie Kim 	if (err) {
3508dce41f5aSRakie Kim 		pr_err("mempolicy sysfs structure failed to initialize\n");
3509dce41f5aSRakie Kim 		kobject_put(mempolicy_kobj);
3510dce41f5aSRakie Kim 		return err;
3511dce41f5aSRakie Kim 	}
3512dce41f5aSRakie Kim 
3513dce41f5aSRakie Kim 	return err;
3514dce41f5aSRakie Kim node_out:
3515dce41f5aSRakie Kim 	kfree(node_attrs);
3516dce41f5aSRakie Kim mempol_out:
3517dce41f5aSRakie Kim 	kfree(mempolicy_kobj);
3518dce41f5aSRakie Kim err_out:
3519dce41f5aSRakie Kim 	pr_err("failed to add mempolicy kobject to the system\n");
3520dce41f5aSRakie Kim 	return err;
3521dce41f5aSRakie Kim }
3522dce41f5aSRakie Kim 
3523dce41f5aSRakie Kim late_initcall(mempolicy_sysfs_init);
3524dce41f5aSRakie Kim #endif /* CONFIG_SYSFS */
3525