xref: /linux/mm/mempolicy.c (revision f087b0bad454a91c7d1615f82954a4752843560d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/sysctl.h>
89 #include <linux/sched/task.h>
90 #include <linux/nodemask.h>
91 #include <linux/cpuset.h>
92 #include <linux/slab.h>
93 #include <linux/string.h>
94 #include <linux/export.h>
95 #include <linux/nsproxy.h>
96 #include <linux/interrupt.h>
97 #include <linux/init.h>
98 #include <linux/compat.h>
99 #include <linux/ptrace.h>
100 #include <linux/swap.h>
101 #include <linux/seq_file.h>
102 #include <linux/proc_fs.h>
103 #include <linux/memory-tiers.h>
104 #include <linux/migrate.h>
105 #include <linux/ksm.h>
106 #include <linux/rmap.h>
107 #include <linux/security.h>
108 #include <linux/syscalls.h>
109 #include <linux/ctype.h>
110 #include <linux/mm_inline.h>
111 #include <linux/mmu_notifier.h>
112 #include <linux/printk.h>
113 #include <linux/leafops.h>
114 #include <linux/gcd.h>
115 
116 #include <asm/tlbflush.h>
117 #include <asm/tlb.h>
118 #include <linux/uaccess.h>
119 #include <linux/memory.h>
120 
121 #include "internal.h"
122 
123 /* Internal flags */
124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
125 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
126 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
127 
128 static struct kmem_cache *policy_cache;
129 static struct kmem_cache *sn_cache;
130 
131 /* Highest zone. An specific allocation for a zone below that is not
132    policied. */
133 enum zone_type policy_zone = 0;
134 
135 /*
136  * run-time system-wide default policy => local allocation
137  */
138 static struct mempolicy default_policy = {
139 	.refcnt = ATOMIC_INIT(1), /* never free it */
140 	.mode = MPOL_LOCAL,
141 };
142 
143 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144 
145 /*
146  * weightiness balances the tradeoff between small weights (cycles through nodes
147  * faster, more fair/even distribution) and large weights (smaller errors
148  * between actual bandwidth ratios and weight ratios). 32 is a number that has
149  * been found to perform at a reasonable compromise between the two goals.
150  */
151 static const int weightiness = 32;
152 
153 /*
154  * A null weighted_interleave_state is interpreted as having .mode="auto",
155  * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156  */
157 struct weighted_interleave_state {
158 	bool mode_auto;
159 	u8 iw_table[];
160 };
161 static struct weighted_interleave_state __rcu *wi_state;
162 static unsigned int *node_bw_table;
163 
164 /*
165  * wi_state_lock protects both wi_state and node_bw_table.
166  * node_bw_table is only used by writers to update wi_state.
167  */
168 static DEFINE_MUTEX(wi_state_lock);
169 
get_il_weight(int node)170 static u8 get_il_weight(int node)
171 {
172 	struct weighted_interleave_state *state;
173 	u8 weight = 1;
174 
175 	rcu_read_lock();
176 	state = rcu_dereference(wi_state);
177 	if (state)
178 		weight = state->iw_table[node];
179 	rcu_read_unlock();
180 	return weight;
181 }
182 
183 /*
184  * Convert bandwidth values into weighted interleave weights.
185  * Call with wi_state_lock.
186  */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188 {
189 	u64 sum_bw = 0;
190 	unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191 	int nid;
192 
193 	for_each_node_state(nid, N_MEMORY)
194 		sum_bw += bw[nid];
195 
196 	/* Scale bandwidths to whole numbers in the range [1, weightiness] */
197 	for_each_node_state(nid, N_MEMORY) {
198 		/*
199 		 * Try not to perform 64-bit division.
200 		 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201 		 * If sum_bw > scaling_factor, then round the weight up to 1.
202 		 */
203 		scaling_factor = weightiness * bw[nid];
204 		if (bw[nid] && sum_bw < scaling_factor) {
205 			cast_sum_bw = (unsigned int)sum_bw;
206 			new_iw[nid] = scaling_factor / cast_sum_bw;
207 		} else {
208 			new_iw[nid] = 1;
209 		}
210 		if (!iw_gcd)
211 			iw_gcd = new_iw[nid];
212 		iw_gcd = gcd(iw_gcd, new_iw[nid]);
213 	}
214 
215 	/* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216 	for_each_node_state(nid, N_MEMORY)
217 		new_iw[nid] /= iw_gcd;
218 }
219 
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221 {
222 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223 	unsigned int *old_bw, *new_bw;
224 	unsigned int bw_val;
225 	int i;
226 
227 	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228 	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229 	if (!new_bw)
230 		return -ENOMEM;
231 
232 	new_wi_state = kmalloc_flex(*new_wi_state, iw_table, nr_node_ids);
233 	if (!new_wi_state) {
234 		kfree(new_bw);
235 		return -ENOMEM;
236 	}
237 	new_wi_state->mode_auto = true;
238 	for (i = 0; i < nr_node_ids; i++)
239 		new_wi_state->iw_table[i] = 1;
240 
241 	/*
242 	 * Update bandwidth info, even in manual mode. That way, when switching
243 	 * to auto mode in the future, iw_table can be overwritten using
244 	 * accurate bw data.
245 	 */
246 	mutex_lock(&wi_state_lock);
247 
248 	old_bw = node_bw_table;
249 	if (old_bw)
250 		memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
251 	new_bw[node] = bw_val;
252 	node_bw_table = new_bw;
253 
254 	old_wi_state = rcu_dereference_protected(wi_state,
255 					lockdep_is_held(&wi_state_lock));
256 	if (old_wi_state && !old_wi_state->mode_auto) {
257 		/* Manual mode; skip reducing weights and updating wi_state */
258 		mutex_unlock(&wi_state_lock);
259 		kfree(new_wi_state);
260 		goto out;
261 	}
262 
263 	/* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
264 	reduce_interleave_weights(new_bw, new_wi_state->iw_table);
265 	rcu_assign_pointer(wi_state, new_wi_state);
266 
267 	mutex_unlock(&wi_state_lock);
268 	if (old_wi_state) {
269 		synchronize_rcu();
270 		kfree(old_wi_state);
271 	}
272 out:
273 	kfree(old_bw);
274 	return 0;
275 }
276 
277 /**
278  * numa_nearest_node - Find nearest node by state
279  * @node: Node id to start the search
280  * @state: State to filter the search
281  *
282  * Lookup the closest node by distance if @nid is not in state.
283  *
284  * Return: this @node if it is in state, otherwise the closest node by distance
285  */
numa_nearest_node(int node,unsigned int state)286 int numa_nearest_node(int node, unsigned int state)
287 {
288 	int min_dist = INT_MAX, dist, n, min_node;
289 
290 	if (state >= NR_NODE_STATES)
291 		return -EINVAL;
292 
293 	if (node == NUMA_NO_NODE || node_state(node, state))
294 		return node;
295 
296 	min_node = node;
297 	for_each_node_state(n, state) {
298 		dist = node_distance(node, n);
299 		if (dist < min_dist) {
300 			min_dist = dist;
301 			min_node = n;
302 		}
303 	}
304 
305 	return min_node;
306 }
307 EXPORT_SYMBOL_GPL(numa_nearest_node);
308 
309 /**
310  * nearest_node_nodemask - Find the node in @mask at the nearest distance
311  *			   from @node.
312  *
313  * @node: a valid node ID to start the search from.
314  * @mask: a pointer to a nodemask representing the allowed nodes.
315  *
316  * This function iterates over all nodes in @mask and calculates the
317  * distance from the starting @node, then it returns the node ID that is
318  * the closest to @node, or MAX_NUMNODES if no node is found.
319  *
320  * Note that @node must be a valid node ID usable with node_distance(),
321  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
322  * or unexpected behavior.
323  */
nearest_node_nodemask(int node,nodemask_t * mask)324 int nearest_node_nodemask(int node, nodemask_t *mask)
325 {
326 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
327 
328 	for_each_node_mask(n, *mask) {
329 		dist = node_distance(node, n);
330 		if (dist < min_dist) {
331 			min_dist = dist;
332 			min_node = n;
333 		}
334 	}
335 
336 	return min_node;
337 }
338 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
339 
get_task_policy(struct task_struct * p)340 struct mempolicy *get_task_policy(struct task_struct *p)
341 {
342 	struct mempolicy *pol = p->mempolicy;
343 	int node;
344 
345 	if (pol)
346 		return pol;
347 
348 	node = numa_node_id();
349 	if (node != NUMA_NO_NODE) {
350 		pol = &preferred_node_policy[node];
351 		/* preferred_node_policy is not initialised early in boot */
352 		if (pol->mode)
353 			return pol;
354 	}
355 
356 	return &default_policy;
357 }
358 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
359 
360 static const struct mempolicy_operations {
361 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
362 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
363 } mpol_ops[MPOL_MAX];
364 
mpol_store_user_nodemask(const struct mempolicy * pol)365 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
366 {
367 	return pol->flags & MPOL_USER_NODEMASK_FLAGS;
368 }
369 
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)370 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
371 				   const nodemask_t *rel)
372 {
373 	nodemask_t tmp;
374 	nodes_fold(tmp, *orig, nodes_weight(*rel));
375 	nodes_onto(*ret, tmp, *rel);
376 }
377 
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)378 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
379 {
380 	if (nodes_empty(*nodes))
381 		return -EINVAL;
382 	pol->nodes = *nodes;
383 	return 0;
384 }
385 
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)386 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 	if (nodes_empty(*nodes))
389 		return -EINVAL;
390 
391 	nodes_clear(pol->nodes);
392 	node_set(first_node(*nodes), pol->nodes);
393 	return 0;
394 }
395 
396 /*
397  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
398  * any, for the new policy.  mpol_new() has already validated the nodes
399  * parameter with respect to the policy mode and flags.
400  *
401  * Must be called holding task's alloc_lock to protect task's mems_allowed
402  * and mempolicy.  May also be called holding the mmap_lock for write.
403  */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)404 static int mpol_set_nodemask(struct mempolicy *pol,
405 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
406 {
407 	int ret;
408 
409 	/*
410 	 * Default (pol==NULL) resp. local memory policies are not a
411 	 * subject of any remapping. They also do not need any special
412 	 * constructor.
413 	 */
414 	if (!pol || pol->mode == MPOL_LOCAL)
415 		return 0;
416 
417 	/* Check N_MEMORY */
418 	nodes_and(nsc->mask1,
419 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
420 
421 	VM_BUG_ON(!nodes);
422 
423 	if (pol->flags & MPOL_F_RELATIVE_NODES)
424 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
425 	else
426 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
427 
428 	if (mpol_store_user_nodemask(pol))
429 		pol->w.user_nodemask = *nodes;
430 	else
431 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
432 
433 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
434 	return ret;
435 }
436 
437 /*
438  * This function just creates a new policy, does some check and simple
439  * initialization. You must invoke mpol_set_nodemask() to set nodes.
440  */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)441 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
442 				  nodemask_t *nodes)
443 {
444 	struct mempolicy *policy;
445 
446 	if (mode == MPOL_DEFAULT) {
447 		if (nodes && !nodes_empty(*nodes))
448 			return ERR_PTR(-EINVAL);
449 		return NULL;
450 	}
451 	VM_BUG_ON(!nodes);
452 
453 	/*
454 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
455 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
456 	 * All other modes require a valid pointer to a non-empty nodemask.
457 	 */
458 	if (mode == MPOL_PREFERRED) {
459 		if (nodes_empty(*nodes)) {
460 			if (((flags & MPOL_F_STATIC_NODES) ||
461 			     (flags & MPOL_F_RELATIVE_NODES)))
462 				return ERR_PTR(-EINVAL);
463 
464 			mode = MPOL_LOCAL;
465 		}
466 	} else if (mode == MPOL_LOCAL) {
467 		if (!nodes_empty(*nodes) ||
468 		    (flags & MPOL_F_STATIC_NODES) ||
469 		    (flags & MPOL_F_RELATIVE_NODES))
470 			return ERR_PTR(-EINVAL);
471 	} else if (nodes_empty(*nodes))
472 		return ERR_PTR(-EINVAL);
473 
474 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
475 	if (!policy)
476 		return ERR_PTR(-ENOMEM);
477 	atomic_set(&policy->refcnt, 1);
478 	policy->mode = mode;
479 	policy->flags = flags;
480 	policy->home_node = NUMA_NO_NODE;
481 
482 	return policy;
483 }
484 
485 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)486 void __mpol_put(struct mempolicy *pol)
487 {
488 	if (!atomic_dec_and_test(&pol->refcnt))
489 		return;
490 	/*
491 	 * Required to allow mmap_lock_speculative*() access, see for example
492 	 * futex_key_to_node_opt(). All accesses are serialized by mmap_lock,
493 	 * however the speculative lock section unbound by the normal lock
494 	 * boundaries, requiring RCU freeing.
495 	 */
496 	kfree_rcu(pol, rcu);
497 }
498 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
499 
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)500 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
501 {
502 }
503 
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)504 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
505 {
506 	nodemask_t tmp;
507 
508 	if (pol->flags & MPOL_F_STATIC_NODES)
509 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
510 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
511 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
512 	else {
513 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
514 								*nodes);
515 		pol->w.cpuset_mems_allowed = *nodes;
516 	}
517 
518 	if (nodes_empty(tmp))
519 		tmp = *nodes;
520 
521 	pol->nodes = tmp;
522 }
523 
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)524 static void mpol_rebind_preferred(struct mempolicy *pol,
525 						const nodemask_t *nodes)
526 {
527 	pol->w.cpuset_mems_allowed = *nodes;
528 }
529 
530 /*
531  * mpol_rebind_policy - Migrate a policy to a different set of nodes
532  *
533  * Per-vma policies are protected by mmap_lock. Allocations using per-task
534  * policies are protected by task->mems_allowed_seq to prevent a premature
535  * OOM/allocation failure due to parallel nodemask modification.
536  */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)537 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
538 {
539 	if (!pol || pol->mode == MPOL_LOCAL)
540 		return;
541 	if (!mpol_store_user_nodemask(pol) &&
542 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
543 		return;
544 
545 	mpol_ops[pol->mode].rebind(pol, newmask);
546 }
547 
548 /*
549  * Wrapper for mpol_rebind_policy() that just requires task
550  * pointer, and updates task mempolicy.
551  *
552  * Called with task's alloc_lock held.
553  */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)554 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
555 {
556 	mpol_rebind_policy(tsk->mempolicy, new);
557 }
558 
559 /*
560  * Rebind each vma in mm to new nodemask.
561  *
562  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
563  */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)564 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
565 {
566 	struct vm_area_struct *vma;
567 	VMA_ITERATOR(vmi, mm, 0);
568 
569 	mmap_write_lock(mm);
570 	for_each_vma(vmi, vma) {
571 		vma_start_write(vma);
572 		mpol_rebind_policy(vma->vm_policy, new);
573 	}
574 	mmap_write_unlock(mm);
575 }
576 
577 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
578 	[MPOL_DEFAULT] = {
579 		.rebind = mpol_rebind_default,
580 	},
581 	[MPOL_INTERLEAVE] = {
582 		.create = mpol_new_nodemask,
583 		.rebind = mpol_rebind_nodemask,
584 	},
585 	[MPOL_PREFERRED] = {
586 		.create = mpol_new_preferred,
587 		.rebind = mpol_rebind_preferred,
588 	},
589 	[MPOL_BIND] = {
590 		.create = mpol_new_nodemask,
591 		.rebind = mpol_rebind_nodemask,
592 	},
593 	[MPOL_LOCAL] = {
594 		.rebind = mpol_rebind_default,
595 	},
596 	[MPOL_PREFERRED_MANY] = {
597 		.create = mpol_new_nodemask,
598 		.rebind = mpol_rebind_preferred,
599 	},
600 	[MPOL_WEIGHTED_INTERLEAVE] = {
601 		.create = mpol_new_nodemask,
602 		.rebind = mpol_rebind_nodemask,
603 	},
604 };
605 
606 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
607 				unsigned long flags);
608 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
609 				pgoff_t ilx, int *nid);
610 
strictly_unmovable(unsigned long flags)611 static bool strictly_unmovable(unsigned long flags)
612 {
613 	/*
614 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
615 	 * if any misplaced page is found.
616 	 */
617 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
618 			 MPOL_MF_STRICT;
619 }
620 
621 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
622 	struct mempolicy *pol;
623 	pgoff_t ilx;
624 };
625 
626 struct queue_pages {
627 	struct list_head *pagelist;
628 	unsigned long flags;
629 	nodemask_t *nmask;
630 	unsigned long start;
631 	unsigned long end;
632 	struct vm_area_struct *first;
633 	struct folio *large;		/* note last large folio encountered */
634 	long nr_failed;			/* could not be isolated at this time */
635 };
636 
637 /*
638  * Check if the folio's nid is in qp->nmask.
639  *
640  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
641  * in the invert of qp->nmask.
642  */
queue_folio_required(struct folio * folio,struct queue_pages * qp)643 static inline bool queue_folio_required(struct folio *folio,
644 					struct queue_pages *qp)
645 {
646 	int nid = folio_nid(folio);
647 	unsigned long flags = qp->flags;
648 
649 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
650 }
651 
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)652 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
653 {
654 	struct folio *folio;
655 	struct queue_pages *qp = walk->private;
656 
657 	if (unlikely(pmd_is_migration_entry(*pmd))) {
658 		qp->nr_failed++;
659 		return;
660 	}
661 	folio = pmd_folio(*pmd);
662 	if (is_huge_zero_folio(folio)) {
663 		walk->action = ACTION_CONTINUE;
664 		return;
665 	}
666 	if (!queue_folio_required(folio, qp))
667 		return;
668 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
669 	    !vma_migratable(walk->vma) ||
670 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
671 		qp->nr_failed++;
672 }
673 
674 /*
675  * Scan through folios, checking if they satisfy the required conditions,
676  * moving them from LRU to local pagelist for migration if they do (or not).
677  *
678  * queue_folios_pte_range() has two possible return values:
679  * 0 - continue walking to scan for more, even if an existing folio on the
680  *     wrong node could not be isolated and queued for migration.
681  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
682  *        and an existing folio was on a node that does not follow the policy.
683  */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)684 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
685 			unsigned long end, struct mm_walk *walk)
686 {
687 	struct vm_area_struct *vma = walk->vma;
688 	struct folio *folio;
689 	struct queue_pages *qp = walk->private;
690 	unsigned long flags = qp->flags;
691 	pte_t *pte, *mapped_pte;
692 	pte_t ptent;
693 	spinlock_t *ptl;
694 	int max_nr, nr;
695 
696 	ptl = pmd_trans_huge_lock(pmd, vma);
697 	if (ptl) {
698 		queue_folios_pmd(pmd, walk);
699 		spin_unlock(ptl);
700 		goto out;
701 	}
702 
703 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
704 	if (!pte) {
705 		walk->action = ACTION_AGAIN;
706 		return 0;
707 	}
708 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
709 		max_nr = (end - addr) >> PAGE_SHIFT;
710 		nr = 1;
711 		ptent = ptep_get(pte);
712 		if (pte_none(ptent))
713 			continue;
714 		if (!pte_present(ptent)) {
715 			const softleaf_t entry = softleaf_from_pte(ptent);
716 
717 			if (softleaf_is_migration(entry))
718 				qp->nr_failed++;
719 			continue;
720 		}
721 		folio = vm_normal_folio(vma, addr, ptent);
722 		if (!folio || folio_is_zone_device(folio))
723 			continue;
724 		if (folio_test_large(folio) && max_nr != 1)
725 			nr = folio_pte_batch(folio, pte, ptent, max_nr);
726 		/*
727 		 * vm_normal_folio() filters out zero pages, but there might
728 		 * still be reserved folios to skip, perhaps in a VDSO.
729 		 */
730 		if (folio_test_reserved(folio))
731 			continue;
732 		if (!queue_folio_required(folio, qp))
733 			continue;
734 		if (folio_test_large(folio)) {
735 			/*
736 			 * A large folio can only be isolated from LRU once,
737 			 * but may be mapped by many PTEs (and Copy-On-Write may
738 			 * intersperse PTEs of other, order 0, folios).  This is
739 			 * a common case, so don't mistake it for failure (but
740 			 * there can be other cases of multi-mapped pages which
741 			 * this quick check does not help to filter out - and a
742 			 * search of the pagelist might grow to be prohibitive).
743 			 *
744 			 * migrate_pages(&pagelist) returns nr_failed folios, so
745 			 * check "large" now so that queue_pages_range() returns
746 			 * a comparable nr_failed folios.  This does imply that
747 			 * if folio could not be isolated for some racy reason
748 			 * at its first PTE, later PTEs will not give it another
749 			 * chance of isolation; but keeps the accounting simple.
750 			 */
751 			if (folio == qp->large)
752 				continue;
753 			qp->large = folio;
754 		}
755 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
756 		    !vma_migratable(vma) ||
757 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
758 			qp->nr_failed += nr;
759 			if (strictly_unmovable(flags))
760 				break;
761 		}
762 	}
763 	pte_unmap_unlock(mapped_pte, ptl);
764 	cond_resched();
765 out:
766 	if (qp->nr_failed && strictly_unmovable(flags))
767 		return -EIO;
768 	return 0;
769 }
770 
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)771 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
772 			       unsigned long addr, unsigned long end,
773 			       struct mm_walk *walk)
774 {
775 #ifdef CONFIG_HUGETLB_PAGE
776 	struct queue_pages *qp = walk->private;
777 	unsigned long flags = qp->flags;
778 	struct folio *folio;
779 	spinlock_t *ptl;
780 	pte_t ptep;
781 
782 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
783 	ptep = huge_ptep_get(walk->mm, addr, pte);
784 	if (!pte_present(ptep)) {
785 		if (!huge_pte_none(ptep)) {
786 			const softleaf_t entry = softleaf_from_pte(ptep);
787 
788 			if (unlikely(softleaf_is_migration(entry)))
789 				qp->nr_failed++;
790 		}
791 
792 		goto unlock;
793 	}
794 	folio = pfn_folio(pte_pfn(ptep));
795 	if (!queue_folio_required(folio, qp))
796 		goto unlock;
797 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
798 	    !vma_migratable(walk->vma)) {
799 		qp->nr_failed++;
800 		goto unlock;
801 	}
802 	/*
803 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
804 	 * Choosing not to migrate a shared folio is not counted as a failure.
805 	 *
806 	 * See folio_maybe_mapped_shared() on possible imprecision when we
807 	 * cannot easily detect if a folio is shared.
808 	 */
809 	if ((flags & MPOL_MF_MOVE_ALL) ||
810 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
811 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
812 			qp->nr_failed++;
813 unlock:
814 	spin_unlock(ptl);
815 	if (qp->nr_failed && strictly_unmovable(flags))
816 		return -EIO;
817 #endif
818 	return 0;
819 }
820 
821 #ifdef CONFIG_NUMA_BALANCING
822 /**
823  * folio_can_map_prot_numa() - check whether the folio can map prot numa
824  * @folio: The folio whose mapping considered for being made NUMA hintable
825  * @vma: The VMA that the folio belongs to.
826  * @is_private_single_threaded: Is this a single-threaded private VMA or not
827  *
828  * This function checks to see if the folio actually indicates that
829  * we need to make the mapping one which causes a NUMA hinting fault,
830  * as there are cases where it's simply unnecessary, and the folio's
831  * access time is adjusted for memory tiering if prot numa needed.
832  *
833  * Return: True if the mapping of the folio needs to be changed, false otherwise.
834  */
folio_can_map_prot_numa(struct folio * folio,struct vm_area_struct * vma,bool is_private_single_threaded)835 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
836 		bool is_private_single_threaded)
837 {
838 	int nid;
839 
840 	if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
841 		return false;
842 
843 	/* Also skip shared copy-on-write folios */
844 	if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
845 		return false;
846 
847 	/* Folios are pinned and can't be migrated */
848 	if (folio_maybe_dma_pinned(folio))
849 		return false;
850 
851 	/*
852 	 * While migration can move some dirty folios,
853 	 * it cannot move them all from MIGRATE_ASYNC
854 	 * context.
855 	 */
856 	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
857 		return false;
858 
859 	/*
860 	 * Don't mess with PTEs if folio is already on the node
861 	 * a single-threaded process is running on.
862 	 */
863 	nid = folio_nid(folio);
864 	if (is_private_single_threaded && (nid == numa_node_id()))
865 		return false;
866 
867 	/*
868 	 * Skip scanning top tier node if normal numa
869 	 * balancing is disabled
870 	 */
871 	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
872 	    node_is_toptier(nid))
873 		return false;
874 
875 	if (folio_use_access_time(folio))
876 		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
877 
878 	return true;
879 }
880 
881 /*
882  * This is used to mark a range of virtual addresses to be inaccessible.
883  * These are later cleared by a NUMA hinting fault. Depending on these
884  * faults, pages may be migrated for better NUMA placement.
885  *
886  * This is assuming that NUMA faults are handled using PROT_NONE. If
887  * an architecture makes a different choice, it will need further
888  * changes to the core.
889  */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)890 unsigned long change_prot_numa(struct vm_area_struct *vma,
891 			unsigned long addr, unsigned long end)
892 {
893 	struct mmu_gather tlb;
894 	long nr_updated;
895 
896 	tlb_gather_mmu(&tlb, vma->vm_mm);
897 
898 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
899 	if (nr_updated > 0) {
900 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
901 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
902 	}
903 
904 	tlb_finish_mmu(&tlb);
905 
906 	return nr_updated;
907 }
908 #endif /* CONFIG_NUMA_BALANCING */
909 
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)910 static int queue_pages_test_walk(unsigned long start, unsigned long end,
911 				struct mm_walk *walk)
912 {
913 	struct vm_area_struct *next, *vma = walk->vma;
914 	struct queue_pages *qp = walk->private;
915 	unsigned long flags = qp->flags;
916 
917 	/* range check first */
918 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
919 
920 	if (!qp->first) {
921 		qp->first = vma;
922 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
923 			(qp->start < vma->vm_start))
924 			/* hole at head side of range */
925 			return -EFAULT;
926 	}
927 	next = find_vma(vma->vm_mm, vma->vm_end);
928 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
929 		((vma->vm_end < qp->end) &&
930 		(!next || vma->vm_end < next->vm_start)))
931 		/* hole at middle or tail of range */
932 		return -EFAULT;
933 
934 	/*
935 	 * Need check MPOL_MF_STRICT to return -EIO if possible
936 	 * regardless of vma_migratable
937 	 */
938 	if (!vma_migratable(vma) &&
939 	    !(flags & MPOL_MF_STRICT))
940 		return 1;
941 
942 	/*
943 	 * Check page nodes, and queue pages to move, in the current vma.
944 	 * But if no moving, and no strict checking, the scan can be skipped.
945 	 */
946 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
947 		return 0;
948 	return 1;
949 }
950 
951 static const struct mm_walk_ops queue_pages_walk_ops = {
952 	.hugetlb_entry		= queue_folios_hugetlb,
953 	.pmd_entry		= queue_folios_pte_range,
954 	.test_walk		= queue_pages_test_walk,
955 	.walk_lock		= PGWALK_RDLOCK,
956 };
957 
958 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
959 	.hugetlb_entry		= queue_folios_hugetlb,
960 	.pmd_entry		= queue_folios_pte_range,
961 	.test_walk		= queue_pages_test_walk,
962 	.walk_lock		= PGWALK_WRLOCK,
963 };
964 
965 /*
966  * Walk through page tables and collect pages to be migrated.
967  *
968  * If pages found in a given range are not on the required set of @nodes,
969  * and migration is allowed, they are isolated and queued to @pagelist.
970  *
971  * queue_pages_range() may return:
972  * 0 - all pages already on the right node, or successfully queued for moving
973  *     (or neither strict checking nor moving requested: only range checking).
974  * >0 - this number of misplaced folios could not be queued for moving
975  *      (a hugetlbfs page or a transparent huge page being counted as 1).
976  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
977  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
978  */
979 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)980 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
981 		nodemask_t *nodes, unsigned long flags,
982 		struct list_head *pagelist)
983 {
984 	int err;
985 	struct queue_pages qp = {
986 		.pagelist = pagelist,
987 		.flags = flags,
988 		.nmask = nodes,
989 		.start = start,
990 		.end = end,
991 		.first = NULL,
992 	};
993 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
994 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
995 
996 	err = walk_page_range(mm, start, end, ops, &qp);
997 
998 	if (!qp.first)
999 		/* whole range in hole */
1000 		err = -EFAULT;
1001 
1002 	return err ? : qp.nr_failed;
1003 }
1004 
1005 /*
1006  * Apply policy to a single VMA
1007  * This must be called with the mmap_lock held for writing.
1008  */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)1009 static int vma_replace_policy(struct vm_area_struct *vma,
1010 				struct mempolicy *pol)
1011 {
1012 	int err;
1013 	struct mempolicy *old;
1014 	struct mempolicy *new;
1015 
1016 	vma_assert_write_locked(vma);
1017 
1018 	new = mpol_dup(pol);
1019 	if (IS_ERR(new))
1020 		return PTR_ERR(new);
1021 
1022 	if (vma->vm_ops && vma->vm_ops->set_policy) {
1023 		err = vma->vm_ops->set_policy(vma, new);
1024 		if (err)
1025 			goto err_out;
1026 	}
1027 
1028 	old = vma->vm_policy;
1029 	WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */
1030 	mpol_put(old);
1031 
1032 	return 0;
1033  err_out:
1034 	mpol_put(new);
1035 	return err;
1036 }
1037 
1038 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)1039 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1040 		struct vm_area_struct **prev, unsigned long start,
1041 		unsigned long end, struct mempolicy *new_pol)
1042 {
1043 	unsigned long vmstart, vmend;
1044 
1045 	vmend = min(end, vma->vm_end);
1046 	if (start > vma->vm_start) {
1047 		*prev = vma;
1048 		vmstart = start;
1049 	} else {
1050 		vmstart = vma->vm_start;
1051 	}
1052 
1053 	if (mpol_equal(vma->vm_policy, new_pol)) {
1054 		*prev = vma;
1055 		return 0;
1056 	}
1057 
1058 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
1059 	if (IS_ERR(vma))
1060 		return PTR_ERR(vma);
1061 
1062 	*prev = vma;
1063 	return vma_replace_policy(vma, new_pol);
1064 }
1065 
1066 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)1067 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1068 			     nodemask_t *nodes)
1069 {
1070 	struct mempolicy *new, *old;
1071 	NODEMASK_SCRATCH(scratch);
1072 	int ret;
1073 
1074 	if (!scratch)
1075 		return -ENOMEM;
1076 
1077 	new = mpol_new(mode, flags, nodes);
1078 	if (IS_ERR(new)) {
1079 		ret = PTR_ERR(new);
1080 		goto out;
1081 	}
1082 
1083 	task_lock(current);
1084 	ret = mpol_set_nodemask(new, nodes, scratch);
1085 	if (ret) {
1086 		task_unlock(current);
1087 		mpol_put(new);
1088 		goto out;
1089 	}
1090 
1091 	old = current->mempolicy;
1092 	current->mempolicy = new;
1093 	if (new && (new->mode == MPOL_INTERLEAVE ||
1094 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1095 		current->il_prev = MAX_NUMNODES-1;
1096 		current->il_weight = 0;
1097 	}
1098 	task_unlock(current);
1099 	mpol_put(old);
1100 	ret = 0;
1101 out:
1102 	NODEMASK_SCRATCH_FREE(scratch);
1103 	return ret;
1104 }
1105 
1106 /*
1107  * Return nodemask for policy for get_mempolicy() query
1108  *
1109  * Called with task's alloc_lock held
1110  */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1111 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1112 {
1113 	nodes_clear(*nodes);
1114 	if (pol == &default_policy)
1115 		return;
1116 
1117 	switch (pol->mode) {
1118 	case MPOL_BIND:
1119 	case MPOL_INTERLEAVE:
1120 	case MPOL_PREFERRED:
1121 	case MPOL_PREFERRED_MANY:
1122 	case MPOL_WEIGHTED_INTERLEAVE:
1123 		*nodes = pol->nodes;
1124 		break;
1125 	case MPOL_LOCAL:
1126 		/* return empty node mask for local allocation */
1127 		break;
1128 	default:
1129 		BUG();
1130 	}
1131 }
1132 
lookup_node(struct mm_struct * mm,unsigned long addr)1133 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1134 {
1135 	struct page *p = NULL;
1136 	int ret;
1137 
1138 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1139 	if (ret > 0) {
1140 		ret = page_to_nid(p);
1141 		put_page(p);
1142 	}
1143 	return ret;
1144 }
1145 
1146 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1147 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1148 			     unsigned long addr, unsigned long flags)
1149 {
1150 	int err;
1151 	struct mm_struct *mm = current->mm;
1152 	struct vm_area_struct *vma = NULL;
1153 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1154 
1155 	if (flags &
1156 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1157 		return -EINVAL;
1158 
1159 	if (flags & MPOL_F_MEMS_ALLOWED) {
1160 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1161 			return -EINVAL;
1162 		*policy = 0;	/* just so it's initialized */
1163 		task_lock(current);
1164 		*nmask  = cpuset_current_mems_allowed;
1165 		task_unlock(current);
1166 		return 0;
1167 	}
1168 
1169 	if (flags & MPOL_F_ADDR) {
1170 		pgoff_t ilx;		/* ignored here */
1171 		/*
1172 		 * Do NOT fall back to task policy if the
1173 		 * vma/shared policy at addr is NULL.  We
1174 		 * want to return MPOL_DEFAULT in this case.
1175 		 */
1176 		mmap_read_lock(mm);
1177 		vma = vma_lookup(mm, addr);
1178 		if (!vma) {
1179 			mmap_read_unlock(mm);
1180 			return -EFAULT;
1181 		}
1182 		pol = __get_vma_policy(vma, addr, &ilx);
1183 	} else if (addr)
1184 		return -EINVAL;
1185 
1186 	if (!pol)
1187 		pol = &default_policy;	/* indicates default behavior */
1188 
1189 	if (flags & MPOL_F_NODE) {
1190 		if (flags & MPOL_F_ADDR) {
1191 			/*
1192 			 * Take a refcount on the mpol, because we are about to
1193 			 * drop the mmap_lock, after which only "pol" remains
1194 			 * valid, "vma" is stale.
1195 			 */
1196 			pol_refcount = pol;
1197 			vma = NULL;
1198 			mpol_get(pol);
1199 			mmap_read_unlock(mm);
1200 			err = lookup_node(mm, addr);
1201 			if (err < 0)
1202 				goto out;
1203 			*policy = err;
1204 		} else if (pol == current->mempolicy &&
1205 				pol->mode == MPOL_INTERLEAVE) {
1206 			*policy = next_node_in(current->il_prev, pol->nodes);
1207 		} else if (pol == current->mempolicy &&
1208 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1209 			if (current->il_weight)
1210 				*policy = current->il_prev;
1211 			else
1212 				*policy = next_node_in(current->il_prev,
1213 						       pol->nodes);
1214 		} else {
1215 			err = -EINVAL;
1216 			goto out;
1217 		}
1218 	} else {
1219 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1220 						pol->mode;
1221 		/*
1222 		 * Internal mempolicy flags must be masked off before exposing
1223 		 * the policy to userspace.
1224 		 */
1225 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1226 	}
1227 
1228 	err = 0;
1229 	if (nmask) {
1230 		if (mpol_store_user_nodemask(pol)) {
1231 			*nmask = pol->w.user_nodemask;
1232 		} else {
1233 			task_lock(current);
1234 			get_policy_nodemask(pol, nmask);
1235 			task_unlock(current);
1236 		}
1237 	}
1238 
1239  out:
1240 	mpol_cond_put(pol);
1241 	if (vma)
1242 		mmap_read_unlock(mm);
1243 	if (pol_refcount)
1244 		mpol_put(pol_refcount);
1245 	return err;
1246 }
1247 
1248 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1249 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1250 				unsigned long flags)
1251 {
1252 	/*
1253 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1254 	 * Choosing not to migrate a shared folio is not counted as a failure.
1255 	 *
1256 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1257 	 * cannot easily detect if a folio is shared.
1258 	 */
1259 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1260 		if (folio_isolate_lru(folio)) {
1261 			list_add_tail(&folio->lru, foliolist);
1262 			node_stat_mod_folio(folio,
1263 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1264 				folio_nr_pages(folio));
1265 		} else {
1266 			/*
1267 			 * Non-movable folio may reach here.  And, there may be
1268 			 * temporary off LRU folios or non-LRU movable folios.
1269 			 * Treat them as unmovable folios since they can't be
1270 			 * isolated, so they can't be moved at the moment.
1271 			 */
1272 			return false;
1273 		}
1274 	}
1275 	return true;
1276 }
1277 
1278 /*
1279  * Migrate pages from one node to a target node.
1280  * Returns error or the number of pages not migrated.
1281  */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1282 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1283 			    int flags)
1284 {
1285 	nodemask_t nmask;
1286 	struct vm_area_struct *vma;
1287 	LIST_HEAD(pagelist);
1288 	long nr_failed;
1289 	long err = 0;
1290 	struct migration_target_control mtc = {
1291 		.nid = dest,
1292 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1293 		.reason = MR_SYSCALL,
1294 	};
1295 
1296 	nodes_clear(nmask);
1297 	node_set(source, nmask);
1298 
1299 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1300 
1301 	mmap_read_lock(mm);
1302 	vma = find_vma(mm, 0);
1303 	if (unlikely(!vma)) {
1304 		mmap_read_unlock(mm);
1305 		return 0;
1306 	}
1307 
1308 	/*
1309 	 * This does not migrate the range, but isolates all pages that
1310 	 * need migration.  Between passing in the full user address
1311 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1312 	 * but passes back the count of pages which could not be isolated.
1313 	 */
1314 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1315 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1316 	mmap_read_unlock(mm);
1317 
1318 	if (!list_empty(&pagelist)) {
1319 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1320 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1321 		if (err)
1322 			putback_movable_pages(&pagelist);
1323 	}
1324 
1325 	if (err >= 0)
1326 		err += nr_failed;
1327 	return err;
1328 }
1329 
1330 /*
1331  * Move pages between the two nodesets so as to preserve the physical
1332  * layout as much as possible.
1333  *
1334  * Returns the number of page that could not be moved.
1335  */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1336 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1337 		     const nodemask_t *to, int flags)
1338 {
1339 	long nr_failed = 0;
1340 	long err = 0;
1341 	nodemask_t tmp;
1342 
1343 	lru_cache_disable();
1344 
1345 	/*
1346 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1347 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1348 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1349 	 * The pair of nodemasks 'to' and 'from' define the map.
1350 	 *
1351 	 * If no pair of bits is found that way, fallback to picking some
1352 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1353 	 * 'source' and 'dest' bits are the same, this represents a node
1354 	 * that will be migrating to itself, so no pages need move.
1355 	 *
1356 	 * If no bits are left in 'tmp', or if all remaining bits left
1357 	 * in 'tmp' correspond to the same bit in 'to', return false
1358 	 * (nothing left to migrate).
1359 	 *
1360 	 * This lets us pick a pair of nodes to migrate between, such that
1361 	 * if possible the dest node is not already occupied by some other
1362 	 * source node, minimizing the risk of overloading the memory on a
1363 	 * node that would happen if we migrated incoming memory to a node
1364 	 * before migrating outgoing memory source that same node.
1365 	 *
1366 	 * A single scan of tmp is sufficient.  As we go, we remember the
1367 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1368 	 * that not only moved, but what's better, moved to an empty slot
1369 	 * (d is not set in tmp), then we break out then, with that pair.
1370 	 * Otherwise when we finish scanning from_tmp, we at least have the
1371 	 * most recent <s, d> pair that moved.  If we get all the way through
1372 	 * the scan of tmp without finding any node that moved, much less
1373 	 * moved to an empty node, then there is nothing left worth migrating.
1374 	 */
1375 
1376 	tmp = *from;
1377 	while (!nodes_empty(tmp)) {
1378 		int s, d;
1379 		int source = NUMA_NO_NODE;
1380 		int dest = 0;
1381 
1382 		for_each_node_mask(s, tmp) {
1383 
1384 			/*
1385 			 * do_migrate_pages() tries to maintain the relative
1386 			 * node relationship of the pages established between
1387 			 * threads and memory areas.
1388                          *
1389 			 * However if the number of source nodes is not equal to
1390 			 * the number of destination nodes we can not preserve
1391 			 * this node relative relationship.  In that case, skip
1392 			 * copying memory from a node that is in the destination
1393 			 * mask.
1394 			 *
1395 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1396 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1397 			 */
1398 
1399 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1400 						(node_isset(s, *to)))
1401 				continue;
1402 
1403 			d = node_remap(s, *from, *to);
1404 			if (s == d)
1405 				continue;
1406 
1407 			source = s;	/* Node moved. Memorize */
1408 			dest = d;
1409 
1410 			/* dest not in remaining from nodes? */
1411 			if (!node_isset(dest, tmp))
1412 				break;
1413 		}
1414 		if (source == NUMA_NO_NODE)
1415 			break;
1416 
1417 		node_clear(source, tmp);
1418 		err = migrate_to_node(mm, source, dest, flags);
1419 		if (err > 0)
1420 			nr_failed += err;
1421 		if (err < 0)
1422 			break;
1423 	}
1424 
1425 	lru_cache_enable();
1426 	if (err < 0)
1427 		return err;
1428 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1429 }
1430 
1431 /*
1432  * Allocate a new folio for page migration, according to NUMA mempolicy.
1433  */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1434 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1435 						    unsigned long private)
1436 {
1437 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1438 	struct mempolicy *pol = mmpol->pol;
1439 	pgoff_t ilx = mmpol->ilx;
1440 	unsigned int order;
1441 	int nid = numa_node_id();
1442 	gfp_t gfp;
1443 
1444 	order = folio_order(src);
1445 	ilx += src->index >> order;
1446 
1447 	if (folio_test_hugetlb(src)) {
1448 		nodemask_t *nodemask;
1449 		struct hstate *h;
1450 
1451 		h = folio_hstate(src);
1452 		gfp = htlb_alloc_mask(h);
1453 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1454 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1455 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1456 	}
1457 
1458 	if (folio_test_large(src))
1459 		gfp = GFP_TRANSHUGE;
1460 	else
1461 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1462 
1463 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1464 }
1465 #else
1466 
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1467 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1468 				unsigned long flags)
1469 {
1470 	return false;
1471 }
1472 
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1473 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1474 		     const nodemask_t *to, int flags)
1475 {
1476 	return -ENOSYS;
1477 }
1478 
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1479 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1480 						    unsigned long private)
1481 {
1482 	return NULL;
1483 }
1484 #endif
1485 
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1486 static long do_mbind(unsigned long start, unsigned long len,
1487 		     unsigned short mode, unsigned short mode_flags,
1488 		     nodemask_t *nmask, unsigned long flags)
1489 {
1490 	struct mm_struct *mm = current->mm;
1491 	struct vm_area_struct *vma, *prev;
1492 	struct vma_iterator vmi;
1493 	struct migration_mpol mmpol;
1494 	struct mempolicy *new;
1495 	unsigned long end;
1496 	long err;
1497 	long nr_failed;
1498 	LIST_HEAD(pagelist);
1499 
1500 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1501 		return -EINVAL;
1502 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1503 		return -EPERM;
1504 
1505 	if (start & ~PAGE_MASK)
1506 		return -EINVAL;
1507 
1508 	if (mode == MPOL_DEFAULT)
1509 		flags &= ~MPOL_MF_STRICT;
1510 
1511 	len = PAGE_ALIGN(len);
1512 	end = start + len;
1513 
1514 	if (end < start)
1515 		return -EINVAL;
1516 	if (end == start)
1517 		return 0;
1518 
1519 	new = mpol_new(mode, mode_flags, nmask);
1520 	if (IS_ERR(new))
1521 		return PTR_ERR(new);
1522 
1523 	/*
1524 	 * If we are using the default policy then operation
1525 	 * on discontinuous address spaces is okay after all
1526 	 */
1527 	if (!new)
1528 		flags |= MPOL_MF_DISCONTIG_OK;
1529 
1530 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1531 		lru_cache_disable();
1532 	{
1533 		NODEMASK_SCRATCH(scratch);
1534 		if (scratch) {
1535 			mmap_write_lock(mm);
1536 			err = mpol_set_nodemask(new, nmask, scratch);
1537 			if (err)
1538 				mmap_write_unlock(mm);
1539 		} else
1540 			err = -ENOMEM;
1541 		NODEMASK_SCRATCH_FREE(scratch);
1542 	}
1543 	if (err)
1544 		goto mpol_out;
1545 
1546 	/*
1547 	 * Lock the VMAs before scanning for pages to migrate,
1548 	 * to ensure we don't miss a concurrently inserted page.
1549 	 */
1550 	nr_failed = queue_pages_range(mm, start, end, nmask,
1551 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1552 
1553 	if (nr_failed < 0) {
1554 		err = nr_failed;
1555 		nr_failed = 0;
1556 	} else {
1557 		vma_iter_init(&vmi, mm, start);
1558 		prev = vma_prev(&vmi);
1559 		for_each_vma_range(vmi, vma, end) {
1560 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1561 			if (err)
1562 				break;
1563 		}
1564 	}
1565 
1566 	if (!err && !list_empty(&pagelist)) {
1567 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1568 		if (!new) {
1569 			new = get_task_policy(current);
1570 			mpol_get(new);
1571 		}
1572 		mmpol.pol = new;
1573 		mmpol.ilx = 0;
1574 
1575 		/*
1576 		 * In the interleaved case, attempt to allocate on exactly the
1577 		 * targeted nodes, for the first VMA to be migrated; for later
1578 		 * VMAs, the nodes will still be interleaved from the targeted
1579 		 * nodemask, but one by one may be selected differently.
1580 		 */
1581 		if (new->mode == MPOL_INTERLEAVE ||
1582 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1583 			struct folio *folio;
1584 			unsigned int order;
1585 			unsigned long addr = -EFAULT;
1586 
1587 			list_for_each_entry(folio, &pagelist, lru) {
1588 				if (!folio_test_ksm(folio))
1589 					break;
1590 			}
1591 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1592 				vma_iter_init(&vmi, mm, start);
1593 				for_each_vma_range(vmi, vma, end) {
1594 					addr = page_address_in_vma(folio,
1595 						folio_page(folio, 0), vma);
1596 					if (addr != -EFAULT)
1597 						break;
1598 				}
1599 			}
1600 			if (addr != -EFAULT) {
1601 				order = folio_order(folio);
1602 				/* We already know the pol, but not the ilx */
1603 				mpol_cond_put(get_vma_policy(vma, addr, order,
1604 							     &mmpol.ilx));
1605 				/* Set base from which to increment by index */
1606 				mmpol.ilx -= folio->index >> order;
1607 			}
1608 		}
1609 	}
1610 
1611 	mmap_write_unlock(mm);
1612 
1613 	if (!err && !list_empty(&pagelist)) {
1614 		nr_failed |= migrate_pages(&pagelist,
1615 				alloc_migration_target_by_mpol, NULL,
1616 				(unsigned long)&mmpol, MIGRATE_SYNC,
1617 				MR_MEMPOLICY_MBIND, NULL);
1618 	}
1619 
1620 	if (nr_failed && (flags & MPOL_MF_STRICT))
1621 		err = -EIO;
1622 	if (!list_empty(&pagelist))
1623 		putback_movable_pages(&pagelist);
1624 mpol_out:
1625 	mpol_put(new);
1626 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1627 		lru_cache_enable();
1628 	return err;
1629 }
1630 
1631 /*
1632  * User space interface with variable sized bitmaps for nodelists.
1633  */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1634 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1635 		      unsigned long maxnode)
1636 {
1637 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1638 	int ret;
1639 
1640 	if (in_compat_syscall())
1641 		ret = compat_get_bitmap(mask,
1642 					(const compat_ulong_t __user *)nmask,
1643 					maxnode);
1644 	else
1645 		ret = copy_from_user(mask, nmask,
1646 				     nlongs * sizeof(unsigned long));
1647 
1648 	if (ret)
1649 		return -EFAULT;
1650 
1651 	if (maxnode % BITS_PER_LONG)
1652 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1653 
1654 	return 0;
1655 }
1656 
1657 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1658 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1659 		     unsigned long maxnode)
1660 {
1661 	--maxnode;
1662 	nodes_clear(*nodes);
1663 	if (maxnode == 0 || !nmask)
1664 		return 0;
1665 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1666 		return -EINVAL;
1667 
1668 	/*
1669 	 * When the user specified more nodes than supported just check
1670 	 * if the non supported part is all zero, one word at a time,
1671 	 * starting at the end.
1672 	 */
1673 	while (maxnode > MAX_NUMNODES) {
1674 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1675 		unsigned long t;
1676 
1677 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1678 			return -EFAULT;
1679 
1680 		if (maxnode - bits >= MAX_NUMNODES) {
1681 			maxnode -= bits;
1682 		} else {
1683 			maxnode = MAX_NUMNODES;
1684 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1685 		}
1686 		if (t)
1687 			return -EINVAL;
1688 	}
1689 
1690 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1691 }
1692 
1693 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1694 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1695 			      nodemask_t *nodes)
1696 {
1697 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1698 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1699 	bool compat = in_compat_syscall();
1700 
1701 	if (compat)
1702 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1703 
1704 	if (copy > nbytes) {
1705 		if (copy > PAGE_SIZE)
1706 			return -EINVAL;
1707 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1708 			return -EFAULT;
1709 		copy = nbytes;
1710 		maxnode = nr_node_ids;
1711 	}
1712 
1713 	if (compat)
1714 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1715 					 nodes_addr(*nodes), maxnode);
1716 
1717 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1718 }
1719 
1720 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1721 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1722 {
1723 	*flags = *mode & MPOL_MODE_FLAGS;
1724 	*mode &= ~MPOL_MODE_FLAGS;
1725 
1726 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1727 		return -EINVAL;
1728 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1729 		return -EINVAL;
1730 	if (*flags & MPOL_F_NUMA_BALANCING) {
1731 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1732 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1733 		else
1734 			return -EINVAL;
1735 	}
1736 	return 0;
1737 }
1738 
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1739 static long kernel_mbind(unsigned long start, unsigned long len,
1740 			 unsigned long mode, const unsigned long __user *nmask,
1741 			 unsigned long maxnode, unsigned int flags)
1742 {
1743 	unsigned short mode_flags;
1744 	nodemask_t nodes;
1745 	int lmode = mode;
1746 	int err;
1747 
1748 	start = untagged_addr(start);
1749 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1750 	if (err)
1751 		return err;
1752 
1753 	err = get_nodes(&nodes, nmask, maxnode);
1754 	if (err)
1755 		return err;
1756 
1757 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1758 }
1759 
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1760 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1761 		unsigned long, home_node, unsigned long, flags)
1762 {
1763 	struct mm_struct *mm = current->mm;
1764 	struct vm_area_struct *vma, *prev;
1765 	struct mempolicy *new, *old;
1766 	unsigned long end;
1767 	int err = -ENOENT;
1768 	VMA_ITERATOR(vmi, mm, start);
1769 
1770 	start = untagged_addr(start);
1771 	if (start & ~PAGE_MASK)
1772 		return -EINVAL;
1773 	/*
1774 	 * flags is used for future extension if any.
1775 	 */
1776 	if (flags != 0)
1777 		return -EINVAL;
1778 
1779 	/*
1780 	 * Check home_node is online to avoid accessing uninitialized
1781 	 * NODE_DATA.
1782 	 */
1783 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1784 		return -EINVAL;
1785 
1786 	len = PAGE_ALIGN(len);
1787 	end = start + len;
1788 
1789 	if (end < start)
1790 		return -EINVAL;
1791 	if (end == start)
1792 		return 0;
1793 	mmap_write_lock(mm);
1794 	prev = vma_prev(&vmi);
1795 	for_each_vma_range(vmi, vma, end) {
1796 		/*
1797 		 * If any vma in the range got policy other than MPOL_BIND
1798 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1799 		 * the home node for vmas we already updated before.
1800 		 */
1801 		old = vma_policy(vma);
1802 		if (!old) {
1803 			prev = vma;
1804 			continue;
1805 		}
1806 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1807 			err = -EOPNOTSUPP;
1808 			break;
1809 		}
1810 		new = mpol_dup(old);
1811 		if (IS_ERR(new)) {
1812 			err = PTR_ERR(new);
1813 			break;
1814 		}
1815 
1816 		vma_start_write(vma);
1817 		new->home_node = home_node;
1818 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1819 		mpol_put(new);
1820 		if (err)
1821 			break;
1822 	}
1823 	mmap_write_unlock(mm);
1824 	return err;
1825 }
1826 
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1827 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1828 		unsigned long, mode, const unsigned long __user *, nmask,
1829 		unsigned long, maxnode, unsigned int, flags)
1830 {
1831 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1832 }
1833 
1834 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1835 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1836 				 unsigned long maxnode)
1837 {
1838 	unsigned short mode_flags;
1839 	nodemask_t nodes;
1840 	int lmode = mode;
1841 	int err;
1842 
1843 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1844 	if (err)
1845 		return err;
1846 
1847 	err = get_nodes(&nodes, nmask, maxnode);
1848 	if (err)
1849 		return err;
1850 
1851 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1852 }
1853 
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1854 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1855 		unsigned long, maxnode)
1856 {
1857 	return kernel_set_mempolicy(mode, nmask, maxnode);
1858 }
1859 
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1860 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1861 				const unsigned long __user *old_nodes,
1862 				const unsigned long __user *new_nodes)
1863 {
1864 	struct mm_struct *mm = NULL;
1865 	struct task_struct *task;
1866 	nodemask_t task_nodes;
1867 	int err;
1868 	nodemask_t *old;
1869 	nodemask_t *new;
1870 	NODEMASK_SCRATCH(scratch);
1871 
1872 	if (!scratch)
1873 		return -ENOMEM;
1874 
1875 	old = &scratch->mask1;
1876 	new = &scratch->mask2;
1877 
1878 	err = get_nodes(old, old_nodes, maxnode);
1879 	if (err)
1880 		goto out;
1881 
1882 	err = get_nodes(new, new_nodes, maxnode);
1883 	if (err)
1884 		goto out;
1885 
1886 	/* Find the mm_struct */
1887 	rcu_read_lock();
1888 	task = pid ? find_task_by_vpid(pid) : current;
1889 	if (!task) {
1890 		rcu_read_unlock();
1891 		err = -ESRCH;
1892 		goto out;
1893 	}
1894 	get_task_struct(task);
1895 
1896 	err = -EINVAL;
1897 
1898 	/*
1899 	 * Check if this process has the right to modify the specified process.
1900 	 * Use the regular "ptrace_may_access()" checks.
1901 	 */
1902 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1903 		rcu_read_unlock();
1904 		err = -EPERM;
1905 		goto out_put;
1906 	}
1907 	rcu_read_unlock();
1908 
1909 	task_nodes = cpuset_mems_allowed(task);
1910 	/* Is the user allowed to access the target nodes? */
1911 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1912 		err = -EPERM;
1913 		goto out_put;
1914 	}
1915 
1916 	task_nodes = cpuset_mems_allowed(current);
1917 	if (!nodes_and(*new, *new, task_nodes))
1918 		goto out_put;
1919 
1920 	err = security_task_movememory(task);
1921 	if (err)
1922 		goto out_put;
1923 
1924 	mm = get_task_mm(task);
1925 	put_task_struct(task);
1926 
1927 	if (!mm) {
1928 		err = -EINVAL;
1929 		goto out;
1930 	}
1931 
1932 	err = do_migrate_pages(mm, old, new,
1933 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1934 
1935 	mmput(mm);
1936 out:
1937 	NODEMASK_SCRATCH_FREE(scratch);
1938 
1939 	return err;
1940 
1941 out_put:
1942 	put_task_struct(task);
1943 	goto out;
1944 }
1945 
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1946 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1947 		const unsigned long __user *, old_nodes,
1948 		const unsigned long __user *, new_nodes)
1949 {
1950 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1951 }
1952 
1953 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1954 static int kernel_get_mempolicy(int __user *policy,
1955 				unsigned long __user *nmask,
1956 				unsigned long maxnode,
1957 				unsigned long addr,
1958 				unsigned long flags)
1959 {
1960 	int err;
1961 	int pval;
1962 	nodemask_t nodes;
1963 
1964 	if (nmask != NULL && maxnode < nr_node_ids)
1965 		return -EINVAL;
1966 
1967 	addr = untagged_addr(addr);
1968 
1969 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1970 
1971 	if (err)
1972 		return err;
1973 
1974 	if (policy && put_user(pval, policy))
1975 		return -EFAULT;
1976 
1977 	if (nmask)
1978 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1979 
1980 	return err;
1981 }
1982 
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1983 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1984 		unsigned long __user *, nmask, unsigned long, maxnode,
1985 		unsigned long, addr, unsigned long, flags)
1986 {
1987 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1988 }
1989 
vma_migratable(struct vm_area_struct * vma)1990 bool vma_migratable(struct vm_area_struct *vma)
1991 {
1992 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1993 		return false;
1994 
1995 	/*
1996 	 * DAX device mappings require predictable access latency, so avoid
1997 	 * incurring periodic faults.
1998 	 */
1999 	if (vma_is_dax(vma))
2000 		return false;
2001 
2002 	if (is_vm_hugetlb_page(vma) &&
2003 		!hugepage_migration_supported(hstate_vma(vma)))
2004 		return false;
2005 
2006 	/*
2007 	 * Migration allocates pages in the highest zone. If we cannot
2008 	 * do so then migration (at least from node to node) is not
2009 	 * possible.
2010 	 */
2011 	if (vma->vm_file &&
2012 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
2013 			< policy_zone)
2014 		return false;
2015 	return true;
2016 }
2017 
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2018 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2019 				   unsigned long addr, pgoff_t *ilx)
2020 {
2021 	*ilx = 0;
2022 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
2023 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2024 }
2025 
2026 /*
2027  * get_vma_policy(@vma, @addr, @order, @ilx)
2028  * @vma: virtual memory area whose policy is sought
2029  * @addr: address in @vma for shared policy lookup
2030  * @order: 0, or appropriate huge_page_order for interleaving
2031  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2032  *       MPOL_WEIGHTED_INTERLEAVE
2033  *
2034  * Returns effective policy for a VMA at specified address.
2035  * Falls back to current->mempolicy or system default policy, as necessary.
2036  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2037  * count--added by the get_policy() vm_op, as appropriate--to protect against
2038  * freeing by another task.  It is the caller's responsibility to free the
2039  * extra reference for shared policies.
2040  */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)2041 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2042 				 unsigned long addr, int order, pgoff_t *ilx)
2043 {
2044 	struct mempolicy *pol;
2045 
2046 	pol = __get_vma_policy(vma, addr, ilx);
2047 	if (!pol)
2048 		pol = get_task_policy(current);
2049 	if (pol->mode == MPOL_INTERLEAVE ||
2050 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2051 		*ilx += vma->vm_pgoff >> order;
2052 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2053 	}
2054 	return pol;
2055 }
2056 
vma_policy_mof(struct vm_area_struct * vma)2057 bool vma_policy_mof(struct vm_area_struct *vma)
2058 {
2059 	struct mempolicy *pol;
2060 
2061 	if (vma->vm_ops && vma->vm_ops->get_policy) {
2062 		bool ret = false;
2063 		pgoff_t ilx;		/* ignored here */
2064 
2065 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2066 		if (pol && (pol->flags & MPOL_F_MOF))
2067 			ret = true;
2068 		mpol_cond_put(pol);
2069 
2070 		return ret;
2071 	}
2072 
2073 	pol = vma->vm_policy;
2074 	if (!pol)
2075 		pol = get_task_policy(current);
2076 
2077 	return pol->flags & MPOL_F_MOF;
2078 }
2079 
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2080 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2081 {
2082 	enum zone_type dynamic_policy_zone = policy_zone;
2083 
2084 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2085 
2086 	/*
2087 	 * if policy->nodes has movable memory only,
2088 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2089 	 *
2090 	 * policy->nodes is intersect with node_states[N_MEMORY].
2091 	 * so if the following test fails, it implies
2092 	 * policy->nodes has movable memory only.
2093 	 */
2094 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2095 		dynamic_policy_zone = ZONE_MOVABLE;
2096 
2097 	return zone >= dynamic_policy_zone;
2098 }
2099 
weighted_interleave_nodes(struct mempolicy * policy)2100 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2101 {
2102 	unsigned int node;
2103 	unsigned int cpuset_mems_cookie;
2104 
2105 retry:
2106 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2107 	cpuset_mems_cookie = read_mems_allowed_begin();
2108 	node = current->il_prev;
2109 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
2110 		node = next_node_in(node, policy->nodes);
2111 		if (read_mems_allowed_retry(cpuset_mems_cookie))
2112 			goto retry;
2113 		if (node == MAX_NUMNODES)
2114 			return node;
2115 		current->il_prev = node;
2116 		current->il_weight = get_il_weight(node);
2117 	}
2118 	current->il_weight--;
2119 	return node;
2120 }
2121 
2122 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2123 static unsigned int interleave_nodes(struct mempolicy *policy)
2124 {
2125 	unsigned int nid;
2126 	unsigned int cpuset_mems_cookie;
2127 
2128 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2129 	do {
2130 		cpuset_mems_cookie = read_mems_allowed_begin();
2131 		nid = next_node_in(current->il_prev, policy->nodes);
2132 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2133 
2134 	if (nid < MAX_NUMNODES)
2135 		current->il_prev = nid;
2136 	return nid;
2137 }
2138 
2139 /*
2140  * Depending on the memory policy provide a node from which to allocate the
2141  * next slab entry.
2142  */
mempolicy_slab_node(void)2143 unsigned int mempolicy_slab_node(void)
2144 {
2145 	struct mempolicy *policy;
2146 	int node = numa_mem_id();
2147 
2148 	if (!in_task())
2149 		return node;
2150 
2151 	policy = current->mempolicy;
2152 	if (!policy)
2153 		return node;
2154 
2155 	switch (policy->mode) {
2156 	case MPOL_PREFERRED:
2157 		return first_node(policy->nodes);
2158 
2159 	case MPOL_INTERLEAVE:
2160 		return interleave_nodes(policy);
2161 
2162 	case MPOL_WEIGHTED_INTERLEAVE:
2163 		return weighted_interleave_nodes(policy);
2164 
2165 	case MPOL_BIND:
2166 	case MPOL_PREFERRED_MANY:
2167 	{
2168 		struct zoneref *z;
2169 
2170 		/*
2171 		 * Follow bind policy behavior and start allocation at the
2172 		 * first node.
2173 		 */
2174 		struct zonelist *zonelist;
2175 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2176 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2177 		z = first_zones_zonelist(zonelist, highest_zoneidx,
2178 							&policy->nodes);
2179 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2180 	}
2181 	case MPOL_LOCAL:
2182 		return node;
2183 
2184 	default:
2185 		BUG();
2186 	}
2187 }
2188 
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2189 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2190 					      nodemask_t *mask)
2191 {
2192 	/*
2193 	 * barrier stabilizes the nodemask locally so that it can be iterated
2194 	 * over safely without concern for changes. Allocators validate node
2195 	 * selection does not violate mems_allowed, so this is safe.
2196 	 */
2197 	barrier();
2198 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2199 	barrier();
2200 	return nodes_weight(*mask);
2201 }
2202 
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2203 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2204 {
2205 	struct weighted_interleave_state *state;
2206 	nodemask_t nodemask;
2207 	unsigned int target, nr_nodes;
2208 	u8 *table = NULL;
2209 	unsigned int weight_total = 0;
2210 	u8 weight;
2211 	int nid = 0;
2212 
2213 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2214 	if (!nr_nodes)
2215 		return numa_node_id();
2216 
2217 	rcu_read_lock();
2218 
2219 	state = rcu_dereference(wi_state);
2220 	/* Uninitialized wi_state means we should assume all weights are 1 */
2221 	if (state)
2222 		table = state->iw_table;
2223 
2224 	/* calculate the total weight */
2225 	for_each_node_mask(nid, nodemask)
2226 		weight_total += table ? table[nid] : 1;
2227 
2228 	/* Calculate the node offset based on totals */
2229 	target = ilx % weight_total;
2230 	nid = first_node(nodemask);
2231 	while (target) {
2232 		/* detect system default usage */
2233 		weight = table ? table[nid] : 1;
2234 		if (target < weight)
2235 			break;
2236 		target -= weight;
2237 		nid = next_node_in(nid, nodemask);
2238 	}
2239 	rcu_read_unlock();
2240 	return nid;
2241 }
2242 
2243 /*
2244  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2245  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2246  * exceeds the number of present nodes.
2247  */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2248 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2249 {
2250 	nodemask_t nodemask;
2251 	unsigned int target, nnodes;
2252 	int i;
2253 	int nid;
2254 
2255 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2256 	if (!nnodes)
2257 		return numa_node_id();
2258 	target = ilx % nnodes;
2259 	nid = first_node(nodemask);
2260 	for (i = 0; i < target; i++)
2261 		nid = next_node(nid, nodemask);
2262 	return nid;
2263 }
2264 
2265 /*
2266  * Return a nodemask representing a mempolicy for filtering nodes for
2267  * page allocation, together with preferred node id (or the input node id).
2268  */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2269 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2270 				   pgoff_t ilx, int *nid)
2271 {
2272 	nodemask_t *nodemask = NULL;
2273 
2274 	switch (pol->mode) {
2275 	case MPOL_PREFERRED:
2276 		/* Override input node id */
2277 		*nid = first_node(pol->nodes);
2278 		break;
2279 	case MPOL_PREFERRED_MANY:
2280 		nodemask = &pol->nodes;
2281 		if (pol->home_node != NUMA_NO_NODE)
2282 			*nid = pol->home_node;
2283 		break;
2284 	case MPOL_BIND:
2285 		/* Restrict to nodemask (but not on lower zones) */
2286 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2287 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2288 			nodemask = &pol->nodes;
2289 		if (pol->home_node != NUMA_NO_NODE)
2290 			*nid = pol->home_node;
2291 		/*
2292 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2293 		 * because we might easily break the expectation to stay on the
2294 		 * requested node and not break the policy.
2295 		 */
2296 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2297 		break;
2298 	case MPOL_INTERLEAVE:
2299 		/* Override input node id */
2300 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2301 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2302 		break;
2303 	case MPOL_WEIGHTED_INTERLEAVE:
2304 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2305 			weighted_interleave_nodes(pol) :
2306 			weighted_interleave_nid(pol, ilx);
2307 		break;
2308 	}
2309 
2310 	return nodemask;
2311 }
2312 
2313 #ifdef CONFIG_HUGETLBFS
2314 /*
2315  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2316  * @vma: virtual memory area whose policy is sought
2317  * @addr: address in @vma for shared policy lookup and interleave policy
2318  * @gfp_flags: for requested zone
2319  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2320  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2321  *
2322  * Returns a nid suitable for a huge page allocation and a pointer
2323  * to the struct mempolicy for conditional unref after allocation.
2324  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2325  * to the mempolicy's @nodemask for filtering the zonelist.
2326  */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2327 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2328 		struct mempolicy **mpol, nodemask_t **nodemask)
2329 {
2330 	pgoff_t ilx;
2331 	int nid;
2332 
2333 	nid = numa_node_id();
2334 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2335 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2336 	return nid;
2337 }
2338 
2339 /*
2340  * init_nodemask_of_mempolicy
2341  *
2342  * If the current task's mempolicy is "default" [NULL], return 'false'
2343  * to indicate default policy.  Otherwise, extract the policy nodemask
2344  * for 'bind' or 'interleave' policy into the argument nodemask, or
2345  * initialize the argument nodemask to contain the single node for
2346  * 'preferred' or 'local' policy and return 'true' to indicate presence
2347  * of non-default mempolicy.
2348  *
2349  * We don't bother with reference counting the mempolicy [mpol_get/put]
2350  * because the current task is examining it's own mempolicy and a task's
2351  * mempolicy is only ever changed by the task itself.
2352  *
2353  * N.B., it is the caller's responsibility to free a returned nodemask.
2354  */
init_nodemask_of_mempolicy(nodemask_t * mask)2355 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2356 {
2357 	struct mempolicy *mempolicy;
2358 
2359 	if (!(mask && current->mempolicy))
2360 		return false;
2361 
2362 	task_lock(current);
2363 	mempolicy = current->mempolicy;
2364 	switch (mempolicy->mode) {
2365 	case MPOL_PREFERRED:
2366 	case MPOL_PREFERRED_MANY:
2367 	case MPOL_BIND:
2368 	case MPOL_INTERLEAVE:
2369 	case MPOL_WEIGHTED_INTERLEAVE:
2370 		*mask = mempolicy->nodes;
2371 		break;
2372 
2373 	case MPOL_LOCAL:
2374 		init_nodemask_of_node(mask, numa_node_id());
2375 		break;
2376 
2377 	default:
2378 		BUG();
2379 	}
2380 	task_unlock(current);
2381 
2382 	return true;
2383 }
2384 #endif
2385 
2386 /*
2387  * mempolicy_in_oom_domain
2388  *
2389  * If tsk's mempolicy is "bind", check for intersection between mask and
2390  * the policy nodemask. Otherwise, return true for all other policies
2391  * including "interleave", as a tsk with "interleave" policy may have
2392  * memory allocated from all nodes in system.
2393  *
2394  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2395  */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2396 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2397 					const nodemask_t *mask)
2398 {
2399 	struct mempolicy *mempolicy;
2400 	bool ret = true;
2401 
2402 	if (!mask)
2403 		return ret;
2404 
2405 	task_lock(tsk);
2406 	mempolicy = tsk->mempolicy;
2407 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2408 		ret = nodes_intersects(mempolicy->nodes, *mask);
2409 	task_unlock(tsk);
2410 
2411 	return ret;
2412 }
2413 
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2414 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2415 						int nid, nodemask_t *nodemask)
2416 {
2417 	struct page *page;
2418 	gfp_t preferred_gfp;
2419 
2420 	/*
2421 	 * This is a two pass approach. The first pass will only try the
2422 	 * preferred nodes but skip the direct reclaim and allow the
2423 	 * allocation to fail, while the second pass will try all the
2424 	 * nodes in system.
2425 	 */
2426 	preferred_gfp = gfp | __GFP_NOWARN;
2427 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2428 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2429 	if (!page)
2430 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2431 
2432 	return page;
2433 }
2434 
2435 /**
2436  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2437  * @gfp: GFP flags.
2438  * @order: Order of the page allocation.
2439  * @pol: Pointer to the NUMA mempolicy.
2440  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2441  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2442  *
2443  * Return: The page on success or NULL if allocation fails.
2444  */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2445 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2446 		struct mempolicy *pol, pgoff_t ilx, int nid)
2447 {
2448 	nodemask_t *nodemask;
2449 	struct page *page;
2450 
2451 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2452 
2453 	if (pol->mode == MPOL_PREFERRED_MANY)
2454 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2455 
2456 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2457 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2458 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2459 		/*
2460 		 * For hugepage allocation and non-interleave policy which
2461 		 * allows the current node (or other explicitly preferred
2462 		 * node) we only try to allocate from the current/preferred
2463 		 * node and don't fall back to other nodes, as the cost of
2464 		 * remote accesses would likely offset THP benefits.
2465 		 *
2466 		 * If the policy is interleave or does not allow the current
2467 		 * node in its nodemask, we allocate the standard way.
2468 		 */
2469 		if (pol->mode != MPOL_INTERLEAVE &&
2470 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2471 		    (!nodemask || node_isset(nid, *nodemask))) {
2472 			/*
2473 			 * First, try to allocate THP only on local node, but
2474 			 * don't reclaim unnecessarily, just compact.
2475 			 */
2476 			page = __alloc_frozen_pages_noprof(
2477 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2478 				nid, NULL);
2479 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2480 				return page;
2481 			/*
2482 			 * If hugepage allocations are configured to always
2483 			 * synchronous compact or the vma has been madvised
2484 			 * to prefer hugepage backing, retry allowing remote
2485 			 * memory with both reclaim and compact as well.
2486 			 */
2487 		}
2488 	}
2489 
2490 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2491 
2492 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2493 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2494 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2495 		if (static_branch_likely(&vm_numa_stat_key) &&
2496 		    page_to_nid(page) == nid) {
2497 			preempt_disable();
2498 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2499 			preempt_enable();
2500 		}
2501 	}
2502 
2503 	return page;
2504 }
2505 
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2506 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2507 		struct mempolicy *pol, pgoff_t ilx, int nid)
2508 {
2509 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2510 			ilx, nid);
2511 	if (!page)
2512 		return NULL;
2513 
2514 	set_page_refcounted(page);
2515 	return page_rmappable_folio(page);
2516 }
2517 
2518 /**
2519  * vma_alloc_folio - Allocate a folio for a VMA.
2520  * @gfp: GFP flags.
2521  * @order: Order of the folio.
2522  * @vma: Pointer to VMA.
2523  * @addr: Virtual address of the allocation.  Must be inside @vma.
2524  *
2525  * Allocate a folio for a specific address in @vma, using the appropriate
2526  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2527  * VMA to prevent it from going away.  Should be used for all allocations
2528  * for folios that will be mapped into user space, excepting hugetlbfs, and
2529  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2530  *
2531  * Return: The folio on success or NULL if allocation fails.
2532  */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2533 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2534 		unsigned long addr)
2535 {
2536 	struct mempolicy *pol;
2537 	pgoff_t ilx;
2538 	struct folio *folio;
2539 
2540 	if (vma->vm_flags & VM_DROPPABLE)
2541 		gfp |= __GFP_NOWARN;
2542 
2543 	pol = get_vma_policy(vma, addr, order, &ilx);
2544 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2545 	mpol_cond_put(pol);
2546 	return folio;
2547 }
2548 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2549 
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2550 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2551 {
2552 	struct mempolicy *pol = &default_policy;
2553 
2554 	/*
2555 	 * No reference counting needed for current->mempolicy
2556 	 * nor system default_policy
2557 	 */
2558 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2559 		pol = get_task_policy(current);
2560 
2561 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2562 				       numa_node_id());
2563 }
2564 
2565 /**
2566  * alloc_pages - Allocate pages.
2567  * @gfp: GFP flags.
2568  * @order: Power of two of number of pages to allocate.
2569  *
2570  * Allocate 1 << @order contiguous pages.  The physical address of the
2571  * first page is naturally aligned (eg an order-3 allocation will be aligned
2572  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2573  * process is honoured when in process context.
2574  *
2575  * Context: Can be called from any context, providing the appropriate GFP
2576  * flags are used.
2577  * Return: The page on success or NULL if allocation fails.
2578  */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2579 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2580 {
2581 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2582 
2583 	if (page)
2584 		set_page_refcounted(page);
2585 	return page;
2586 }
2587 EXPORT_SYMBOL(alloc_pages_noprof);
2588 
folio_alloc_noprof(gfp_t gfp,unsigned int order)2589 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2590 {
2591 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2592 }
2593 EXPORT_SYMBOL(folio_alloc_noprof);
2594 
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2595 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2596 		struct mempolicy *pol, unsigned long nr_pages,
2597 		struct page **page_array)
2598 {
2599 	int nodes;
2600 	unsigned long nr_pages_per_node;
2601 	int delta;
2602 	int i;
2603 	unsigned long nr_allocated;
2604 	unsigned long total_allocated = 0;
2605 
2606 	nodes = nodes_weight(pol->nodes);
2607 	nr_pages_per_node = nr_pages / nodes;
2608 	delta = nr_pages - nodes * nr_pages_per_node;
2609 
2610 	for (i = 0; i < nodes; i++) {
2611 		if (delta) {
2612 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2613 					interleave_nodes(pol), NULL,
2614 					nr_pages_per_node + 1,
2615 					page_array);
2616 			delta--;
2617 		} else {
2618 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2619 					interleave_nodes(pol), NULL,
2620 					nr_pages_per_node, page_array);
2621 		}
2622 
2623 		page_array += nr_allocated;
2624 		total_allocated += nr_allocated;
2625 	}
2626 
2627 	return total_allocated;
2628 }
2629 
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2630 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2631 		struct mempolicy *pol, unsigned long nr_pages,
2632 		struct page **page_array)
2633 {
2634 	struct weighted_interleave_state *state;
2635 	struct task_struct *me = current;
2636 	unsigned int cpuset_mems_cookie;
2637 	unsigned long total_allocated = 0;
2638 	unsigned long nr_allocated = 0;
2639 	unsigned long rounds;
2640 	unsigned long node_pages, delta;
2641 	u8 *weights, weight;
2642 	unsigned int weight_total = 0;
2643 	unsigned long rem_pages = nr_pages;
2644 	nodemask_t nodes;
2645 	int nnodes, node;
2646 	int resume_node = MAX_NUMNODES - 1;
2647 	u8 resume_weight = 0;
2648 	int prev_node;
2649 	int i;
2650 
2651 	if (!nr_pages)
2652 		return 0;
2653 
2654 	/* read the nodes onto the stack, retry if done during rebind */
2655 	do {
2656 		cpuset_mems_cookie = read_mems_allowed_begin();
2657 		nnodes = read_once_policy_nodemask(pol, &nodes);
2658 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2659 
2660 	/* if the nodemask has become invalid, we cannot do anything */
2661 	if (!nnodes)
2662 		return 0;
2663 
2664 	/* Continue allocating from most recent node and adjust the nr_pages */
2665 	node = me->il_prev;
2666 	weight = me->il_weight;
2667 	if (weight && node_isset(node, nodes)) {
2668 		node_pages = min(rem_pages, weight);
2669 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2670 						  page_array);
2671 		page_array += nr_allocated;
2672 		total_allocated += nr_allocated;
2673 		/* if that's all the pages, no need to interleave */
2674 		if (rem_pages <= weight) {
2675 			me->il_weight -= rem_pages;
2676 			return total_allocated;
2677 		}
2678 		/* Otherwise we adjust remaining pages, continue from there */
2679 		rem_pages -= weight;
2680 	}
2681 	/* clear active weight in case of an allocation failure */
2682 	me->il_weight = 0;
2683 	prev_node = node;
2684 
2685 	/* create a local copy of node weights to operate on outside rcu */
2686 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2687 	if (!weights)
2688 		return total_allocated;
2689 
2690 	rcu_read_lock();
2691 	state = rcu_dereference(wi_state);
2692 	if (state) {
2693 		memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2694 		rcu_read_unlock();
2695 	} else {
2696 		rcu_read_unlock();
2697 		for (i = 0; i < nr_node_ids; i++)
2698 			weights[i] = 1;
2699 	}
2700 
2701 	/* calculate total, detect system default usage */
2702 	for_each_node_mask(node, nodes)
2703 		weight_total += weights[node];
2704 
2705 	/*
2706 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2707 	 * Track which node weighted interleave should resume from.
2708 	 *
2709 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2710 	 * the node following prev_node and its weight.
2711 	 */
2712 	rounds = rem_pages / weight_total;
2713 	delta = rem_pages % weight_total;
2714 	resume_node = next_node_in(prev_node, nodes);
2715 	resume_weight = weights[resume_node];
2716 	for (i = 0; i < nnodes; i++) {
2717 		node = next_node_in(prev_node, nodes);
2718 		weight = weights[node];
2719 		node_pages = weight * rounds;
2720 		/* If a delta exists, add this node's portion of the delta */
2721 		if (delta > weight) {
2722 			node_pages += weight;
2723 			delta -= weight;
2724 		} else if (delta) {
2725 			/* when delta is depleted, resume from that node */
2726 			node_pages += delta;
2727 			resume_node = node;
2728 			resume_weight = weight - delta;
2729 			delta = 0;
2730 		}
2731 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2732 		if (!node_pages)
2733 			break;
2734 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2735 						  page_array);
2736 		page_array += nr_allocated;
2737 		total_allocated += nr_allocated;
2738 		if (total_allocated == nr_pages)
2739 			break;
2740 		prev_node = node;
2741 	}
2742 	me->il_prev = resume_node;
2743 	me->il_weight = resume_weight;
2744 	kfree(weights);
2745 	return total_allocated;
2746 }
2747 
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2748 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2749 		struct mempolicy *pol, unsigned long nr_pages,
2750 		struct page **page_array)
2751 {
2752 	gfp_t preferred_gfp;
2753 	unsigned long nr_allocated = 0;
2754 
2755 	preferred_gfp = gfp | __GFP_NOWARN;
2756 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2757 
2758 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2759 					   nr_pages, page_array);
2760 
2761 	if (nr_allocated < nr_pages)
2762 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2763 				nr_pages - nr_allocated,
2764 				page_array + nr_allocated);
2765 	return nr_allocated;
2766 }
2767 
2768 /* alloc pages bulk and mempolicy should be considered at the
2769  * same time in some situation such as vmalloc.
2770  *
2771  * It can accelerate memory allocation especially interleaving
2772  * allocate memory.
2773  */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2774 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2775 		unsigned long nr_pages, struct page **page_array)
2776 {
2777 	struct mempolicy *pol = &default_policy;
2778 	nodemask_t *nodemask;
2779 	int nid;
2780 
2781 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2782 		pol = get_task_policy(current);
2783 
2784 	if (pol->mode == MPOL_INTERLEAVE)
2785 		return alloc_pages_bulk_interleave(gfp, pol,
2786 							 nr_pages, page_array);
2787 
2788 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2789 		return alloc_pages_bulk_weighted_interleave(
2790 				  gfp, pol, nr_pages, page_array);
2791 
2792 	if (pol->mode == MPOL_PREFERRED_MANY)
2793 		return alloc_pages_bulk_preferred_many(gfp,
2794 				numa_node_id(), pol, nr_pages, page_array);
2795 
2796 	nid = numa_node_id();
2797 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2798 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2799 				       nr_pages, page_array);
2800 }
2801 
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2802 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2803 {
2804 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2805 
2806 	if (IS_ERR(pol))
2807 		return PTR_ERR(pol);
2808 	dst->vm_policy = pol;
2809 	return 0;
2810 }
2811 
2812 /*
2813  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2814  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2815  * with the mems_allowed returned by cpuset_mems_allowed().  This
2816  * keeps mempolicies cpuset relative after its cpuset moves.  See
2817  * further kernel/cpuset.c update_nodemask().
2818  *
2819  * current's mempolicy may be rebinded by the other task(the task that changes
2820  * cpuset's mems), so we needn't do rebind work for current task.
2821  */
2822 
2823 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2824 struct mempolicy *__mpol_dup(struct mempolicy *old)
2825 {
2826 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2827 
2828 	if (!new)
2829 		return ERR_PTR(-ENOMEM);
2830 
2831 	/* task's mempolicy is protected by alloc_lock */
2832 	if (old == current->mempolicy) {
2833 		task_lock(current);
2834 		*new = *old;
2835 		task_unlock(current);
2836 	} else
2837 		*new = *old;
2838 
2839 	if (current_cpuset_is_being_rebound()) {
2840 		nodemask_t mems = cpuset_mems_allowed(current);
2841 		mpol_rebind_policy(new, &mems);
2842 	}
2843 	atomic_set(&new->refcnt, 1);
2844 	return new;
2845 }
2846 
2847 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2848 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2849 {
2850 	if (!a || !b)
2851 		return false;
2852 	if (a->mode != b->mode)
2853 		return false;
2854 	if (a->flags != b->flags)
2855 		return false;
2856 	if (a->home_node != b->home_node)
2857 		return false;
2858 	if (mpol_store_user_nodemask(a))
2859 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2860 			return false;
2861 
2862 	switch (a->mode) {
2863 	case MPOL_BIND:
2864 	case MPOL_INTERLEAVE:
2865 	case MPOL_PREFERRED:
2866 	case MPOL_PREFERRED_MANY:
2867 	case MPOL_WEIGHTED_INTERLEAVE:
2868 		return !!nodes_equal(a->nodes, b->nodes);
2869 	case MPOL_LOCAL:
2870 		return true;
2871 	default:
2872 		BUG();
2873 		return false;
2874 	}
2875 }
2876 
2877 /*
2878  * Shared memory backing store policy support.
2879  *
2880  * Remember policies even when nobody has shared memory mapped.
2881  * The policies are kept in Red-Black tree linked from the inode.
2882  * They are protected by the sp->lock rwlock, which should be held
2883  * for any accesses to the tree.
2884  */
2885 
2886 /*
2887  * lookup first element intersecting start-end.  Caller holds sp->lock for
2888  * reading or for writing
2889  */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2890 static struct sp_node *sp_lookup(struct shared_policy *sp,
2891 					pgoff_t start, pgoff_t end)
2892 {
2893 	struct rb_node *n = sp->root.rb_node;
2894 
2895 	while (n) {
2896 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2897 
2898 		if (start >= p->end)
2899 			n = n->rb_right;
2900 		else if (end <= p->start)
2901 			n = n->rb_left;
2902 		else
2903 			break;
2904 	}
2905 	if (!n)
2906 		return NULL;
2907 	for (;;) {
2908 		struct sp_node *w = NULL;
2909 		struct rb_node *prev = rb_prev(n);
2910 		if (!prev)
2911 			break;
2912 		w = rb_entry(prev, struct sp_node, nd);
2913 		if (w->end <= start)
2914 			break;
2915 		n = prev;
2916 	}
2917 	return rb_entry(n, struct sp_node, nd);
2918 }
2919 
2920 /*
2921  * Insert a new shared policy into the list.  Caller holds sp->lock for
2922  * writing.
2923  */
sp_insert(struct shared_policy * sp,struct sp_node * new)2924 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2925 {
2926 	struct rb_node **p = &sp->root.rb_node;
2927 	struct rb_node *parent = NULL;
2928 	struct sp_node *nd;
2929 
2930 	while (*p) {
2931 		parent = *p;
2932 		nd = rb_entry(parent, struct sp_node, nd);
2933 		if (new->start < nd->start)
2934 			p = &(*p)->rb_left;
2935 		else if (new->end > nd->end)
2936 			p = &(*p)->rb_right;
2937 		else
2938 			BUG();
2939 	}
2940 	rb_link_node(&new->nd, parent, p);
2941 	rb_insert_color(&new->nd, &sp->root);
2942 }
2943 
2944 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2945 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2946 						pgoff_t idx)
2947 {
2948 	struct mempolicy *pol = NULL;
2949 	struct sp_node *sn;
2950 
2951 	if (!sp->root.rb_node)
2952 		return NULL;
2953 	read_lock(&sp->lock);
2954 	sn = sp_lookup(sp, idx, idx+1);
2955 	if (sn) {
2956 		mpol_get(sn->policy);
2957 		pol = sn->policy;
2958 	}
2959 	read_unlock(&sp->lock);
2960 	return pol;
2961 }
2962 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
2963 
sp_free(struct sp_node * n)2964 static void sp_free(struct sp_node *n)
2965 {
2966 	mpol_put(n->policy);
2967 	kmem_cache_free(sn_cache, n);
2968 }
2969 
2970 /**
2971  * mpol_misplaced - check whether current folio node is valid in policy
2972  *
2973  * @folio: folio to be checked
2974  * @vmf: structure describing the fault
2975  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2976  *
2977  * Lookup current policy node id for vma,addr and "compare to" folio's
2978  * node id.  Policy determination "mimics" alloc_page_vma().
2979  * Called from fault path where we know the vma and faulting address.
2980  *
2981  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2982  * policy, or a suitable node ID to allocate a replacement folio from.
2983  */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2984 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2985 		   unsigned long addr)
2986 {
2987 	struct mempolicy *pol;
2988 	pgoff_t ilx;
2989 	struct zoneref *z;
2990 	int curnid = folio_nid(folio);
2991 	struct vm_area_struct *vma = vmf->vma;
2992 	int thiscpu = raw_smp_processor_id();
2993 	int thisnid = numa_node_id();
2994 	int polnid = NUMA_NO_NODE;
2995 	int ret = NUMA_NO_NODE;
2996 
2997 	/*
2998 	 * Make sure ptl is held so that we don't preempt and we
2999 	 * have a stable smp processor id
3000 	 */
3001 	lockdep_assert_held(vmf->ptl);
3002 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
3003 	if (!(pol->flags & MPOL_F_MOF))
3004 		goto out;
3005 
3006 	switch (pol->mode) {
3007 	case MPOL_INTERLEAVE:
3008 		polnid = interleave_nid(pol, ilx);
3009 		break;
3010 
3011 	case MPOL_WEIGHTED_INTERLEAVE:
3012 		polnid = weighted_interleave_nid(pol, ilx);
3013 		break;
3014 
3015 	case MPOL_PREFERRED:
3016 		if (node_isset(curnid, pol->nodes))
3017 			goto out;
3018 		polnid = first_node(pol->nodes);
3019 		break;
3020 
3021 	case MPOL_LOCAL:
3022 		polnid = numa_node_id();
3023 		break;
3024 
3025 	case MPOL_BIND:
3026 	case MPOL_PREFERRED_MANY:
3027 		/*
3028 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
3029 		 * policy nodemask we don't allow numa migration to nodes
3030 		 * outside policy nodemask for now. This is done so that if we
3031 		 * want demotion to slow memory to happen, before allocating
3032 		 * from some DRAM node say 'x', we will end up using a
3033 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3034 		 * we should not promote to node 'x' from slow memory node.
3035 		 */
3036 		if (pol->flags & MPOL_F_MORON) {
3037 			/*
3038 			 * Optimize placement among multiple nodes
3039 			 * via NUMA balancing
3040 			 */
3041 			if (node_isset(thisnid, pol->nodes))
3042 				break;
3043 			goto out;
3044 		}
3045 
3046 		/*
3047 		 * use current page if in policy nodemask,
3048 		 * else select nearest allowed node, if any.
3049 		 * If no allowed nodes, use current [!misplaced].
3050 		 */
3051 		if (node_isset(curnid, pol->nodes))
3052 			goto out;
3053 		z = first_zones_zonelist(
3054 				node_zonelist(thisnid, GFP_HIGHUSER),
3055 				gfp_zone(GFP_HIGHUSER),
3056 				&pol->nodes);
3057 		polnid = zonelist_node_idx(z);
3058 		break;
3059 
3060 	default:
3061 		BUG();
3062 	}
3063 
3064 	/* Migrate the folio towards the node whose CPU is referencing it */
3065 	if (pol->flags & MPOL_F_MORON) {
3066 		polnid = thisnid;
3067 
3068 		if (!should_numa_migrate_memory(current, folio, curnid,
3069 						thiscpu))
3070 			goto out;
3071 	}
3072 
3073 	if (curnid != polnid)
3074 		ret = polnid;
3075 out:
3076 	mpol_cond_put(pol);
3077 
3078 	return ret;
3079 }
3080 
3081 /*
3082  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
3083  * dropped after task->mempolicy is set to NULL so that any allocation done as
3084  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3085  * policy.
3086  */
mpol_put_task_policy(struct task_struct * task)3087 void mpol_put_task_policy(struct task_struct *task)
3088 {
3089 	struct mempolicy *pol;
3090 
3091 	task_lock(task);
3092 	pol = task->mempolicy;
3093 	task->mempolicy = NULL;
3094 	task_unlock(task);
3095 	mpol_put(pol);
3096 }
3097 
sp_delete(struct shared_policy * sp,struct sp_node * n)3098 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3099 {
3100 	rb_erase(&n->nd, &sp->root);
3101 	sp_free(n);
3102 }
3103 
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3104 static void sp_node_init(struct sp_node *node, unsigned long start,
3105 			unsigned long end, struct mempolicy *pol)
3106 {
3107 	node->start = start;
3108 	node->end = end;
3109 	node->policy = pol;
3110 }
3111 
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3112 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3113 				struct mempolicy *pol)
3114 {
3115 	struct sp_node *n;
3116 	struct mempolicy *newpol;
3117 
3118 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3119 	if (!n)
3120 		return NULL;
3121 
3122 	newpol = mpol_dup(pol);
3123 	if (IS_ERR(newpol)) {
3124 		kmem_cache_free(sn_cache, n);
3125 		return NULL;
3126 	}
3127 	newpol->flags |= MPOL_F_SHARED;
3128 	sp_node_init(n, start, end, newpol);
3129 
3130 	return n;
3131 }
3132 
3133 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3134 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3135 				 pgoff_t end, struct sp_node *new)
3136 {
3137 	struct sp_node *n;
3138 	struct sp_node *n_new = NULL;
3139 	struct mempolicy *mpol_new = NULL;
3140 	int ret = 0;
3141 
3142 restart:
3143 	write_lock(&sp->lock);
3144 	n = sp_lookup(sp, start, end);
3145 	/* Take care of old policies in the same range. */
3146 	while (n && n->start < end) {
3147 		struct rb_node *next = rb_next(&n->nd);
3148 		if (n->start >= start) {
3149 			if (n->end <= end)
3150 				sp_delete(sp, n);
3151 			else
3152 				n->start = end;
3153 		} else {
3154 			/* Old policy spanning whole new range. */
3155 			if (n->end > end) {
3156 				if (!n_new)
3157 					goto alloc_new;
3158 
3159 				*mpol_new = *n->policy;
3160 				atomic_set(&mpol_new->refcnt, 1);
3161 				sp_node_init(n_new, end, n->end, mpol_new);
3162 				n->end = start;
3163 				sp_insert(sp, n_new);
3164 				n_new = NULL;
3165 				mpol_new = NULL;
3166 				break;
3167 			} else
3168 				n->end = start;
3169 		}
3170 		if (!next)
3171 			break;
3172 		n = rb_entry(next, struct sp_node, nd);
3173 	}
3174 	if (new)
3175 		sp_insert(sp, new);
3176 	write_unlock(&sp->lock);
3177 	ret = 0;
3178 
3179 err_out:
3180 	if (mpol_new)
3181 		mpol_put(mpol_new);
3182 	if (n_new)
3183 		kmem_cache_free(sn_cache, n_new);
3184 
3185 	return ret;
3186 
3187 alloc_new:
3188 	write_unlock(&sp->lock);
3189 	ret = -ENOMEM;
3190 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3191 	if (!n_new)
3192 		goto err_out;
3193 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3194 	if (!mpol_new)
3195 		goto err_out;
3196 	atomic_set(&mpol_new->refcnt, 1);
3197 	goto restart;
3198 }
3199 
3200 /**
3201  * mpol_shared_policy_init - initialize shared policy for inode
3202  * @sp: pointer to inode shared policy
3203  * @mpol:  struct mempolicy to install
3204  *
3205  * Install non-NULL @mpol in inode's shared policy rb-tree.
3206  * On entry, the current task has a reference on a non-NULL @mpol.
3207  * This must be released on exit.
3208  * This is called at get_inode() calls and we can use GFP_KERNEL.
3209  */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3210 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3211 {
3212 	int ret;
3213 
3214 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3215 	rwlock_init(&sp->lock);
3216 
3217 	if (mpol) {
3218 		struct sp_node *sn;
3219 		struct mempolicy *npol;
3220 		NODEMASK_SCRATCH(scratch);
3221 
3222 		if (!scratch)
3223 			goto put_mpol;
3224 
3225 		/* contextualize the tmpfs mount point mempolicy to this file */
3226 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3227 		if (IS_ERR(npol))
3228 			goto free_scratch; /* no valid nodemask intersection */
3229 
3230 		task_lock(current);
3231 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3232 		task_unlock(current);
3233 		if (ret)
3234 			goto put_npol;
3235 
3236 		/* alloc node covering entire file; adds ref to file's npol */
3237 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3238 		if (sn)
3239 			sp_insert(sp, sn);
3240 put_npol:
3241 		mpol_put(npol);	/* drop initial ref on file's npol */
3242 free_scratch:
3243 		NODEMASK_SCRATCH_FREE(scratch);
3244 put_mpol:
3245 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3246 	}
3247 }
3248 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
3249 
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3250 int mpol_set_shared_policy(struct shared_policy *sp,
3251 			struct vm_area_struct *vma, struct mempolicy *pol)
3252 {
3253 	int err;
3254 	struct sp_node *new = NULL;
3255 	unsigned long sz = vma_pages(vma);
3256 
3257 	if (pol) {
3258 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3259 		if (!new)
3260 			return -ENOMEM;
3261 	}
3262 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3263 	if (err && new)
3264 		sp_free(new);
3265 	return err;
3266 }
3267 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
3268 
3269 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3270 void mpol_free_shared_policy(struct shared_policy *sp)
3271 {
3272 	struct sp_node *n;
3273 	struct rb_node *next;
3274 
3275 	if (!sp->root.rb_node)
3276 		return;
3277 	write_lock(&sp->lock);
3278 	next = rb_first(&sp->root);
3279 	while (next) {
3280 		n = rb_entry(next, struct sp_node, nd);
3281 		next = rb_next(&n->nd);
3282 		sp_delete(sp, n);
3283 	}
3284 	write_unlock(&sp->lock);
3285 }
3286 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
3287 
3288 #ifdef CONFIG_NUMA_BALANCING
3289 static int __initdata numabalancing_override;
3290 
check_numabalancing_enable(void)3291 static void __init check_numabalancing_enable(void)
3292 {
3293 	bool numabalancing_default = false;
3294 
3295 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3296 		numabalancing_default = true;
3297 
3298 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3299 	if (numabalancing_override)
3300 		set_numabalancing_state(numabalancing_override == 1);
3301 
3302 	if (num_online_nodes() > 1 && !numabalancing_override) {
3303 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3304 			numabalancing_default ? "Enabling" : "Disabling");
3305 		set_numabalancing_state(numabalancing_default);
3306 	}
3307 }
3308 
setup_numabalancing(char * str)3309 static int __init setup_numabalancing(char *str)
3310 {
3311 	int ret = 0;
3312 	if (!str)
3313 		goto out;
3314 
3315 	if (!strcmp(str, "enable")) {
3316 		numabalancing_override = 1;
3317 		ret = 1;
3318 	} else if (!strcmp(str, "disable")) {
3319 		numabalancing_override = -1;
3320 		ret = 1;
3321 	}
3322 out:
3323 	if (!ret)
3324 		pr_warn("Unable to parse numa_balancing=\n");
3325 
3326 	return ret;
3327 }
3328 __setup("numa_balancing=", setup_numabalancing);
3329 #else
check_numabalancing_enable(void)3330 static inline void __init check_numabalancing_enable(void)
3331 {
3332 }
3333 #endif /* CONFIG_NUMA_BALANCING */
3334 
numa_policy_init(void)3335 void __init numa_policy_init(void)
3336 {
3337 	nodemask_t interleave_nodes;
3338 	unsigned long largest = 0;
3339 	int nid, prefer = 0;
3340 
3341 	policy_cache = kmem_cache_create("numa_policy",
3342 					 sizeof(struct mempolicy),
3343 					 0, SLAB_PANIC, NULL);
3344 
3345 	sn_cache = kmem_cache_create("shared_policy_node",
3346 				     sizeof(struct sp_node),
3347 				     0, SLAB_PANIC, NULL);
3348 
3349 	for_each_node(nid) {
3350 		preferred_node_policy[nid] = (struct mempolicy) {
3351 			.refcnt = ATOMIC_INIT(1),
3352 			.mode = MPOL_PREFERRED,
3353 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3354 			.nodes = nodemask_of_node(nid),
3355 		};
3356 	}
3357 
3358 	/*
3359 	 * Set interleaving policy for system init. Interleaving is only
3360 	 * enabled across suitably sized nodes (default is >= 16MB), or
3361 	 * fall back to the largest node if they're all smaller.
3362 	 */
3363 	nodes_clear(interleave_nodes);
3364 	for_each_node_state(nid, N_MEMORY) {
3365 		unsigned long total_pages = node_present_pages(nid);
3366 
3367 		/* Preserve the largest node */
3368 		if (largest < total_pages) {
3369 			largest = total_pages;
3370 			prefer = nid;
3371 		}
3372 
3373 		/* Interleave this node? */
3374 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3375 			node_set(nid, interleave_nodes);
3376 	}
3377 
3378 	/* All too small, use the largest */
3379 	if (unlikely(nodes_empty(interleave_nodes)))
3380 		node_set(prefer, interleave_nodes);
3381 
3382 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3383 		pr_err("%s: interleaving failed\n", __func__);
3384 
3385 	check_numabalancing_enable();
3386 }
3387 
3388 /* Reset policy of current process to default */
numa_default_policy(void)3389 void numa_default_policy(void)
3390 {
3391 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3392 }
3393 
3394 /*
3395  * Parse and format mempolicy from/to strings
3396  */
3397 static const char * const policy_modes[] =
3398 {
3399 	[MPOL_DEFAULT]    = "default",
3400 	[MPOL_PREFERRED]  = "prefer",
3401 	[MPOL_BIND]       = "bind",
3402 	[MPOL_INTERLEAVE] = "interleave",
3403 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3404 	[MPOL_LOCAL]      = "local",
3405 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3406 };
3407 
3408 #ifdef CONFIG_TMPFS
3409 /**
3410  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3411  * @str:  string containing mempolicy to parse
3412  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3413  *
3414  * Format of input:
3415  *	<mode>[=<flags>][:<nodelist>]
3416  *
3417  * Return: %0 on success, else %1
3418  */
mpol_parse_str(char * str,struct mempolicy ** mpol)3419 int mpol_parse_str(char *str, struct mempolicy **mpol)
3420 {
3421 	struct mempolicy *new = NULL;
3422 	unsigned short mode_flags;
3423 	nodemask_t nodes;
3424 	char *nodelist = strchr(str, ':');
3425 	char *flags = strchr(str, '=');
3426 	int err = 1, mode;
3427 
3428 	if (flags)
3429 		*flags++ = '\0';	/* terminate mode string */
3430 
3431 	if (nodelist) {
3432 		/* NUL-terminate mode or flags string */
3433 		*nodelist++ = '\0';
3434 		if (nodelist_parse(nodelist, nodes))
3435 			goto out;
3436 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3437 			goto out;
3438 	} else
3439 		nodes_clear(nodes);
3440 
3441 	mode = match_string(policy_modes, MPOL_MAX, str);
3442 	if (mode < 0)
3443 		goto out;
3444 
3445 	switch (mode) {
3446 	case MPOL_PREFERRED:
3447 		/*
3448 		 * Insist on a nodelist of one node only, although later
3449 		 * we use first_node(nodes) to grab a single node, so here
3450 		 * nodelist (or nodes) cannot be empty.
3451 		 */
3452 		if (nodelist) {
3453 			char *rest = nodelist;
3454 			while (isdigit(*rest))
3455 				rest++;
3456 			if (*rest)
3457 				goto out;
3458 			if (nodes_empty(nodes))
3459 				goto out;
3460 		}
3461 		break;
3462 	case MPOL_INTERLEAVE:
3463 	case MPOL_WEIGHTED_INTERLEAVE:
3464 		/*
3465 		 * Default to online nodes with memory if no nodelist
3466 		 */
3467 		if (!nodelist)
3468 			nodes = node_states[N_MEMORY];
3469 		break;
3470 	case MPOL_LOCAL:
3471 		/*
3472 		 * Don't allow a nodelist;  mpol_new() checks flags
3473 		 */
3474 		if (nodelist)
3475 			goto out;
3476 		break;
3477 	case MPOL_DEFAULT:
3478 		/*
3479 		 * Insist on a empty nodelist
3480 		 */
3481 		if (!nodelist)
3482 			err = 0;
3483 		goto out;
3484 	case MPOL_PREFERRED_MANY:
3485 	case MPOL_BIND:
3486 		/*
3487 		 * Insist on a nodelist
3488 		 */
3489 		if (!nodelist)
3490 			goto out;
3491 	}
3492 
3493 	mode_flags = 0;
3494 	if (flags) {
3495 		/*
3496 		 * Currently, we only support two mutually exclusive
3497 		 * mode flags.
3498 		 */
3499 		if (!strcmp(flags, "static"))
3500 			mode_flags |= MPOL_F_STATIC_NODES;
3501 		else if (!strcmp(flags, "relative"))
3502 			mode_flags |= MPOL_F_RELATIVE_NODES;
3503 		else
3504 			goto out;
3505 	}
3506 
3507 	new = mpol_new(mode, mode_flags, &nodes);
3508 	if (IS_ERR(new))
3509 		goto out;
3510 
3511 	/*
3512 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3513 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3514 	 */
3515 	if (mode != MPOL_PREFERRED) {
3516 		new->nodes = nodes;
3517 	} else if (nodelist) {
3518 		nodes_clear(new->nodes);
3519 		node_set(first_node(nodes), new->nodes);
3520 	} else {
3521 		new->mode = MPOL_LOCAL;
3522 	}
3523 
3524 	/*
3525 	 * Save nodes for contextualization: this will be used to "clone"
3526 	 * the mempolicy in a specific context [cpuset] at a later time.
3527 	 */
3528 	new->w.user_nodemask = nodes;
3529 
3530 	err = 0;
3531 
3532 out:
3533 	/* Restore string for error message */
3534 	if (nodelist)
3535 		*--nodelist = ':';
3536 	if (flags)
3537 		*--flags = '=';
3538 	if (!err)
3539 		*mpol = new;
3540 	return err;
3541 }
3542 #endif /* CONFIG_TMPFS */
3543 
3544 /**
3545  * mpol_to_str - format a mempolicy structure for printing
3546  * @buffer:  to contain formatted mempolicy string
3547  * @maxlen:  length of @buffer
3548  * @pol:  pointer to mempolicy to be formatted
3549  *
3550  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3551  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3552  * interleave", plus the longest flag flags, "relative|balancing", and to
3553  * display at least a few node ids.
3554  */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3555 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3556 {
3557 	char *p = buffer;
3558 	nodemask_t nodes = NODE_MASK_NONE;
3559 	unsigned short mode = MPOL_DEFAULT;
3560 	unsigned short flags = 0;
3561 
3562 	if (pol &&
3563 	    pol != &default_policy &&
3564 	    !(pol >= &preferred_node_policy[0] &&
3565 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3566 		mode = pol->mode;
3567 		flags = pol->flags;
3568 	}
3569 
3570 	switch (mode) {
3571 	case MPOL_DEFAULT:
3572 	case MPOL_LOCAL:
3573 		break;
3574 	case MPOL_PREFERRED:
3575 	case MPOL_PREFERRED_MANY:
3576 	case MPOL_BIND:
3577 	case MPOL_INTERLEAVE:
3578 	case MPOL_WEIGHTED_INTERLEAVE:
3579 		nodes = pol->nodes;
3580 		break;
3581 	default:
3582 		WARN_ON_ONCE(1);
3583 		snprintf(p, maxlen, "unknown");
3584 		return;
3585 	}
3586 
3587 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3588 
3589 	if (flags & MPOL_MODE_FLAGS) {
3590 		p += snprintf(p, buffer + maxlen - p, "=");
3591 
3592 		/*
3593 		 * Static and relative are mutually exclusive.
3594 		 */
3595 		if (flags & MPOL_F_STATIC_NODES)
3596 			p += snprintf(p, buffer + maxlen - p, "static");
3597 		else if (flags & MPOL_F_RELATIVE_NODES)
3598 			p += snprintf(p, buffer + maxlen - p, "relative");
3599 
3600 		if (flags & MPOL_F_NUMA_BALANCING) {
3601 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3602 				p += snprintf(p, buffer + maxlen - p, "|");
3603 			p += snprintf(p, buffer + maxlen - p, "balancing");
3604 		}
3605 	}
3606 
3607 	if (!nodes_empty(nodes))
3608 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3609 			       nodemask_pr_args(&nodes));
3610 }
3611 
3612 #ifdef CONFIG_SYSFS
3613 struct iw_node_attr {
3614 	struct kobj_attribute kobj_attr;
3615 	int nid;
3616 };
3617 
3618 struct sysfs_wi_group {
3619 	struct kobject wi_kobj;
3620 	struct mutex kobj_lock;
3621 	struct iw_node_attr *nattrs[];
3622 };
3623 
3624 static struct sysfs_wi_group *wi_group;
3625 
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3626 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3627 			 char *buf)
3628 {
3629 	struct iw_node_attr *node_attr;
3630 	u8 weight;
3631 
3632 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3633 	weight = get_il_weight(node_attr->nid);
3634 	return sysfs_emit(buf, "%d\n", weight);
3635 }
3636 
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3637 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3638 			  const char *buf, size_t count)
3639 {
3640 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3641 	struct iw_node_attr *node_attr;
3642 	u8 weight = 0;
3643 	int i;
3644 
3645 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3646 	if (count == 0 || sysfs_streq(buf, "") ||
3647 	    kstrtou8(buf, 0, &weight) || weight == 0)
3648 		return -EINVAL;
3649 
3650 	new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3651 	if (!new_wi_state)
3652 		return -ENOMEM;
3653 
3654 	mutex_lock(&wi_state_lock);
3655 	old_wi_state = rcu_dereference_protected(wi_state,
3656 					lockdep_is_held(&wi_state_lock));
3657 	if (old_wi_state) {
3658 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3659 					nr_node_ids * sizeof(u8));
3660 	} else {
3661 		for (i = 0; i < nr_node_ids; i++)
3662 			new_wi_state->iw_table[i] = 1;
3663 	}
3664 	new_wi_state->iw_table[node_attr->nid] = weight;
3665 	new_wi_state->mode_auto = false;
3666 
3667 	rcu_assign_pointer(wi_state, new_wi_state);
3668 	mutex_unlock(&wi_state_lock);
3669 	if (old_wi_state) {
3670 		synchronize_rcu();
3671 		kfree(old_wi_state);
3672 	}
3673 	return count;
3674 }
3675 
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3676 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3677 		struct kobj_attribute *attr, char *buf)
3678 {
3679 	struct weighted_interleave_state *state;
3680 	bool wi_auto = true;
3681 
3682 	rcu_read_lock();
3683 	state = rcu_dereference(wi_state);
3684 	if (state)
3685 		wi_auto = state->mode_auto;
3686 	rcu_read_unlock();
3687 
3688 	return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3689 }
3690 
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3691 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3692 		struct kobj_attribute *attr, const char *buf, size_t count)
3693 {
3694 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3695 	unsigned int *bw;
3696 	bool input;
3697 	int i;
3698 
3699 	if (kstrtobool(buf, &input))
3700 		return -EINVAL;
3701 
3702 	new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3703 	if (!new_wi_state)
3704 		return -ENOMEM;
3705 	for (i = 0; i < nr_node_ids; i++)
3706 		new_wi_state->iw_table[i] = 1;
3707 
3708 	mutex_lock(&wi_state_lock);
3709 	if (!input) {
3710 		old_wi_state = rcu_dereference_protected(wi_state,
3711 					lockdep_is_held(&wi_state_lock));
3712 		if (!old_wi_state)
3713 			goto update_wi_state;
3714 		if (input == old_wi_state->mode_auto) {
3715 			mutex_unlock(&wi_state_lock);
3716 			return count;
3717 		}
3718 
3719 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3720 					       nr_node_ids * sizeof(u8));
3721 		goto update_wi_state;
3722 	}
3723 
3724 	bw = node_bw_table;
3725 	if (!bw) {
3726 		mutex_unlock(&wi_state_lock);
3727 		kfree(new_wi_state);
3728 		return -ENODEV;
3729 	}
3730 
3731 	new_wi_state->mode_auto = true;
3732 	reduce_interleave_weights(bw, new_wi_state->iw_table);
3733 
3734 update_wi_state:
3735 	rcu_assign_pointer(wi_state, new_wi_state);
3736 	mutex_unlock(&wi_state_lock);
3737 	if (old_wi_state) {
3738 		synchronize_rcu();
3739 		kfree(old_wi_state);
3740 	}
3741 	return count;
3742 }
3743 
sysfs_wi_node_delete(int nid)3744 static void sysfs_wi_node_delete(int nid)
3745 {
3746 	struct iw_node_attr *attr;
3747 
3748 	if (nid < 0 || nid >= nr_node_ids)
3749 		return;
3750 
3751 	mutex_lock(&wi_group->kobj_lock);
3752 	attr = wi_group->nattrs[nid];
3753 	if (!attr) {
3754 		mutex_unlock(&wi_group->kobj_lock);
3755 		return;
3756 	}
3757 
3758 	wi_group->nattrs[nid] = NULL;
3759 	mutex_unlock(&wi_group->kobj_lock);
3760 
3761 	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3762 	kfree(attr->kobj_attr.attr.name);
3763 	kfree(attr);
3764 }
3765 
sysfs_wi_node_delete_all(void)3766 static void sysfs_wi_node_delete_all(void)
3767 {
3768 	int nid;
3769 
3770 	for (nid = 0; nid < nr_node_ids; nid++)
3771 		sysfs_wi_node_delete(nid);
3772 }
3773 
wi_state_free(void)3774 static void wi_state_free(void)
3775 {
3776 	struct weighted_interleave_state *old_wi_state;
3777 
3778 	mutex_lock(&wi_state_lock);
3779 	old_wi_state = rcu_dereference_protected(wi_state,
3780 			lockdep_is_held(&wi_state_lock));
3781 	rcu_assign_pointer(wi_state, NULL);
3782 	mutex_unlock(&wi_state_lock);
3783 
3784 	if (old_wi_state) {
3785 		synchronize_rcu();
3786 		kfree(old_wi_state);
3787 	}
3788 }
3789 
3790 static struct kobj_attribute wi_auto_attr =
3791 	__ATTR(auto, 0664, weighted_interleave_auto_show,
3792 			   weighted_interleave_auto_store);
3793 
wi_cleanup(void)3794 static void wi_cleanup(void) {
3795 	sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3796 	sysfs_wi_node_delete_all();
3797 	wi_state_free();
3798 }
3799 
wi_kobj_release(struct kobject * wi_kobj)3800 static void wi_kobj_release(struct kobject *wi_kobj)
3801 {
3802 	kfree(wi_group);
3803 }
3804 
3805 static const struct kobj_type wi_ktype = {
3806 	.sysfs_ops = &kobj_sysfs_ops,
3807 	.release = wi_kobj_release,
3808 };
3809 
sysfs_wi_node_add(int nid)3810 static int sysfs_wi_node_add(int nid)
3811 {
3812 	int ret;
3813 	char *name;
3814 	struct iw_node_attr *new_attr;
3815 
3816 	if (nid < 0 || nid >= nr_node_ids) {
3817 		pr_err("invalid node id: %d\n", nid);
3818 		return -EINVAL;
3819 	}
3820 
3821 	new_attr = kzalloc_obj(*new_attr);
3822 	if (!new_attr)
3823 		return -ENOMEM;
3824 
3825 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3826 	if (!name) {
3827 		kfree(new_attr);
3828 		return -ENOMEM;
3829 	}
3830 
3831 	sysfs_attr_init(&new_attr->kobj_attr.attr);
3832 	new_attr->kobj_attr.attr.name = name;
3833 	new_attr->kobj_attr.attr.mode = 0644;
3834 	new_attr->kobj_attr.show = node_show;
3835 	new_attr->kobj_attr.store = node_store;
3836 	new_attr->nid = nid;
3837 
3838 	mutex_lock(&wi_group->kobj_lock);
3839 	if (wi_group->nattrs[nid]) {
3840 		mutex_unlock(&wi_group->kobj_lock);
3841 		ret = -EEXIST;
3842 		goto out;
3843 	}
3844 
3845 	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3846 	if (ret) {
3847 		mutex_unlock(&wi_group->kobj_lock);
3848 		goto out;
3849 	}
3850 	wi_group->nattrs[nid] = new_attr;
3851 	mutex_unlock(&wi_group->kobj_lock);
3852 	return 0;
3853 
3854 out:
3855 	kfree(new_attr->kobj_attr.attr.name);
3856 	kfree(new_attr);
3857 	return ret;
3858 }
3859 
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3860 static int wi_node_notifier(struct notifier_block *nb,
3861 			       unsigned long action, void *data)
3862 {
3863 	int err;
3864 	struct node_notify *nn = data;
3865 	int nid = nn->nid;
3866 
3867 	switch (action) {
3868 	case NODE_ADDED_FIRST_MEMORY:
3869 		err = sysfs_wi_node_add(nid);
3870 		if (err)
3871 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3872 			       nid, err);
3873 		break;
3874 	case NODE_REMOVED_LAST_MEMORY:
3875 		sysfs_wi_node_delete(nid);
3876 		break;
3877 	}
3878 
3879 	return NOTIFY_OK;
3880 }
3881 
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3882 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3883 {
3884 	int nid, err;
3885 
3886 	wi_group = kzalloc_flex(*wi_group, nattrs, nr_node_ids);
3887 	if (!wi_group)
3888 		return -ENOMEM;
3889 	mutex_init(&wi_group->kobj_lock);
3890 
3891 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3892 				   "weighted_interleave");
3893 	if (err)
3894 		goto err_put_kobj;
3895 
3896 	err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3897 	if (err)
3898 		goto err_put_kobj;
3899 
3900 	for_each_online_node(nid) {
3901 		if (!node_state(nid, N_MEMORY))
3902 			continue;
3903 
3904 		err = sysfs_wi_node_add(nid);
3905 		if (err) {
3906 			pr_err("failed to add sysfs for node%d during init: %d\n",
3907 			       nid, err);
3908 			goto err_cleanup_kobj;
3909 		}
3910 	}
3911 
3912 	hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3913 	return 0;
3914 
3915 err_cleanup_kobj:
3916 	wi_cleanup();
3917 	kobject_del(&wi_group->wi_kobj);
3918 err_put_kobj:
3919 	kobject_put(&wi_group->wi_kobj);
3920 	return err;
3921 }
3922 
mempolicy_sysfs_init(void)3923 static int __init mempolicy_sysfs_init(void)
3924 {
3925 	int err;
3926 	static struct kobject *mempolicy_kobj;
3927 
3928 	mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3929 	if (!mempolicy_kobj)
3930 		return -ENOMEM;
3931 
3932 	err = add_weighted_interleave_group(mempolicy_kobj);
3933 	if (err)
3934 		goto err_kobj;
3935 
3936 	return 0;
3937 
3938 err_kobj:
3939 	kobject_del(mempolicy_kobj);
3940 	kobject_put(mempolicy_kobj);
3941 	return err;
3942 }
3943 
3944 late_initcall(mempolicy_sysfs_init);
3945 #endif /* CONFIG_SYSFS */
3946