xref: /linux/mm/mempolicy.c (revision 51d90a15fedf8366cb96ef68d0ea2d0bf15417d2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/sysctl.h>
89 #include <linux/sched/task.h>
90 #include <linux/nodemask.h>
91 #include <linux/cpuset.h>
92 #include <linux/slab.h>
93 #include <linux/string.h>
94 #include <linux/export.h>
95 #include <linux/nsproxy.h>
96 #include <linux/interrupt.h>
97 #include <linux/init.h>
98 #include <linux/compat.h>
99 #include <linux/ptrace.h>
100 #include <linux/swap.h>
101 #include <linux/seq_file.h>
102 #include <linux/proc_fs.h>
103 #include <linux/memory-tiers.h>
104 #include <linux/migrate.h>
105 #include <linux/ksm.h>
106 #include <linux/rmap.h>
107 #include <linux/security.h>
108 #include <linux/syscalls.h>
109 #include <linux/ctype.h>
110 #include <linux/mm_inline.h>
111 #include <linux/mmu_notifier.h>
112 #include <linux/printk.h>
113 #include <linux/leafops.h>
114 #include <linux/gcd.h>
115 
116 #include <asm/tlbflush.h>
117 #include <asm/tlb.h>
118 #include <linux/uaccess.h>
119 #include <linux/memory.h>
120 
121 #include "internal.h"
122 
123 /* Internal flags */
124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
125 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
126 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
127 
128 static struct kmem_cache *policy_cache;
129 static struct kmem_cache *sn_cache;
130 
131 /* Highest zone. An specific allocation for a zone below that is not
132    policied. */
133 enum zone_type policy_zone = 0;
134 
135 /*
136  * run-time system-wide default policy => local allocation
137  */
138 static struct mempolicy default_policy = {
139 	.refcnt = ATOMIC_INIT(1), /* never free it */
140 	.mode = MPOL_LOCAL,
141 };
142 
143 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144 
145 /*
146  * weightiness balances the tradeoff between small weights (cycles through nodes
147  * faster, more fair/even distribution) and large weights (smaller errors
148  * between actual bandwidth ratios and weight ratios). 32 is a number that has
149  * been found to perform at a reasonable compromise between the two goals.
150  */
151 static const int weightiness = 32;
152 
153 /*
154  * A null weighted_interleave_state is interpreted as having .mode="auto",
155  * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156  */
157 struct weighted_interleave_state {
158 	bool mode_auto;
159 	u8 iw_table[];
160 };
161 static struct weighted_interleave_state __rcu *wi_state;
162 static unsigned int *node_bw_table;
163 
164 /*
165  * wi_state_lock protects both wi_state and node_bw_table.
166  * node_bw_table is only used by writers to update wi_state.
167  */
168 static DEFINE_MUTEX(wi_state_lock);
169 
get_il_weight(int node)170 static u8 get_il_weight(int node)
171 {
172 	struct weighted_interleave_state *state;
173 	u8 weight = 1;
174 
175 	rcu_read_lock();
176 	state = rcu_dereference(wi_state);
177 	if (state)
178 		weight = state->iw_table[node];
179 	rcu_read_unlock();
180 	return weight;
181 }
182 
183 /*
184  * Convert bandwidth values into weighted interleave weights.
185  * Call with wi_state_lock.
186  */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188 {
189 	u64 sum_bw = 0;
190 	unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191 	int nid;
192 
193 	for_each_node_state(nid, N_MEMORY)
194 		sum_bw += bw[nid];
195 
196 	/* Scale bandwidths to whole numbers in the range [1, weightiness] */
197 	for_each_node_state(nid, N_MEMORY) {
198 		/*
199 		 * Try not to perform 64-bit division.
200 		 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201 		 * If sum_bw > scaling_factor, then round the weight up to 1.
202 		 */
203 		scaling_factor = weightiness * bw[nid];
204 		if (bw[nid] && sum_bw < scaling_factor) {
205 			cast_sum_bw = (unsigned int)sum_bw;
206 			new_iw[nid] = scaling_factor / cast_sum_bw;
207 		} else {
208 			new_iw[nid] = 1;
209 		}
210 		if (!iw_gcd)
211 			iw_gcd = new_iw[nid];
212 		iw_gcd = gcd(iw_gcd, new_iw[nid]);
213 	}
214 
215 	/* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216 	for_each_node_state(nid, N_MEMORY)
217 		new_iw[nid] /= iw_gcd;
218 }
219 
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221 {
222 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223 	unsigned int *old_bw, *new_bw;
224 	unsigned int bw_val;
225 	int i;
226 
227 	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228 	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229 	if (!new_bw)
230 		return -ENOMEM;
231 
232 	new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
233 			       GFP_KERNEL);
234 	if (!new_wi_state) {
235 		kfree(new_bw);
236 		return -ENOMEM;
237 	}
238 	new_wi_state->mode_auto = true;
239 	for (i = 0; i < nr_node_ids; i++)
240 		new_wi_state->iw_table[i] = 1;
241 
242 	/*
243 	 * Update bandwidth info, even in manual mode. That way, when switching
244 	 * to auto mode in the future, iw_table can be overwritten using
245 	 * accurate bw data.
246 	 */
247 	mutex_lock(&wi_state_lock);
248 
249 	old_bw = node_bw_table;
250 	if (old_bw)
251 		memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
252 	new_bw[node] = bw_val;
253 	node_bw_table = new_bw;
254 
255 	old_wi_state = rcu_dereference_protected(wi_state,
256 					lockdep_is_held(&wi_state_lock));
257 	if (old_wi_state && !old_wi_state->mode_auto) {
258 		/* Manual mode; skip reducing weights and updating wi_state */
259 		mutex_unlock(&wi_state_lock);
260 		kfree(new_wi_state);
261 		goto out;
262 	}
263 
264 	/* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
265 	reduce_interleave_weights(new_bw, new_wi_state->iw_table);
266 	rcu_assign_pointer(wi_state, new_wi_state);
267 
268 	mutex_unlock(&wi_state_lock);
269 	if (old_wi_state) {
270 		synchronize_rcu();
271 		kfree(old_wi_state);
272 	}
273 out:
274 	kfree(old_bw);
275 	return 0;
276 }
277 
278 /**
279  * numa_nearest_node - Find nearest node by state
280  * @node: Node id to start the search
281  * @state: State to filter the search
282  *
283  * Lookup the closest node by distance if @nid is not in state.
284  *
285  * Return: this @node if it is in state, otherwise the closest node by distance
286  */
numa_nearest_node(int node,unsigned int state)287 int numa_nearest_node(int node, unsigned int state)
288 {
289 	int min_dist = INT_MAX, dist, n, min_node;
290 
291 	if (state >= NR_NODE_STATES)
292 		return -EINVAL;
293 
294 	if (node == NUMA_NO_NODE || node_state(node, state))
295 		return node;
296 
297 	min_node = node;
298 	for_each_node_state(n, state) {
299 		dist = node_distance(node, n);
300 		if (dist < min_dist) {
301 			min_dist = dist;
302 			min_node = n;
303 		}
304 	}
305 
306 	return min_node;
307 }
308 EXPORT_SYMBOL_GPL(numa_nearest_node);
309 
310 /**
311  * nearest_node_nodemask - Find the node in @mask at the nearest distance
312  *			   from @node.
313  *
314  * @node: a valid node ID to start the search from.
315  * @mask: a pointer to a nodemask representing the allowed nodes.
316  *
317  * This function iterates over all nodes in @mask and calculates the
318  * distance from the starting @node, then it returns the node ID that is
319  * the closest to @node, or MAX_NUMNODES if no node is found.
320  *
321  * Note that @node must be a valid node ID usable with node_distance(),
322  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
323  * or unexpected behavior.
324  */
nearest_node_nodemask(int node,nodemask_t * mask)325 int nearest_node_nodemask(int node, nodemask_t *mask)
326 {
327 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
328 
329 	for_each_node_mask(n, *mask) {
330 		dist = node_distance(node, n);
331 		if (dist < min_dist) {
332 			min_dist = dist;
333 			min_node = n;
334 		}
335 	}
336 
337 	return min_node;
338 }
339 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
340 
get_task_policy(struct task_struct * p)341 struct mempolicy *get_task_policy(struct task_struct *p)
342 {
343 	struct mempolicy *pol = p->mempolicy;
344 	int node;
345 
346 	if (pol)
347 		return pol;
348 
349 	node = numa_node_id();
350 	if (node != NUMA_NO_NODE) {
351 		pol = &preferred_node_policy[node];
352 		/* preferred_node_policy is not initialised early in boot */
353 		if (pol->mode)
354 			return pol;
355 	}
356 
357 	return &default_policy;
358 }
359 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
360 
361 static const struct mempolicy_operations {
362 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
363 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
364 } mpol_ops[MPOL_MAX];
365 
mpol_store_user_nodemask(const struct mempolicy * pol)366 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
367 {
368 	return pol->flags & MPOL_MODE_FLAGS;
369 }
370 
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)371 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
372 				   const nodemask_t *rel)
373 {
374 	nodemask_t tmp;
375 	nodes_fold(tmp, *orig, nodes_weight(*rel));
376 	nodes_onto(*ret, tmp, *rel);
377 }
378 
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)379 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
380 {
381 	if (nodes_empty(*nodes))
382 		return -EINVAL;
383 	pol->nodes = *nodes;
384 	return 0;
385 }
386 
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)387 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
388 {
389 	if (nodes_empty(*nodes))
390 		return -EINVAL;
391 
392 	nodes_clear(pol->nodes);
393 	node_set(first_node(*nodes), pol->nodes);
394 	return 0;
395 }
396 
397 /*
398  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
399  * any, for the new policy.  mpol_new() has already validated the nodes
400  * parameter with respect to the policy mode and flags.
401  *
402  * Must be called holding task's alloc_lock to protect task's mems_allowed
403  * and mempolicy.  May also be called holding the mmap_lock for write.
404  */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)405 static int mpol_set_nodemask(struct mempolicy *pol,
406 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
407 {
408 	int ret;
409 
410 	/*
411 	 * Default (pol==NULL) resp. local memory policies are not a
412 	 * subject of any remapping. They also do not need any special
413 	 * constructor.
414 	 */
415 	if (!pol || pol->mode == MPOL_LOCAL)
416 		return 0;
417 
418 	/* Check N_MEMORY */
419 	nodes_and(nsc->mask1,
420 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
421 
422 	VM_BUG_ON(!nodes);
423 
424 	if (pol->flags & MPOL_F_RELATIVE_NODES)
425 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
426 	else
427 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
428 
429 	if (mpol_store_user_nodemask(pol))
430 		pol->w.user_nodemask = *nodes;
431 	else
432 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
433 
434 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
435 	return ret;
436 }
437 
438 /*
439  * This function just creates a new policy, does some check and simple
440  * initialization. You must invoke mpol_set_nodemask() to set nodes.
441  */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)442 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
443 				  nodemask_t *nodes)
444 {
445 	struct mempolicy *policy;
446 
447 	if (mode == MPOL_DEFAULT) {
448 		if (nodes && !nodes_empty(*nodes))
449 			return ERR_PTR(-EINVAL);
450 		return NULL;
451 	}
452 	VM_BUG_ON(!nodes);
453 
454 	/*
455 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
456 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
457 	 * All other modes require a valid pointer to a non-empty nodemask.
458 	 */
459 	if (mode == MPOL_PREFERRED) {
460 		if (nodes_empty(*nodes)) {
461 			if (((flags & MPOL_F_STATIC_NODES) ||
462 			     (flags & MPOL_F_RELATIVE_NODES)))
463 				return ERR_PTR(-EINVAL);
464 
465 			mode = MPOL_LOCAL;
466 		}
467 	} else if (mode == MPOL_LOCAL) {
468 		if (!nodes_empty(*nodes) ||
469 		    (flags & MPOL_F_STATIC_NODES) ||
470 		    (flags & MPOL_F_RELATIVE_NODES))
471 			return ERR_PTR(-EINVAL);
472 	} else if (nodes_empty(*nodes))
473 		return ERR_PTR(-EINVAL);
474 
475 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
476 	if (!policy)
477 		return ERR_PTR(-ENOMEM);
478 	atomic_set(&policy->refcnt, 1);
479 	policy->mode = mode;
480 	policy->flags = flags;
481 	policy->home_node = NUMA_NO_NODE;
482 
483 	return policy;
484 }
485 
486 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)487 void __mpol_put(struct mempolicy *pol)
488 {
489 	if (!atomic_dec_and_test(&pol->refcnt))
490 		return;
491 	kmem_cache_free(policy_cache, pol);
492 }
493 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
494 
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)495 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
496 {
497 }
498 
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)499 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
500 {
501 	nodemask_t tmp;
502 
503 	if (pol->flags & MPOL_F_STATIC_NODES)
504 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
505 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
506 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
507 	else {
508 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
509 								*nodes);
510 		pol->w.cpuset_mems_allowed = *nodes;
511 	}
512 
513 	if (nodes_empty(tmp))
514 		tmp = *nodes;
515 
516 	pol->nodes = tmp;
517 }
518 
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)519 static void mpol_rebind_preferred(struct mempolicy *pol,
520 						const nodemask_t *nodes)
521 {
522 	pol->w.cpuset_mems_allowed = *nodes;
523 }
524 
525 /*
526  * mpol_rebind_policy - Migrate a policy to a different set of nodes
527  *
528  * Per-vma policies are protected by mmap_lock. Allocations using per-task
529  * policies are protected by task->mems_allowed_seq to prevent a premature
530  * OOM/allocation failure due to parallel nodemask modification.
531  */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)532 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
533 {
534 	if (!pol || pol->mode == MPOL_LOCAL)
535 		return;
536 	if (!mpol_store_user_nodemask(pol) &&
537 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
538 		return;
539 
540 	mpol_ops[pol->mode].rebind(pol, newmask);
541 }
542 
543 /*
544  * Wrapper for mpol_rebind_policy() that just requires task
545  * pointer, and updates task mempolicy.
546  *
547  * Called with task's alloc_lock held.
548  */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)549 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
550 {
551 	mpol_rebind_policy(tsk->mempolicy, new);
552 }
553 
554 /*
555  * Rebind each vma in mm to new nodemask.
556  *
557  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
558  */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)559 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
560 {
561 	struct vm_area_struct *vma;
562 	VMA_ITERATOR(vmi, mm, 0);
563 
564 	mmap_write_lock(mm);
565 	for_each_vma(vmi, vma) {
566 		vma_start_write(vma);
567 		mpol_rebind_policy(vma->vm_policy, new);
568 	}
569 	mmap_write_unlock(mm);
570 }
571 
572 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
573 	[MPOL_DEFAULT] = {
574 		.rebind = mpol_rebind_default,
575 	},
576 	[MPOL_INTERLEAVE] = {
577 		.create = mpol_new_nodemask,
578 		.rebind = mpol_rebind_nodemask,
579 	},
580 	[MPOL_PREFERRED] = {
581 		.create = mpol_new_preferred,
582 		.rebind = mpol_rebind_preferred,
583 	},
584 	[MPOL_BIND] = {
585 		.create = mpol_new_nodemask,
586 		.rebind = mpol_rebind_nodemask,
587 	},
588 	[MPOL_LOCAL] = {
589 		.rebind = mpol_rebind_default,
590 	},
591 	[MPOL_PREFERRED_MANY] = {
592 		.create = mpol_new_nodemask,
593 		.rebind = mpol_rebind_preferred,
594 	},
595 	[MPOL_WEIGHTED_INTERLEAVE] = {
596 		.create = mpol_new_nodemask,
597 		.rebind = mpol_rebind_nodemask,
598 	},
599 };
600 
601 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
602 				unsigned long flags);
603 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
604 				pgoff_t ilx, int *nid);
605 
strictly_unmovable(unsigned long flags)606 static bool strictly_unmovable(unsigned long flags)
607 {
608 	/*
609 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
610 	 * if any misplaced page is found.
611 	 */
612 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
613 			 MPOL_MF_STRICT;
614 }
615 
616 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
617 	struct mempolicy *pol;
618 	pgoff_t ilx;
619 };
620 
621 struct queue_pages {
622 	struct list_head *pagelist;
623 	unsigned long flags;
624 	nodemask_t *nmask;
625 	unsigned long start;
626 	unsigned long end;
627 	struct vm_area_struct *first;
628 	struct folio *large;		/* note last large folio encountered */
629 	long nr_failed;			/* could not be isolated at this time */
630 };
631 
632 /*
633  * Check if the folio's nid is in qp->nmask.
634  *
635  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
636  * in the invert of qp->nmask.
637  */
queue_folio_required(struct folio * folio,struct queue_pages * qp)638 static inline bool queue_folio_required(struct folio *folio,
639 					struct queue_pages *qp)
640 {
641 	int nid = folio_nid(folio);
642 	unsigned long flags = qp->flags;
643 
644 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
645 }
646 
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)647 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
648 {
649 	struct folio *folio;
650 	struct queue_pages *qp = walk->private;
651 
652 	if (unlikely(pmd_is_migration_entry(*pmd))) {
653 		qp->nr_failed++;
654 		return;
655 	}
656 	folio = pmd_folio(*pmd);
657 	if (is_huge_zero_folio(folio)) {
658 		walk->action = ACTION_CONTINUE;
659 		return;
660 	}
661 	if (!queue_folio_required(folio, qp))
662 		return;
663 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
664 	    !vma_migratable(walk->vma) ||
665 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
666 		qp->nr_failed++;
667 }
668 
669 /*
670  * Scan through folios, checking if they satisfy the required conditions,
671  * moving them from LRU to local pagelist for migration if they do (or not).
672  *
673  * queue_folios_pte_range() has two possible return values:
674  * 0 - continue walking to scan for more, even if an existing folio on the
675  *     wrong node could not be isolated and queued for migration.
676  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
677  *        and an existing folio was on a node that does not follow the policy.
678  */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)679 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
680 			unsigned long end, struct mm_walk *walk)
681 {
682 	struct vm_area_struct *vma = walk->vma;
683 	struct folio *folio;
684 	struct queue_pages *qp = walk->private;
685 	unsigned long flags = qp->flags;
686 	pte_t *pte, *mapped_pte;
687 	pte_t ptent;
688 	spinlock_t *ptl;
689 	int max_nr, nr;
690 
691 	ptl = pmd_trans_huge_lock(pmd, vma);
692 	if (ptl) {
693 		queue_folios_pmd(pmd, walk);
694 		spin_unlock(ptl);
695 		goto out;
696 	}
697 
698 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
699 	if (!pte) {
700 		walk->action = ACTION_AGAIN;
701 		return 0;
702 	}
703 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
704 		max_nr = (end - addr) >> PAGE_SHIFT;
705 		nr = 1;
706 		ptent = ptep_get(pte);
707 		if (pte_none(ptent))
708 			continue;
709 		if (!pte_present(ptent)) {
710 			const softleaf_t entry = softleaf_from_pte(ptent);
711 
712 			if (softleaf_is_migration(entry))
713 				qp->nr_failed++;
714 			continue;
715 		}
716 		folio = vm_normal_folio(vma, addr, ptent);
717 		if (!folio || folio_is_zone_device(folio))
718 			continue;
719 		if (folio_test_large(folio) && max_nr != 1)
720 			nr = folio_pte_batch(folio, pte, ptent, max_nr);
721 		/*
722 		 * vm_normal_folio() filters out zero pages, but there might
723 		 * still be reserved folios to skip, perhaps in a VDSO.
724 		 */
725 		if (folio_test_reserved(folio))
726 			continue;
727 		if (!queue_folio_required(folio, qp))
728 			continue;
729 		if (folio_test_large(folio)) {
730 			/*
731 			 * A large folio can only be isolated from LRU once,
732 			 * but may be mapped by many PTEs (and Copy-On-Write may
733 			 * intersperse PTEs of other, order 0, folios).  This is
734 			 * a common case, so don't mistake it for failure (but
735 			 * there can be other cases of multi-mapped pages which
736 			 * this quick check does not help to filter out - and a
737 			 * search of the pagelist might grow to be prohibitive).
738 			 *
739 			 * migrate_pages(&pagelist) returns nr_failed folios, so
740 			 * check "large" now so that queue_pages_range() returns
741 			 * a comparable nr_failed folios.  This does imply that
742 			 * if folio could not be isolated for some racy reason
743 			 * at its first PTE, later PTEs will not give it another
744 			 * chance of isolation; but keeps the accounting simple.
745 			 */
746 			if (folio == qp->large)
747 				continue;
748 			qp->large = folio;
749 		}
750 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
751 		    !vma_migratable(vma) ||
752 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
753 			qp->nr_failed += nr;
754 			if (strictly_unmovable(flags))
755 				break;
756 		}
757 	}
758 	pte_unmap_unlock(mapped_pte, ptl);
759 	cond_resched();
760 out:
761 	if (qp->nr_failed && strictly_unmovable(flags))
762 		return -EIO;
763 	return 0;
764 }
765 
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)766 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
767 			       unsigned long addr, unsigned long end,
768 			       struct mm_walk *walk)
769 {
770 #ifdef CONFIG_HUGETLB_PAGE
771 	struct queue_pages *qp = walk->private;
772 	unsigned long flags = qp->flags;
773 	struct folio *folio;
774 	spinlock_t *ptl;
775 	pte_t ptep;
776 
777 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
778 	ptep = huge_ptep_get(walk->mm, addr, pte);
779 	if (!pte_present(ptep)) {
780 		if (!huge_pte_none(ptep)) {
781 			const softleaf_t entry = softleaf_from_pte(ptep);
782 
783 			if (unlikely(softleaf_is_migration(entry)))
784 				qp->nr_failed++;
785 		}
786 
787 		goto unlock;
788 	}
789 	folio = pfn_folio(pte_pfn(ptep));
790 	if (!queue_folio_required(folio, qp))
791 		goto unlock;
792 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
793 	    !vma_migratable(walk->vma)) {
794 		qp->nr_failed++;
795 		goto unlock;
796 	}
797 	/*
798 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
799 	 * Choosing not to migrate a shared folio is not counted as a failure.
800 	 *
801 	 * See folio_maybe_mapped_shared() on possible imprecision when we
802 	 * cannot easily detect if a folio is shared.
803 	 */
804 	if ((flags & MPOL_MF_MOVE_ALL) ||
805 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
806 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
807 			qp->nr_failed++;
808 unlock:
809 	spin_unlock(ptl);
810 	if (qp->nr_failed && strictly_unmovable(flags))
811 		return -EIO;
812 #endif
813 	return 0;
814 }
815 
816 #ifdef CONFIG_NUMA_BALANCING
817 /**
818  * folio_can_map_prot_numa() - check whether the folio can map prot numa
819  * @folio: The folio whose mapping considered for being made NUMA hintable
820  * @vma: The VMA that the folio belongs to.
821  * @is_private_single_threaded: Is this a single-threaded private VMA or not
822  *
823  * This function checks to see if the folio actually indicates that
824  * we need to make the mapping one which causes a NUMA hinting fault,
825  * as there are cases where it's simply unnecessary, and the folio's
826  * access time is adjusted for memory tiering if prot numa needed.
827  *
828  * Return: True if the mapping of the folio needs to be changed, false otherwise.
829  */
folio_can_map_prot_numa(struct folio * folio,struct vm_area_struct * vma,bool is_private_single_threaded)830 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
831 		bool is_private_single_threaded)
832 {
833 	int nid;
834 
835 	if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
836 		return false;
837 
838 	/* Also skip shared copy-on-write folios */
839 	if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
840 		return false;
841 
842 	/* Folios are pinned and can't be migrated */
843 	if (folio_maybe_dma_pinned(folio))
844 		return false;
845 
846 	/*
847 	 * While migration can move some dirty folios,
848 	 * it cannot move them all from MIGRATE_ASYNC
849 	 * context.
850 	 */
851 	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
852 		return false;
853 
854 	/*
855 	 * Don't mess with PTEs if folio is already on the node
856 	 * a single-threaded process is running on.
857 	 */
858 	nid = folio_nid(folio);
859 	if (is_private_single_threaded && (nid == numa_node_id()))
860 		return false;
861 
862 	/*
863 	 * Skip scanning top tier node if normal numa
864 	 * balancing is disabled
865 	 */
866 	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
867 	    node_is_toptier(nid))
868 		return false;
869 
870 	if (folio_use_access_time(folio))
871 		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
872 
873 	return true;
874 }
875 
876 /*
877  * This is used to mark a range of virtual addresses to be inaccessible.
878  * These are later cleared by a NUMA hinting fault. Depending on these
879  * faults, pages may be migrated for better NUMA placement.
880  *
881  * This is assuming that NUMA faults are handled using PROT_NONE. If
882  * an architecture makes a different choice, it will need further
883  * changes to the core.
884  */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)885 unsigned long change_prot_numa(struct vm_area_struct *vma,
886 			unsigned long addr, unsigned long end)
887 {
888 	struct mmu_gather tlb;
889 	long nr_updated;
890 
891 	tlb_gather_mmu(&tlb, vma->vm_mm);
892 
893 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
894 	if (nr_updated > 0) {
895 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
896 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
897 	}
898 
899 	tlb_finish_mmu(&tlb);
900 
901 	return nr_updated;
902 }
903 #endif /* CONFIG_NUMA_BALANCING */
904 
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)905 static int queue_pages_test_walk(unsigned long start, unsigned long end,
906 				struct mm_walk *walk)
907 {
908 	struct vm_area_struct *next, *vma = walk->vma;
909 	struct queue_pages *qp = walk->private;
910 	unsigned long flags = qp->flags;
911 
912 	/* range check first */
913 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
914 
915 	if (!qp->first) {
916 		qp->first = vma;
917 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
918 			(qp->start < vma->vm_start))
919 			/* hole at head side of range */
920 			return -EFAULT;
921 	}
922 	next = find_vma(vma->vm_mm, vma->vm_end);
923 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
924 		((vma->vm_end < qp->end) &&
925 		(!next || vma->vm_end < next->vm_start)))
926 		/* hole at middle or tail of range */
927 		return -EFAULT;
928 
929 	/*
930 	 * Need check MPOL_MF_STRICT to return -EIO if possible
931 	 * regardless of vma_migratable
932 	 */
933 	if (!vma_migratable(vma) &&
934 	    !(flags & MPOL_MF_STRICT))
935 		return 1;
936 
937 	/*
938 	 * Check page nodes, and queue pages to move, in the current vma.
939 	 * But if no moving, and no strict checking, the scan can be skipped.
940 	 */
941 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
942 		return 0;
943 	return 1;
944 }
945 
946 static const struct mm_walk_ops queue_pages_walk_ops = {
947 	.hugetlb_entry		= queue_folios_hugetlb,
948 	.pmd_entry		= queue_folios_pte_range,
949 	.test_walk		= queue_pages_test_walk,
950 	.walk_lock		= PGWALK_RDLOCK,
951 };
952 
953 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
954 	.hugetlb_entry		= queue_folios_hugetlb,
955 	.pmd_entry		= queue_folios_pte_range,
956 	.test_walk		= queue_pages_test_walk,
957 	.walk_lock		= PGWALK_WRLOCK,
958 };
959 
960 /*
961  * Walk through page tables and collect pages to be migrated.
962  *
963  * If pages found in a given range are not on the required set of @nodes,
964  * and migration is allowed, they are isolated and queued to @pagelist.
965  *
966  * queue_pages_range() may return:
967  * 0 - all pages already on the right node, or successfully queued for moving
968  *     (or neither strict checking nor moving requested: only range checking).
969  * >0 - this number of misplaced folios could not be queued for moving
970  *      (a hugetlbfs page or a transparent huge page being counted as 1).
971  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
972  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
973  */
974 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)975 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
976 		nodemask_t *nodes, unsigned long flags,
977 		struct list_head *pagelist)
978 {
979 	int err;
980 	struct queue_pages qp = {
981 		.pagelist = pagelist,
982 		.flags = flags,
983 		.nmask = nodes,
984 		.start = start,
985 		.end = end,
986 		.first = NULL,
987 	};
988 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
989 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
990 
991 	err = walk_page_range(mm, start, end, ops, &qp);
992 
993 	if (!qp.first)
994 		/* whole range in hole */
995 		err = -EFAULT;
996 
997 	return err ? : qp.nr_failed;
998 }
999 
1000 /*
1001  * Apply policy to a single VMA
1002  * This must be called with the mmap_lock held for writing.
1003  */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)1004 static int vma_replace_policy(struct vm_area_struct *vma,
1005 				struct mempolicy *pol)
1006 {
1007 	int err;
1008 	struct mempolicy *old;
1009 	struct mempolicy *new;
1010 
1011 	vma_assert_write_locked(vma);
1012 
1013 	new = mpol_dup(pol);
1014 	if (IS_ERR(new))
1015 		return PTR_ERR(new);
1016 
1017 	if (vma->vm_ops && vma->vm_ops->set_policy) {
1018 		err = vma->vm_ops->set_policy(vma, new);
1019 		if (err)
1020 			goto err_out;
1021 	}
1022 
1023 	old = vma->vm_policy;
1024 	vma->vm_policy = new; /* protected by mmap_lock */
1025 	mpol_put(old);
1026 
1027 	return 0;
1028  err_out:
1029 	mpol_put(new);
1030 	return err;
1031 }
1032 
1033 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)1034 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1035 		struct vm_area_struct **prev, unsigned long start,
1036 		unsigned long end, struct mempolicy *new_pol)
1037 {
1038 	unsigned long vmstart, vmend;
1039 
1040 	vmend = min(end, vma->vm_end);
1041 	if (start > vma->vm_start) {
1042 		*prev = vma;
1043 		vmstart = start;
1044 	} else {
1045 		vmstart = vma->vm_start;
1046 	}
1047 
1048 	if (mpol_equal(vma->vm_policy, new_pol)) {
1049 		*prev = vma;
1050 		return 0;
1051 	}
1052 
1053 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
1054 	if (IS_ERR(vma))
1055 		return PTR_ERR(vma);
1056 
1057 	*prev = vma;
1058 	return vma_replace_policy(vma, new_pol);
1059 }
1060 
1061 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)1062 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1063 			     nodemask_t *nodes)
1064 {
1065 	struct mempolicy *new, *old;
1066 	NODEMASK_SCRATCH(scratch);
1067 	int ret;
1068 
1069 	if (!scratch)
1070 		return -ENOMEM;
1071 
1072 	new = mpol_new(mode, flags, nodes);
1073 	if (IS_ERR(new)) {
1074 		ret = PTR_ERR(new);
1075 		goto out;
1076 	}
1077 
1078 	task_lock(current);
1079 	ret = mpol_set_nodemask(new, nodes, scratch);
1080 	if (ret) {
1081 		task_unlock(current);
1082 		mpol_put(new);
1083 		goto out;
1084 	}
1085 
1086 	old = current->mempolicy;
1087 	current->mempolicy = new;
1088 	if (new && (new->mode == MPOL_INTERLEAVE ||
1089 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1090 		current->il_prev = MAX_NUMNODES-1;
1091 		current->il_weight = 0;
1092 	}
1093 	task_unlock(current);
1094 	mpol_put(old);
1095 	ret = 0;
1096 out:
1097 	NODEMASK_SCRATCH_FREE(scratch);
1098 	return ret;
1099 }
1100 
1101 /*
1102  * Return nodemask for policy for get_mempolicy() query
1103  *
1104  * Called with task's alloc_lock held
1105  */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1106 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1107 {
1108 	nodes_clear(*nodes);
1109 	if (pol == &default_policy)
1110 		return;
1111 
1112 	switch (pol->mode) {
1113 	case MPOL_BIND:
1114 	case MPOL_INTERLEAVE:
1115 	case MPOL_PREFERRED:
1116 	case MPOL_PREFERRED_MANY:
1117 	case MPOL_WEIGHTED_INTERLEAVE:
1118 		*nodes = pol->nodes;
1119 		break;
1120 	case MPOL_LOCAL:
1121 		/* return empty node mask for local allocation */
1122 		break;
1123 	default:
1124 		BUG();
1125 	}
1126 }
1127 
lookup_node(struct mm_struct * mm,unsigned long addr)1128 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1129 {
1130 	struct page *p = NULL;
1131 	int ret;
1132 
1133 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1134 	if (ret > 0) {
1135 		ret = page_to_nid(p);
1136 		put_page(p);
1137 	}
1138 	return ret;
1139 }
1140 
1141 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1142 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1143 			     unsigned long addr, unsigned long flags)
1144 {
1145 	int err;
1146 	struct mm_struct *mm = current->mm;
1147 	struct vm_area_struct *vma = NULL;
1148 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1149 
1150 	if (flags &
1151 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1152 		return -EINVAL;
1153 
1154 	if (flags & MPOL_F_MEMS_ALLOWED) {
1155 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1156 			return -EINVAL;
1157 		*policy = 0;	/* just so it's initialized */
1158 		task_lock(current);
1159 		*nmask  = cpuset_current_mems_allowed;
1160 		task_unlock(current);
1161 		return 0;
1162 	}
1163 
1164 	if (flags & MPOL_F_ADDR) {
1165 		pgoff_t ilx;		/* ignored here */
1166 		/*
1167 		 * Do NOT fall back to task policy if the
1168 		 * vma/shared policy at addr is NULL.  We
1169 		 * want to return MPOL_DEFAULT in this case.
1170 		 */
1171 		mmap_read_lock(mm);
1172 		vma = vma_lookup(mm, addr);
1173 		if (!vma) {
1174 			mmap_read_unlock(mm);
1175 			return -EFAULT;
1176 		}
1177 		pol = __get_vma_policy(vma, addr, &ilx);
1178 	} else if (addr)
1179 		return -EINVAL;
1180 
1181 	if (!pol)
1182 		pol = &default_policy;	/* indicates default behavior */
1183 
1184 	if (flags & MPOL_F_NODE) {
1185 		if (flags & MPOL_F_ADDR) {
1186 			/*
1187 			 * Take a refcount on the mpol, because we are about to
1188 			 * drop the mmap_lock, after which only "pol" remains
1189 			 * valid, "vma" is stale.
1190 			 */
1191 			pol_refcount = pol;
1192 			vma = NULL;
1193 			mpol_get(pol);
1194 			mmap_read_unlock(mm);
1195 			err = lookup_node(mm, addr);
1196 			if (err < 0)
1197 				goto out;
1198 			*policy = err;
1199 		} else if (pol == current->mempolicy &&
1200 				pol->mode == MPOL_INTERLEAVE) {
1201 			*policy = next_node_in(current->il_prev, pol->nodes);
1202 		} else if (pol == current->mempolicy &&
1203 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1204 			if (current->il_weight)
1205 				*policy = current->il_prev;
1206 			else
1207 				*policy = next_node_in(current->il_prev,
1208 						       pol->nodes);
1209 		} else {
1210 			err = -EINVAL;
1211 			goto out;
1212 		}
1213 	} else {
1214 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1215 						pol->mode;
1216 		/*
1217 		 * Internal mempolicy flags must be masked off before exposing
1218 		 * the policy to userspace.
1219 		 */
1220 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1221 	}
1222 
1223 	err = 0;
1224 	if (nmask) {
1225 		if (mpol_store_user_nodemask(pol)) {
1226 			*nmask = pol->w.user_nodemask;
1227 		} else {
1228 			task_lock(current);
1229 			get_policy_nodemask(pol, nmask);
1230 			task_unlock(current);
1231 		}
1232 	}
1233 
1234  out:
1235 	mpol_cond_put(pol);
1236 	if (vma)
1237 		mmap_read_unlock(mm);
1238 	if (pol_refcount)
1239 		mpol_put(pol_refcount);
1240 	return err;
1241 }
1242 
1243 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1244 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1245 				unsigned long flags)
1246 {
1247 	/*
1248 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1249 	 * Choosing not to migrate a shared folio is not counted as a failure.
1250 	 *
1251 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1252 	 * cannot easily detect if a folio is shared.
1253 	 */
1254 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1255 		if (folio_isolate_lru(folio)) {
1256 			list_add_tail(&folio->lru, foliolist);
1257 			node_stat_mod_folio(folio,
1258 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1259 				folio_nr_pages(folio));
1260 		} else {
1261 			/*
1262 			 * Non-movable folio may reach here.  And, there may be
1263 			 * temporary off LRU folios or non-LRU movable folios.
1264 			 * Treat them as unmovable folios since they can't be
1265 			 * isolated, so they can't be moved at the moment.
1266 			 */
1267 			return false;
1268 		}
1269 	}
1270 	return true;
1271 }
1272 
1273 /*
1274  * Migrate pages from one node to a target node.
1275  * Returns error or the number of pages not migrated.
1276  */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1277 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1278 			    int flags)
1279 {
1280 	nodemask_t nmask;
1281 	struct vm_area_struct *vma;
1282 	LIST_HEAD(pagelist);
1283 	long nr_failed;
1284 	long err = 0;
1285 	struct migration_target_control mtc = {
1286 		.nid = dest,
1287 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1288 		.reason = MR_SYSCALL,
1289 	};
1290 
1291 	nodes_clear(nmask);
1292 	node_set(source, nmask);
1293 
1294 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1295 
1296 	mmap_read_lock(mm);
1297 	vma = find_vma(mm, 0);
1298 	if (unlikely(!vma)) {
1299 		mmap_read_unlock(mm);
1300 		return 0;
1301 	}
1302 
1303 	/*
1304 	 * This does not migrate the range, but isolates all pages that
1305 	 * need migration.  Between passing in the full user address
1306 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1307 	 * but passes back the count of pages which could not be isolated.
1308 	 */
1309 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1310 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1311 	mmap_read_unlock(mm);
1312 
1313 	if (!list_empty(&pagelist)) {
1314 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1315 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1316 		if (err)
1317 			putback_movable_pages(&pagelist);
1318 	}
1319 
1320 	if (err >= 0)
1321 		err += nr_failed;
1322 	return err;
1323 }
1324 
1325 /*
1326  * Move pages between the two nodesets so as to preserve the physical
1327  * layout as much as possible.
1328  *
1329  * Returns the number of page that could not be moved.
1330  */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1331 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1332 		     const nodemask_t *to, int flags)
1333 {
1334 	long nr_failed = 0;
1335 	long err = 0;
1336 	nodemask_t tmp;
1337 
1338 	lru_cache_disable();
1339 
1340 	/*
1341 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1342 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1343 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1344 	 * The pair of nodemasks 'to' and 'from' define the map.
1345 	 *
1346 	 * If no pair of bits is found that way, fallback to picking some
1347 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1348 	 * 'source' and 'dest' bits are the same, this represents a node
1349 	 * that will be migrating to itself, so no pages need move.
1350 	 *
1351 	 * If no bits are left in 'tmp', or if all remaining bits left
1352 	 * in 'tmp' correspond to the same bit in 'to', return false
1353 	 * (nothing left to migrate).
1354 	 *
1355 	 * This lets us pick a pair of nodes to migrate between, such that
1356 	 * if possible the dest node is not already occupied by some other
1357 	 * source node, minimizing the risk of overloading the memory on a
1358 	 * node that would happen if we migrated incoming memory to a node
1359 	 * before migrating outgoing memory source that same node.
1360 	 *
1361 	 * A single scan of tmp is sufficient.  As we go, we remember the
1362 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1363 	 * that not only moved, but what's better, moved to an empty slot
1364 	 * (d is not set in tmp), then we break out then, with that pair.
1365 	 * Otherwise when we finish scanning from_tmp, we at least have the
1366 	 * most recent <s, d> pair that moved.  If we get all the way through
1367 	 * the scan of tmp without finding any node that moved, much less
1368 	 * moved to an empty node, then there is nothing left worth migrating.
1369 	 */
1370 
1371 	tmp = *from;
1372 	while (!nodes_empty(tmp)) {
1373 		int s, d;
1374 		int source = NUMA_NO_NODE;
1375 		int dest = 0;
1376 
1377 		for_each_node_mask(s, tmp) {
1378 
1379 			/*
1380 			 * do_migrate_pages() tries to maintain the relative
1381 			 * node relationship of the pages established between
1382 			 * threads and memory areas.
1383                          *
1384 			 * However if the number of source nodes is not equal to
1385 			 * the number of destination nodes we can not preserve
1386 			 * this node relative relationship.  In that case, skip
1387 			 * copying memory from a node that is in the destination
1388 			 * mask.
1389 			 *
1390 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1391 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1392 			 */
1393 
1394 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1395 						(node_isset(s, *to)))
1396 				continue;
1397 
1398 			d = node_remap(s, *from, *to);
1399 			if (s == d)
1400 				continue;
1401 
1402 			source = s;	/* Node moved. Memorize */
1403 			dest = d;
1404 
1405 			/* dest not in remaining from nodes? */
1406 			if (!node_isset(dest, tmp))
1407 				break;
1408 		}
1409 		if (source == NUMA_NO_NODE)
1410 			break;
1411 
1412 		node_clear(source, tmp);
1413 		err = migrate_to_node(mm, source, dest, flags);
1414 		if (err > 0)
1415 			nr_failed += err;
1416 		if (err < 0)
1417 			break;
1418 	}
1419 
1420 	lru_cache_enable();
1421 	if (err < 0)
1422 		return err;
1423 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1424 }
1425 
1426 /*
1427  * Allocate a new folio for page migration, according to NUMA mempolicy.
1428  */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1429 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1430 						    unsigned long private)
1431 {
1432 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1433 	struct mempolicy *pol = mmpol->pol;
1434 	pgoff_t ilx = mmpol->ilx;
1435 	unsigned int order;
1436 	int nid = numa_node_id();
1437 	gfp_t gfp;
1438 
1439 	order = folio_order(src);
1440 	ilx += src->index >> order;
1441 
1442 	if (folio_test_hugetlb(src)) {
1443 		nodemask_t *nodemask;
1444 		struct hstate *h;
1445 
1446 		h = folio_hstate(src);
1447 		gfp = htlb_alloc_mask(h);
1448 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1449 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1450 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1451 	}
1452 
1453 	if (folio_test_large(src))
1454 		gfp = GFP_TRANSHUGE;
1455 	else
1456 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1457 
1458 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1459 }
1460 #else
1461 
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1462 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1463 				unsigned long flags)
1464 {
1465 	return false;
1466 }
1467 
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1468 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1469 		     const nodemask_t *to, int flags)
1470 {
1471 	return -ENOSYS;
1472 }
1473 
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1474 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1475 						    unsigned long private)
1476 {
1477 	return NULL;
1478 }
1479 #endif
1480 
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1481 static long do_mbind(unsigned long start, unsigned long len,
1482 		     unsigned short mode, unsigned short mode_flags,
1483 		     nodemask_t *nmask, unsigned long flags)
1484 {
1485 	struct mm_struct *mm = current->mm;
1486 	struct vm_area_struct *vma, *prev;
1487 	struct vma_iterator vmi;
1488 	struct migration_mpol mmpol;
1489 	struct mempolicy *new;
1490 	unsigned long end;
1491 	long err;
1492 	long nr_failed;
1493 	LIST_HEAD(pagelist);
1494 
1495 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1496 		return -EINVAL;
1497 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1498 		return -EPERM;
1499 
1500 	if (start & ~PAGE_MASK)
1501 		return -EINVAL;
1502 
1503 	if (mode == MPOL_DEFAULT)
1504 		flags &= ~MPOL_MF_STRICT;
1505 
1506 	len = PAGE_ALIGN(len);
1507 	end = start + len;
1508 
1509 	if (end < start)
1510 		return -EINVAL;
1511 	if (end == start)
1512 		return 0;
1513 
1514 	new = mpol_new(mode, mode_flags, nmask);
1515 	if (IS_ERR(new))
1516 		return PTR_ERR(new);
1517 
1518 	/*
1519 	 * If we are using the default policy then operation
1520 	 * on discontinuous address spaces is okay after all
1521 	 */
1522 	if (!new)
1523 		flags |= MPOL_MF_DISCONTIG_OK;
1524 
1525 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1526 		lru_cache_disable();
1527 	{
1528 		NODEMASK_SCRATCH(scratch);
1529 		if (scratch) {
1530 			mmap_write_lock(mm);
1531 			err = mpol_set_nodemask(new, nmask, scratch);
1532 			if (err)
1533 				mmap_write_unlock(mm);
1534 		} else
1535 			err = -ENOMEM;
1536 		NODEMASK_SCRATCH_FREE(scratch);
1537 	}
1538 	if (err)
1539 		goto mpol_out;
1540 
1541 	/*
1542 	 * Lock the VMAs before scanning for pages to migrate,
1543 	 * to ensure we don't miss a concurrently inserted page.
1544 	 */
1545 	nr_failed = queue_pages_range(mm, start, end, nmask,
1546 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1547 
1548 	if (nr_failed < 0) {
1549 		err = nr_failed;
1550 		nr_failed = 0;
1551 	} else {
1552 		vma_iter_init(&vmi, mm, start);
1553 		prev = vma_prev(&vmi);
1554 		for_each_vma_range(vmi, vma, end) {
1555 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1556 			if (err)
1557 				break;
1558 		}
1559 	}
1560 
1561 	if (!err && !list_empty(&pagelist)) {
1562 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1563 		if (!new) {
1564 			new = get_task_policy(current);
1565 			mpol_get(new);
1566 		}
1567 		mmpol.pol = new;
1568 		mmpol.ilx = 0;
1569 
1570 		/*
1571 		 * In the interleaved case, attempt to allocate on exactly the
1572 		 * targeted nodes, for the first VMA to be migrated; for later
1573 		 * VMAs, the nodes will still be interleaved from the targeted
1574 		 * nodemask, but one by one may be selected differently.
1575 		 */
1576 		if (new->mode == MPOL_INTERLEAVE ||
1577 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1578 			struct folio *folio;
1579 			unsigned int order;
1580 			unsigned long addr = -EFAULT;
1581 
1582 			list_for_each_entry(folio, &pagelist, lru) {
1583 				if (!folio_test_ksm(folio))
1584 					break;
1585 			}
1586 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1587 				vma_iter_init(&vmi, mm, start);
1588 				for_each_vma_range(vmi, vma, end) {
1589 					addr = page_address_in_vma(folio,
1590 						folio_page(folio, 0), vma);
1591 					if (addr != -EFAULT)
1592 						break;
1593 				}
1594 			}
1595 			if (addr != -EFAULT) {
1596 				order = folio_order(folio);
1597 				/* We already know the pol, but not the ilx */
1598 				mpol_cond_put(get_vma_policy(vma, addr, order,
1599 							     &mmpol.ilx));
1600 				/* Set base from which to increment by index */
1601 				mmpol.ilx -= folio->index >> order;
1602 			}
1603 		}
1604 	}
1605 
1606 	mmap_write_unlock(mm);
1607 
1608 	if (!err && !list_empty(&pagelist)) {
1609 		nr_failed |= migrate_pages(&pagelist,
1610 				alloc_migration_target_by_mpol, NULL,
1611 				(unsigned long)&mmpol, MIGRATE_SYNC,
1612 				MR_MEMPOLICY_MBIND, NULL);
1613 	}
1614 
1615 	if (nr_failed && (flags & MPOL_MF_STRICT))
1616 		err = -EIO;
1617 	if (!list_empty(&pagelist))
1618 		putback_movable_pages(&pagelist);
1619 mpol_out:
1620 	mpol_put(new);
1621 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1622 		lru_cache_enable();
1623 	return err;
1624 }
1625 
1626 /*
1627  * User space interface with variable sized bitmaps for nodelists.
1628  */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1629 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1630 		      unsigned long maxnode)
1631 {
1632 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1633 	int ret;
1634 
1635 	if (in_compat_syscall())
1636 		ret = compat_get_bitmap(mask,
1637 					(const compat_ulong_t __user *)nmask,
1638 					maxnode);
1639 	else
1640 		ret = copy_from_user(mask, nmask,
1641 				     nlongs * sizeof(unsigned long));
1642 
1643 	if (ret)
1644 		return -EFAULT;
1645 
1646 	if (maxnode % BITS_PER_LONG)
1647 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1648 
1649 	return 0;
1650 }
1651 
1652 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1653 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1654 		     unsigned long maxnode)
1655 {
1656 	--maxnode;
1657 	nodes_clear(*nodes);
1658 	if (maxnode == 0 || !nmask)
1659 		return 0;
1660 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1661 		return -EINVAL;
1662 
1663 	/*
1664 	 * When the user specified more nodes than supported just check
1665 	 * if the non supported part is all zero, one word at a time,
1666 	 * starting at the end.
1667 	 */
1668 	while (maxnode > MAX_NUMNODES) {
1669 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1670 		unsigned long t;
1671 
1672 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1673 			return -EFAULT;
1674 
1675 		if (maxnode - bits >= MAX_NUMNODES) {
1676 			maxnode -= bits;
1677 		} else {
1678 			maxnode = MAX_NUMNODES;
1679 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1680 		}
1681 		if (t)
1682 			return -EINVAL;
1683 	}
1684 
1685 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1686 }
1687 
1688 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1689 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1690 			      nodemask_t *nodes)
1691 {
1692 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1693 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1694 	bool compat = in_compat_syscall();
1695 
1696 	if (compat)
1697 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1698 
1699 	if (copy > nbytes) {
1700 		if (copy > PAGE_SIZE)
1701 			return -EINVAL;
1702 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1703 			return -EFAULT;
1704 		copy = nbytes;
1705 		maxnode = nr_node_ids;
1706 	}
1707 
1708 	if (compat)
1709 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1710 					 nodes_addr(*nodes), maxnode);
1711 
1712 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1713 }
1714 
1715 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1716 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1717 {
1718 	*flags = *mode & MPOL_MODE_FLAGS;
1719 	*mode &= ~MPOL_MODE_FLAGS;
1720 
1721 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1722 		return -EINVAL;
1723 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1724 		return -EINVAL;
1725 	if (*flags & MPOL_F_NUMA_BALANCING) {
1726 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1727 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1728 		else
1729 			return -EINVAL;
1730 	}
1731 	return 0;
1732 }
1733 
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1734 static long kernel_mbind(unsigned long start, unsigned long len,
1735 			 unsigned long mode, const unsigned long __user *nmask,
1736 			 unsigned long maxnode, unsigned int flags)
1737 {
1738 	unsigned short mode_flags;
1739 	nodemask_t nodes;
1740 	int lmode = mode;
1741 	int err;
1742 
1743 	start = untagged_addr(start);
1744 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1745 	if (err)
1746 		return err;
1747 
1748 	err = get_nodes(&nodes, nmask, maxnode);
1749 	if (err)
1750 		return err;
1751 
1752 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1753 }
1754 
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1755 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1756 		unsigned long, home_node, unsigned long, flags)
1757 {
1758 	struct mm_struct *mm = current->mm;
1759 	struct vm_area_struct *vma, *prev;
1760 	struct mempolicy *new, *old;
1761 	unsigned long end;
1762 	int err = -ENOENT;
1763 	VMA_ITERATOR(vmi, mm, start);
1764 
1765 	start = untagged_addr(start);
1766 	if (start & ~PAGE_MASK)
1767 		return -EINVAL;
1768 	/*
1769 	 * flags is used for future extension if any.
1770 	 */
1771 	if (flags != 0)
1772 		return -EINVAL;
1773 
1774 	/*
1775 	 * Check home_node is online to avoid accessing uninitialized
1776 	 * NODE_DATA.
1777 	 */
1778 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1779 		return -EINVAL;
1780 
1781 	len = PAGE_ALIGN(len);
1782 	end = start + len;
1783 
1784 	if (end < start)
1785 		return -EINVAL;
1786 	if (end == start)
1787 		return 0;
1788 	mmap_write_lock(mm);
1789 	prev = vma_prev(&vmi);
1790 	for_each_vma_range(vmi, vma, end) {
1791 		/*
1792 		 * If any vma in the range got policy other than MPOL_BIND
1793 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1794 		 * the home node for vmas we already updated before.
1795 		 */
1796 		old = vma_policy(vma);
1797 		if (!old) {
1798 			prev = vma;
1799 			continue;
1800 		}
1801 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1802 			err = -EOPNOTSUPP;
1803 			break;
1804 		}
1805 		new = mpol_dup(old);
1806 		if (IS_ERR(new)) {
1807 			err = PTR_ERR(new);
1808 			break;
1809 		}
1810 
1811 		vma_start_write(vma);
1812 		new->home_node = home_node;
1813 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1814 		mpol_put(new);
1815 		if (err)
1816 			break;
1817 	}
1818 	mmap_write_unlock(mm);
1819 	return err;
1820 }
1821 
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1822 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1823 		unsigned long, mode, const unsigned long __user *, nmask,
1824 		unsigned long, maxnode, unsigned int, flags)
1825 {
1826 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1827 }
1828 
1829 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1830 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1831 				 unsigned long maxnode)
1832 {
1833 	unsigned short mode_flags;
1834 	nodemask_t nodes;
1835 	int lmode = mode;
1836 	int err;
1837 
1838 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1839 	if (err)
1840 		return err;
1841 
1842 	err = get_nodes(&nodes, nmask, maxnode);
1843 	if (err)
1844 		return err;
1845 
1846 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1847 }
1848 
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1849 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1850 		unsigned long, maxnode)
1851 {
1852 	return kernel_set_mempolicy(mode, nmask, maxnode);
1853 }
1854 
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1855 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1856 				const unsigned long __user *old_nodes,
1857 				const unsigned long __user *new_nodes)
1858 {
1859 	struct mm_struct *mm = NULL;
1860 	struct task_struct *task;
1861 	nodemask_t task_nodes;
1862 	int err;
1863 	nodemask_t *old;
1864 	nodemask_t *new;
1865 	NODEMASK_SCRATCH(scratch);
1866 
1867 	if (!scratch)
1868 		return -ENOMEM;
1869 
1870 	old = &scratch->mask1;
1871 	new = &scratch->mask2;
1872 
1873 	err = get_nodes(old, old_nodes, maxnode);
1874 	if (err)
1875 		goto out;
1876 
1877 	err = get_nodes(new, new_nodes, maxnode);
1878 	if (err)
1879 		goto out;
1880 
1881 	/* Find the mm_struct */
1882 	rcu_read_lock();
1883 	task = pid ? find_task_by_vpid(pid) : current;
1884 	if (!task) {
1885 		rcu_read_unlock();
1886 		err = -ESRCH;
1887 		goto out;
1888 	}
1889 	get_task_struct(task);
1890 
1891 	err = -EINVAL;
1892 
1893 	/*
1894 	 * Check if this process has the right to modify the specified process.
1895 	 * Use the regular "ptrace_may_access()" checks.
1896 	 */
1897 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1898 		rcu_read_unlock();
1899 		err = -EPERM;
1900 		goto out_put;
1901 	}
1902 	rcu_read_unlock();
1903 
1904 	task_nodes = cpuset_mems_allowed(task);
1905 	/* Is the user allowed to access the target nodes? */
1906 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1907 		err = -EPERM;
1908 		goto out_put;
1909 	}
1910 
1911 	task_nodes = cpuset_mems_allowed(current);
1912 	nodes_and(*new, *new, task_nodes);
1913 	if (nodes_empty(*new))
1914 		goto out_put;
1915 
1916 	err = security_task_movememory(task);
1917 	if (err)
1918 		goto out_put;
1919 
1920 	mm = get_task_mm(task);
1921 	put_task_struct(task);
1922 
1923 	if (!mm) {
1924 		err = -EINVAL;
1925 		goto out;
1926 	}
1927 
1928 	err = do_migrate_pages(mm, old, new,
1929 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1930 
1931 	mmput(mm);
1932 out:
1933 	NODEMASK_SCRATCH_FREE(scratch);
1934 
1935 	return err;
1936 
1937 out_put:
1938 	put_task_struct(task);
1939 	goto out;
1940 }
1941 
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1942 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1943 		const unsigned long __user *, old_nodes,
1944 		const unsigned long __user *, new_nodes)
1945 {
1946 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1947 }
1948 
1949 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1950 static int kernel_get_mempolicy(int __user *policy,
1951 				unsigned long __user *nmask,
1952 				unsigned long maxnode,
1953 				unsigned long addr,
1954 				unsigned long flags)
1955 {
1956 	int err;
1957 	int pval;
1958 	nodemask_t nodes;
1959 
1960 	if (nmask != NULL && maxnode < nr_node_ids)
1961 		return -EINVAL;
1962 
1963 	addr = untagged_addr(addr);
1964 
1965 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1966 
1967 	if (err)
1968 		return err;
1969 
1970 	if (policy && put_user(pval, policy))
1971 		return -EFAULT;
1972 
1973 	if (nmask)
1974 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1975 
1976 	return err;
1977 }
1978 
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1979 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1980 		unsigned long __user *, nmask, unsigned long, maxnode,
1981 		unsigned long, addr, unsigned long, flags)
1982 {
1983 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1984 }
1985 
vma_migratable(struct vm_area_struct * vma)1986 bool vma_migratable(struct vm_area_struct *vma)
1987 {
1988 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1989 		return false;
1990 
1991 	/*
1992 	 * DAX device mappings require predictable access latency, so avoid
1993 	 * incurring periodic faults.
1994 	 */
1995 	if (vma_is_dax(vma))
1996 		return false;
1997 
1998 	if (is_vm_hugetlb_page(vma) &&
1999 		!hugepage_migration_supported(hstate_vma(vma)))
2000 		return false;
2001 
2002 	/*
2003 	 * Migration allocates pages in the highest zone. If we cannot
2004 	 * do so then migration (at least from node to node) is not
2005 	 * possible.
2006 	 */
2007 	if (vma->vm_file &&
2008 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
2009 			< policy_zone)
2010 		return false;
2011 	return true;
2012 }
2013 
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2014 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2015 				   unsigned long addr, pgoff_t *ilx)
2016 {
2017 	*ilx = 0;
2018 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
2019 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2020 }
2021 
2022 /*
2023  * get_vma_policy(@vma, @addr, @order, @ilx)
2024  * @vma: virtual memory area whose policy is sought
2025  * @addr: address in @vma for shared policy lookup
2026  * @order: 0, or appropriate huge_page_order for interleaving
2027  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2028  *       MPOL_WEIGHTED_INTERLEAVE
2029  *
2030  * Returns effective policy for a VMA at specified address.
2031  * Falls back to current->mempolicy or system default policy, as necessary.
2032  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2033  * count--added by the get_policy() vm_op, as appropriate--to protect against
2034  * freeing by another task.  It is the caller's responsibility to free the
2035  * extra reference for shared policies.
2036  */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)2037 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2038 				 unsigned long addr, int order, pgoff_t *ilx)
2039 {
2040 	struct mempolicy *pol;
2041 
2042 	pol = __get_vma_policy(vma, addr, ilx);
2043 	if (!pol)
2044 		pol = get_task_policy(current);
2045 	if (pol->mode == MPOL_INTERLEAVE ||
2046 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2047 		*ilx += vma->vm_pgoff >> order;
2048 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2049 	}
2050 	return pol;
2051 }
2052 
vma_policy_mof(struct vm_area_struct * vma)2053 bool vma_policy_mof(struct vm_area_struct *vma)
2054 {
2055 	struct mempolicy *pol;
2056 
2057 	if (vma->vm_ops && vma->vm_ops->get_policy) {
2058 		bool ret = false;
2059 		pgoff_t ilx;		/* ignored here */
2060 
2061 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2062 		if (pol && (pol->flags & MPOL_F_MOF))
2063 			ret = true;
2064 		mpol_cond_put(pol);
2065 
2066 		return ret;
2067 	}
2068 
2069 	pol = vma->vm_policy;
2070 	if (!pol)
2071 		pol = get_task_policy(current);
2072 
2073 	return pol->flags & MPOL_F_MOF;
2074 }
2075 
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2076 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2077 {
2078 	enum zone_type dynamic_policy_zone = policy_zone;
2079 
2080 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2081 
2082 	/*
2083 	 * if policy->nodes has movable memory only,
2084 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2085 	 *
2086 	 * policy->nodes is intersect with node_states[N_MEMORY].
2087 	 * so if the following test fails, it implies
2088 	 * policy->nodes has movable memory only.
2089 	 */
2090 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2091 		dynamic_policy_zone = ZONE_MOVABLE;
2092 
2093 	return zone >= dynamic_policy_zone;
2094 }
2095 
weighted_interleave_nodes(struct mempolicy * policy)2096 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2097 {
2098 	unsigned int node;
2099 	unsigned int cpuset_mems_cookie;
2100 
2101 retry:
2102 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2103 	cpuset_mems_cookie = read_mems_allowed_begin();
2104 	node = current->il_prev;
2105 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
2106 		node = next_node_in(node, policy->nodes);
2107 		if (read_mems_allowed_retry(cpuset_mems_cookie))
2108 			goto retry;
2109 		if (node == MAX_NUMNODES)
2110 			return node;
2111 		current->il_prev = node;
2112 		current->il_weight = get_il_weight(node);
2113 	}
2114 	current->il_weight--;
2115 	return node;
2116 }
2117 
2118 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2119 static unsigned int interleave_nodes(struct mempolicy *policy)
2120 {
2121 	unsigned int nid;
2122 	unsigned int cpuset_mems_cookie;
2123 
2124 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2125 	do {
2126 		cpuset_mems_cookie = read_mems_allowed_begin();
2127 		nid = next_node_in(current->il_prev, policy->nodes);
2128 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2129 
2130 	if (nid < MAX_NUMNODES)
2131 		current->il_prev = nid;
2132 	return nid;
2133 }
2134 
2135 /*
2136  * Depending on the memory policy provide a node from which to allocate the
2137  * next slab entry.
2138  */
mempolicy_slab_node(void)2139 unsigned int mempolicy_slab_node(void)
2140 {
2141 	struct mempolicy *policy;
2142 	int node = numa_mem_id();
2143 
2144 	if (!in_task())
2145 		return node;
2146 
2147 	policy = current->mempolicy;
2148 	if (!policy)
2149 		return node;
2150 
2151 	switch (policy->mode) {
2152 	case MPOL_PREFERRED:
2153 		return first_node(policy->nodes);
2154 
2155 	case MPOL_INTERLEAVE:
2156 		return interleave_nodes(policy);
2157 
2158 	case MPOL_WEIGHTED_INTERLEAVE:
2159 		return weighted_interleave_nodes(policy);
2160 
2161 	case MPOL_BIND:
2162 	case MPOL_PREFERRED_MANY:
2163 	{
2164 		struct zoneref *z;
2165 
2166 		/*
2167 		 * Follow bind policy behavior and start allocation at the
2168 		 * first node.
2169 		 */
2170 		struct zonelist *zonelist;
2171 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2172 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2173 		z = first_zones_zonelist(zonelist, highest_zoneidx,
2174 							&policy->nodes);
2175 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2176 	}
2177 	case MPOL_LOCAL:
2178 		return node;
2179 
2180 	default:
2181 		BUG();
2182 	}
2183 }
2184 
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2185 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2186 					      nodemask_t *mask)
2187 {
2188 	/*
2189 	 * barrier stabilizes the nodemask locally so that it can be iterated
2190 	 * over safely without concern for changes. Allocators validate node
2191 	 * selection does not violate mems_allowed, so this is safe.
2192 	 */
2193 	barrier();
2194 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2195 	barrier();
2196 	return nodes_weight(*mask);
2197 }
2198 
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2199 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2200 {
2201 	struct weighted_interleave_state *state;
2202 	nodemask_t nodemask;
2203 	unsigned int target, nr_nodes;
2204 	u8 *table = NULL;
2205 	unsigned int weight_total = 0;
2206 	u8 weight;
2207 	int nid = 0;
2208 
2209 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2210 	if (!nr_nodes)
2211 		return numa_node_id();
2212 
2213 	rcu_read_lock();
2214 
2215 	state = rcu_dereference(wi_state);
2216 	/* Uninitialized wi_state means we should assume all weights are 1 */
2217 	if (state)
2218 		table = state->iw_table;
2219 
2220 	/* calculate the total weight */
2221 	for_each_node_mask(nid, nodemask)
2222 		weight_total += table ? table[nid] : 1;
2223 
2224 	/* Calculate the node offset based on totals */
2225 	target = ilx % weight_total;
2226 	nid = first_node(nodemask);
2227 	while (target) {
2228 		/* detect system default usage */
2229 		weight = table ? table[nid] : 1;
2230 		if (target < weight)
2231 			break;
2232 		target -= weight;
2233 		nid = next_node_in(nid, nodemask);
2234 	}
2235 	rcu_read_unlock();
2236 	return nid;
2237 }
2238 
2239 /*
2240  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2241  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2242  * exceeds the number of present nodes.
2243  */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2244 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2245 {
2246 	nodemask_t nodemask;
2247 	unsigned int target, nnodes;
2248 	int i;
2249 	int nid;
2250 
2251 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2252 	if (!nnodes)
2253 		return numa_node_id();
2254 	target = ilx % nnodes;
2255 	nid = first_node(nodemask);
2256 	for (i = 0; i < target; i++)
2257 		nid = next_node(nid, nodemask);
2258 	return nid;
2259 }
2260 
2261 /*
2262  * Return a nodemask representing a mempolicy for filtering nodes for
2263  * page allocation, together with preferred node id (or the input node id).
2264  */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2265 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2266 				   pgoff_t ilx, int *nid)
2267 {
2268 	nodemask_t *nodemask = NULL;
2269 
2270 	switch (pol->mode) {
2271 	case MPOL_PREFERRED:
2272 		/* Override input node id */
2273 		*nid = first_node(pol->nodes);
2274 		break;
2275 	case MPOL_PREFERRED_MANY:
2276 		nodemask = &pol->nodes;
2277 		if (pol->home_node != NUMA_NO_NODE)
2278 			*nid = pol->home_node;
2279 		break;
2280 	case MPOL_BIND:
2281 		/* Restrict to nodemask (but not on lower zones) */
2282 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2283 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2284 			nodemask = &pol->nodes;
2285 		if (pol->home_node != NUMA_NO_NODE)
2286 			*nid = pol->home_node;
2287 		/*
2288 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2289 		 * because we might easily break the expectation to stay on the
2290 		 * requested node and not break the policy.
2291 		 */
2292 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2293 		break;
2294 	case MPOL_INTERLEAVE:
2295 		/* Override input node id */
2296 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2297 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2298 		break;
2299 	case MPOL_WEIGHTED_INTERLEAVE:
2300 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2301 			weighted_interleave_nodes(pol) :
2302 			weighted_interleave_nid(pol, ilx);
2303 		break;
2304 	}
2305 
2306 	return nodemask;
2307 }
2308 
2309 #ifdef CONFIG_HUGETLBFS
2310 /*
2311  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2312  * @vma: virtual memory area whose policy is sought
2313  * @addr: address in @vma for shared policy lookup and interleave policy
2314  * @gfp_flags: for requested zone
2315  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2316  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2317  *
2318  * Returns a nid suitable for a huge page allocation and a pointer
2319  * to the struct mempolicy for conditional unref after allocation.
2320  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2321  * to the mempolicy's @nodemask for filtering the zonelist.
2322  */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2323 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2324 		struct mempolicy **mpol, nodemask_t **nodemask)
2325 {
2326 	pgoff_t ilx;
2327 	int nid;
2328 
2329 	nid = numa_node_id();
2330 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2331 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2332 	return nid;
2333 }
2334 
2335 /*
2336  * init_nodemask_of_mempolicy
2337  *
2338  * If the current task's mempolicy is "default" [NULL], return 'false'
2339  * to indicate default policy.  Otherwise, extract the policy nodemask
2340  * for 'bind' or 'interleave' policy into the argument nodemask, or
2341  * initialize the argument nodemask to contain the single node for
2342  * 'preferred' or 'local' policy and return 'true' to indicate presence
2343  * of non-default mempolicy.
2344  *
2345  * We don't bother with reference counting the mempolicy [mpol_get/put]
2346  * because the current task is examining it's own mempolicy and a task's
2347  * mempolicy is only ever changed by the task itself.
2348  *
2349  * N.B., it is the caller's responsibility to free a returned nodemask.
2350  */
init_nodemask_of_mempolicy(nodemask_t * mask)2351 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2352 {
2353 	struct mempolicy *mempolicy;
2354 
2355 	if (!(mask && current->mempolicy))
2356 		return false;
2357 
2358 	task_lock(current);
2359 	mempolicy = current->mempolicy;
2360 	switch (mempolicy->mode) {
2361 	case MPOL_PREFERRED:
2362 	case MPOL_PREFERRED_MANY:
2363 	case MPOL_BIND:
2364 	case MPOL_INTERLEAVE:
2365 	case MPOL_WEIGHTED_INTERLEAVE:
2366 		*mask = mempolicy->nodes;
2367 		break;
2368 
2369 	case MPOL_LOCAL:
2370 		init_nodemask_of_node(mask, numa_node_id());
2371 		break;
2372 
2373 	default:
2374 		BUG();
2375 	}
2376 	task_unlock(current);
2377 
2378 	return true;
2379 }
2380 #endif
2381 
2382 /*
2383  * mempolicy_in_oom_domain
2384  *
2385  * If tsk's mempolicy is "bind", check for intersection between mask and
2386  * the policy nodemask. Otherwise, return true for all other policies
2387  * including "interleave", as a tsk with "interleave" policy may have
2388  * memory allocated from all nodes in system.
2389  *
2390  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2391  */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2392 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2393 					const nodemask_t *mask)
2394 {
2395 	struct mempolicy *mempolicy;
2396 	bool ret = true;
2397 
2398 	if (!mask)
2399 		return ret;
2400 
2401 	task_lock(tsk);
2402 	mempolicy = tsk->mempolicy;
2403 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2404 		ret = nodes_intersects(mempolicy->nodes, *mask);
2405 	task_unlock(tsk);
2406 
2407 	return ret;
2408 }
2409 
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2410 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2411 						int nid, nodemask_t *nodemask)
2412 {
2413 	struct page *page;
2414 	gfp_t preferred_gfp;
2415 
2416 	/*
2417 	 * This is a two pass approach. The first pass will only try the
2418 	 * preferred nodes but skip the direct reclaim and allow the
2419 	 * allocation to fail, while the second pass will try all the
2420 	 * nodes in system.
2421 	 */
2422 	preferred_gfp = gfp | __GFP_NOWARN;
2423 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2424 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2425 	if (!page)
2426 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2427 
2428 	return page;
2429 }
2430 
2431 /**
2432  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2433  * @gfp: GFP flags.
2434  * @order: Order of the page allocation.
2435  * @pol: Pointer to the NUMA mempolicy.
2436  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2437  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2438  *
2439  * Return: The page on success or NULL if allocation fails.
2440  */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2441 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2442 		struct mempolicy *pol, pgoff_t ilx, int nid)
2443 {
2444 	nodemask_t *nodemask;
2445 	struct page *page;
2446 
2447 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2448 
2449 	if (pol->mode == MPOL_PREFERRED_MANY)
2450 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2451 
2452 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2453 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2454 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2455 		/*
2456 		 * For hugepage allocation and non-interleave policy which
2457 		 * allows the current node (or other explicitly preferred
2458 		 * node) we only try to allocate from the current/preferred
2459 		 * node and don't fall back to other nodes, as the cost of
2460 		 * remote accesses would likely offset THP benefits.
2461 		 *
2462 		 * If the policy is interleave or does not allow the current
2463 		 * node in its nodemask, we allocate the standard way.
2464 		 */
2465 		if (pol->mode != MPOL_INTERLEAVE &&
2466 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2467 		    (!nodemask || node_isset(nid, *nodemask))) {
2468 			/*
2469 			 * First, try to allocate THP only on local node, but
2470 			 * don't reclaim unnecessarily, just compact.
2471 			 */
2472 			page = __alloc_frozen_pages_noprof(
2473 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2474 				nid, NULL);
2475 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2476 				return page;
2477 			/*
2478 			 * If hugepage allocations are configured to always
2479 			 * synchronous compact or the vma has been madvised
2480 			 * to prefer hugepage backing, retry allowing remote
2481 			 * memory with both reclaim and compact as well.
2482 			 */
2483 		}
2484 	}
2485 
2486 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2487 
2488 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2489 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2490 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2491 		if (static_branch_likely(&vm_numa_stat_key) &&
2492 		    page_to_nid(page) == nid) {
2493 			preempt_disable();
2494 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2495 			preempt_enable();
2496 		}
2497 	}
2498 
2499 	return page;
2500 }
2501 
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2502 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2503 		struct mempolicy *pol, pgoff_t ilx, int nid)
2504 {
2505 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2506 			ilx, nid);
2507 	if (!page)
2508 		return NULL;
2509 
2510 	set_page_refcounted(page);
2511 	return page_rmappable_folio(page);
2512 }
2513 
2514 /**
2515  * vma_alloc_folio - Allocate a folio for a VMA.
2516  * @gfp: GFP flags.
2517  * @order: Order of the folio.
2518  * @vma: Pointer to VMA.
2519  * @addr: Virtual address of the allocation.  Must be inside @vma.
2520  *
2521  * Allocate a folio for a specific address in @vma, using the appropriate
2522  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2523  * VMA to prevent it from going away.  Should be used for all allocations
2524  * for folios that will be mapped into user space, excepting hugetlbfs, and
2525  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2526  *
2527  * Return: The folio on success or NULL if allocation fails.
2528  */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2529 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2530 		unsigned long addr)
2531 {
2532 	struct mempolicy *pol;
2533 	pgoff_t ilx;
2534 	struct folio *folio;
2535 
2536 	if (vma->vm_flags & VM_DROPPABLE)
2537 		gfp |= __GFP_NOWARN;
2538 
2539 	pol = get_vma_policy(vma, addr, order, &ilx);
2540 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2541 	mpol_cond_put(pol);
2542 	return folio;
2543 }
2544 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2545 
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2546 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2547 {
2548 	struct mempolicy *pol = &default_policy;
2549 
2550 	/*
2551 	 * No reference counting needed for current->mempolicy
2552 	 * nor system default_policy
2553 	 */
2554 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2555 		pol = get_task_policy(current);
2556 
2557 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2558 				       numa_node_id());
2559 }
2560 
2561 /**
2562  * alloc_pages - Allocate pages.
2563  * @gfp: GFP flags.
2564  * @order: Power of two of number of pages to allocate.
2565  *
2566  * Allocate 1 << @order contiguous pages.  The physical address of the
2567  * first page is naturally aligned (eg an order-3 allocation will be aligned
2568  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2569  * process is honoured when in process context.
2570  *
2571  * Context: Can be called from any context, providing the appropriate GFP
2572  * flags are used.
2573  * Return: The page on success or NULL if allocation fails.
2574  */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2575 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2576 {
2577 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2578 
2579 	if (page)
2580 		set_page_refcounted(page);
2581 	return page;
2582 }
2583 EXPORT_SYMBOL(alloc_pages_noprof);
2584 
folio_alloc_noprof(gfp_t gfp,unsigned int order)2585 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2586 {
2587 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2588 }
2589 EXPORT_SYMBOL(folio_alloc_noprof);
2590 
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2591 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2592 		struct mempolicy *pol, unsigned long nr_pages,
2593 		struct page **page_array)
2594 {
2595 	int nodes;
2596 	unsigned long nr_pages_per_node;
2597 	int delta;
2598 	int i;
2599 	unsigned long nr_allocated;
2600 	unsigned long total_allocated = 0;
2601 
2602 	nodes = nodes_weight(pol->nodes);
2603 	nr_pages_per_node = nr_pages / nodes;
2604 	delta = nr_pages - nodes * nr_pages_per_node;
2605 
2606 	for (i = 0; i < nodes; i++) {
2607 		if (delta) {
2608 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2609 					interleave_nodes(pol), NULL,
2610 					nr_pages_per_node + 1,
2611 					page_array);
2612 			delta--;
2613 		} else {
2614 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2615 					interleave_nodes(pol), NULL,
2616 					nr_pages_per_node, page_array);
2617 		}
2618 
2619 		page_array += nr_allocated;
2620 		total_allocated += nr_allocated;
2621 	}
2622 
2623 	return total_allocated;
2624 }
2625 
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2626 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2627 		struct mempolicy *pol, unsigned long nr_pages,
2628 		struct page **page_array)
2629 {
2630 	struct weighted_interleave_state *state;
2631 	struct task_struct *me = current;
2632 	unsigned int cpuset_mems_cookie;
2633 	unsigned long total_allocated = 0;
2634 	unsigned long nr_allocated = 0;
2635 	unsigned long rounds;
2636 	unsigned long node_pages, delta;
2637 	u8 *weights, weight;
2638 	unsigned int weight_total = 0;
2639 	unsigned long rem_pages = nr_pages;
2640 	nodemask_t nodes;
2641 	int nnodes, node;
2642 	int resume_node = MAX_NUMNODES - 1;
2643 	u8 resume_weight = 0;
2644 	int prev_node;
2645 	int i;
2646 
2647 	if (!nr_pages)
2648 		return 0;
2649 
2650 	/* read the nodes onto the stack, retry if done during rebind */
2651 	do {
2652 		cpuset_mems_cookie = read_mems_allowed_begin();
2653 		nnodes = read_once_policy_nodemask(pol, &nodes);
2654 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2655 
2656 	/* if the nodemask has become invalid, we cannot do anything */
2657 	if (!nnodes)
2658 		return 0;
2659 
2660 	/* Continue allocating from most recent node and adjust the nr_pages */
2661 	node = me->il_prev;
2662 	weight = me->il_weight;
2663 	if (weight && node_isset(node, nodes)) {
2664 		node_pages = min(rem_pages, weight);
2665 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2666 						  page_array);
2667 		page_array += nr_allocated;
2668 		total_allocated += nr_allocated;
2669 		/* if that's all the pages, no need to interleave */
2670 		if (rem_pages <= weight) {
2671 			me->il_weight -= rem_pages;
2672 			return total_allocated;
2673 		}
2674 		/* Otherwise we adjust remaining pages, continue from there */
2675 		rem_pages -= weight;
2676 	}
2677 	/* clear active weight in case of an allocation failure */
2678 	me->il_weight = 0;
2679 	prev_node = node;
2680 
2681 	/* create a local copy of node weights to operate on outside rcu */
2682 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2683 	if (!weights)
2684 		return total_allocated;
2685 
2686 	rcu_read_lock();
2687 	state = rcu_dereference(wi_state);
2688 	if (state) {
2689 		memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2690 		rcu_read_unlock();
2691 	} else {
2692 		rcu_read_unlock();
2693 		for (i = 0; i < nr_node_ids; i++)
2694 			weights[i] = 1;
2695 	}
2696 
2697 	/* calculate total, detect system default usage */
2698 	for_each_node_mask(node, nodes)
2699 		weight_total += weights[node];
2700 
2701 	/*
2702 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2703 	 * Track which node weighted interleave should resume from.
2704 	 *
2705 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2706 	 * the node following prev_node and its weight.
2707 	 */
2708 	rounds = rem_pages / weight_total;
2709 	delta = rem_pages % weight_total;
2710 	resume_node = next_node_in(prev_node, nodes);
2711 	resume_weight = weights[resume_node];
2712 	for (i = 0; i < nnodes; i++) {
2713 		node = next_node_in(prev_node, nodes);
2714 		weight = weights[node];
2715 		node_pages = weight * rounds;
2716 		/* If a delta exists, add this node's portion of the delta */
2717 		if (delta > weight) {
2718 			node_pages += weight;
2719 			delta -= weight;
2720 		} else if (delta) {
2721 			/* when delta is depleted, resume from that node */
2722 			node_pages += delta;
2723 			resume_node = node;
2724 			resume_weight = weight - delta;
2725 			delta = 0;
2726 		}
2727 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2728 		if (!node_pages)
2729 			break;
2730 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2731 						  page_array);
2732 		page_array += nr_allocated;
2733 		total_allocated += nr_allocated;
2734 		if (total_allocated == nr_pages)
2735 			break;
2736 		prev_node = node;
2737 	}
2738 	me->il_prev = resume_node;
2739 	me->il_weight = resume_weight;
2740 	kfree(weights);
2741 	return total_allocated;
2742 }
2743 
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2744 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2745 		struct mempolicy *pol, unsigned long nr_pages,
2746 		struct page **page_array)
2747 {
2748 	gfp_t preferred_gfp;
2749 	unsigned long nr_allocated = 0;
2750 
2751 	preferred_gfp = gfp | __GFP_NOWARN;
2752 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2753 
2754 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2755 					   nr_pages, page_array);
2756 
2757 	if (nr_allocated < nr_pages)
2758 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2759 				nr_pages - nr_allocated,
2760 				page_array + nr_allocated);
2761 	return nr_allocated;
2762 }
2763 
2764 /* alloc pages bulk and mempolicy should be considered at the
2765  * same time in some situation such as vmalloc.
2766  *
2767  * It can accelerate memory allocation especially interleaving
2768  * allocate memory.
2769  */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2770 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2771 		unsigned long nr_pages, struct page **page_array)
2772 {
2773 	struct mempolicy *pol = &default_policy;
2774 	nodemask_t *nodemask;
2775 	int nid;
2776 
2777 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2778 		pol = get_task_policy(current);
2779 
2780 	if (pol->mode == MPOL_INTERLEAVE)
2781 		return alloc_pages_bulk_interleave(gfp, pol,
2782 							 nr_pages, page_array);
2783 
2784 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2785 		return alloc_pages_bulk_weighted_interleave(
2786 				  gfp, pol, nr_pages, page_array);
2787 
2788 	if (pol->mode == MPOL_PREFERRED_MANY)
2789 		return alloc_pages_bulk_preferred_many(gfp,
2790 				numa_node_id(), pol, nr_pages, page_array);
2791 
2792 	nid = numa_node_id();
2793 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2794 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2795 				       nr_pages, page_array);
2796 }
2797 
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2798 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2799 {
2800 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2801 
2802 	if (IS_ERR(pol))
2803 		return PTR_ERR(pol);
2804 	dst->vm_policy = pol;
2805 	return 0;
2806 }
2807 
2808 /*
2809  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2810  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2811  * with the mems_allowed returned by cpuset_mems_allowed().  This
2812  * keeps mempolicies cpuset relative after its cpuset moves.  See
2813  * further kernel/cpuset.c update_nodemask().
2814  *
2815  * current's mempolicy may be rebinded by the other task(the task that changes
2816  * cpuset's mems), so we needn't do rebind work for current task.
2817  */
2818 
2819 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2820 struct mempolicy *__mpol_dup(struct mempolicy *old)
2821 {
2822 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2823 
2824 	if (!new)
2825 		return ERR_PTR(-ENOMEM);
2826 
2827 	/* task's mempolicy is protected by alloc_lock */
2828 	if (old == current->mempolicy) {
2829 		task_lock(current);
2830 		*new = *old;
2831 		task_unlock(current);
2832 	} else
2833 		*new = *old;
2834 
2835 	if (current_cpuset_is_being_rebound()) {
2836 		nodemask_t mems = cpuset_mems_allowed(current);
2837 		mpol_rebind_policy(new, &mems);
2838 	}
2839 	atomic_set(&new->refcnt, 1);
2840 	return new;
2841 }
2842 
2843 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2844 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2845 {
2846 	if (!a || !b)
2847 		return false;
2848 	if (a->mode != b->mode)
2849 		return false;
2850 	if (a->flags != b->flags)
2851 		return false;
2852 	if (a->home_node != b->home_node)
2853 		return false;
2854 	if (mpol_store_user_nodemask(a))
2855 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2856 			return false;
2857 
2858 	switch (a->mode) {
2859 	case MPOL_BIND:
2860 	case MPOL_INTERLEAVE:
2861 	case MPOL_PREFERRED:
2862 	case MPOL_PREFERRED_MANY:
2863 	case MPOL_WEIGHTED_INTERLEAVE:
2864 		return !!nodes_equal(a->nodes, b->nodes);
2865 	case MPOL_LOCAL:
2866 		return true;
2867 	default:
2868 		BUG();
2869 		return false;
2870 	}
2871 }
2872 
2873 /*
2874  * Shared memory backing store policy support.
2875  *
2876  * Remember policies even when nobody has shared memory mapped.
2877  * The policies are kept in Red-Black tree linked from the inode.
2878  * They are protected by the sp->lock rwlock, which should be held
2879  * for any accesses to the tree.
2880  */
2881 
2882 /*
2883  * lookup first element intersecting start-end.  Caller holds sp->lock for
2884  * reading or for writing
2885  */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2886 static struct sp_node *sp_lookup(struct shared_policy *sp,
2887 					pgoff_t start, pgoff_t end)
2888 {
2889 	struct rb_node *n = sp->root.rb_node;
2890 
2891 	while (n) {
2892 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2893 
2894 		if (start >= p->end)
2895 			n = n->rb_right;
2896 		else if (end <= p->start)
2897 			n = n->rb_left;
2898 		else
2899 			break;
2900 	}
2901 	if (!n)
2902 		return NULL;
2903 	for (;;) {
2904 		struct sp_node *w = NULL;
2905 		struct rb_node *prev = rb_prev(n);
2906 		if (!prev)
2907 			break;
2908 		w = rb_entry(prev, struct sp_node, nd);
2909 		if (w->end <= start)
2910 			break;
2911 		n = prev;
2912 	}
2913 	return rb_entry(n, struct sp_node, nd);
2914 }
2915 
2916 /*
2917  * Insert a new shared policy into the list.  Caller holds sp->lock for
2918  * writing.
2919  */
sp_insert(struct shared_policy * sp,struct sp_node * new)2920 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2921 {
2922 	struct rb_node **p = &sp->root.rb_node;
2923 	struct rb_node *parent = NULL;
2924 	struct sp_node *nd;
2925 
2926 	while (*p) {
2927 		parent = *p;
2928 		nd = rb_entry(parent, struct sp_node, nd);
2929 		if (new->start < nd->start)
2930 			p = &(*p)->rb_left;
2931 		else if (new->end > nd->end)
2932 			p = &(*p)->rb_right;
2933 		else
2934 			BUG();
2935 	}
2936 	rb_link_node(&new->nd, parent, p);
2937 	rb_insert_color(&new->nd, &sp->root);
2938 }
2939 
2940 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2941 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2942 						pgoff_t idx)
2943 {
2944 	struct mempolicy *pol = NULL;
2945 	struct sp_node *sn;
2946 
2947 	if (!sp->root.rb_node)
2948 		return NULL;
2949 	read_lock(&sp->lock);
2950 	sn = sp_lookup(sp, idx, idx+1);
2951 	if (sn) {
2952 		mpol_get(sn->policy);
2953 		pol = sn->policy;
2954 	}
2955 	read_unlock(&sp->lock);
2956 	return pol;
2957 }
2958 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
2959 
sp_free(struct sp_node * n)2960 static void sp_free(struct sp_node *n)
2961 {
2962 	mpol_put(n->policy);
2963 	kmem_cache_free(sn_cache, n);
2964 }
2965 
2966 /**
2967  * mpol_misplaced - check whether current folio node is valid in policy
2968  *
2969  * @folio: folio to be checked
2970  * @vmf: structure describing the fault
2971  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2972  *
2973  * Lookup current policy node id for vma,addr and "compare to" folio's
2974  * node id.  Policy determination "mimics" alloc_page_vma().
2975  * Called from fault path where we know the vma and faulting address.
2976  *
2977  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2978  * policy, or a suitable node ID to allocate a replacement folio from.
2979  */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2980 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2981 		   unsigned long addr)
2982 {
2983 	struct mempolicy *pol;
2984 	pgoff_t ilx;
2985 	struct zoneref *z;
2986 	int curnid = folio_nid(folio);
2987 	struct vm_area_struct *vma = vmf->vma;
2988 	int thiscpu = raw_smp_processor_id();
2989 	int thisnid = numa_node_id();
2990 	int polnid = NUMA_NO_NODE;
2991 	int ret = NUMA_NO_NODE;
2992 
2993 	/*
2994 	 * Make sure ptl is held so that we don't preempt and we
2995 	 * have a stable smp processor id
2996 	 */
2997 	lockdep_assert_held(vmf->ptl);
2998 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2999 	if (!(pol->flags & MPOL_F_MOF))
3000 		goto out;
3001 
3002 	switch (pol->mode) {
3003 	case MPOL_INTERLEAVE:
3004 		polnid = interleave_nid(pol, ilx);
3005 		break;
3006 
3007 	case MPOL_WEIGHTED_INTERLEAVE:
3008 		polnid = weighted_interleave_nid(pol, ilx);
3009 		break;
3010 
3011 	case MPOL_PREFERRED:
3012 		if (node_isset(curnid, pol->nodes))
3013 			goto out;
3014 		polnid = first_node(pol->nodes);
3015 		break;
3016 
3017 	case MPOL_LOCAL:
3018 		polnid = numa_node_id();
3019 		break;
3020 
3021 	case MPOL_BIND:
3022 	case MPOL_PREFERRED_MANY:
3023 		/*
3024 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
3025 		 * policy nodemask we don't allow numa migration to nodes
3026 		 * outside policy nodemask for now. This is done so that if we
3027 		 * want demotion to slow memory to happen, before allocating
3028 		 * from some DRAM node say 'x', we will end up using a
3029 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3030 		 * we should not promote to node 'x' from slow memory node.
3031 		 */
3032 		if (pol->flags & MPOL_F_MORON) {
3033 			/*
3034 			 * Optimize placement among multiple nodes
3035 			 * via NUMA balancing
3036 			 */
3037 			if (node_isset(thisnid, pol->nodes))
3038 				break;
3039 			goto out;
3040 		}
3041 
3042 		/*
3043 		 * use current page if in policy nodemask,
3044 		 * else select nearest allowed node, if any.
3045 		 * If no allowed nodes, use current [!misplaced].
3046 		 */
3047 		if (node_isset(curnid, pol->nodes))
3048 			goto out;
3049 		z = first_zones_zonelist(
3050 				node_zonelist(thisnid, GFP_HIGHUSER),
3051 				gfp_zone(GFP_HIGHUSER),
3052 				&pol->nodes);
3053 		polnid = zonelist_node_idx(z);
3054 		break;
3055 
3056 	default:
3057 		BUG();
3058 	}
3059 
3060 	/* Migrate the folio towards the node whose CPU is referencing it */
3061 	if (pol->flags & MPOL_F_MORON) {
3062 		polnid = thisnid;
3063 
3064 		if (!should_numa_migrate_memory(current, folio, curnid,
3065 						thiscpu))
3066 			goto out;
3067 	}
3068 
3069 	if (curnid != polnid)
3070 		ret = polnid;
3071 out:
3072 	mpol_cond_put(pol);
3073 
3074 	return ret;
3075 }
3076 
3077 /*
3078  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
3079  * dropped after task->mempolicy is set to NULL so that any allocation done as
3080  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3081  * policy.
3082  */
mpol_put_task_policy(struct task_struct * task)3083 void mpol_put_task_policy(struct task_struct *task)
3084 {
3085 	struct mempolicy *pol;
3086 
3087 	task_lock(task);
3088 	pol = task->mempolicy;
3089 	task->mempolicy = NULL;
3090 	task_unlock(task);
3091 	mpol_put(pol);
3092 }
3093 
sp_delete(struct shared_policy * sp,struct sp_node * n)3094 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3095 {
3096 	rb_erase(&n->nd, &sp->root);
3097 	sp_free(n);
3098 }
3099 
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3100 static void sp_node_init(struct sp_node *node, unsigned long start,
3101 			unsigned long end, struct mempolicy *pol)
3102 {
3103 	node->start = start;
3104 	node->end = end;
3105 	node->policy = pol;
3106 }
3107 
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3108 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3109 				struct mempolicy *pol)
3110 {
3111 	struct sp_node *n;
3112 	struct mempolicy *newpol;
3113 
3114 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3115 	if (!n)
3116 		return NULL;
3117 
3118 	newpol = mpol_dup(pol);
3119 	if (IS_ERR(newpol)) {
3120 		kmem_cache_free(sn_cache, n);
3121 		return NULL;
3122 	}
3123 	newpol->flags |= MPOL_F_SHARED;
3124 	sp_node_init(n, start, end, newpol);
3125 
3126 	return n;
3127 }
3128 
3129 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3130 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3131 				 pgoff_t end, struct sp_node *new)
3132 {
3133 	struct sp_node *n;
3134 	struct sp_node *n_new = NULL;
3135 	struct mempolicy *mpol_new = NULL;
3136 	int ret = 0;
3137 
3138 restart:
3139 	write_lock(&sp->lock);
3140 	n = sp_lookup(sp, start, end);
3141 	/* Take care of old policies in the same range. */
3142 	while (n && n->start < end) {
3143 		struct rb_node *next = rb_next(&n->nd);
3144 		if (n->start >= start) {
3145 			if (n->end <= end)
3146 				sp_delete(sp, n);
3147 			else
3148 				n->start = end;
3149 		} else {
3150 			/* Old policy spanning whole new range. */
3151 			if (n->end > end) {
3152 				if (!n_new)
3153 					goto alloc_new;
3154 
3155 				*mpol_new = *n->policy;
3156 				atomic_set(&mpol_new->refcnt, 1);
3157 				sp_node_init(n_new, end, n->end, mpol_new);
3158 				n->end = start;
3159 				sp_insert(sp, n_new);
3160 				n_new = NULL;
3161 				mpol_new = NULL;
3162 				break;
3163 			} else
3164 				n->end = start;
3165 		}
3166 		if (!next)
3167 			break;
3168 		n = rb_entry(next, struct sp_node, nd);
3169 	}
3170 	if (new)
3171 		sp_insert(sp, new);
3172 	write_unlock(&sp->lock);
3173 	ret = 0;
3174 
3175 err_out:
3176 	if (mpol_new)
3177 		mpol_put(mpol_new);
3178 	if (n_new)
3179 		kmem_cache_free(sn_cache, n_new);
3180 
3181 	return ret;
3182 
3183 alloc_new:
3184 	write_unlock(&sp->lock);
3185 	ret = -ENOMEM;
3186 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3187 	if (!n_new)
3188 		goto err_out;
3189 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3190 	if (!mpol_new)
3191 		goto err_out;
3192 	atomic_set(&mpol_new->refcnt, 1);
3193 	goto restart;
3194 }
3195 
3196 /**
3197  * mpol_shared_policy_init - initialize shared policy for inode
3198  * @sp: pointer to inode shared policy
3199  * @mpol:  struct mempolicy to install
3200  *
3201  * Install non-NULL @mpol in inode's shared policy rb-tree.
3202  * On entry, the current task has a reference on a non-NULL @mpol.
3203  * This must be released on exit.
3204  * This is called at get_inode() calls and we can use GFP_KERNEL.
3205  */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3206 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3207 {
3208 	int ret;
3209 
3210 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3211 	rwlock_init(&sp->lock);
3212 
3213 	if (mpol) {
3214 		struct sp_node *sn;
3215 		struct mempolicy *npol;
3216 		NODEMASK_SCRATCH(scratch);
3217 
3218 		if (!scratch)
3219 			goto put_mpol;
3220 
3221 		/* contextualize the tmpfs mount point mempolicy to this file */
3222 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3223 		if (IS_ERR(npol))
3224 			goto free_scratch; /* no valid nodemask intersection */
3225 
3226 		task_lock(current);
3227 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3228 		task_unlock(current);
3229 		if (ret)
3230 			goto put_npol;
3231 
3232 		/* alloc node covering entire file; adds ref to file's npol */
3233 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3234 		if (sn)
3235 			sp_insert(sp, sn);
3236 put_npol:
3237 		mpol_put(npol);	/* drop initial ref on file's npol */
3238 free_scratch:
3239 		NODEMASK_SCRATCH_FREE(scratch);
3240 put_mpol:
3241 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3242 	}
3243 }
3244 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
3245 
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3246 int mpol_set_shared_policy(struct shared_policy *sp,
3247 			struct vm_area_struct *vma, struct mempolicy *pol)
3248 {
3249 	int err;
3250 	struct sp_node *new = NULL;
3251 	unsigned long sz = vma_pages(vma);
3252 
3253 	if (pol) {
3254 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3255 		if (!new)
3256 			return -ENOMEM;
3257 	}
3258 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3259 	if (err && new)
3260 		sp_free(new);
3261 	return err;
3262 }
3263 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
3264 
3265 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3266 void mpol_free_shared_policy(struct shared_policy *sp)
3267 {
3268 	struct sp_node *n;
3269 	struct rb_node *next;
3270 
3271 	if (!sp->root.rb_node)
3272 		return;
3273 	write_lock(&sp->lock);
3274 	next = rb_first(&sp->root);
3275 	while (next) {
3276 		n = rb_entry(next, struct sp_node, nd);
3277 		next = rb_next(&n->nd);
3278 		sp_delete(sp, n);
3279 	}
3280 	write_unlock(&sp->lock);
3281 }
3282 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
3283 
3284 #ifdef CONFIG_NUMA_BALANCING
3285 static int __initdata numabalancing_override;
3286 
check_numabalancing_enable(void)3287 static void __init check_numabalancing_enable(void)
3288 {
3289 	bool numabalancing_default = false;
3290 
3291 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3292 		numabalancing_default = true;
3293 
3294 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3295 	if (numabalancing_override)
3296 		set_numabalancing_state(numabalancing_override == 1);
3297 
3298 	if (num_online_nodes() > 1 && !numabalancing_override) {
3299 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3300 			numabalancing_default ? "Enabling" : "Disabling");
3301 		set_numabalancing_state(numabalancing_default);
3302 	}
3303 }
3304 
setup_numabalancing(char * str)3305 static int __init setup_numabalancing(char *str)
3306 {
3307 	int ret = 0;
3308 	if (!str)
3309 		goto out;
3310 
3311 	if (!strcmp(str, "enable")) {
3312 		numabalancing_override = 1;
3313 		ret = 1;
3314 	} else if (!strcmp(str, "disable")) {
3315 		numabalancing_override = -1;
3316 		ret = 1;
3317 	}
3318 out:
3319 	if (!ret)
3320 		pr_warn("Unable to parse numa_balancing=\n");
3321 
3322 	return ret;
3323 }
3324 __setup("numa_balancing=", setup_numabalancing);
3325 #else
check_numabalancing_enable(void)3326 static inline void __init check_numabalancing_enable(void)
3327 {
3328 }
3329 #endif /* CONFIG_NUMA_BALANCING */
3330 
numa_policy_init(void)3331 void __init numa_policy_init(void)
3332 {
3333 	nodemask_t interleave_nodes;
3334 	unsigned long largest = 0;
3335 	int nid, prefer = 0;
3336 
3337 	policy_cache = kmem_cache_create("numa_policy",
3338 					 sizeof(struct mempolicy),
3339 					 0, SLAB_PANIC, NULL);
3340 
3341 	sn_cache = kmem_cache_create("shared_policy_node",
3342 				     sizeof(struct sp_node),
3343 				     0, SLAB_PANIC, NULL);
3344 
3345 	for_each_node(nid) {
3346 		preferred_node_policy[nid] = (struct mempolicy) {
3347 			.refcnt = ATOMIC_INIT(1),
3348 			.mode = MPOL_PREFERRED,
3349 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3350 			.nodes = nodemask_of_node(nid),
3351 		};
3352 	}
3353 
3354 	/*
3355 	 * Set interleaving policy for system init. Interleaving is only
3356 	 * enabled across suitably sized nodes (default is >= 16MB), or
3357 	 * fall back to the largest node if they're all smaller.
3358 	 */
3359 	nodes_clear(interleave_nodes);
3360 	for_each_node_state(nid, N_MEMORY) {
3361 		unsigned long total_pages = node_present_pages(nid);
3362 
3363 		/* Preserve the largest node */
3364 		if (largest < total_pages) {
3365 			largest = total_pages;
3366 			prefer = nid;
3367 		}
3368 
3369 		/* Interleave this node? */
3370 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3371 			node_set(nid, interleave_nodes);
3372 	}
3373 
3374 	/* All too small, use the largest */
3375 	if (unlikely(nodes_empty(interleave_nodes)))
3376 		node_set(prefer, interleave_nodes);
3377 
3378 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3379 		pr_err("%s: interleaving failed\n", __func__);
3380 
3381 	check_numabalancing_enable();
3382 }
3383 
3384 /* Reset policy of current process to default */
numa_default_policy(void)3385 void numa_default_policy(void)
3386 {
3387 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3388 }
3389 
3390 /*
3391  * Parse and format mempolicy from/to strings
3392  */
3393 static const char * const policy_modes[] =
3394 {
3395 	[MPOL_DEFAULT]    = "default",
3396 	[MPOL_PREFERRED]  = "prefer",
3397 	[MPOL_BIND]       = "bind",
3398 	[MPOL_INTERLEAVE] = "interleave",
3399 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3400 	[MPOL_LOCAL]      = "local",
3401 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3402 };
3403 
3404 #ifdef CONFIG_TMPFS
3405 /**
3406  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3407  * @str:  string containing mempolicy to parse
3408  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3409  *
3410  * Format of input:
3411  *	<mode>[=<flags>][:<nodelist>]
3412  *
3413  * Return: %0 on success, else %1
3414  */
mpol_parse_str(char * str,struct mempolicy ** mpol)3415 int mpol_parse_str(char *str, struct mempolicy **mpol)
3416 {
3417 	struct mempolicy *new = NULL;
3418 	unsigned short mode_flags;
3419 	nodemask_t nodes;
3420 	char *nodelist = strchr(str, ':');
3421 	char *flags = strchr(str, '=');
3422 	int err = 1, mode;
3423 
3424 	if (flags)
3425 		*flags++ = '\0';	/* terminate mode string */
3426 
3427 	if (nodelist) {
3428 		/* NUL-terminate mode or flags string */
3429 		*nodelist++ = '\0';
3430 		if (nodelist_parse(nodelist, nodes))
3431 			goto out;
3432 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3433 			goto out;
3434 	} else
3435 		nodes_clear(nodes);
3436 
3437 	mode = match_string(policy_modes, MPOL_MAX, str);
3438 	if (mode < 0)
3439 		goto out;
3440 
3441 	switch (mode) {
3442 	case MPOL_PREFERRED:
3443 		/*
3444 		 * Insist on a nodelist of one node only, although later
3445 		 * we use first_node(nodes) to grab a single node, so here
3446 		 * nodelist (or nodes) cannot be empty.
3447 		 */
3448 		if (nodelist) {
3449 			char *rest = nodelist;
3450 			while (isdigit(*rest))
3451 				rest++;
3452 			if (*rest)
3453 				goto out;
3454 			if (nodes_empty(nodes))
3455 				goto out;
3456 		}
3457 		break;
3458 	case MPOL_INTERLEAVE:
3459 	case MPOL_WEIGHTED_INTERLEAVE:
3460 		/*
3461 		 * Default to online nodes with memory if no nodelist
3462 		 */
3463 		if (!nodelist)
3464 			nodes = node_states[N_MEMORY];
3465 		break;
3466 	case MPOL_LOCAL:
3467 		/*
3468 		 * Don't allow a nodelist;  mpol_new() checks flags
3469 		 */
3470 		if (nodelist)
3471 			goto out;
3472 		break;
3473 	case MPOL_DEFAULT:
3474 		/*
3475 		 * Insist on a empty nodelist
3476 		 */
3477 		if (!nodelist)
3478 			err = 0;
3479 		goto out;
3480 	case MPOL_PREFERRED_MANY:
3481 	case MPOL_BIND:
3482 		/*
3483 		 * Insist on a nodelist
3484 		 */
3485 		if (!nodelist)
3486 			goto out;
3487 	}
3488 
3489 	mode_flags = 0;
3490 	if (flags) {
3491 		/*
3492 		 * Currently, we only support two mutually exclusive
3493 		 * mode flags.
3494 		 */
3495 		if (!strcmp(flags, "static"))
3496 			mode_flags |= MPOL_F_STATIC_NODES;
3497 		else if (!strcmp(flags, "relative"))
3498 			mode_flags |= MPOL_F_RELATIVE_NODES;
3499 		else
3500 			goto out;
3501 	}
3502 
3503 	new = mpol_new(mode, mode_flags, &nodes);
3504 	if (IS_ERR(new))
3505 		goto out;
3506 
3507 	/*
3508 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3509 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3510 	 */
3511 	if (mode != MPOL_PREFERRED) {
3512 		new->nodes = nodes;
3513 	} else if (nodelist) {
3514 		nodes_clear(new->nodes);
3515 		node_set(first_node(nodes), new->nodes);
3516 	} else {
3517 		new->mode = MPOL_LOCAL;
3518 	}
3519 
3520 	/*
3521 	 * Save nodes for contextualization: this will be used to "clone"
3522 	 * the mempolicy in a specific context [cpuset] at a later time.
3523 	 */
3524 	new->w.user_nodemask = nodes;
3525 
3526 	err = 0;
3527 
3528 out:
3529 	/* Restore string for error message */
3530 	if (nodelist)
3531 		*--nodelist = ':';
3532 	if (flags)
3533 		*--flags = '=';
3534 	if (!err)
3535 		*mpol = new;
3536 	return err;
3537 }
3538 #endif /* CONFIG_TMPFS */
3539 
3540 /**
3541  * mpol_to_str - format a mempolicy structure for printing
3542  * @buffer:  to contain formatted mempolicy string
3543  * @maxlen:  length of @buffer
3544  * @pol:  pointer to mempolicy to be formatted
3545  *
3546  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3547  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3548  * interleave", plus the longest flag flags, "relative|balancing", and to
3549  * display at least a few node ids.
3550  */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3551 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3552 {
3553 	char *p = buffer;
3554 	nodemask_t nodes = NODE_MASK_NONE;
3555 	unsigned short mode = MPOL_DEFAULT;
3556 	unsigned short flags = 0;
3557 
3558 	if (pol &&
3559 	    pol != &default_policy &&
3560 	    !(pol >= &preferred_node_policy[0] &&
3561 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3562 		mode = pol->mode;
3563 		flags = pol->flags;
3564 	}
3565 
3566 	switch (mode) {
3567 	case MPOL_DEFAULT:
3568 	case MPOL_LOCAL:
3569 		break;
3570 	case MPOL_PREFERRED:
3571 	case MPOL_PREFERRED_MANY:
3572 	case MPOL_BIND:
3573 	case MPOL_INTERLEAVE:
3574 	case MPOL_WEIGHTED_INTERLEAVE:
3575 		nodes = pol->nodes;
3576 		break;
3577 	default:
3578 		WARN_ON_ONCE(1);
3579 		snprintf(p, maxlen, "unknown");
3580 		return;
3581 	}
3582 
3583 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3584 
3585 	if (flags & MPOL_MODE_FLAGS) {
3586 		p += snprintf(p, buffer + maxlen - p, "=");
3587 
3588 		/*
3589 		 * Static and relative are mutually exclusive.
3590 		 */
3591 		if (flags & MPOL_F_STATIC_NODES)
3592 			p += snprintf(p, buffer + maxlen - p, "static");
3593 		else if (flags & MPOL_F_RELATIVE_NODES)
3594 			p += snprintf(p, buffer + maxlen - p, "relative");
3595 
3596 		if (flags & MPOL_F_NUMA_BALANCING) {
3597 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3598 				p += snprintf(p, buffer + maxlen - p, "|");
3599 			p += snprintf(p, buffer + maxlen - p, "balancing");
3600 		}
3601 	}
3602 
3603 	if (!nodes_empty(nodes))
3604 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3605 			       nodemask_pr_args(&nodes));
3606 }
3607 
3608 #ifdef CONFIG_SYSFS
3609 struct iw_node_attr {
3610 	struct kobj_attribute kobj_attr;
3611 	int nid;
3612 };
3613 
3614 struct sysfs_wi_group {
3615 	struct kobject wi_kobj;
3616 	struct mutex kobj_lock;
3617 	struct iw_node_attr *nattrs[];
3618 };
3619 
3620 static struct sysfs_wi_group *wi_group;
3621 
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3622 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3623 			 char *buf)
3624 {
3625 	struct iw_node_attr *node_attr;
3626 	u8 weight;
3627 
3628 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3629 	weight = get_il_weight(node_attr->nid);
3630 	return sysfs_emit(buf, "%d\n", weight);
3631 }
3632 
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3633 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3634 			  const char *buf, size_t count)
3635 {
3636 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3637 	struct iw_node_attr *node_attr;
3638 	u8 weight = 0;
3639 	int i;
3640 
3641 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3642 	if (count == 0 || sysfs_streq(buf, "") ||
3643 	    kstrtou8(buf, 0, &weight) || weight == 0)
3644 		return -EINVAL;
3645 
3646 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3647 			       GFP_KERNEL);
3648 	if (!new_wi_state)
3649 		return -ENOMEM;
3650 
3651 	mutex_lock(&wi_state_lock);
3652 	old_wi_state = rcu_dereference_protected(wi_state,
3653 					lockdep_is_held(&wi_state_lock));
3654 	if (old_wi_state) {
3655 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3656 					nr_node_ids * sizeof(u8));
3657 	} else {
3658 		for (i = 0; i < nr_node_ids; i++)
3659 			new_wi_state->iw_table[i] = 1;
3660 	}
3661 	new_wi_state->iw_table[node_attr->nid] = weight;
3662 	new_wi_state->mode_auto = false;
3663 
3664 	rcu_assign_pointer(wi_state, new_wi_state);
3665 	mutex_unlock(&wi_state_lock);
3666 	if (old_wi_state) {
3667 		synchronize_rcu();
3668 		kfree(old_wi_state);
3669 	}
3670 	return count;
3671 }
3672 
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3673 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3674 		struct kobj_attribute *attr, char *buf)
3675 {
3676 	struct weighted_interleave_state *state;
3677 	bool wi_auto = true;
3678 
3679 	rcu_read_lock();
3680 	state = rcu_dereference(wi_state);
3681 	if (state)
3682 		wi_auto = state->mode_auto;
3683 	rcu_read_unlock();
3684 
3685 	return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3686 }
3687 
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3688 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3689 		struct kobj_attribute *attr, const char *buf, size_t count)
3690 {
3691 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3692 	unsigned int *bw;
3693 	bool input;
3694 	int i;
3695 
3696 	if (kstrtobool(buf, &input))
3697 		return -EINVAL;
3698 
3699 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3700 			       GFP_KERNEL);
3701 	if (!new_wi_state)
3702 		return -ENOMEM;
3703 	for (i = 0; i < nr_node_ids; i++)
3704 		new_wi_state->iw_table[i] = 1;
3705 
3706 	mutex_lock(&wi_state_lock);
3707 	if (!input) {
3708 		old_wi_state = rcu_dereference_protected(wi_state,
3709 					lockdep_is_held(&wi_state_lock));
3710 		if (!old_wi_state)
3711 			goto update_wi_state;
3712 		if (input == old_wi_state->mode_auto) {
3713 			mutex_unlock(&wi_state_lock);
3714 			return count;
3715 		}
3716 
3717 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3718 					       nr_node_ids * sizeof(u8));
3719 		goto update_wi_state;
3720 	}
3721 
3722 	bw = node_bw_table;
3723 	if (!bw) {
3724 		mutex_unlock(&wi_state_lock);
3725 		kfree(new_wi_state);
3726 		return -ENODEV;
3727 	}
3728 
3729 	new_wi_state->mode_auto = true;
3730 	reduce_interleave_weights(bw, new_wi_state->iw_table);
3731 
3732 update_wi_state:
3733 	rcu_assign_pointer(wi_state, new_wi_state);
3734 	mutex_unlock(&wi_state_lock);
3735 	if (old_wi_state) {
3736 		synchronize_rcu();
3737 		kfree(old_wi_state);
3738 	}
3739 	return count;
3740 }
3741 
sysfs_wi_node_delete(int nid)3742 static void sysfs_wi_node_delete(int nid)
3743 {
3744 	struct iw_node_attr *attr;
3745 
3746 	if (nid < 0 || nid >= nr_node_ids)
3747 		return;
3748 
3749 	mutex_lock(&wi_group->kobj_lock);
3750 	attr = wi_group->nattrs[nid];
3751 	if (!attr) {
3752 		mutex_unlock(&wi_group->kobj_lock);
3753 		return;
3754 	}
3755 
3756 	wi_group->nattrs[nid] = NULL;
3757 	mutex_unlock(&wi_group->kobj_lock);
3758 
3759 	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3760 	kfree(attr->kobj_attr.attr.name);
3761 	kfree(attr);
3762 }
3763 
sysfs_wi_node_delete_all(void)3764 static void sysfs_wi_node_delete_all(void)
3765 {
3766 	int nid;
3767 
3768 	for (nid = 0; nid < nr_node_ids; nid++)
3769 		sysfs_wi_node_delete(nid);
3770 }
3771 
wi_state_free(void)3772 static void wi_state_free(void)
3773 {
3774 	struct weighted_interleave_state *old_wi_state;
3775 
3776 	mutex_lock(&wi_state_lock);
3777 	old_wi_state = rcu_dereference_protected(wi_state,
3778 			lockdep_is_held(&wi_state_lock));
3779 	rcu_assign_pointer(wi_state, NULL);
3780 	mutex_unlock(&wi_state_lock);
3781 
3782 	if (old_wi_state) {
3783 		synchronize_rcu();
3784 		kfree(old_wi_state);
3785 	}
3786 }
3787 
3788 static struct kobj_attribute wi_auto_attr =
3789 	__ATTR(auto, 0664, weighted_interleave_auto_show,
3790 			   weighted_interleave_auto_store);
3791 
wi_cleanup(void)3792 static void wi_cleanup(void) {
3793 	sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3794 	sysfs_wi_node_delete_all();
3795 	wi_state_free();
3796 }
3797 
wi_kobj_release(struct kobject * wi_kobj)3798 static void wi_kobj_release(struct kobject *wi_kobj)
3799 {
3800 	kfree(wi_group);
3801 }
3802 
3803 static const struct kobj_type wi_ktype = {
3804 	.sysfs_ops = &kobj_sysfs_ops,
3805 	.release = wi_kobj_release,
3806 };
3807 
sysfs_wi_node_add(int nid)3808 static int sysfs_wi_node_add(int nid)
3809 {
3810 	int ret;
3811 	char *name;
3812 	struct iw_node_attr *new_attr;
3813 
3814 	if (nid < 0 || nid >= nr_node_ids) {
3815 		pr_err("invalid node id: %d\n", nid);
3816 		return -EINVAL;
3817 	}
3818 
3819 	new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3820 	if (!new_attr)
3821 		return -ENOMEM;
3822 
3823 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3824 	if (!name) {
3825 		kfree(new_attr);
3826 		return -ENOMEM;
3827 	}
3828 
3829 	sysfs_attr_init(&new_attr->kobj_attr.attr);
3830 	new_attr->kobj_attr.attr.name = name;
3831 	new_attr->kobj_attr.attr.mode = 0644;
3832 	new_attr->kobj_attr.show = node_show;
3833 	new_attr->kobj_attr.store = node_store;
3834 	new_attr->nid = nid;
3835 
3836 	mutex_lock(&wi_group->kobj_lock);
3837 	if (wi_group->nattrs[nid]) {
3838 		mutex_unlock(&wi_group->kobj_lock);
3839 		ret = -EEXIST;
3840 		goto out;
3841 	}
3842 
3843 	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3844 	if (ret) {
3845 		mutex_unlock(&wi_group->kobj_lock);
3846 		goto out;
3847 	}
3848 	wi_group->nattrs[nid] = new_attr;
3849 	mutex_unlock(&wi_group->kobj_lock);
3850 	return 0;
3851 
3852 out:
3853 	kfree(new_attr->kobj_attr.attr.name);
3854 	kfree(new_attr);
3855 	return ret;
3856 }
3857 
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3858 static int wi_node_notifier(struct notifier_block *nb,
3859 			       unsigned long action, void *data)
3860 {
3861 	int err;
3862 	struct node_notify *nn = data;
3863 	int nid = nn->nid;
3864 
3865 	switch (action) {
3866 	case NODE_ADDED_FIRST_MEMORY:
3867 		err = sysfs_wi_node_add(nid);
3868 		if (err)
3869 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3870 			       nid, err);
3871 		break;
3872 	case NODE_REMOVED_LAST_MEMORY:
3873 		sysfs_wi_node_delete(nid);
3874 		break;
3875 	}
3876 
3877 	return NOTIFY_OK;
3878 }
3879 
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3880 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3881 {
3882 	int nid, err;
3883 
3884 	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3885 			   GFP_KERNEL);
3886 	if (!wi_group)
3887 		return -ENOMEM;
3888 	mutex_init(&wi_group->kobj_lock);
3889 
3890 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3891 				   "weighted_interleave");
3892 	if (err)
3893 		goto err_put_kobj;
3894 
3895 	err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3896 	if (err)
3897 		goto err_put_kobj;
3898 
3899 	for_each_online_node(nid) {
3900 		if (!node_state(nid, N_MEMORY))
3901 			continue;
3902 
3903 		err = sysfs_wi_node_add(nid);
3904 		if (err) {
3905 			pr_err("failed to add sysfs for node%d during init: %d\n",
3906 			       nid, err);
3907 			goto err_cleanup_kobj;
3908 		}
3909 	}
3910 
3911 	hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3912 	return 0;
3913 
3914 err_cleanup_kobj:
3915 	wi_cleanup();
3916 	kobject_del(&wi_group->wi_kobj);
3917 err_put_kobj:
3918 	kobject_put(&wi_group->wi_kobj);
3919 	return err;
3920 }
3921 
mempolicy_sysfs_init(void)3922 static int __init mempolicy_sysfs_init(void)
3923 {
3924 	int err;
3925 	static struct kobject *mempolicy_kobj;
3926 
3927 	mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3928 	if (!mempolicy_kobj)
3929 		return -ENOMEM;
3930 
3931 	err = add_weighted_interleave_group(mempolicy_kobj);
3932 	if (err)
3933 		goto err_kobj;
3934 
3935 	return 0;
3936 
3937 err_kobj:
3938 	kobject_del(mempolicy_kobj);
3939 	kobject_put(mempolicy_kobj);
3940 	return err;
3941 }
3942 
3943 late_initcall(mempolicy_sysfs_init);
3944 #endif /* CONFIG_SYSFS */
3945