xref: /linux/mm/mempolicy.c (revision 32a92f8c89326985e05dce8b22d3f0aa07a3e1bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/sysctl.h>
89 #include <linux/sched/task.h>
90 #include <linux/nodemask.h>
91 #include <linux/cpuset.h>
92 #include <linux/slab.h>
93 #include <linux/string.h>
94 #include <linux/export.h>
95 #include <linux/nsproxy.h>
96 #include <linux/interrupt.h>
97 #include <linux/init.h>
98 #include <linux/compat.h>
99 #include <linux/ptrace.h>
100 #include <linux/swap.h>
101 #include <linux/seq_file.h>
102 #include <linux/proc_fs.h>
103 #include <linux/memory-tiers.h>
104 #include <linux/migrate.h>
105 #include <linux/ksm.h>
106 #include <linux/rmap.h>
107 #include <linux/security.h>
108 #include <linux/syscalls.h>
109 #include <linux/ctype.h>
110 #include <linux/mm_inline.h>
111 #include <linux/mmu_notifier.h>
112 #include <linux/printk.h>
113 #include <linux/leafops.h>
114 #include <linux/gcd.h>
115 
116 #include <asm/tlbflush.h>
117 #include <asm/tlb.h>
118 #include <linux/uaccess.h>
119 #include <linux/memory.h>
120 
121 #include "internal.h"
122 
123 /* Internal flags */
124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
125 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
126 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
127 
128 static struct kmem_cache *policy_cache;
129 static struct kmem_cache *sn_cache;
130 
131 /* Highest zone. An specific allocation for a zone below that is not
132    policied. */
133 enum zone_type policy_zone = 0;
134 
135 /*
136  * run-time system-wide default policy => local allocation
137  */
138 static struct mempolicy default_policy = {
139 	.refcnt = ATOMIC_INIT(1), /* never free it */
140 	.mode = MPOL_LOCAL,
141 };
142 
143 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144 
145 /*
146  * weightiness balances the tradeoff between small weights (cycles through nodes
147  * faster, more fair/even distribution) and large weights (smaller errors
148  * between actual bandwidth ratios and weight ratios). 32 is a number that has
149  * been found to perform at a reasonable compromise between the two goals.
150  */
151 static const int weightiness = 32;
152 
153 /*
154  * A null weighted_interleave_state is interpreted as having .mode="auto",
155  * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156  */
157 struct weighted_interleave_state {
158 	bool mode_auto;
159 	u8 iw_table[];
160 };
161 static struct weighted_interleave_state __rcu *wi_state;
162 static unsigned int *node_bw_table;
163 
164 /*
165  * wi_state_lock protects both wi_state and node_bw_table.
166  * node_bw_table is only used by writers to update wi_state.
167  */
168 static DEFINE_MUTEX(wi_state_lock);
169 
get_il_weight(int node)170 static u8 get_il_weight(int node)
171 {
172 	struct weighted_interleave_state *state;
173 	u8 weight = 1;
174 
175 	rcu_read_lock();
176 	state = rcu_dereference(wi_state);
177 	if (state)
178 		weight = state->iw_table[node];
179 	rcu_read_unlock();
180 	return weight;
181 }
182 
183 /*
184  * Convert bandwidth values into weighted interleave weights.
185  * Call with wi_state_lock.
186  */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188 {
189 	u64 sum_bw = 0;
190 	unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191 	int nid;
192 
193 	for_each_node_state(nid, N_MEMORY)
194 		sum_bw += bw[nid];
195 
196 	/* Scale bandwidths to whole numbers in the range [1, weightiness] */
197 	for_each_node_state(nid, N_MEMORY) {
198 		/*
199 		 * Try not to perform 64-bit division.
200 		 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201 		 * If sum_bw > scaling_factor, then round the weight up to 1.
202 		 */
203 		scaling_factor = weightiness * bw[nid];
204 		if (bw[nid] && sum_bw < scaling_factor) {
205 			cast_sum_bw = (unsigned int)sum_bw;
206 			new_iw[nid] = scaling_factor / cast_sum_bw;
207 		} else {
208 			new_iw[nid] = 1;
209 		}
210 		if (!iw_gcd)
211 			iw_gcd = new_iw[nid];
212 		iw_gcd = gcd(iw_gcd, new_iw[nid]);
213 	}
214 
215 	/* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216 	for_each_node_state(nid, N_MEMORY)
217 		new_iw[nid] /= iw_gcd;
218 }
219 
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221 {
222 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223 	unsigned int *old_bw, *new_bw;
224 	unsigned int bw_val;
225 	int i;
226 
227 	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228 	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229 	if (!new_bw)
230 		return -ENOMEM;
231 
232 	new_wi_state = kmalloc_flex(*new_wi_state, iw_table, nr_node_ids);
233 	if (!new_wi_state) {
234 		kfree(new_bw);
235 		return -ENOMEM;
236 	}
237 	new_wi_state->mode_auto = true;
238 	for (i = 0; i < nr_node_ids; i++)
239 		new_wi_state->iw_table[i] = 1;
240 
241 	/*
242 	 * Update bandwidth info, even in manual mode. That way, when switching
243 	 * to auto mode in the future, iw_table can be overwritten using
244 	 * accurate bw data.
245 	 */
246 	mutex_lock(&wi_state_lock);
247 
248 	old_bw = node_bw_table;
249 	if (old_bw)
250 		memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
251 	new_bw[node] = bw_val;
252 	node_bw_table = new_bw;
253 
254 	old_wi_state = rcu_dereference_protected(wi_state,
255 					lockdep_is_held(&wi_state_lock));
256 	if (old_wi_state && !old_wi_state->mode_auto) {
257 		/* Manual mode; skip reducing weights and updating wi_state */
258 		mutex_unlock(&wi_state_lock);
259 		kfree(new_wi_state);
260 		goto out;
261 	}
262 
263 	/* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
264 	reduce_interleave_weights(new_bw, new_wi_state->iw_table);
265 	rcu_assign_pointer(wi_state, new_wi_state);
266 
267 	mutex_unlock(&wi_state_lock);
268 	if (old_wi_state) {
269 		synchronize_rcu();
270 		kfree(old_wi_state);
271 	}
272 out:
273 	kfree(old_bw);
274 	return 0;
275 }
276 
277 /**
278  * numa_nearest_node - Find nearest node by state
279  * @node: Node id to start the search
280  * @state: State to filter the search
281  *
282  * Lookup the closest node by distance if @nid is not in state.
283  *
284  * Return: this @node if it is in state, otherwise the closest node by distance
285  */
numa_nearest_node(int node,unsigned int state)286 int numa_nearest_node(int node, unsigned int state)
287 {
288 	int min_dist = INT_MAX, dist, n, min_node;
289 
290 	if (state >= NR_NODE_STATES)
291 		return -EINVAL;
292 
293 	if (node == NUMA_NO_NODE || node_state(node, state))
294 		return node;
295 
296 	min_node = node;
297 	for_each_node_state(n, state) {
298 		dist = node_distance(node, n);
299 		if (dist < min_dist) {
300 			min_dist = dist;
301 			min_node = n;
302 		}
303 	}
304 
305 	return min_node;
306 }
307 EXPORT_SYMBOL_GPL(numa_nearest_node);
308 
309 /**
310  * nearest_node_nodemask - Find the node in @mask at the nearest distance
311  *			   from @node.
312  *
313  * @node: a valid node ID to start the search from.
314  * @mask: a pointer to a nodemask representing the allowed nodes.
315  *
316  * This function iterates over all nodes in @mask and calculates the
317  * distance from the starting @node, then it returns the node ID that is
318  * the closest to @node, or MAX_NUMNODES if no node is found.
319  *
320  * Note that @node must be a valid node ID usable with node_distance(),
321  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
322  * or unexpected behavior.
323  */
nearest_node_nodemask(int node,nodemask_t * mask)324 int nearest_node_nodemask(int node, nodemask_t *mask)
325 {
326 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
327 
328 	for_each_node_mask(n, *mask) {
329 		dist = node_distance(node, n);
330 		if (dist < min_dist) {
331 			min_dist = dist;
332 			min_node = n;
333 		}
334 	}
335 
336 	return min_node;
337 }
338 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
339 
get_task_policy(struct task_struct * p)340 struct mempolicy *get_task_policy(struct task_struct *p)
341 {
342 	struct mempolicy *pol = p->mempolicy;
343 	int node;
344 
345 	if (pol)
346 		return pol;
347 
348 	node = numa_node_id();
349 	if (node != NUMA_NO_NODE) {
350 		pol = &preferred_node_policy[node];
351 		/* preferred_node_policy is not initialised early in boot */
352 		if (pol->mode)
353 			return pol;
354 	}
355 
356 	return &default_policy;
357 }
358 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
359 
360 static const struct mempolicy_operations {
361 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
362 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
363 } mpol_ops[MPOL_MAX];
364 
mpol_store_user_nodemask(const struct mempolicy * pol)365 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
366 {
367 	return pol->flags & MPOL_USER_NODEMASK_FLAGS;
368 }
369 
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)370 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
371 				   const nodemask_t *rel)
372 {
373 	nodemask_t tmp;
374 	nodes_fold(tmp, *orig, nodes_weight(*rel));
375 	nodes_onto(*ret, tmp, *rel);
376 }
377 
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)378 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
379 {
380 	if (nodes_empty(*nodes))
381 		return -EINVAL;
382 	pol->nodes = *nodes;
383 	return 0;
384 }
385 
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)386 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 	if (nodes_empty(*nodes))
389 		return -EINVAL;
390 
391 	nodes_clear(pol->nodes);
392 	node_set(first_node(*nodes), pol->nodes);
393 	return 0;
394 }
395 
396 /*
397  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
398  * any, for the new policy.  mpol_new() has already validated the nodes
399  * parameter with respect to the policy mode and flags.
400  *
401  * Must be called holding task's alloc_lock to protect task's mems_allowed
402  * and mempolicy.  May also be called holding the mmap_lock for write.
403  */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)404 static int mpol_set_nodemask(struct mempolicy *pol,
405 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
406 {
407 	int ret;
408 
409 	/*
410 	 * Default (pol==NULL) resp. local memory policies are not a
411 	 * subject of any remapping. They also do not need any special
412 	 * constructor.
413 	 */
414 	if (!pol || pol->mode == MPOL_LOCAL)
415 		return 0;
416 
417 	/* Check N_MEMORY */
418 	nodes_and(nsc->mask1,
419 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
420 
421 	VM_BUG_ON(!nodes);
422 
423 	if (pol->flags & MPOL_F_RELATIVE_NODES)
424 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
425 	else
426 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
427 
428 	if (mpol_store_user_nodemask(pol))
429 		pol->w.user_nodemask = *nodes;
430 	else
431 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
432 
433 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
434 	return ret;
435 }
436 
437 /*
438  * This function just creates a new policy, does some check and simple
439  * initialization. You must invoke mpol_set_nodemask() to set nodes.
440  */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)441 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
442 				  nodemask_t *nodes)
443 {
444 	struct mempolicy *policy;
445 
446 	if (mode == MPOL_DEFAULT) {
447 		if (nodes && !nodes_empty(*nodes))
448 			return ERR_PTR(-EINVAL);
449 		return NULL;
450 	}
451 	VM_BUG_ON(!nodes);
452 
453 	/*
454 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
455 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
456 	 * All other modes require a valid pointer to a non-empty nodemask.
457 	 */
458 	if (mode == MPOL_PREFERRED) {
459 		if (nodes_empty(*nodes)) {
460 			if (((flags & MPOL_F_STATIC_NODES) ||
461 			     (flags & MPOL_F_RELATIVE_NODES)))
462 				return ERR_PTR(-EINVAL);
463 
464 			mode = MPOL_LOCAL;
465 		}
466 	} else if (mode == MPOL_LOCAL) {
467 		if (!nodes_empty(*nodes) ||
468 		    (flags & MPOL_F_STATIC_NODES) ||
469 		    (flags & MPOL_F_RELATIVE_NODES))
470 			return ERR_PTR(-EINVAL);
471 	} else if (nodes_empty(*nodes))
472 		return ERR_PTR(-EINVAL);
473 
474 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
475 	if (!policy)
476 		return ERR_PTR(-ENOMEM);
477 	atomic_set(&policy->refcnt, 1);
478 	policy->mode = mode;
479 	policy->flags = flags;
480 	policy->home_node = NUMA_NO_NODE;
481 
482 	return policy;
483 }
484 
485 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)486 void __mpol_put(struct mempolicy *pol)
487 {
488 	if (!atomic_dec_and_test(&pol->refcnt))
489 		return;
490 	kmem_cache_free(policy_cache, pol);
491 }
492 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
493 
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)494 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
495 {
496 }
497 
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)498 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
499 {
500 	nodemask_t tmp;
501 
502 	if (pol->flags & MPOL_F_STATIC_NODES)
503 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
504 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
505 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
506 	else {
507 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
508 								*nodes);
509 		pol->w.cpuset_mems_allowed = *nodes;
510 	}
511 
512 	if (nodes_empty(tmp))
513 		tmp = *nodes;
514 
515 	pol->nodes = tmp;
516 }
517 
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)518 static void mpol_rebind_preferred(struct mempolicy *pol,
519 						const nodemask_t *nodes)
520 {
521 	pol->w.cpuset_mems_allowed = *nodes;
522 }
523 
524 /*
525  * mpol_rebind_policy - Migrate a policy to a different set of nodes
526  *
527  * Per-vma policies are protected by mmap_lock. Allocations using per-task
528  * policies are protected by task->mems_allowed_seq to prevent a premature
529  * OOM/allocation failure due to parallel nodemask modification.
530  */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)531 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
532 {
533 	if (!pol || pol->mode == MPOL_LOCAL)
534 		return;
535 	if (!mpol_store_user_nodemask(pol) &&
536 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
537 		return;
538 
539 	mpol_ops[pol->mode].rebind(pol, newmask);
540 }
541 
542 /*
543  * Wrapper for mpol_rebind_policy() that just requires task
544  * pointer, and updates task mempolicy.
545  *
546  * Called with task's alloc_lock held.
547  */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)548 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
549 {
550 	mpol_rebind_policy(tsk->mempolicy, new);
551 }
552 
553 /*
554  * Rebind each vma in mm to new nodemask.
555  *
556  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
557  */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)558 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
559 {
560 	struct vm_area_struct *vma;
561 	VMA_ITERATOR(vmi, mm, 0);
562 
563 	mmap_write_lock(mm);
564 	for_each_vma(vmi, vma) {
565 		vma_start_write(vma);
566 		mpol_rebind_policy(vma->vm_policy, new);
567 	}
568 	mmap_write_unlock(mm);
569 }
570 
571 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
572 	[MPOL_DEFAULT] = {
573 		.rebind = mpol_rebind_default,
574 	},
575 	[MPOL_INTERLEAVE] = {
576 		.create = mpol_new_nodemask,
577 		.rebind = mpol_rebind_nodemask,
578 	},
579 	[MPOL_PREFERRED] = {
580 		.create = mpol_new_preferred,
581 		.rebind = mpol_rebind_preferred,
582 	},
583 	[MPOL_BIND] = {
584 		.create = mpol_new_nodemask,
585 		.rebind = mpol_rebind_nodemask,
586 	},
587 	[MPOL_LOCAL] = {
588 		.rebind = mpol_rebind_default,
589 	},
590 	[MPOL_PREFERRED_MANY] = {
591 		.create = mpol_new_nodemask,
592 		.rebind = mpol_rebind_preferred,
593 	},
594 	[MPOL_WEIGHTED_INTERLEAVE] = {
595 		.create = mpol_new_nodemask,
596 		.rebind = mpol_rebind_nodemask,
597 	},
598 };
599 
600 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
601 				unsigned long flags);
602 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
603 				pgoff_t ilx, int *nid);
604 
strictly_unmovable(unsigned long flags)605 static bool strictly_unmovable(unsigned long flags)
606 {
607 	/*
608 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
609 	 * if any misplaced page is found.
610 	 */
611 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
612 			 MPOL_MF_STRICT;
613 }
614 
615 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
616 	struct mempolicy *pol;
617 	pgoff_t ilx;
618 };
619 
620 struct queue_pages {
621 	struct list_head *pagelist;
622 	unsigned long flags;
623 	nodemask_t *nmask;
624 	unsigned long start;
625 	unsigned long end;
626 	struct vm_area_struct *first;
627 	struct folio *large;		/* note last large folio encountered */
628 	long nr_failed;			/* could not be isolated at this time */
629 };
630 
631 /*
632  * Check if the folio's nid is in qp->nmask.
633  *
634  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
635  * in the invert of qp->nmask.
636  */
queue_folio_required(struct folio * folio,struct queue_pages * qp)637 static inline bool queue_folio_required(struct folio *folio,
638 					struct queue_pages *qp)
639 {
640 	int nid = folio_nid(folio);
641 	unsigned long flags = qp->flags;
642 
643 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
644 }
645 
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)646 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
647 {
648 	struct folio *folio;
649 	struct queue_pages *qp = walk->private;
650 
651 	if (unlikely(pmd_is_migration_entry(*pmd))) {
652 		qp->nr_failed++;
653 		return;
654 	}
655 	folio = pmd_folio(*pmd);
656 	if (is_huge_zero_folio(folio)) {
657 		walk->action = ACTION_CONTINUE;
658 		return;
659 	}
660 	if (!queue_folio_required(folio, qp))
661 		return;
662 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
663 	    !vma_migratable(walk->vma) ||
664 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
665 		qp->nr_failed++;
666 }
667 
668 /*
669  * Scan through folios, checking if they satisfy the required conditions,
670  * moving them from LRU to local pagelist for migration if they do (or not).
671  *
672  * queue_folios_pte_range() has two possible return values:
673  * 0 - continue walking to scan for more, even if an existing folio on the
674  *     wrong node could not be isolated and queued for migration.
675  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
676  *        and an existing folio was on a node that does not follow the policy.
677  */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)678 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
679 			unsigned long end, struct mm_walk *walk)
680 {
681 	struct vm_area_struct *vma = walk->vma;
682 	struct folio *folio;
683 	struct queue_pages *qp = walk->private;
684 	unsigned long flags = qp->flags;
685 	pte_t *pte, *mapped_pte;
686 	pte_t ptent;
687 	spinlock_t *ptl;
688 	int max_nr, nr;
689 
690 	ptl = pmd_trans_huge_lock(pmd, vma);
691 	if (ptl) {
692 		queue_folios_pmd(pmd, walk);
693 		spin_unlock(ptl);
694 		goto out;
695 	}
696 
697 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
698 	if (!pte) {
699 		walk->action = ACTION_AGAIN;
700 		return 0;
701 	}
702 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
703 		max_nr = (end - addr) >> PAGE_SHIFT;
704 		nr = 1;
705 		ptent = ptep_get(pte);
706 		if (pte_none(ptent))
707 			continue;
708 		if (!pte_present(ptent)) {
709 			const softleaf_t entry = softleaf_from_pte(ptent);
710 
711 			if (softleaf_is_migration(entry))
712 				qp->nr_failed++;
713 			continue;
714 		}
715 		folio = vm_normal_folio(vma, addr, ptent);
716 		if (!folio || folio_is_zone_device(folio))
717 			continue;
718 		if (folio_test_large(folio) && max_nr != 1)
719 			nr = folio_pte_batch(folio, pte, ptent, max_nr);
720 		/*
721 		 * vm_normal_folio() filters out zero pages, but there might
722 		 * still be reserved folios to skip, perhaps in a VDSO.
723 		 */
724 		if (folio_test_reserved(folio))
725 			continue;
726 		if (!queue_folio_required(folio, qp))
727 			continue;
728 		if (folio_test_large(folio)) {
729 			/*
730 			 * A large folio can only be isolated from LRU once,
731 			 * but may be mapped by many PTEs (and Copy-On-Write may
732 			 * intersperse PTEs of other, order 0, folios).  This is
733 			 * a common case, so don't mistake it for failure (but
734 			 * there can be other cases of multi-mapped pages which
735 			 * this quick check does not help to filter out - and a
736 			 * search of the pagelist might grow to be prohibitive).
737 			 *
738 			 * migrate_pages(&pagelist) returns nr_failed folios, so
739 			 * check "large" now so that queue_pages_range() returns
740 			 * a comparable nr_failed folios.  This does imply that
741 			 * if folio could not be isolated for some racy reason
742 			 * at its first PTE, later PTEs will not give it another
743 			 * chance of isolation; but keeps the accounting simple.
744 			 */
745 			if (folio == qp->large)
746 				continue;
747 			qp->large = folio;
748 		}
749 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
750 		    !vma_migratable(vma) ||
751 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
752 			qp->nr_failed += nr;
753 			if (strictly_unmovable(flags))
754 				break;
755 		}
756 	}
757 	pte_unmap_unlock(mapped_pte, ptl);
758 	cond_resched();
759 out:
760 	if (qp->nr_failed && strictly_unmovable(flags))
761 		return -EIO;
762 	return 0;
763 }
764 
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)765 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
766 			       unsigned long addr, unsigned long end,
767 			       struct mm_walk *walk)
768 {
769 #ifdef CONFIG_HUGETLB_PAGE
770 	struct queue_pages *qp = walk->private;
771 	unsigned long flags = qp->flags;
772 	struct folio *folio;
773 	spinlock_t *ptl;
774 	pte_t ptep;
775 
776 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
777 	ptep = huge_ptep_get(walk->mm, addr, pte);
778 	if (!pte_present(ptep)) {
779 		if (!huge_pte_none(ptep)) {
780 			const softleaf_t entry = softleaf_from_pte(ptep);
781 
782 			if (unlikely(softleaf_is_migration(entry)))
783 				qp->nr_failed++;
784 		}
785 
786 		goto unlock;
787 	}
788 	folio = pfn_folio(pte_pfn(ptep));
789 	if (!queue_folio_required(folio, qp))
790 		goto unlock;
791 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
792 	    !vma_migratable(walk->vma)) {
793 		qp->nr_failed++;
794 		goto unlock;
795 	}
796 	/*
797 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
798 	 * Choosing not to migrate a shared folio is not counted as a failure.
799 	 *
800 	 * See folio_maybe_mapped_shared() on possible imprecision when we
801 	 * cannot easily detect if a folio is shared.
802 	 */
803 	if ((flags & MPOL_MF_MOVE_ALL) ||
804 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
805 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
806 			qp->nr_failed++;
807 unlock:
808 	spin_unlock(ptl);
809 	if (qp->nr_failed && strictly_unmovable(flags))
810 		return -EIO;
811 #endif
812 	return 0;
813 }
814 
815 #ifdef CONFIG_NUMA_BALANCING
816 /**
817  * folio_can_map_prot_numa() - check whether the folio can map prot numa
818  * @folio: The folio whose mapping considered for being made NUMA hintable
819  * @vma: The VMA that the folio belongs to.
820  * @is_private_single_threaded: Is this a single-threaded private VMA or not
821  *
822  * This function checks to see if the folio actually indicates that
823  * we need to make the mapping one which causes a NUMA hinting fault,
824  * as there are cases where it's simply unnecessary, and the folio's
825  * access time is adjusted for memory tiering if prot numa needed.
826  *
827  * Return: True if the mapping of the folio needs to be changed, false otherwise.
828  */
folio_can_map_prot_numa(struct folio * folio,struct vm_area_struct * vma,bool is_private_single_threaded)829 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
830 		bool is_private_single_threaded)
831 {
832 	int nid;
833 
834 	if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
835 		return false;
836 
837 	/* Also skip shared copy-on-write folios */
838 	if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
839 		return false;
840 
841 	/* Folios are pinned and can't be migrated */
842 	if (folio_maybe_dma_pinned(folio))
843 		return false;
844 
845 	/*
846 	 * While migration can move some dirty folios,
847 	 * it cannot move them all from MIGRATE_ASYNC
848 	 * context.
849 	 */
850 	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
851 		return false;
852 
853 	/*
854 	 * Don't mess with PTEs if folio is already on the node
855 	 * a single-threaded process is running on.
856 	 */
857 	nid = folio_nid(folio);
858 	if (is_private_single_threaded && (nid == numa_node_id()))
859 		return false;
860 
861 	/*
862 	 * Skip scanning top tier node if normal numa
863 	 * balancing is disabled
864 	 */
865 	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
866 	    node_is_toptier(nid))
867 		return false;
868 
869 	if (folio_use_access_time(folio))
870 		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
871 
872 	return true;
873 }
874 
875 /*
876  * This is used to mark a range of virtual addresses to be inaccessible.
877  * These are later cleared by a NUMA hinting fault. Depending on these
878  * faults, pages may be migrated for better NUMA placement.
879  *
880  * This is assuming that NUMA faults are handled using PROT_NONE. If
881  * an architecture makes a different choice, it will need further
882  * changes to the core.
883  */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)884 unsigned long change_prot_numa(struct vm_area_struct *vma,
885 			unsigned long addr, unsigned long end)
886 {
887 	struct mmu_gather tlb;
888 	long nr_updated;
889 
890 	tlb_gather_mmu(&tlb, vma->vm_mm);
891 
892 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
893 	if (nr_updated > 0) {
894 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
895 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
896 	}
897 
898 	tlb_finish_mmu(&tlb);
899 
900 	return nr_updated;
901 }
902 #endif /* CONFIG_NUMA_BALANCING */
903 
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)904 static int queue_pages_test_walk(unsigned long start, unsigned long end,
905 				struct mm_walk *walk)
906 {
907 	struct vm_area_struct *next, *vma = walk->vma;
908 	struct queue_pages *qp = walk->private;
909 	unsigned long flags = qp->flags;
910 
911 	/* range check first */
912 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
913 
914 	if (!qp->first) {
915 		qp->first = vma;
916 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
917 			(qp->start < vma->vm_start))
918 			/* hole at head side of range */
919 			return -EFAULT;
920 	}
921 	next = find_vma(vma->vm_mm, vma->vm_end);
922 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
923 		((vma->vm_end < qp->end) &&
924 		(!next || vma->vm_end < next->vm_start)))
925 		/* hole at middle or tail of range */
926 		return -EFAULT;
927 
928 	/*
929 	 * Need check MPOL_MF_STRICT to return -EIO if possible
930 	 * regardless of vma_migratable
931 	 */
932 	if (!vma_migratable(vma) &&
933 	    !(flags & MPOL_MF_STRICT))
934 		return 1;
935 
936 	/*
937 	 * Check page nodes, and queue pages to move, in the current vma.
938 	 * But if no moving, and no strict checking, the scan can be skipped.
939 	 */
940 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
941 		return 0;
942 	return 1;
943 }
944 
945 static const struct mm_walk_ops queue_pages_walk_ops = {
946 	.hugetlb_entry		= queue_folios_hugetlb,
947 	.pmd_entry		= queue_folios_pte_range,
948 	.test_walk		= queue_pages_test_walk,
949 	.walk_lock		= PGWALK_RDLOCK,
950 };
951 
952 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
953 	.hugetlb_entry		= queue_folios_hugetlb,
954 	.pmd_entry		= queue_folios_pte_range,
955 	.test_walk		= queue_pages_test_walk,
956 	.walk_lock		= PGWALK_WRLOCK,
957 };
958 
959 /*
960  * Walk through page tables and collect pages to be migrated.
961  *
962  * If pages found in a given range are not on the required set of @nodes,
963  * and migration is allowed, they are isolated and queued to @pagelist.
964  *
965  * queue_pages_range() may return:
966  * 0 - all pages already on the right node, or successfully queued for moving
967  *     (or neither strict checking nor moving requested: only range checking).
968  * >0 - this number of misplaced folios could not be queued for moving
969  *      (a hugetlbfs page or a transparent huge page being counted as 1).
970  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
971  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
972  */
973 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)974 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
975 		nodemask_t *nodes, unsigned long flags,
976 		struct list_head *pagelist)
977 {
978 	int err;
979 	struct queue_pages qp = {
980 		.pagelist = pagelist,
981 		.flags = flags,
982 		.nmask = nodes,
983 		.start = start,
984 		.end = end,
985 		.first = NULL,
986 	};
987 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
988 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
989 
990 	err = walk_page_range(mm, start, end, ops, &qp);
991 
992 	if (!qp.first)
993 		/* whole range in hole */
994 		err = -EFAULT;
995 
996 	return err ? : qp.nr_failed;
997 }
998 
999 /*
1000  * Apply policy to a single VMA
1001  * This must be called with the mmap_lock held for writing.
1002  */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)1003 static int vma_replace_policy(struct vm_area_struct *vma,
1004 				struct mempolicy *pol)
1005 {
1006 	int err;
1007 	struct mempolicy *old;
1008 	struct mempolicy *new;
1009 
1010 	vma_assert_write_locked(vma);
1011 
1012 	new = mpol_dup(pol);
1013 	if (IS_ERR(new))
1014 		return PTR_ERR(new);
1015 
1016 	if (vma->vm_ops && vma->vm_ops->set_policy) {
1017 		err = vma->vm_ops->set_policy(vma, new);
1018 		if (err)
1019 			goto err_out;
1020 	}
1021 
1022 	old = vma->vm_policy;
1023 	vma->vm_policy = new; /* protected by mmap_lock */
1024 	mpol_put(old);
1025 
1026 	return 0;
1027  err_out:
1028 	mpol_put(new);
1029 	return err;
1030 }
1031 
1032 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)1033 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1034 		struct vm_area_struct **prev, unsigned long start,
1035 		unsigned long end, struct mempolicy *new_pol)
1036 {
1037 	unsigned long vmstart, vmend;
1038 
1039 	vmend = min(end, vma->vm_end);
1040 	if (start > vma->vm_start) {
1041 		*prev = vma;
1042 		vmstart = start;
1043 	} else {
1044 		vmstart = vma->vm_start;
1045 	}
1046 
1047 	if (mpol_equal(vma->vm_policy, new_pol)) {
1048 		*prev = vma;
1049 		return 0;
1050 	}
1051 
1052 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
1053 	if (IS_ERR(vma))
1054 		return PTR_ERR(vma);
1055 
1056 	*prev = vma;
1057 	return vma_replace_policy(vma, new_pol);
1058 }
1059 
1060 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)1061 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1062 			     nodemask_t *nodes)
1063 {
1064 	struct mempolicy *new, *old;
1065 	NODEMASK_SCRATCH(scratch);
1066 	int ret;
1067 
1068 	if (!scratch)
1069 		return -ENOMEM;
1070 
1071 	new = mpol_new(mode, flags, nodes);
1072 	if (IS_ERR(new)) {
1073 		ret = PTR_ERR(new);
1074 		goto out;
1075 	}
1076 
1077 	task_lock(current);
1078 	ret = mpol_set_nodemask(new, nodes, scratch);
1079 	if (ret) {
1080 		task_unlock(current);
1081 		mpol_put(new);
1082 		goto out;
1083 	}
1084 
1085 	old = current->mempolicy;
1086 	current->mempolicy = new;
1087 	if (new && (new->mode == MPOL_INTERLEAVE ||
1088 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1089 		current->il_prev = MAX_NUMNODES-1;
1090 		current->il_weight = 0;
1091 	}
1092 	task_unlock(current);
1093 	mpol_put(old);
1094 	ret = 0;
1095 out:
1096 	NODEMASK_SCRATCH_FREE(scratch);
1097 	return ret;
1098 }
1099 
1100 /*
1101  * Return nodemask for policy for get_mempolicy() query
1102  *
1103  * Called with task's alloc_lock held
1104  */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1105 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1106 {
1107 	nodes_clear(*nodes);
1108 	if (pol == &default_policy)
1109 		return;
1110 
1111 	switch (pol->mode) {
1112 	case MPOL_BIND:
1113 	case MPOL_INTERLEAVE:
1114 	case MPOL_PREFERRED:
1115 	case MPOL_PREFERRED_MANY:
1116 	case MPOL_WEIGHTED_INTERLEAVE:
1117 		*nodes = pol->nodes;
1118 		break;
1119 	case MPOL_LOCAL:
1120 		/* return empty node mask for local allocation */
1121 		break;
1122 	default:
1123 		BUG();
1124 	}
1125 }
1126 
lookup_node(struct mm_struct * mm,unsigned long addr)1127 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1128 {
1129 	struct page *p = NULL;
1130 	int ret;
1131 
1132 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1133 	if (ret > 0) {
1134 		ret = page_to_nid(p);
1135 		put_page(p);
1136 	}
1137 	return ret;
1138 }
1139 
1140 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1141 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1142 			     unsigned long addr, unsigned long flags)
1143 {
1144 	int err;
1145 	struct mm_struct *mm = current->mm;
1146 	struct vm_area_struct *vma = NULL;
1147 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1148 
1149 	if (flags &
1150 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1151 		return -EINVAL;
1152 
1153 	if (flags & MPOL_F_MEMS_ALLOWED) {
1154 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1155 			return -EINVAL;
1156 		*policy = 0;	/* just so it's initialized */
1157 		task_lock(current);
1158 		*nmask  = cpuset_current_mems_allowed;
1159 		task_unlock(current);
1160 		return 0;
1161 	}
1162 
1163 	if (flags & MPOL_F_ADDR) {
1164 		pgoff_t ilx;		/* ignored here */
1165 		/*
1166 		 * Do NOT fall back to task policy if the
1167 		 * vma/shared policy at addr is NULL.  We
1168 		 * want to return MPOL_DEFAULT in this case.
1169 		 */
1170 		mmap_read_lock(mm);
1171 		vma = vma_lookup(mm, addr);
1172 		if (!vma) {
1173 			mmap_read_unlock(mm);
1174 			return -EFAULT;
1175 		}
1176 		pol = __get_vma_policy(vma, addr, &ilx);
1177 	} else if (addr)
1178 		return -EINVAL;
1179 
1180 	if (!pol)
1181 		pol = &default_policy;	/* indicates default behavior */
1182 
1183 	if (flags & MPOL_F_NODE) {
1184 		if (flags & MPOL_F_ADDR) {
1185 			/*
1186 			 * Take a refcount on the mpol, because we are about to
1187 			 * drop the mmap_lock, after which only "pol" remains
1188 			 * valid, "vma" is stale.
1189 			 */
1190 			pol_refcount = pol;
1191 			vma = NULL;
1192 			mpol_get(pol);
1193 			mmap_read_unlock(mm);
1194 			err = lookup_node(mm, addr);
1195 			if (err < 0)
1196 				goto out;
1197 			*policy = err;
1198 		} else if (pol == current->mempolicy &&
1199 				pol->mode == MPOL_INTERLEAVE) {
1200 			*policy = next_node_in(current->il_prev, pol->nodes);
1201 		} else if (pol == current->mempolicy &&
1202 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1203 			if (current->il_weight)
1204 				*policy = current->il_prev;
1205 			else
1206 				*policy = next_node_in(current->il_prev,
1207 						       pol->nodes);
1208 		} else {
1209 			err = -EINVAL;
1210 			goto out;
1211 		}
1212 	} else {
1213 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1214 						pol->mode;
1215 		/*
1216 		 * Internal mempolicy flags must be masked off before exposing
1217 		 * the policy to userspace.
1218 		 */
1219 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1220 	}
1221 
1222 	err = 0;
1223 	if (nmask) {
1224 		if (mpol_store_user_nodemask(pol)) {
1225 			*nmask = pol->w.user_nodemask;
1226 		} else {
1227 			task_lock(current);
1228 			get_policy_nodemask(pol, nmask);
1229 			task_unlock(current);
1230 		}
1231 	}
1232 
1233  out:
1234 	mpol_cond_put(pol);
1235 	if (vma)
1236 		mmap_read_unlock(mm);
1237 	if (pol_refcount)
1238 		mpol_put(pol_refcount);
1239 	return err;
1240 }
1241 
1242 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1243 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1244 				unsigned long flags)
1245 {
1246 	/*
1247 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1248 	 * Choosing not to migrate a shared folio is not counted as a failure.
1249 	 *
1250 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1251 	 * cannot easily detect if a folio is shared.
1252 	 */
1253 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1254 		if (folio_isolate_lru(folio)) {
1255 			list_add_tail(&folio->lru, foliolist);
1256 			node_stat_mod_folio(folio,
1257 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1258 				folio_nr_pages(folio));
1259 		} else {
1260 			/*
1261 			 * Non-movable folio may reach here.  And, there may be
1262 			 * temporary off LRU folios or non-LRU movable folios.
1263 			 * Treat them as unmovable folios since they can't be
1264 			 * isolated, so they can't be moved at the moment.
1265 			 */
1266 			return false;
1267 		}
1268 	}
1269 	return true;
1270 }
1271 
1272 /*
1273  * Migrate pages from one node to a target node.
1274  * Returns error or the number of pages not migrated.
1275  */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1276 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1277 			    int flags)
1278 {
1279 	nodemask_t nmask;
1280 	struct vm_area_struct *vma;
1281 	LIST_HEAD(pagelist);
1282 	long nr_failed;
1283 	long err = 0;
1284 	struct migration_target_control mtc = {
1285 		.nid = dest,
1286 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1287 		.reason = MR_SYSCALL,
1288 	};
1289 
1290 	nodes_clear(nmask);
1291 	node_set(source, nmask);
1292 
1293 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1294 
1295 	mmap_read_lock(mm);
1296 	vma = find_vma(mm, 0);
1297 	if (unlikely(!vma)) {
1298 		mmap_read_unlock(mm);
1299 		return 0;
1300 	}
1301 
1302 	/*
1303 	 * This does not migrate the range, but isolates all pages that
1304 	 * need migration.  Between passing in the full user address
1305 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1306 	 * but passes back the count of pages which could not be isolated.
1307 	 */
1308 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1309 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1310 	mmap_read_unlock(mm);
1311 
1312 	if (!list_empty(&pagelist)) {
1313 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1314 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1315 		if (err)
1316 			putback_movable_pages(&pagelist);
1317 	}
1318 
1319 	if (err >= 0)
1320 		err += nr_failed;
1321 	return err;
1322 }
1323 
1324 /*
1325  * Move pages between the two nodesets so as to preserve the physical
1326  * layout as much as possible.
1327  *
1328  * Returns the number of page that could not be moved.
1329  */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1330 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1331 		     const nodemask_t *to, int flags)
1332 {
1333 	long nr_failed = 0;
1334 	long err = 0;
1335 	nodemask_t tmp;
1336 
1337 	lru_cache_disable();
1338 
1339 	/*
1340 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1341 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1342 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1343 	 * The pair of nodemasks 'to' and 'from' define the map.
1344 	 *
1345 	 * If no pair of bits is found that way, fallback to picking some
1346 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1347 	 * 'source' and 'dest' bits are the same, this represents a node
1348 	 * that will be migrating to itself, so no pages need move.
1349 	 *
1350 	 * If no bits are left in 'tmp', or if all remaining bits left
1351 	 * in 'tmp' correspond to the same bit in 'to', return false
1352 	 * (nothing left to migrate).
1353 	 *
1354 	 * This lets us pick a pair of nodes to migrate between, such that
1355 	 * if possible the dest node is not already occupied by some other
1356 	 * source node, minimizing the risk of overloading the memory on a
1357 	 * node that would happen if we migrated incoming memory to a node
1358 	 * before migrating outgoing memory source that same node.
1359 	 *
1360 	 * A single scan of tmp is sufficient.  As we go, we remember the
1361 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1362 	 * that not only moved, but what's better, moved to an empty slot
1363 	 * (d is not set in tmp), then we break out then, with that pair.
1364 	 * Otherwise when we finish scanning from_tmp, we at least have the
1365 	 * most recent <s, d> pair that moved.  If we get all the way through
1366 	 * the scan of tmp without finding any node that moved, much less
1367 	 * moved to an empty node, then there is nothing left worth migrating.
1368 	 */
1369 
1370 	tmp = *from;
1371 	while (!nodes_empty(tmp)) {
1372 		int s, d;
1373 		int source = NUMA_NO_NODE;
1374 		int dest = 0;
1375 
1376 		for_each_node_mask(s, tmp) {
1377 
1378 			/*
1379 			 * do_migrate_pages() tries to maintain the relative
1380 			 * node relationship of the pages established between
1381 			 * threads and memory areas.
1382                          *
1383 			 * However if the number of source nodes is not equal to
1384 			 * the number of destination nodes we can not preserve
1385 			 * this node relative relationship.  In that case, skip
1386 			 * copying memory from a node that is in the destination
1387 			 * mask.
1388 			 *
1389 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1390 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1391 			 */
1392 
1393 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1394 						(node_isset(s, *to)))
1395 				continue;
1396 
1397 			d = node_remap(s, *from, *to);
1398 			if (s == d)
1399 				continue;
1400 
1401 			source = s;	/* Node moved. Memorize */
1402 			dest = d;
1403 
1404 			/* dest not in remaining from nodes? */
1405 			if (!node_isset(dest, tmp))
1406 				break;
1407 		}
1408 		if (source == NUMA_NO_NODE)
1409 			break;
1410 
1411 		node_clear(source, tmp);
1412 		err = migrate_to_node(mm, source, dest, flags);
1413 		if (err > 0)
1414 			nr_failed += err;
1415 		if (err < 0)
1416 			break;
1417 	}
1418 
1419 	lru_cache_enable();
1420 	if (err < 0)
1421 		return err;
1422 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1423 }
1424 
1425 /*
1426  * Allocate a new folio for page migration, according to NUMA mempolicy.
1427  */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1428 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1429 						    unsigned long private)
1430 {
1431 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1432 	struct mempolicy *pol = mmpol->pol;
1433 	pgoff_t ilx = mmpol->ilx;
1434 	unsigned int order;
1435 	int nid = numa_node_id();
1436 	gfp_t gfp;
1437 
1438 	order = folio_order(src);
1439 	ilx += src->index >> order;
1440 
1441 	if (folio_test_hugetlb(src)) {
1442 		nodemask_t *nodemask;
1443 		struct hstate *h;
1444 
1445 		h = folio_hstate(src);
1446 		gfp = htlb_alloc_mask(h);
1447 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1448 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1449 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1450 	}
1451 
1452 	if (folio_test_large(src))
1453 		gfp = GFP_TRANSHUGE;
1454 	else
1455 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1456 
1457 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1458 }
1459 #else
1460 
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1461 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1462 				unsigned long flags)
1463 {
1464 	return false;
1465 }
1466 
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1467 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1468 		     const nodemask_t *to, int flags)
1469 {
1470 	return -ENOSYS;
1471 }
1472 
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1473 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1474 						    unsigned long private)
1475 {
1476 	return NULL;
1477 }
1478 #endif
1479 
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1480 static long do_mbind(unsigned long start, unsigned long len,
1481 		     unsigned short mode, unsigned short mode_flags,
1482 		     nodemask_t *nmask, unsigned long flags)
1483 {
1484 	struct mm_struct *mm = current->mm;
1485 	struct vm_area_struct *vma, *prev;
1486 	struct vma_iterator vmi;
1487 	struct migration_mpol mmpol;
1488 	struct mempolicy *new;
1489 	unsigned long end;
1490 	long err;
1491 	long nr_failed;
1492 	LIST_HEAD(pagelist);
1493 
1494 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1495 		return -EINVAL;
1496 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1497 		return -EPERM;
1498 
1499 	if (start & ~PAGE_MASK)
1500 		return -EINVAL;
1501 
1502 	if (mode == MPOL_DEFAULT)
1503 		flags &= ~MPOL_MF_STRICT;
1504 
1505 	len = PAGE_ALIGN(len);
1506 	end = start + len;
1507 
1508 	if (end < start)
1509 		return -EINVAL;
1510 	if (end == start)
1511 		return 0;
1512 
1513 	new = mpol_new(mode, mode_flags, nmask);
1514 	if (IS_ERR(new))
1515 		return PTR_ERR(new);
1516 
1517 	/*
1518 	 * If we are using the default policy then operation
1519 	 * on discontinuous address spaces is okay after all
1520 	 */
1521 	if (!new)
1522 		flags |= MPOL_MF_DISCONTIG_OK;
1523 
1524 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1525 		lru_cache_disable();
1526 	{
1527 		NODEMASK_SCRATCH(scratch);
1528 		if (scratch) {
1529 			mmap_write_lock(mm);
1530 			err = mpol_set_nodemask(new, nmask, scratch);
1531 			if (err)
1532 				mmap_write_unlock(mm);
1533 		} else
1534 			err = -ENOMEM;
1535 		NODEMASK_SCRATCH_FREE(scratch);
1536 	}
1537 	if (err)
1538 		goto mpol_out;
1539 
1540 	/*
1541 	 * Lock the VMAs before scanning for pages to migrate,
1542 	 * to ensure we don't miss a concurrently inserted page.
1543 	 */
1544 	nr_failed = queue_pages_range(mm, start, end, nmask,
1545 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1546 
1547 	if (nr_failed < 0) {
1548 		err = nr_failed;
1549 		nr_failed = 0;
1550 	} else {
1551 		vma_iter_init(&vmi, mm, start);
1552 		prev = vma_prev(&vmi);
1553 		for_each_vma_range(vmi, vma, end) {
1554 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1555 			if (err)
1556 				break;
1557 		}
1558 	}
1559 
1560 	if (!err && !list_empty(&pagelist)) {
1561 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1562 		if (!new) {
1563 			new = get_task_policy(current);
1564 			mpol_get(new);
1565 		}
1566 		mmpol.pol = new;
1567 		mmpol.ilx = 0;
1568 
1569 		/*
1570 		 * In the interleaved case, attempt to allocate on exactly the
1571 		 * targeted nodes, for the first VMA to be migrated; for later
1572 		 * VMAs, the nodes will still be interleaved from the targeted
1573 		 * nodemask, but one by one may be selected differently.
1574 		 */
1575 		if (new->mode == MPOL_INTERLEAVE ||
1576 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1577 			struct folio *folio;
1578 			unsigned int order;
1579 			unsigned long addr = -EFAULT;
1580 
1581 			list_for_each_entry(folio, &pagelist, lru) {
1582 				if (!folio_test_ksm(folio))
1583 					break;
1584 			}
1585 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1586 				vma_iter_init(&vmi, mm, start);
1587 				for_each_vma_range(vmi, vma, end) {
1588 					addr = page_address_in_vma(folio,
1589 						folio_page(folio, 0), vma);
1590 					if (addr != -EFAULT)
1591 						break;
1592 				}
1593 			}
1594 			if (addr != -EFAULT) {
1595 				order = folio_order(folio);
1596 				/* We already know the pol, but not the ilx */
1597 				mpol_cond_put(get_vma_policy(vma, addr, order,
1598 							     &mmpol.ilx));
1599 				/* Set base from which to increment by index */
1600 				mmpol.ilx -= folio->index >> order;
1601 			}
1602 		}
1603 	}
1604 
1605 	mmap_write_unlock(mm);
1606 
1607 	if (!err && !list_empty(&pagelist)) {
1608 		nr_failed |= migrate_pages(&pagelist,
1609 				alloc_migration_target_by_mpol, NULL,
1610 				(unsigned long)&mmpol, MIGRATE_SYNC,
1611 				MR_MEMPOLICY_MBIND, NULL);
1612 	}
1613 
1614 	if (nr_failed && (flags & MPOL_MF_STRICT))
1615 		err = -EIO;
1616 	if (!list_empty(&pagelist))
1617 		putback_movable_pages(&pagelist);
1618 mpol_out:
1619 	mpol_put(new);
1620 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1621 		lru_cache_enable();
1622 	return err;
1623 }
1624 
1625 /*
1626  * User space interface with variable sized bitmaps for nodelists.
1627  */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1628 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1629 		      unsigned long maxnode)
1630 {
1631 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1632 	int ret;
1633 
1634 	if (in_compat_syscall())
1635 		ret = compat_get_bitmap(mask,
1636 					(const compat_ulong_t __user *)nmask,
1637 					maxnode);
1638 	else
1639 		ret = copy_from_user(mask, nmask,
1640 				     nlongs * sizeof(unsigned long));
1641 
1642 	if (ret)
1643 		return -EFAULT;
1644 
1645 	if (maxnode % BITS_PER_LONG)
1646 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1647 
1648 	return 0;
1649 }
1650 
1651 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1652 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1653 		     unsigned long maxnode)
1654 {
1655 	--maxnode;
1656 	nodes_clear(*nodes);
1657 	if (maxnode == 0 || !nmask)
1658 		return 0;
1659 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1660 		return -EINVAL;
1661 
1662 	/*
1663 	 * When the user specified more nodes than supported just check
1664 	 * if the non supported part is all zero, one word at a time,
1665 	 * starting at the end.
1666 	 */
1667 	while (maxnode > MAX_NUMNODES) {
1668 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1669 		unsigned long t;
1670 
1671 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1672 			return -EFAULT;
1673 
1674 		if (maxnode - bits >= MAX_NUMNODES) {
1675 			maxnode -= bits;
1676 		} else {
1677 			maxnode = MAX_NUMNODES;
1678 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1679 		}
1680 		if (t)
1681 			return -EINVAL;
1682 	}
1683 
1684 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1685 }
1686 
1687 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1688 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1689 			      nodemask_t *nodes)
1690 {
1691 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1692 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1693 	bool compat = in_compat_syscall();
1694 
1695 	if (compat)
1696 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1697 
1698 	if (copy > nbytes) {
1699 		if (copy > PAGE_SIZE)
1700 			return -EINVAL;
1701 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1702 			return -EFAULT;
1703 		copy = nbytes;
1704 		maxnode = nr_node_ids;
1705 	}
1706 
1707 	if (compat)
1708 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1709 					 nodes_addr(*nodes), maxnode);
1710 
1711 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1712 }
1713 
1714 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1715 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1716 {
1717 	*flags = *mode & MPOL_MODE_FLAGS;
1718 	*mode &= ~MPOL_MODE_FLAGS;
1719 
1720 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1721 		return -EINVAL;
1722 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1723 		return -EINVAL;
1724 	if (*flags & MPOL_F_NUMA_BALANCING) {
1725 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1726 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1727 		else
1728 			return -EINVAL;
1729 	}
1730 	return 0;
1731 }
1732 
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1733 static long kernel_mbind(unsigned long start, unsigned long len,
1734 			 unsigned long mode, const unsigned long __user *nmask,
1735 			 unsigned long maxnode, unsigned int flags)
1736 {
1737 	unsigned short mode_flags;
1738 	nodemask_t nodes;
1739 	int lmode = mode;
1740 	int err;
1741 
1742 	start = untagged_addr(start);
1743 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1744 	if (err)
1745 		return err;
1746 
1747 	err = get_nodes(&nodes, nmask, maxnode);
1748 	if (err)
1749 		return err;
1750 
1751 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1752 }
1753 
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1754 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1755 		unsigned long, home_node, unsigned long, flags)
1756 {
1757 	struct mm_struct *mm = current->mm;
1758 	struct vm_area_struct *vma, *prev;
1759 	struct mempolicy *new, *old;
1760 	unsigned long end;
1761 	int err = -ENOENT;
1762 	VMA_ITERATOR(vmi, mm, start);
1763 
1764 	start = untagged_addr(start);
1765 	if (start & ~PAGE_MASK)
1766 		return -EINVAL;
1767 	/*
1768 	 * flags is used for future extension if any.
1769 	 */
1770 	if (flags != 0)
1771 		return -EINVAL;
1772 
1773 	/*
1774 	 * Check home_node is online to avoid accessing uninitialized
1775 	 * NODE_DATA.
1776 	 */
1777 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1778 		return -EINVAL;
1779 
1780 	len = PAGE_ALIGN(len);
1781 	end = start + len;
1782 
1783 	if (end < start)
1784 		return -EINVAL;
1785 	if (end == start)
1786 		return 0;
1787 	mmap_write_lock(mm);
1788 	prev = vma_prev(&vmi);
1789 	for_each_vma_range(vmi, vma, end) {
1790 		/*
1791 		 * If any vma in the range got policy other than MPOL_BIND
1792 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1793 		 * the home node for vmas we already updated before.
1794 		 */
1795 		old = vma_policy(vma);
1796 		if (!old) {
1797 			prev = vma;
1798 			continue;
1799 		}
1800 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1801 			err = -EOPNOTSUPP;
1802 			break;
1803 		}
1804 		new = mpol_dup(old);
1805 		if (IS_ERR(new)) {
1806 			err = PTR_ERR(new);
1807 			break;
1808 		}
1809 
1810 		vma_start_write(vma);
1811 		new->home_node = home_node;
1812 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1813 		mpol_put(new);
1814 		if (err)
1815 			break;
1816 	}
1817 	mmap_write_unlock(mm);
1818 	return err;
1819 }
1820 
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1821 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1822 		unsigned long, mode, const unsigned long __user *, nmask,
1823 		unsigned long, maxnode, unsigned int, flags)
1824 {
1825 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1826 }
1827 
1828 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1829 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1830 				 unsigned long maxnode)
1831 {
1832 	unsigned short mode_flags;
1833 	nodemask_t nodes;
1834 	int lmode = mode;
1835 	int err;
1836 
1837 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1838 	if (err)
1839 		return err;
1840 
1841 	err = get_nodes(&nodes, nmask, maxnode);
1842 	if (err)
1843 		return err;
1844 
1845 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1846 }
1847 
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1848 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1849 		unsigned long, maxnode)
1850 {
1851 	return kernel_set_mempolicy(mode, nmask, maxnode);
1852 }
1853 
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1854 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1855 				const unsigned long __user *old_nodes,
1856 				const unsigned long __user *new_nodes)
1857 {
1858 	struct mm_struct *mm = NULL;
1859 	struct task_struct *task;
1860 	nodemask_t task_nodes;
1861 	int err;
1862 	nodemask_t *old;
1863 	nodemask_t *new;
1864 	NODEMASK_SCRATCH(scratch);
1865 
1866 	if (!scratch)
1867 		return -ENOMEM;
1868 
1869 	old = &scratch->mask1;
1870 	new = &scratch->mask2;
1871 
1872 	err = get_nodes(old, old_nodes, maxnode);
1873 	if (err)
1874 		goto out;
1875 
1876 	err = get_nodes(new, new_nodes, maxnode);
1877 	if (err)
1878 		goto out;
1879 
1880 	/* Find the mm_struct */
1881 	rcu_read_lock();
1882 	task = pid ? find_task_by_vpid(pid) : current;
1883 	if (!task) {
1884 		rcu_read_unlock();
1885 		err = -ESRCH;
1886 		goto out;
1887 	}
1888 	get_task_struct(task);
1889 
1890 	err = -EINVAL;
1891 
1892 	/*
1893 	 * Check if this process has the right to modify the specified process.
1894 	 * Use the regular "ptrace_may_access()" checks.
1895 	 */
1896 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1897 		rcu_read_unlock();
1898 		err = -EPERM;
1899 		goto out_put;
1900 	}
1901 	rcu_read_unlock();
1902 
1903 	task_nodes = cpuset_mems_allowed(task);
1904 	/* Is the user allowed to access the target nodes? */
1905 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1906 		err = -EPERM;
1907 		goto out_put;
1908 	}
1909 
1910 	task_nodes = cpuset_mems_allowed(current);
1911 	if (!nodes_and(*new, *new, task_nodes))
1912 		goto out_put;
1913 
1914 	err = security_task_movememory(task);
1915 	if (err)
1916 		goto out_put;
1917 
1918 	mm = get_task_mm(task);
1919 	put_task_struct(task);
1920 
1921 	if (!mm) {
1922 		err = -EINVAL;
1923 		goto out;
1924 	}
1925 
1926 	err = do_migrate_pages(mm, old, new,
1927 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1928 
1929 	mmput(mm);
1930 out:
1931 	NODEMASK_SCRATCH_FREE(scratch);
1932 
1933 	return err;
1934 
1935 out_put:
1936 	put_task_struct(task);
1937 	goto out;
1938 }
1939 
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1940 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1941 		const unsigned long __user *, old_nodes,
1942 		const unsigned long __user *, new_nodes)
1943 {
1944 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1945 }
1946 
1947 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1948 static int kernel_get_mempolicy(int __user *policy,
1949 				unsigned long __user *nmask,
1950 				unsigned long maxnode,
1951 				unsigned long addr,
1952 				unsigned long flags)
1953 {
1954 	int err;
1955 	int pval;
1956 	nodemask_t nodes;
1957 
1958 	if (nmask != NULL && maxnode < nr_node_ids)
1959 		return -EINVAL;
1960 
1961 	addr = untagged_addr(addr);
1962 
1963 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1964 
1965 	if (err)
1966 		return err;
1967 
1968 	if (policy && put_user(pval, policy))
1969 		return -EFAULT;
1970 
1971 	if (nmask)
1972 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1973 
1974 	return err;
1975 }
1976 
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1977 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1978 		unsigned long __user *, nmask, unsigned long, maxnode,
1979 		unsigned long, addr, unsigned long, flags)
1980 {
1981 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1982 }
1983 
vma_migratable(struct vm_area_struct * vma)1984 bool vma_migratable(struct vm_area_struct *vma)
1985 {
1986 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1987 		return false;
1988 
1989 	/*
1990 	 * DAX device mappings require predictable access latency, so avoid
1991 	 * incurring periodic faults.
1992 	 */
1993 	if (vma_is_dax(vma))
1994 		return false;
1995 
1996 	if (is_vm_hugetlb_page(vma) &&
1997 		!hugepage_migration_supported(hstate_vma(vma)))
1998 		return false;
1999 
2000 	/*
2001 	 * Migration allocates pages in the highest zone. If we cannot
2002 	 * do so then migration (at least from node to node) is not
2003 	 * possible.
2004 	 */
2005 	if (vma->vm_file &&
2006 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
2007 			< policy_zone)
2008 		return false;
2009 	return true;
2010 }
2011 
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2012 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2013 				   unsigned long addr, pgoff_t *ilx)
2014 {
2015 	*ilx = 0;
2016 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
2017 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2018 }
2019 
2020 /*
2021  * get_vma_policy(@vma, @addr, @order, @ilx)
2022  * @vma: virtual memory area whose policy is sought
2023  * @addr: address in @vma for shared policy lookup
2024  * @order: 0, or appropriate huge_page_order for interleaving
2025  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2026  *       MPOL_WEIGHTED_INTERLEAVE
2027  *
2028  * Returns effective policy for a VMA at specified address.
2029  * Falls back to current->mempolicy or system default policy, as necessary.
2030  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2031  * count--added by the get_policy() vm_op, as appropriate--to protect against
2032  * freeing by another task.  It is the caller's responsibility to free the
2033  * extra reference for shared policies.
2034  */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)2035 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2036 				 unsigned long addr, int order, pgoff_t *ilx)
2037 {
2038 	struct mempolicy *pol;
2039 
2040 	pol = __get_vma_policy(vma, addr, ilx);
2041 	if (!pol)
2042 		pol = get_task_policy(current);
2043 	if (pol->mode == MPOL_INTERLEAVE ||
2044 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2045 		*ilx += vma->vm_pgoff >> order;
2046 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2047 	}
2048 	return pol;
2049 }
2050 
vma_policy_mof(struct vm_area_struct * vma)2051 bool vma_policy_mof(struct vm_area_struct *vma)
2052 {
2053 	struct mempolicy *pol;
2054 
2055 	if (vma->vm_ops && vma->vm_ops->get_policy) {
2056 		bool ret = false;
2057 		pgoff_t ilx;		/* ignored here */
2058 
2059 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2060 		if (pol && (pol->flags & MPOL_F_MOF))
2061 			ret = true;
2062 		mpol_cond_put(pol);
2063 
2064 		return ret;
2065 	}
2066 
2067 	pol = vma->vm_policy;
2068 	if (!pol)
2069 		pol = get_task_policy(current);
2070 
2071 	return pol->flags & MPOL_F_MOF;
2072 }
2073 
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2074 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2075 {
2076 	enum zone_type dynamic_policy_zone = policy_zone;
2077 
2078 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2079 
2080 	/*
2081 	 * if policy->nodes has movable memory only,
2082 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2083 	 *
2084 	 * policy->nodes is intersect with node_states[N_MEMORY].
2085 	 * so if the following test fails, it implies
2086 	 * policy->nodes has movable memory only.
2087 	 */
2088 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2089 		dynamic_policy_zone = ZONE_MOVABLE;
2090 
2091 	return zone >= dynamic_policy_zone;
2092 }
2093 
weighted_interleave_nodes(struct mempolicy * policy)2094 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2095 {
2096 	unsigned int node;
2097 	unsigned int cpuset_mems_cookie;
2098 
2099 retry:
2100 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2101 	cpuset_mems_cookie = read_mems_allowed_begin();
2102 	node = current->il_prev;
2103 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
2104 		node = next_node_in(node, policy->nodes);
2105 		if (read_mems_allowed_retry(cpuset_mems_cookie))
2106 			goto retry;
2107 		if (node == MAX_NUMNODES)
2108 			return node;
2109 		current->il_prev = node;
2110 		current->il_weight = get_il_weight(node);
2111 	}
2112 	current->il_weight--;
2113 	return node;
2114 }
2115 
2116 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2117 static unsigned int interleave_nodes(struct mempolicy *policy)
2118 {
2119 	unsigned int nid;
2120 	unsigned int cpuset_mems_cookie;
2121 
2122 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2123 	do {
2124 		cpuset_mems_cookie = read_mems_allowed_begin();
2125 		nid = next_node_in(current->il_prev, policy->nodes);
2126 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2127 
2128 	if (nid < MAX_NUMNODES)
2129 		current->il_prev = nid;
2130 	return nid;
2131 }
2132 
2133 /*
2134  * Depending on the memory policy provide a node from which to allocate the
2135  * next slab entry.
2136  */
mempolicy_slab_node(void)2137 unsigned int mempolicy_slab_node(void)
2138 {
2139 	struct mempolicy *policy;
2140 	int node = numa_mem_id();
2141 
2142 	if (!in_task())
2143 		return node;
2144 
2145 	policy = current->mempolicy;
2146 	if (!policy)
2147 		return node;
2148 
2149 	switch (policy->mode) {
2150 	case MPOL_PREFERRED:
2151 		return first_node(policy->nodes);
2152 
2153 	case MPOL_INTERLEAVE:
2154 		return interleave_nodes(policy);
2155 
2156 	case MPOL_WEIGHTED_INTERLEAVE:
2157 		return weighted_interleave_nodes(policy);
2158 
2159 	case MPOL_BIND:
2160 	case MPOL_PREFERRED_MANY:
2161 	{
2162 		struct zoneref *z;
2163 
2164 		/*
2165 		 * Follow bind policy behavior and start allocation at the
2166 		 * first node.
2167 		 */
2168 		struct zonelist *zonelist;
2169 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2170 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2171 		z = first_zones_zonelist(zonelist, highest_zoneidx,
2172 							&policy->nodes);
2173 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2174 	}
2175 	case MPOL_LOCAL:
2176 		return node;
2177 
2178 	default:
2179 		BUG();
2180 	}
2181 }
2182 
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2183 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2184 					      nodemask_t *mask)
2185 {
2186 	/*
2187 	 * barrier stabilizes the nodemask locally so that it can be iterated
2188 	 * over safely without concern for changes. Allocators validate node
2189 	 * selection does not violate mems_allowed, so this is safe.
2190 	 */
2191 	barrier();
2192 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2193 	barrier();
2194 	return nodes_weight(*mask);
2195 }
2196 
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2197 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2198 {
2199 	struct weighted_interleave_state *state;
2200 	nodemask_t nodemask;
2201 	unsigned int target, nr_nodes;
2202 	u8 *table = NULL;
2203 	unsigned int weight_total = 0;
2204 	u8 weight;
2205 	int nid = 0;
2206 
2207 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2208 	if (!nr_nodes)
2209 		return numa_node_id();
2210 
2211 	rcu_read_lock();
2212 
2213 	state = rcu_dereference(wi_state);
2214 	/* Uninitialized wi_state means we should assume all weights are 1 */
2215 	if (state)
2216 		table = state->iw_table;
2217 
2218 	/* calculate the total weight */
2219 	for_each_node_mask(nid, nodemask)
2220 		weight_total += table ? table[nid] : 1;
2221 
2222 	/* Calculate the node offset based on totals */
2223 	target = ilx % weight_total;
2224 	nid = first_node(nodemask);
2225 	while (target) {
2226 		/* detect system default usage */
2227 		weight = table ? table[nid] : 1;
2228 		if (target < weight)
2229 			break;
2230 		target -= weight;
2231 		nid = next_node_in(nid, nodemask);
2232 	}
2233 	rcu_read_unlock();
2234 	return nid;
2235 }
2236 
2237 /*
2238  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2239  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2240  * exceeds the number of present nodes.
2241  */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2242 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2243 {
2244 	nodemask_t nodemask;
2245 	unsigned int target, nnodes;
2246 	int i;
2247 	int nid;
2248 
2249 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2250 	if (!nnodes)
2251 		return numa_node_id();
2252 	target = ilx % nnodes;
2253 	nid = first_node(nodemask);
2254 	for (i = 0; i < target; i++)
2255 		nid = next_node(nid, nodemask);
2256 	return nid;
2257 }
2258 
2259 /*
2260  * Return a nodemask representing a mempolicy for filtering nodes for
2261  * page allocation, together with preferred node id (or the input node id).
2262  */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2263 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2264 				   pgoff_t ilx, int *nid)
2265 {
2266 	nodemask_t *nodemask = NULL;
2267 
2268 	switch (pol->mode) {
2269 	case MPOL_PREFERRED:
2270 		/* Override input node id */
2271 		*nid = first_node(pol->nodes);
2272 		break;
2273 	case MPOL_PREFERRED_MANY:
2274 		nodemask = &pol->nodes;
2275 		if (pol->home_node != NUMA_NO_NODE)
2276 			*nid = pol->home_node;
2277 		break;
2278 	case MPOL_BIND:
2279 		/* Restrict to nodemask (but not on lower zones) */
2280 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2281 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2282 			nodemask = &pol->nodes;
2283 		if (pol->home_node != NUMA_NO_NODE)
2284 			*nid = pol->home_node;
2285 		/*
2286 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2287 		 * because we might easily break the expectation to stay on the
2288 		 * requested node and not break the policy.
2289 		 */
2290 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2291 		break;
2292 	case MPOL_INTERLEAVE:
2293 		/* Override input node id */
2294 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2295 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2296 		break;
2297 	case MPOL_WEIGHTED_INTERLEAVE:
2298 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2299 			weighted_interleave_nodes(pol) :
2300 			weighted_interleave_nid(pol, ilx);
2301 		break;
2302 	}
2303 
2304 	return nodemask;
2305 }
2306 
2307 #ifdef CONFIG_HUGETLBFS
2308 /*
2309  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2310  * @vma: virtual memory area whose policy is sought
2311  * @addr: address in @vma for shared policy lookup and interleave policy
2312  * @gfp_flags: for requested zone
2313  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2314  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2315  *
2316  * Returns a nid suitable for a huge page allocation and a pointer
2317  * to the struct mempolicy for conditional unref after allocation.
2318  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2319  * to the mempolicy's @nodemask for filtering the zonelist.
2320  */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2321 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2322 		struct mempolicy **mpol, nodemask_t **nodemask)
2323 {
2324 	pgoff_t ilx;
2325 	int nid;
2326 
2327 	nid = numa_node_id();
2328 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2329 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2330 	return nid;
2331 }
2332 
2333 /*
2334  * init_nodemask_of_mempolicy
2335  *
2336  * If the current task's mempolicy is "default" [NULL], return 'false'
2337  * to indicate default policy.  Otherwise, extract the policy nodemask
2338  * for 'bind' or 'interleave' policy into the argument nodemask, or
2339  * initialize the argument nodemask to contain the single node for
2340  * 'preferred' or 'local' policy and return 'true' to indicate presence
2341  * of non-default mempolicy.
2342  *
2343  * We don't bother with reference counting the mempolicy [mpol_get/put]
2344  * because the current task is examining it's own mempolicy and a task's
2345  * mempolicy is only ever changed by the task itself.
2346  *
2347  * N.B., it is the caller's responsibility to free a returned nodemask.
2348  */
init_nodemask_of_mempolicy(nodemask_t * mask)2349 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2350 {
2351 	struct mempolicy *mempolicy;
2352 
2353 	if (!(mask && current->mempolicy))
2354 		return false;
2355 
2356 	task_lock(current);
2357 	mempolicy = current->mempolicy;
2358 	switch (mempolicy->mode) {
2359 	case MPOL_PREFERRED:
2360 	case MPOL_PREFERRED_MANY:
2361 	case MPOL_BIND:
2362 	case MPOL_INTERLEAVE:
2363 	case MPOL_WEIGHTED_INTERLEAVE:
2364 		*mask = mempolicy->nodes;
2365 		break;
2366 
2367 	case MPOL_LOCAL:
2368 		init_nodemask_of_node(mask, numa_node_id());
2369 		break;
2370 
2371 	default:
2372 		BUG();
2373 	}
2374 	task_unlock(current);
2375 
2376 	return true;
2377 }
2378 #endif
2379 
2380 /*
2381  * mempolicy_in_oom_domain
2382  *
2383  * If tsk's mempolicy is "bind", check for intersection between mask and
2384  * the policy nodemask. Otherwise, return true for all other policies
2385  * including "interleave", as a tsk with "interleave" policy may have
2386  * memory allocated from all nodes in system.
2387  *
2388  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2389  */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2390 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2391 					const nodemask_t *mask)
2392 {
2393 	struct mempolicy *mempolicy;
2394 	bool ret = true;
2395 
2396 	if (!mask)
2397 		return ret;
2398 
2399 	task_lock(tsk);
2400 	mempolicy = tsk->mempolicy;
2401 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2402 		ret = nodes_intersects(mempolicy->nodes, *mask);
2403 	task_unlock(tsk);
2404 
2405 	return ret;
2406 }
2407 
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2408 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2409 						int nid, nodemask_t *nodemask)
2410 {
2411 	struct page *page;
2412 	gfp_t preferred_gfp;
2413 
2414 	/*
2415 	 * This is a two pass approach. The first pass will only try the
2416 	 * preferred nodes but skip the direct reclaim and allow the
2417 	 * allocation to fail, while the second pass will try all the
2418 	 * nodes in system.
2419 	 */
2420 	preferred_gfp = gfp | __GFP_NOWARN;
2421 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2422 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2423 	if (!page)
2424 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2425 
2426 	return page;
2427 }
2428 
2429 /**
2430  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2431  * @gfp: GFP flags.
2432  * @order: Order of the page allocation.
2433  * @pol: Pointer to the NUMA mempolicy.
2434  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2435  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2436  *
2437  * Return: The page on success or NULL if allocation fails.
2438  */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2439 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2440 		struct mempolicy *pol, pgoff_t ilx, int nid)
2441 {
2442 	nodemask_t *nodemask;
2443 	struct page *page;
2444 
2445 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2446 
2447 	if (pol->mode == MPOL_PREFERRED_MANY)
2448 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2449 
2450 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2451 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2452 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2453 		/*
2454 		 * For hugepage allocation and non-interleave policy which
2455 		 * allows the current node (or other explicitly preferred
2456 		 * node) we only try to allocate from the current/preferred
2457 		 * node and don't fall back to other nodes, as the cost of
2458 		 * remote accesses would likely offset THP benefits.
2459 		 *
2460 		 * If the policy is interleave or does not allow the current
2461 		 * node in its nodemask, we allocate the standard way.
2462 		 */
2463 		if (pol->mode != MPOL_INTERLEAVE &&
2464 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2465 		    (!nodemask || node_isset(nid, *nodemask))) {
2466 			/*
2467 			 * First, try to allocate THP only on local node, but
2468 			 * don't reclaim unnecessarily, just compact.
2469 			 */
2470 			page = __alloc_frozen_pages_noprof(
2471 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2472 				nid, NULL);
2473 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2474 				return page;
2475 			/*
2476 			 * If hugepage allocations are configured to always
2477 			 * synchronous compact or the vma has been madvised
2478 			 * to prefer hugepage backing, retry allowing remote
2479 			 * memory with both reclaim and compact as well.
2480 			 */
2481 		}
2482 	}
2483 
2484 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2485 
2486 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2487 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2488 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2489 		if (static_branch_likely(&vm_numa_stat_key) &&
2490 		    page_to_nid(page) == nid) {
2491 			preempt_disable();
2492 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2493 			preempt_enable();
2494 		}
2495 	}
2496 
2497 	return page;
2498 }
2499 
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2500 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2501 		struct mempolicy *pol, pgoff_t ilx, int nid)
2502 {
2503 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2504 			ilx, nid);
2505 	if (!page)
2506 		return NULL;
2507 
2508 	set_page_refcounted(page);
2509 	return page_rmappable_folio(page);
2510 }
2511 
2512 /**
2513  * vma_alloc_folio - Allocate a folio for a VMA.
2514  * @gfp: GFP flags.
2515  * @order: Order of the folio.
2516  * @vma: Pointer to VMA.
2517  * @addr: Virtual address of the allocation.  Must be inside @vma.
2518  *
2519  * Allocate a folio for a specific address in @vma, using the appropriate
2520  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2521  * VMA to prevent it from going away.  Should be used for all allocations
2522  * for folios that will be mapped into user space, excepting hugetlbfs, and
2523  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2524  *
2525  * Return: The folio on success or NULL if allocation fails.
2526  */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2527 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2528 		unsigned long addr)
2529 {
2530 	struct mempolicy *pol;
2531 	pgoff_t ilx;
2532 	struct folio *folio;
2533 
2534 	if (vma->vm_flags & VM_DROPPABLE)
2535 		gfp |= __GFP_NOWARN;
2536 
2537 	pol = get_vma_policy(vma, addr, order, &ilx);
2538 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2539 	mpol_cond_put(pol);
2540 	return folio;
2541 }
2542 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2543 
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2544 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2545 {
2546 	struct mempolicy *pol = &default_policy;
2547 
2548 	/*
2549 	 * No reference counting needed for current->mempolicy
2550 	 * nor system default_policy
2551 	 */
2552 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2553 		pol = get_task_policy(current);
2554 
2555 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2556 				       numa_node_id());
2557 }
2558 
2559 /**
2560  * alloc_pages - Allocate pages.
2561  * @gfp: GFP flags.
2562  * @order: Power of two of number of pages to allocate.
2563  *
2564  * Allocate 1 << @order contiguous pages.  The physical address of the
2565  * first page is naturally aligned (eg an order-3 allocation will be aligned
2566  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2567  * process is honoured when in process context.
2568  *
2569  * Context: Can be called from any context, providing the appropriate GFP
2570  * flags are used.
2571  * Return: The page on success or NULL if allocation fails.
2572  */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2573 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2574 {
2575 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2576 
2577 	if (page)
2578 		set_page_refcounted(page);
2579 	return page;
2580 }
2581 EXPORT_SYMBOL(alloc_pages_noprof);
2582 
folio_alloc_noprof(gfp_t gfp,unsigned int order)2583 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2584 {
2585 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2586 }
2587 EXPORT_SYMBOL(folio_alloc_noprof);
2588 
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2589 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2590 		struct mempolicy *pol, unsigned long nr_pages,
2591 		struct page **page_array)
2592 {
2593 	int nodes;
2594 	unsigned long nr_pages_per_node;
2595 	int delta;
2596 	int i;
2597 	unsigned long nr_allocated;
2598 	unsigned long total_allocated = 0;
2599 
2600 	nodes = nodes_weight(pol->nodes);
2601 	nr_pages_per_node = nr_pages / nodes;
2602 	delta = nr_pages - nodes * nr_pages_per_node;
2603 
2604 	for (i = 0; i < nodes; i++) {
2605 		if (delta) {
2606 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2607 					interleave_nodes(pol), NULL,
2608 					nr_pages_per_node + 1,
2609 					page_array);
2610 			delta--;
2611 		} else {
2612 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2613 					interleave_nodes(pol), NULL,
2614 					nr_pages_per_node, page_array);
2615 		}
2616 
2617 		page_array += nr_allocated;
2618 		total_allocated += nr_allocated;
2619 	}
2620 
2621 	return total_allocated;
2622 }
2623 
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2624 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2625 		struct mempolicy *pol, unsigned long nr_pages,
2626 		struct page **page_array)
2627 {
2628 	struct weighted_interleave_state *state;
2629 	struct task_struct *me = current;
2630 	unsigned int cpuset_mems_cookie;
2631 	unsigned long total_allocated = 0;
2632 	unsigned long nr_allocated = 0;
2633 	unsigned long rounds;
2634 	unsigned long node_pages, delta;
2635 	u8 *weights, weight;
2636 	unsigned int weight_total = 0;
2637 	unsigned long rem_pages = nr_pages;
2638 	nodemask_t nodes;
2639 	int nnodes, node;
2640 	int resume_node = MAX_NUMNODES - 1;
2641 	u8 resume_weight = 0;
2642 	int prev_node;
2643 	int i;
2644 
2645 	if (!nr_pages)
2646 		return 0;
2647 
2648 	/* read the nodes onto the stack, retry if done during rebind */
2649 	do {
2650 		cpuset_mems_cookie = read_mems_allowed_begin();
2651 		nnodes = read_once_policy_nodemask(pol, &nodes);
2652 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2653 
2654 	/* if the nodemask has become invalid, we cannot do anything */
2655 	if (!nnodes)
2656 		return 0;
2657 
2658 	/* Continue allocating from most recent node and adjust the nr_pages */
2659 	node = me->il_prev;
2660 	weight = me->il_weight;
2661 	if (weight && node_isset(node, nodes)) {
2662 		node_pages = min(rem_pages, weight);
2663 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2664 						  page_array);
2665 		page_array += nr_allocated;
2666 		total_allocated += nr_allocated;
2667 		/* if that's all the pages, no need to interleave */
2668 		if (rem_pages <= weight) {
2669 			me->il_weight -= rem_pages;
2670 			return total_allocated;
2671 		}
2672 		/* Otherwise we adjust remaining pages, continue from there */
2673 		rem_pages -= weight;
2674 	}
2675 	/* clear active weight in case of an allocation failure */
2676 	me->il_weight = 0;
2677 	prev_node = node;
2678 
2679 	/* create a local copy of node weights to operate on outside rcu */
2680 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2681 	if (!weights)
2682 		return total_allocated;
2683 
2684 	rcu_read_lock();
2685 	state = rcu_dereference(wi_state);
2686 	if (state) {
2687 		memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2688 		rcu_read_unlock();
2689 	} else {
2690 		rcu_read_unlock();
2691 		for (i = 0; i < nr_node_ids; i++)
2692 			weights[i] = 1;
2693 	}
2694 
2695 	/* calculate total, detect system default usage */
2696 	for_each_node_mask(node, nodes)
2697 		weight_total += weights[node];
2698 
2699 	/*
2700 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2701 	 * Track which node weighted interleave should resume from.
2702 	 *
2703 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2704 	 * the node following prev_node and its weight.
2705 	 */
2706 	rounds = rem_pages / weight_total;
2707 	delta = rem_pages % weight_total;
2708 	resume_node = next_node_in(prev_node, nodes);
2709 	resume_weight = weights[resume_node];
2710 	for (i = 0; i < nnodes; i++) {
2711 		node = next_node_in(prev_node, nodes);
2712 		weight = weights[node];
2713 		node_pages = weight * rounds;
2714 		/* If a delta exists, add this node's portion of the delta */
2715 		if (delta > weight) {
2716 			node_pages += weight;
2717 			delta -= weight;
2718 		} else if (delta) {
2719 			/* when delta is depleted, resume from that node */
2720 			node_pages += delta;
2721 			resume_node = node;
2722 			resume_weight = weight - delta;
2723 			delta = 0;
2724 		}
2725 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2726 		if (!node_pages)
2727 			break;
2728 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2729 						  page_array);
2730 		page_array += nr_allocated;
2731 		total_allocated += nr_allocated;
2732 		if (total_allocated == nr_pages)
2733 			break;
2734 		prev_node = node;
2735 	}
2736 	me->il_prev = resume_node;
2737 	me->il_weight = resume_weight;
2738 	kfree(weights);
2739 	return total_allocated;
2740 }
2741 
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2742 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2743 		struct mempolicy *pol, unsigned long nr_pages,
2744 		struct page **page_array)
2745 {
2746 	gfp_t preferred_gfp;
2747 	unsigned long nr_allocated = 0;
2748 
2749 	preferred_gfp = gfp | __GFP_NOWARN;
2750 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2751 
2752 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2753 					   nr_pages, page_array);
2754 
2755 	if (nr_allocated < nr_pages)
2756 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2757 				nr_pages - nr_allocated,
2758 				page_array + nr_allocated);
2759 	return nr_allocated;
2760 }
2761 
2762 /* alloc pages bulk and mempolicy should be considered at the
2763  * same time in some situation such as vmalloc.
2764  *
2765  * It can accelerate memory allocation especially interleaving
2766  * allocate memory.
2767  */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2768 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2769 		unsigned long nr_pages, struct page **page_array)
2770 {
2771 	struct mempolicy *pol = &default_policy;
2772 	nodemask_t *nodemask;
2773 	int nid;
2774 
2775 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2776 		pol = get_task_policy(current);
2777 
2778 	if (pol->mode == MPOL_INTERLEAVE)
2779 		return alloc_pages_bulk_interleave(gfp, pol,
2780 							 nr_pages, page_array);
2781 
2782 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2783 		return alloc_pages_bulk_weighted_interleave(
2784 				  gfp, pol, nr_pages, page_array);
2785 
2786 	if (pol->mode == MPOL_PREFERRED_MANY)
2787 		return alloc_pages_bulk_preferred_many(gfp,
2788 				numa_node_id(), pol, nr_pages, page_array);
2789 
2790 	nid = numa_node_id();
2791 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2792 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2793 				       nr_pages, page_array);
2794 }
2795 
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2796 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2797 {
2798 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2799 
2800 	if (IS_ERR(pol))
2801 		return PTR_ERR(pol);
2802 	dst->vm_policy = pol;
2803 	return 0;
2804 }
2805 
2806 /*
2807  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2808  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2809  * with the mems_allowed returned by cpuset_mems_allowed().  This
2810  * keeps mempolicies cpuset relative after its cpuset moves.  See
2811  * further kernel/cpuset.c update_nodemask().
2812  *
2813  * current's mempolicy may be rebinded by the other task(the task that changes
2814  * cpuset's mems), so we needn't do rebind work for current task.
2815  */
2816 
2817 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2818 struct mempolicy *__mpol_dup(struct mempolicy *old)
2819 {
2820 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2821 
2822 	if (!new)
2823 		return ERR_PTR(-ENOMEM);
2824 
2825 	/* task's mempolicy is protected by alloc_lock */
2826 	if (old == current->mempolicy) {
2827 		task_lock(current);
2828 		*new = *old;
2829 		task_unlock(current);
2830 	} else
2831 		*new = *old;
2832 
2833 	if (current_cpuset_is_being_rebound()) {
2834 		nodemask_t mems = cpuset_mems_allowed(current);
2835 		mpol_rebind_policy(new, &mems);
2836 	}
2837 	atomic_set(&new->refcnt, 1);
2838 	return new;
2839 }
2840 
2841 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2842 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2843 {
2844 	if (!a || !b)
2845 		return false;
2846 	if (a->mode != b->mode)
2847 		return false;
2848 	if (a->flags != b->flags)
2849 		return false;
2850 	if (a->home_node != b->home_node)
2851 		return false;
2852 	if (mpol_store_user_nodemask(a))
2853 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2854 			return false;
2855 
2856 	switch (a->mode) {
2857 	case MPOL_BIND:
2858 	case MPOL_INTERLEAVE:
2859 	case MPOL_PREFERRED:
2860 	case MPOL_PREFERRED_MANY:
2861 	case MPOL_WEIGHTED_INTERLEAVE:
2862 		return !!nodes_equal(a->nodes, b->nodes);
2863 	case MPOL_LOCAL:
2864 		return true;
2865 	default:
2866 		BUG();
2867 		return false;
2868 	}
2869 }
2870 
2871 /*
2872  * Shared memory backing store policy support.
2873  *
2874  * Remember policies even when nobody has shared memory mapped.
2875  * The policies are kept in Red-Black tree linked from the inode.
2876  * They are protected by the sp->lock rwlock, which should be held
2877  * for any accesses to the tree.
2878  */
2879 
2880 /*
2881  * lookup first element intersecting start-end.  Caller holds sp->lock for
2882  * reading or for writing
2883  */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2884 static struct sp_node *sp_lookup(struct shared_policy *sp,
2885 					pgoff_t start, pgoff_t end)
2886 {
2887 	struct rb_node *n = sp->root.rb_node;
2888 
2889 	while (n) {
2890 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2891 
2892 		if (start >= p->end)
2893 			n = n->rb_right;
2894 		else if (end <= p->start)
2895 			n = n->rb_left;
2896 		else
2897 			break;
2898 	}
2899 	if (!n)
2900 		return NULL;
2901 	for (;;) {
2902 		struct sp_node *w = NULL;
2903 		struct rb_node *prev = rb_prev(n);
2904 		if (!prev)
2905 			break;
2906 		w = rb_entry(prev, struct sp_node, nd);
2907 		if (w->end <= start)
2908 			break;
2909 		n = prev;
2910 	}
2911 	return rb_entry(n, struct sp_node, nd);
2912 }
2913 
2914 /*
2915  * Insert a new shared policy into the list.  Caller holds sp->lock for
2916  * writing.
2917  */
sp_insert(struct shared_policy * sp,struct sp_node * new)2918 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2919 {
2920 	struct rb_node **p = &sp->root.rb_node;
2921 	struct rb_node *parent = NULL;
2922 	struct sp_node *nd;
2923 
2924 	while (*p) {
2925 		parent = *p;
2926 		nd = rb_entry(parent, struct sp_node, nd);
2927 		if (new->start < nd->start)
2928 			p = &(*p)->rb_left;
2929 		else if (new->end > nd->end)
2930 			p = &(*p)->rb_right;
2931 		else
2932 			BUG();
2933 	}
2934 	rb_link_node(&new->nd, parent, p);
2935 	rb_insert_color(&new->nd, &sp->root);
2936 }
2937 
2938 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2939 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2940 						pgoff_t idx)
2941 {
2942 	struct mempolicy *pol = NULL;
2943 	struct sp_node *sn;
2944 
2945 	if (!sp->root.rb_node)
2946 		return NULL;
2947 	read_lock(&sp->lock);
2948 	sn = sp_lookup(sp, idx, idx+1);
2949 	if (sn) {
2950 		mpol_get(sn->policy);
2951 		pol = sn->policy;
2952 	}
2953 	read_unlock(&sp->lock);
2954 	return pol;
2955 }
2956 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
2957 
sp_free(struct sp_node * n)2958 static void sp_free(struct sp_node *n)
2959 {
2960 	mpol_put(n->policy);
2961 	kmem_cache_free(sn_cache, n);
2962 }
2963 
2964 /**
2965  * mpol_misplaced - check whether current folio node is valid in policy
2966  *
2967  * @folio: folio to be checked
2968  * @vmf: structure describing the fault
2969  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2970  *
2971  * Lookup current policy node id for vma,addr and "compare to" folio's
2972  * node id.  Policy determination "mimics" alloc_page_vma().
2973  * Called from fault path where we know the vma and faulting address.
2974  *
2975  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2976  * policy, or a suitable node ID to allocate a replacement folio from.
2977  */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2978 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2979 		   unsigned long addr)
2980 {
2981 	struct mempolicy *pol;
2982 	pgoff_t ilx;
2983 	struct zoneref *z;
2984 	int curnid = folio_nid(folio);
2985 	struct vm_area_struct *vma = vmf->vma;
2986 	int thiscpu = raw_smp_processor_id();
2987 	int thisnid = numa_node_id();
2988 	int polnid = NUMA_NO_NODE;
2989 	int ret = NUMA_NO_NODE;
2990 
2991 	/*
2992 	 * Make sure ptl is held so that we don't preempt and we
2993 	 * have a stable smp processor id
2994 	 */
2995 	lockdep_assert_held(vmf->ptl);
2996 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2997 	if (!(pol->flags & MPOL_F_MOF))
2998 		goto out;
2999 
3000 	switch (pol->mode) {
3001 	case MPOL_INTERLEAVE:
3002 		polnid = interleave_nid(pol, ilx);
3003 		break;
3004 
3005 	case MPOL_WEIGHTED_INTERLEAVE:
3006 		polnid = weighted_interleave_nid(pol, ilx);
3007 		break;
3008 
3009 	case MPOL_PREFERRED:
3010 		if (node_isset(curnid, pol->nodes))
3011 			goto out;
3012 		polnid = first_node(pol->nodes);
3013 		break;
3014 
3015 	case MPOL_LOCAL:
3016 		polnid = numa_node_id();
3017 		break;
3018 
3019 	case MPOL_BIND:
3020 	case MPOL_PREFERRED_MANY:
3021 		/*
3022 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
3023 		 * policy nodemask we don't allow numa migration to nodes
3024 		 * outside policy nodemask for now. This is done so that if we
3025 		 * want demotion to slow memory to happen, before allocating
3026 		 * from some DRAM node say 'x', we will end up using a
3027 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3028 		 * we should not promote to node 'x' from slow memory node.
3029 		 */
3030 		if (pol->flags & MPOL_F_MORON) {
3031 			/*
3032 			 * Optimize placement among multiple nodes
3033 			 * via NUMA balancing
3034 			 */
3035 			if (node_isset(thisnid, pol->nodes))
3036 				break;
3037 			goto out;
3038 		}
3039 
3040 		/*
3041 		 * use current page if in policy nodemask,
3042 		 * else select nearest allowed node, if any.
3043 		 * If no allowed nodes, use current [!misplaced].
3044 		 */
3045 		if (node_isset(curnid, pol->nodes))
3046 			goto out;
3047 		z = first_zones_zonelist(
3048 				node_zonelist(thisnid, GFP_HIGHUSER),
3049 				gfp_zone(GFP_HIGHUSER),
3050 				&pol->nodes);
3051 		polnid = zonelist_node_idx(z);
3052 		break;
3053 
3054 	default:
3055 		BUG();
3056 	}
3057 
3058 	/* Migrate the folio towards the node whose CPU is referencing it */
3059 	if (pol->flags & MPOL_F_MORON) {
3060 		polnid = thisnid;
3061 
3062 		if (!should_numa_migrate_memory(current, folio, curnid,
3063 						thiscpu))
3064 			goto out;
3065 	}
3066 
3067 	if (curnid != polnid)
3068 		ret = polnid;
3069 out:
3070 	mpol_cond_put(pol);
3071 
3072 	return ret;
3073 }
3074 
3075 /*
3076  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
3077  * dropped after task->mempolicy is set to NULL so that any allocation done as
3078  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3079  * policy.
3080  */
mpol_put_task_policy(struct task_struct * task)3081 void mpol_put_task_policy(struct task_struct *task)
3082 {
3083 	struct mempolicy *pol;
3084 
3085 	task_lock(task);
3086 	pol = task->mempolicy;
3087 	task->mempolicy = NULL;
3088 	task_unlock(task);
3089 	mpol_put(pol);
3090 }
3091 
sp_delete(struct shared_policy * sp,struct sp_node * n)3092 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3093 {
3094 	rb_erase(&n->nd, &sp->root);
3095 	sp_free(n);
3096 }
3097 
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3098 static void sp_node_init(struct sp_node *node, unsigned long start,
3099 			unsigned long end, struct mempolicy *pol)
3100 {
3101 	node->start = start;
3102 	node->end = end;
3103 	node->policy = pol;
3104 }
3105 
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3106 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3107 				struct mempolicy *pol)
3108 {
3109 	struct sp_node *n;
3110 	struct mempolicy *newpol;
3111 
3112 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3113 	if (!n)
3114 		return NULL;
3115 
3116 	newpol = mpol_dup(pol);
3117 	if (IS_ERR(newpol)) {
3118 		kmem_cache_free(sn_cache, n);
3119 		return NULL;
3120 	}
3121 	newpol->flags |= MPOL_F_SHARED;
3122 	sp_node_init(n, start, end, newpol);
3123 
3124 	return n;
3125 }
3126 
3127 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3128 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3129 				 pgoff_t end, struct sp_node *new)
3130 {
3131 	struct sp_node *n;
3132 	struct sp_node *n_new = NULL;
3133 	struct mempolicy *mpol_new = NULL;
3134 	int ret = 0;
3135 
3136 restart:
3137 	write_lock(&sp->lock);
3138 	n = sp_lookup(sp, start, end);
3139 	/* Take care of old policies in the same range. */
3140 	while (n && n->start < end) {
3141 		struct rb_node *next = rb_next(&n->nd);
3142 		if (n->start >= start) {
3143 			if (n->end <= end)
3144 				sp_delete(sp, n);
3145 			else
3146 				n->start = end;
3147 		} else {
3148 			/* Old policy spanning whole new range. */
3149 			if (n->end > end) {
3150 				if (!n_new)
3151 					goto alloc_new;
3152 
3153 				*mpol_new = *n->policy;
3154 				atomic_set(&mpol_new->refcnt, 1);
3155 				sp_node_init(n_new, end, n->end, mpol_new);
3156 				n->end = start;
3157 				sp_insert(sp, n_new);
3158 				n_new = NULL;
3159 				mpol_new = NULL;
3160 				break;
3161 			} else
3162 				n->end = start;
3163 		}
3164 		if (!next)
3165 			break;
3166 		n = rb_entry(next, struct sp_node, nd);
3167 	}
3168 	if (new)
3169 		sp_insert(sp, new);
3170 	write_unlock(&sp->lock);
3171 	ret = 0;
3172 
3173 err_out:
3174 	if (mpol_new)
3175 		mpol_put(mpol_new);
3176 	if (n_new)
3177 		kmem_cache_free(sn_cache, n_new);
3178 
3179 	return ret;
3180 
3181 alloc_new:
3182 	write_unlock(&sp->lock);
3183 	ret = -ENOMEM;
3184 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3185 	if (!n_new)
3186 		goto err_out;
3187 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3188 	if (!mpol_new)
3189 		goto err_out;
3190 	atomic_set(&mpol_new->refcnt, 1);
3191 	goto restart;
3192 }
3193 
3194 /**
3195  * mpol_shared_policy_init - initialize shared policy for inode
3196  * @sp: pointer to inode shared policy
3197  * @mpol:  struct mempolicy to install
3198  *
3199  * Install non-NULL @mpol in inode's shared policy rb-tree.
3200  * On entry, the current task has a reference on a non-NULL @mpol.
3201  * This must be released on exit.
3202  * This is called at get_inode() calls and we can use GFP_KERNEL.
3203  */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3204 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3205 {
3206 	int ret;
3207 
3208 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3209 	rwlock_init(&sp->lock);
3210 
3211 	if (mpol) {
3212 		struct sp_node *sn;
3213 		struct mempolicy *npol;
3214 		NODEMASK_SCRATCH(scratch);
3215 
3216 		if (!scratch)
3217 			goto put_mpol;
3218 
3219 		/* contextualize the tmpfs mount point mempolicy to this file */
3220 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3221 		if (IS_ERR(npol))
3222 			goto free_scratch; /* no valid nodemask intersection */
3223 
3224 		task_lock(current);
3225 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3226 		task_unlock(current);
3227 		if (ret)
3228 			goto put_npol;
3229 
3230 		/* alloc node covering entire file; adds ref to file's npol */
3231 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3232 		if (sn)
3233 			sp_insert(sp, sn);
3234 put_npol:
3235 		mpol_put(npol);	/* drop initial ref on file's npol */
3236 free_scratch:
3237 		NODEMASK_SCRATCH_FREE(scratch);
3238 put_mpol:
3239 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3240 	}
3241 }
3242 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
3243 
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3244 int mpol_set_shared_policy(struct shared_policy *sp,
3245 			struct vm_area_struct *vma, struct mempolicy *pol)
3246 {
3247 	int err;
3248 	struct sp_node *new = NULL;
3249 	unsigned long sz = vma_pages(vma);
3250 
3251 	if (pol) {
3252 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3253 		if (!new)
3254 			return -ENOMEM;
3255 	}
3256 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3257 	if (err && new)
3258 		sp_free(new);
3259 	return err;
3260 }
3261 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
3262 
3263 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3264 void mpol_free_shared_policy(struct shared_policy *sp)
3265 {
3266 	struct sp_node *n;
3267 	struct rb_node *next;
3268 
3269 	if (!sp->root.rb_node)
3270 		return;
3271 	write_lock(&sp->lock);
3272 	next = rb_first(&sp->root);
3273 	while (next) {
3274 		n = rb_entry(next, struct sp_node, nd);
3275 		next = rb_next(&n->nd);
3276 		sp_delete(sp, n);
3277 	}
3278 	write_unlock(&sp->lock);
3279 }
3280 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
3281 
3282 #ifdef CONFIG_NUMA_BALANCING
3283 static int __initdata numabalancing_override;
3284 
check_numabalancing_enable(void)3285 static void __init check_numabalancing_enable(void)
3286 {
3287 	bool numabalancing_default = false;
3288 
3289 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3290 		numabalancing_default = true;
3291 
3292 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3293 	if (numabalancing_override)
3294 		set_numabalancing_state(numabalancing_override == 1);
3295 
3296 	if (num_online_nodes() > 1 && !numabalancing_override) {
3297 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3298 			numabalancing_default ? "Enabling" : "Disabling");
3299 		set_numabalancing_state(numabalancing_default);
3300 	}
3301 }
3302 
setup_numabalancing(char * str)3303 static int __init setup_numabalancing(char *str)
3304 {
3305 	int ret = 0;
3306 	if (!str)
3307 		goto out;
3308 
3309 	if (!strcmp(str, "enable")) {
3310 		numabalancing_override = 1;
3311 		ret = 1;
3312 	} else if (!strcmp(str, "disable")) {
3313 		numabalancing_override = -1;
3314 		ret = 1;
3315 	}
3316 out:
3317 	if (!ret)
3318 		pr_warn("Unable to parse numa_balancing=\n");
3319 
3320 	return ret;
3321 }
3322 __setup("numa_balancing=", setup_numabalancing);
3323 #else
check_numabalancing_enable(void)3324 static inline void __init check_numabalancing_enable(void)
3325 {
3326 }
3327 #endif /* CONFIG_NUMA_BALANCING */
3328 
numa_policy_init(void)3329 void __init numa_policy_init(void)
3330 {
3331 	nodemask_t interleave_nodes;
3332 	unsigned long largest = 0;
3333 	int nid, prefer = 0;
3334 
3335 	policy_cache = kmem_cache_create("numa_policy",
3336 					 sizeof(struct mempolicy),
3337 					 0, SLAB_PANIC, NULL);
3338 
3339 	sn_cache = kmem_cache_create("shared_policy_node",
3340 				     sizeof(struct sp_node),
3341 				     0, SLAB_PANIC, NULL);
3342 
3343 	for_each_node(nid) {
3344 		preferred_node_policy[nid] = (struct mempolicy) {
3345 			.refcnt = ATOMIC_INIT(1),
3346 			.mode = MPOL_PREFERRED,
3347 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3348 			.nodes = nodemask_of_node(nid),
3349 		};
3350 	}
3351 
3352 	/*
3353 	 * Set interleaving policy for system init. Interleaving is only
3354 	 * enabled across suitably sized nodes (default is >= 16MB), or
3355 	 * fall back to the largest node if they're all smaller.
3356 	 */
3357 	nodes_clear(interleave_nodes);
3358 	for_each_node_state(nid, N_MEMORY) {
3359 		unsigned long total_pages = node_present_pages(nid);
3360 
3361 		/* Preserve the largest node */
3362 		if (largest < total_pages) {
3363 			largest = total_pages;
3364 			prefer = nid;
3365 		}
3366 
3367 		/* Interleave this node? */
3368 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3369 			node_set(nid, interleave_nodes);
3370 	}
3371 
3372 	/* All too small, use the largest */
3373 	if (unlikely(nodes_empty(interleave_nodes)))
3374 		node_set(prefer, interleave_nodes);
3375 
3376 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3377 		pr_err("%s: interleaving failed\n", __func__);
3378 
3379 	check_numabalancing_enable();
3380 }
3381 
3382 /* Reset policy of current process to default */
numa_default_policy(void)3383 void numa_default_policy(void)
3384 {
3385 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3386 }
3387 
3388 /*
3389  * Parse and format mempolicy from/to strings
3390  */
3391 static const char * const policy_modes[] =
3392 {
3393 	[MPOL_DEFAULT]    = "default",
3394 	[MPOL_PREFERRED]  = "prefer",
3395 	[MPOL_BIND]       = "bind",
3396 	[MPOL_INTERLEAVE] = "interleave",
3397 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3398 	[MPOL_LOCAL]      = "local",
3399 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3400 };
3401 
3402 #ifdef CONFIG_TMPFS
3403 /**
3404  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3405  * @str:  string containing mempolicy to parse
3406  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3407  *
3408  * Format of input:
3409  *	<mode>[=<flags>][:<nodelist>]
3410  *
3411  * Return: %0 on success, else %1
3412  */
mpol_parse_str(char * str,struct mempolicy ** mpol)3413 int mpol_parse_str(char *str, struct mempolicy **mpol)
3414 {
3415 	struct mempolicy *new = NULL;
3416 	unsigned short mode_flags;
3417 	nodemask_t nodes;
3418 	char *nodelist = strchr(str, ':');
3419 	char *flags = strchr(str, '=');
3420 	int err = 1, mode;
3421 
3422 	if (flags)
3423 		*flags++ = '\0';	/* terminate mode string */
3424 
3425 	if (nodelist) {
3426 		/* NUL-terminate mode or flags string */
3427 		*nodelist++ = '\0';
3428 		if (nodelist_parse(nodelist, nodes))
3429 			goto out;
3430 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3431 			goto out;
3432 	} else
3433 		nodes_clear(nodes);
3434 
3435 	mode = match_string(policy_modes, MPOL_MAX, str);
3436 	if (mode < 0)
3437 		goto out;
3438 
3439 	switch (mode) {
3440 	case MPOL_PREFERRED:
3441 		/*
3442 		 * Insist on a nodelist of one node only, although later
3443 		 * we use first_node(nodes) to grab a single node, so here
3444 		 * nodelist (or nodes) cannot be empty.
3445 		 */
3446 		if (nodelist) {
3447 			char *rest = nodelist;
3448 			while (isdigit(*rest))
3449 				rest++;
3450 			if (*rest)
3451 				goto out;
3452 			if (nodes_empty(nodes))
3453 				goto out;
3454 		}
3455 		break;
3456 	case MPOL_INTERLEAVE:
3457 	case MPOL_WEIGHTED_INTERLEAVE:
3458 		/*
3459 		 * Default to online nodes with memory if no nodelist
3460 		 */
3461 		if (!nodelist)
3462 			nodes = node_states[N_MEMORY];
3463 		break;
3464 	case MPOL_LOCAL:
3465 		/*
3466 		 * Don't allow a nodelist;  mpol_new() checks flags
3467 		 */
3468 		if (nodelist)
3469 			goto out;
3470 		break;
3471 	case MPOL_DEFAULT:
3472 		/*
3473 		 * Insist on a empty nodelist
3474 		 */
3475 		if (!nodelist)
3476 			err = 0;
3477 		goto out;
3478 	case MPOL_PREFERRED_MANY:
3479 	case MPOL_BIND:
3480 		/*
3481 		 * Insist on a nodelist
3482 		 */
3483 		if (!nodelist)
3484 			goto out;
3485 	}
3486 
3487 	mode_flags = 0;
3488 	if (flags) {
3489 		/*
3490 		 * Currently, we only support two mutually exclusive
3491 		 * mode flags.
3492 		 */
3493 		if (!strcmp(flags, "static"))
3494 			mode_flags |= MPOL_F_STATIC_NODES;
3495 		else if (!strcmp(flags, "relative"))
3496 			mode_flags |= MPOL_F_RELATIVE_NODES;
3497 		else
3498 			goto out;
3499 	}
3500 
3501 	new = mpol_new(mode, mode_flags, &nodes);
3502 	if (IS_ERR(new))
3503 		goto out;
3504 
3505 	/*
3506 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3507 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3508 	 */
3509 	if (mode != MPOL_PREFERRED) {
3510 		new->nodes = nodes;
3511 	} else if (nodelist) {
3512 		nodes_clear(new->nodes);
3513 		node_set(first_node(nodes), new->nodes);
3514 	} else {
3515 		new->mode = MPOL_LOCAL;
3516 	}
3517 
3518 	/*
3519 	 * Save nodes for contextualization: this will be used to "clone"
3520 	 * the mempolicy in a specific context [cpuset] at a later time.
3521 	 */
3522 	new->w.user_nodemask = nodes;
3523 
3524 	err = 0;
3525 
3526 out:
3527 	/* Restore string for error message */
3528 	if (nodelist)
3529 		*--nodelist = ':';
3530 	if (flags)
3531 		*--flags = '=';
3532 	if (!err)
3533 		*mpol = new;
3534 	return err;
3535 }
3536 #endif /* CONFIG_TMPFS */
3537 
3538 /**
3539  * mpol_to_str - format a mempolicy structure for printing
3540  * @buffer:  to contain formatted mempolicy string
3541  * @maxlen:  length of @buffer
3542  * @pol:  pointer to mempolicy to be formatted
3543  *
3544  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3545  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3546  * interleave", plus the longest flag flags, "relative|balancing", and to
3547  * display at least a few node ids.
3548  */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3549 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3550 {
3551 	char *p = buffer;
3552 	nodemask_t nodes = NODE_MASK_NONE;
3553 	unsigned short mode = MPOL_DEFAULT;
3554 	unsigned short flags = 0;
3555 
3556 	if (pol &&
3557 	    pol != &default_policy &&
3558 	    !(pol >= &preferred_node_policy[0] &&
3559 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3560 		mode = pol->mode;
3561 		flags = pol->flags;
3562 	}
3563 
3564 	switch (mode) {
3565 	case MPOL_DEFAULT:
3566 	case MPOL_LOCAL:
3567 		break;
3568 	case MPOL_PREFERRED:
3569 	case MPOL_PREFERRED_MANY:
3570 	case MPOL_BIND:
3571 	case MPOL_INTERLEAVE:
3572 	case MPOL_WEIGHTED_INTERLEAVE:
3573 		nodes = pol->nodes;
3574 		break;
3575 	default:
3576 		WARN_ON_ONCE(1);
3577 		snprintf(p, maxlen, "unknown");
3578 		return;
3579 	}
3580 
3581 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3582 
3583 	if (flags & MPOL_MODE_FLAGS) {
3584 		p += snprintf(p, buffer + maxlen - p, "=");
3585 
3586 		/*
3587 		 * Static and relative are mutually exclusive.
3588 		 */
3589 		if (flags & MPOL_F_STATIC_NODES)
3590 			p += snprintf(p, buffer + maxlen - p, "static");
3591 		else if (flags & MPOL_F_RELATIVE_NODES)
3592 			p += snprintf(p, buffer + maxlen - p, "relative");
3593 
3594 		if (flags & MPOL_F_NUMA_BALANCING) {
3595 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3596 				p += snprintf(p, buffer + maxlen - p, "|");
3597 			p += snprintf(p, buffer + maxlen - p, "balancing");
3598 		}
3599 	}
3600 
3601 	if (!nodes_empty(nodes))
3602 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3603 			       nodemask_pr_args(&nodes));
3604 }
3605 
3606 #ifdef CONFIG_SYSFS
3607 struct iw_node_attr {
3608 	struct kobj_attribute kobj_attr;
3609 	int nid;
3610 };
3611 
3612 struct sysfs_wi_group {
3613 	struct kobject wi_kobj;
3614 	struct mutex kobj_lock;
3615 	struct iw_node_attr *nattrs[];
3616 };
3617 
3618 static struct sysfs_wi_group *wi_group;
3619 
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3620 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3621 			 char *buf)
3622 {
3623 	struct iw_node_attr *node_attr;
3624 	u8 weight;
3625 
3626 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3627 	weight = get_il_weight(node_attr->nid);
3628 	return sysfs_emit(buf, "%d\n", weight);
3629 }
3630 
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3631 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3632 			  const char *buf, size_t count)
3633 {
3634 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3635 	struct iw_node_attr *node_attr;
3636 	u8 weight = 0;
3637 	int i;
3638 
3639 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3640 	if (count == 0 || sysfs_streq(buf, "") ||
3641 	    kstrtou8(buf, 0, &weight) || weight == 0)
3642 		return -EINVAL;
3643 
3644 	new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3645 	if (!new_wi_state)
3646 		return -ENOMEM;
3647 
3648 	mutex_lock(&wi_state_lock);
3649 	old_wi_state = rcu_dereference_protected(wi_state,
3650 					lockdep_is_held(&wi_state_lock));
3651 	if (old_wi_state) {
3652 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3653 					nr_node_ids * sizeof(u8));
3654 	} else {
3655 		for (i = 0; i < nr_node_ids; i++)
3656 			new_wi_state->iw_table[i] = 1;
3657 	}
3658 	new_wi_state->iw_table[node_attr->nid] = weight;
3659 	new_wi_state->mode_auto = false;
3660 
3661 	rcu_assign_pointer(wi_state, new_wi_state);
3662 	mutex_unlock(&wi_state_lock);
3663 	if (old_wi_state) {
3664 		synchronize_rcu();
3665 		kfree(old_wi_state);
3666 	}
3667 	return count;
3668 }
3669 
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3670 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3671 		struct kobj_attribute *attr, char *buf)
3672 {
3673 	struct weighted_interleave_state *state;
3674 	bool wi_auto = true;
3675 
3676 	rcu_read_lock();
3677 	state = rcu_dereference(wi_state);
3678 	if (state)
3679 		wi_auto = state->mode_auto;
3680 	rcu_read_unlock();
3681 
3682 	return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3683 }
3684 
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3685 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3686 		struct kobj_attribute *attr, const char *buf, size_t count)
3687 {
3688 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3689 	unsigned int *bw;
3690 	bool input;
3691 	int i;
3692 
3693 	if (kstrtobool(buf, &input))
3694 		return -EINVAL;
3695 
3696 	new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3697 	if (!new_wi_state)
3698 		return -ENOMEM;
3699 	for (i = 0; i < nr_node_ids; i++)
3700 		new_wi_state->iw_table[i] = 1;
3701 
3702 	mutex_lock(&wi_state_lock);
3703 	if (!input) {
3704 		old_wi_state = rcu_dereference_protected(wi_state,
3705 					lockdep_is_held(&wi_state_lock));
3706 		if (!old_wi_state)
3707 			goto update_wi_state;
3708 		if (input == old_wi_state->mode_auto) {
3709 			mutex_unlock(&wi_state_lock);
3710 			return count;
3711 		}
3712 
3713 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3714 					       nr_node_ids * sizeof(u8));
3715 		goto update_wi_state;
3716 	}
3717 
3718 	bw = node_bw_table;
3719 	if (!bw) {
3720 		mutex_unlock(&wi_state_lock);
3721 		kfree(new_wi_state);
3722 		return -ENODEV;
3723 	}
3724 
3725 	new_wi_state->mode_auto = true;
3726 	reduce_interleave_weights(bw, new_wi_state->iw_table);
3727 
3728 update_wi_state:
3729 	rcu_assign_pointer(wi_state, new_wi_state);
3730 	mutex_unlock(&wi_state_lock);
3731 	if (old_wi_state) {
3732 		synchronize_rcu();
3733 		kfree(old_wi_state);
3734 	}
3735 	return count;
3736 }
3737 
sysfs_wi_node_delete(int nid)3738 static void sysfs_wi_node_delete(int nid)
3739 {
3740 	struct iw_node_attr *attr;
3741 
3742 	if (nid < 0 || nid >= nr_node_ids)
3743 		return;
3744 
3745 	mutex_lock(&wi_group->kobj_lock);
3746 	attr = wi_group->nattrs[nid];
3747 	if (!attr) {
3748 		mutex_unlock(&wi_group->kobj_lock);
3749 		return;
3750 	}
3751 
3752 	wi_group->nattrs[nid] = NULL;
3753 	mutex_unlock(&wi_group->kobj_lock);
3754 
3755 	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3756 	kfree(attr->kobj_attr.attr.name);
3757 	kfree(attr);
3758 }
3759 
sysfs_wi_node_delete_all(void)3760 static void sysfs_wi_node_delete_all(void)
3761 {
3762 	int nid;
3763 
3764 	for (nid = 0; nid < nr_node_ids; nid++)
3765 		sysfs_wi_node_delete(nid);
3766 }
3767 
wi_state_free(void)3768 static void wi_state_free(void)
3769 {
3770 	struct weighted_interleave_state *old_wi_state;
3771 
3772 	mutex_lock(&wi_state_lock);
3773 	old_wi_state = rcu_dereference_protected(wi_state,
3774 			lockdep_is_held(&wi_state_lock));
3775 	rcu_assign_pointer(wi_state, NULL);
3776 	mutex_unlock(&wi_state_lock);
3777 
3778 	if (old_wi_state) {
3779 		synchronize_rcu();
3780 		kfree(old_wi_state);
3781 	}
3782 }
3783 
3784 static struct kobj_attribute wi_auto_attr =
3785 	__ATTR(auto, 0664, weighted_interleave_auto_show,
3786 			   weighted_interleave_auto_store);
3787 
wi_cleanup(void)3788 static void wi_cleanup(void) {
3789 	sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3790 	sysfs_wi_node_delete_all();
3791 	wi_state_free();
3792 }
3793 
wi_kobj_release(struct kobject * wi_kobj)3794 static void wi_kobj_release(struct kobject *wi_kobj)
3795 {
3796 	kfree(wi_group);
3797 }
3798 
3799 static const struct kobj_type wi_ktype = {
3800 	.sysfs_ops = &kobj_sysfs_ops,
3801 	.release = wi_kobj_release,
3802 };
3803 
sysfs_wi_node_add(int nid)3804 static int sysfs_wi_node_add(int nid)
3805 {
3806 	int ret;
3807 	char *name;
3808 	struct iw_node_attr *new_attr;
3809 
3810 	if (nid < 0 || nid >= nr_node_ids) {
3811 		pr_err("invalid node id: %d\n", nid);
3812 		return -EINVAL;
3813 	}
3814 
3815 	new_attr = kzalloc_obj(*new_attr);
3816 	if (!new_attr)
3817 		return -ENOMEM;
3818 
3819 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3820 	if (!name) {
3821 		kfree(new_attr);
3822 		return -ENOMEM;
3823 	}
3824 
3825 	sysfs_attr_init(&new_attr->kobj_attr.attr);
3826 	new_attr->kobj_attr.attr.name = name;
3827 	new_attr->kobj_attr.attr.mode = 0644;
3828 	new_attr->kobj_attr.show = node_show;
3829 	new_attr->kobj_attr.store = node_store;
3830 	new_attr->nid = nid;
3831 
3832 	mutex_lock(&wi_group->kobj_lock);
3833 	if (wi_group->nattrs[nid]) {
3834 		mutex_unlock(&wi_group->kobj_lock);
3835 		ret = -EEXIST;
3836 		goto out;
3837 	}
3838 
3839 	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3840 	if (ret) {
3841 		mutex_unlock(&wi_group->kobj_lock);
3842 		goto out;
3843 	}
3844 	wi_group->nattrs[nid] = new_attr;
3845 	mutex_unlock(&wi_group->kobj_lock);
3846 	return 0;
3847 
3848 out:
3849 	kfree(new_attr->kobj_attr.attr.name);
3850 	kfree(new_attr);
3851 	return ret;
3852 }
3853 
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3854 static int wi_node_notifier(struct notifier_block *nb,
3855 			       unsigned long action, void *data)
3856 {
3857 	int err;
3858 	struct node_notify *nn = data;
3859 	int nid = nn->nid;
3860 
3861 	switch (action) {
3862 	case NODE_ADDED_FIRST_MEMORY:
3863 		err = sysfs_wi_node_add(nid);
3864 		if (err)
3865 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3866 			       nid, err);
3867 		break;
3868 	case NODE_REMOVED_LAST_MEMORY:
3869 		sysfs_wi_node_delete(nid);
3870 		break;
3871 	}
3872 
3873 	return NOTIFY_OK;
3874 }
3875 
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3876 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3877 {
3878 	int nid, err;
3879 
3880 	wi_group = kzalloc_flex(*wi_group, nattrs, nr_node_ids);
3881 	if (!wi_group)
3882 		return -ENOMEM;
3883 	mutex_init(&wi_group->kobj_lock);
3884 
3885 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3886 				   "weighted_interleave");
3887 	if (err)
3888 		goto err_put_kobj;
3889 
3890 	err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3891 	if (err)
3892 		goto err_put_kobj;
3893 
3894 	for_each_online_node(nid) {
3895 		if (!node_state(nid, N_MEMORY))
3896 			continue;
3897 
3898 		err = sysfs_wi_node_add(nid);
3899 		if (err) {
3900 			pr_err("failed to add sysfs for node%d during init: %d\n",
3901 			       nid, err);
3902 			goto err_cleanup_kobj;
3903 		}
3904 	}
3905 
3906 	hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3907 	return 0;
3908 
3909 err_cleanup_kobj:
3910 	wi_cleanup();
3911 	kobject_del(&wi_group->wi_kobj);
3912 err_put_kobj:
3913 	kobject_put(&wi_group->wi_kobj);
3914 	return err;
3915 }
3916 
mempolicy_sysfs_init(void)3917 static int __init mempolicy_sysfs_init(void)
3918 {
3919 	int err;
3920 	static struct kobject *mempolicy_kobj;
3921 
3922 	mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3923 	if (!mempolicy_kobj)
3924 		return -ENOMEM;
3925 
3926 	err = add_weighted_interleave_group(mempolicy_kobj);
3927 	if (err)
3928 		goto err_kobj;
3929 
3930 	return 0;
3931 
3932 err_kobj:
3933 	kobject_del(mempolicy_kobj);
3934 	kobject_put(mempolicy_kobj);
3935 	return err;
3936 }
3937 
3938 late_initcall(mempolicy_sysfs_init);
3939 #endif /* CONFIG_SYSFS */
3940