xref: /linux/mm/mempolicy.c (revision ebf68996de0ab250c5d520eb2291ab65643e9a1e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102 
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105 
106 #include "internal.h"
107 
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
111 
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114 
115 /* Highest zone. An specific allocation for a zone below that is not
116    policied. */
117 enum zone_type policy_zone = 0;
118 
119 /*
120  * run-time system-wide default policy => local allocation
121  */
122 static struct mempolicy default_policy = {
123 	.refcnt = ATOMIC_INIT(1), /* never free it */
124 	.mode = MPOL_PREFERRED,
125 	.flags = MPOL_F_LOCAL,
126 };
127 
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 
130 struct mempolicy *get_task_policy(struct task_struct *p)
131 {
132 	struct mempolicy *pol = p->mempolicy;
133 	int node;
134 
135 	if (pol)
136 		return pol;
137 
138 	node = numa_node_id();
139 	if (node != NUMA_NO_NODE) {
140 		pol = &preferred_node_policy[node];
141 		/* preferred_node_policy is not initialised early in boot */
142 		if (pol->mode)
143 			return pol;
144 	}
145 
146 	return &default_policy;
147 }
148 
149 static const struct mempolicy_operations {
150 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152 } mpol_ops[MPOL_MAX];
153 
154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155 {
156 	return pol->flags & MPOL_MODE_FLAGS;
157 }
158 
159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 				   const nodemask_t *rel)
161 {
162 	nodemask_t tmp;
163 	nodes_fold(tmp, *orig, nodes_weight(*rel));
164 	nodes_onto(*ret, tmp, *rel);
165 }
166 
167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168 {
169 	if (nodes_empty(*nodes))
170 		return -EINVAL;
171 	pol->v.nodes = *nodes;
172 	return 0;
173 }
174 
175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176 {
177 	if (!nodes)
178 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
179 	else if (nodes_empty(*nodes))
180 		return -EINVAL;			/*  no allowed nodes */
181 	else
182 		pol->v.preferred_node = first_node(*nodes);
183 	return 0;
184 }
185 
186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188 	if (nodes_empty(*nodes))
189 		return -EINVAL;
190 	pol->v.nodes = *nodes;
191 	return 0;
192 }
193 
194 /*
195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
196  * any, for the new policy.  mpol_new() has already validated the nodes
197  * parameter with respect to the policy mode and flags.  But, we need to
198  * handle an empty nodemask with MPOL_PREFERRED here.
199  *
200  * Must be called holding task's alloc_lock to protect task's mems_allowed
201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
202  */
203 static int mpol_set_nodemask(struct mempolicy *pol,
204 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
205 {
206 	int ret;
207 
208 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
209 	if (pol == NULL)
210 		return 0;
211 	/* Check N_MEMORY */
212 	nodes_and(nsc->mask1,
213 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
214 
215 	VM_BUG_ON(!nodes);
216 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 		nodes = NULL;	/* explicit local allocation */
218 	else {
219 		if (pol->flags & MPOL_F_RELATIVE_NODES)
220 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 		else
222 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
223 
224 		if (mpol_store_user_nodemask(pol))
225 			pol->w.user_nodemask = *nodes;
226 		else
227 			pol->w.cpuset_mems_allowed =
228 						cpuset_current_mems_allowed;
229 	}
230 
231 	if (nodes)
232 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 	else
234 		ret = mpol_ops[pol->mode].create(pol, NULL);
235 	return ret;
236 }
237 
238 /*
239  * This function just creates a new policy, does some check and simple
240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
241  */
242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 				  nodemask_t *nodes)
244 {
245 	struct mempolicy *policy;
246 
247 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249 
250 	if (mode == MPOL_DEFAULT) {
251 		if (nodes && !nodes_empty(*nodes))
252 			return ERR_PTR(-EINVAL);
253 		return NULL;
254 	}
255 	VM_BUG_ON(!nodes);
256 
257 	/*
258 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
259 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
260 	 * All other modes require a valid pointer to a non-empty nodemask.
261 	 */
262 	if (mode == MPOL_PREFERRED) {
263 		if (nodes_empty(*nodes)) {
264 			if (((flags & MPOL_F_STATIC_NODES) ||
265 			     (flags & MPOL_F_RELATIVE_NODES)))
266 				return ERR_PTR(-EINVAL);
267 		}
268 	} else if (mode == MPOL_LOCAL) {
269 		if (!nodes_empty(*nodes) ||
270 		    (flags & MPOL_F_STATIC_NODES) ||
271 		    (flags & MPOL_F_RELATIVE_NODES))
272 			return ERR_PTR(-EINVAL);
273 		mode = MPOL_PREFERRED;
274 	} else if (nodes_empty(*nodes))
275 		return ERR_PTR(-EINVAL);
276 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 	if (!policy)
278 		return ERR_PTR(-ENOMEM);
279 	atomic_set(&policy->refcnt, 1);
280 	policy->mode = mode;
281 	policy->flags = flags;
282 
283 	return policy;
284 }
285 
286 /* Slow path of a mpol destructor. */
287 void __mpol_put(struct mempolicy *p)
288 {
289 	if (!atomic_dec_and_test(&p->refcnt))
290 		return;
291 	kmem_cache_free(policy_cache, p);
292 }
293 
294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295 {
296 }
297 
298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299 {
300 	nodemask_t tmp;
301 
302 	if (pol->flags & MPOL_F_STATIC_NODES)
303 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 	else {
307 		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 								*nodes);
309 		pol->w.cpuset_mems_allowed = tmp;
310 	}
311 
312 	if (nodes_empty(tmp))
313 		tmp = *nodes;
314 
315 	pol->v.nodes = tmp;
316 }
317 
318 static void mpol_rebind_preferred(struct mempolicy *pol,
319 						const nodemask_t *nodes)
320 {
321 	nodemask_t tmp;
322 
323 	if (pol->flags & MPOL_F_STATIC_NODES) {
324 		int node = first_node(pol->w.user_nodemask);
325 
326 		if (node_isset(node, *nodes)) {
327 			pol->v.preferred_node = node;
328 			pol->flags &= ~MPOL_F_LOCAL;
329 		} else
330 			pol->flags |= MPOL_F_LOCAL;
331 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 		pol->v.preferred_node = first_node(tmp);
334 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
335 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 						   pol->w.cpuset_mems_allowed,
337 						   *nodes);
338 		pol->w.cpuset_mems_allowed = *nodes;
339 	}
340 }
341 
342 /*
343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
344  *
345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
346  * policies are protected by task->mems_allowed_seq to prevent a premature
347  * OOM/allocation failure due to parallel nodemask modification.
348  */
349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350 {
351 	if (!pol)
352 		return;
353 	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 		return;
356 
357 	mpol_ops[pol->mode].rebind(pol, newmask);
358 }
359 
360 /*
361  * Wrapper for mpol_rebind_policy() that just requires task
362  * pointer, and updates task mempolicy.
363  *
364  * Called with task's alloc_lock held.
365  */
366 
367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368 {
369 	mpol_rebind_policy(tsk->mempolicy, new);
370 }
371 
372 /*
373  * Rebind each vma in mm to new nodemask.
374  *
375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
376  */
377 
378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379 {
380 	struct vm_area_struct *vma;
381 
382 	down_write(&mm->mmap_sem);
383 	for (vma = mm->mmap; vma; vma = vma->vm_next)
384 		mpol_rebind_policy(vma->vm_policy, new);
385 	up_write(&mm->mmap_sem);
386 }
387 
388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 	[MPOL_DEFAULT] = {
390 		.rebind = mpol_rebind_default,
391 	},
392 	[MPOL_INTERLEAVE] = {
393 		.create = mpol_new_interleave,
394 		.rebind = mpol_rebind_nodemask,
395 	},
396 	[MPOL_PREFERRED] = {
397 		.create = mpol_new_preferred,
398 		.rebind = mpol_rebind_preferred,
399 	},
400 	[MPOL_BIND] = {
401 		.create = mpol_new_bind,
402 		.rebind = mpol_rebind_nodemask,
403 	},
404 };
405 
406 static void migrate_page_add(struct page *page, struct list_head *pagelist,
407 				unsigned long flags);
408 
409 struct queue_pages {
410 	struct list_head *pagelist;
411 	unsigned long flags;
412 	nodemask_t *nmask;
413 	struct vm_area_struct *prev;
414 };
415 
416 /*
417  * Check if the page's nid is in qp->nmask.
418  *
419  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
420  * in the invert of qp->nmask.
421  */
422 static inline bool queue_pages_required(struct page *page,
423 					struct queue_pages *qp)
424 {
425 	int nid = page_to_nid(page);
426 	unsigned long flags = qp->flags;
427 
428 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
429 }
430 
431 /*
432  * queue_pages_pmd() has three possible return values:
433  * 1 - pages are placed on the right node or queued successfully.
434  * 0 - THP was split.
435  * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
436  *        page was already on a node that does not follow the policy.
437  */
438 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
439 				unsigned long end, struct mm_walk *walk)
440 {
441 	int ret = 0;
442 	struct page *page;
443 	struct queue_pages *qp = walk->private;
444 	unsigned long flags;
445 
446 	if (unlikely(is_pmd_migration_entry(*pmd))) {
447 		ret = -EIO;
448 		goto unlock;
449 	}
450 	page = pmd_page(*pmd);
451 	if (is_huge_zero_page(page)) {
452 		spin_unlock(ptl);
453 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
454 		goto out;
455 	}
456 	if (!queue_pages_required(page, qp)) {
457 		ret = 1;
458 		goto unlock;
459 	}
460 
461 	ret = 1;
462 	flags = qp->flags;
463 	/* go to thp migration */
464 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
465 		if (!vma_migratable(walk->vma)) {
466 			ret = -EIO;
467 			goto unlock;
468 		}
469 
470 		migrate_page_add(page, qp->pagelist, flags);
471 	} else
472 		ret = -EIO;
473 unlock:
474 	spin_unlock(ptl);
475 out:
476 	return ret;
477 }
478 
479 /*
480  * Scan through pages checking if pages follow certain conditions,
481  * and move them to the pagelist if they do.
482  */
483 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
484 			unsigned long end, struct mm_walk *walk)
485 {
486 	struct vm_area_struct *vma = walk->vma;
487 	struct page *page;
488 	struct queue_pages *qp = walk->private;
489 	unsigned long flags = qp->flags;
490 	int ret;
491 	pte_t *pte;
492 	spinlock_t *ptl;
493 
494 	ptl = pmd_trans_huge_lock(pmd, vma);
495 	if (ptl) {
496 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
497 		if (ret > 0)
498 			return 0;
499 		else if (ret < 0)
500 			return ret;
501 	}
502 
503 	if (pmd_trans_unstable(pmd))
504 		return 0;
505 
506 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
507 	for (; addr != end; pte++, addr += PAGE_SIZE) {
508 		if (!pte_present(*pte))
509 			continue;
510 		page = vm_normal_page(vma, addr, *pte);
511 		if (!page)
512 			continue;
513 		/*
514 		 * vm_normal_page() filters out zero pages, but there might
515 		 * still be PageReserved pages to skip, perhaps in a VDSO.
516 		 */
517 		if (PageReserved(page))
518 			continue;
519 		if (!queue_pages_required(page, qp))
520 			continue;
521 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
522 			if (!vma_migratable(vma))
523 				break;
524 			migrate_page_add(page, qp->pagelist, flags);
525 		} else
526 			break;
527 	}
528 	pte_unmap_unlock(pte - 1, ptl);
529 	cond_resched();
530 	return addr != end ? -EIO : 0;
531 }
532 
533 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
534 			       unsigned long addr, unsigned long end,
535 			       struct mm_walk *walk)
536 {
537 #ifdef CONFIG_HUGETLB_PAGE
538 	struct queue_pages *qp = walk->private;
539 	unsigned long flags = qp->flags;
540 	struct page *page;
541 	spinlock_t *ptl;
542 	pte_t entry;
543 
544 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
545 	entry = huge_ptep_get(pte);
546 	if (!pte_present(entry))
547 		goto unlock;
548 	page = pte_page(entry);
549 	if (!queue_pages_required(page, qp))
550 		goto unlock;
551 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
552 	if (flags & (MPOL_MF_MOVE_ALL) ||
553 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
554 		isolate_huge_page(page, qp->pagelist);
555 unlock:
556 	spin_unlock(ptl);
557 #else
558 	BUG();
559 #endif
560 	return 0;
561 }
562 
563 #ifdef CONFIG_NUMA_BALANCING
564 /*
565  * This is used to mark a range of virtual addresses to be inaccessible.
566  * These are later cleared by a NUMA hinting fault. Depending on these
567  * faults, pages may be migrated for better NUMA placement.
568  *
569  * This is assuming that NUMA faults are handled using PROT_NONE. If
570  * an architecture makes a different choice, it will need further
571  * changes to the core.
572  */
573 unsigned long change_prot_numa(struct vm_area_struct *vma,
574 			unsigned long addr, unsigned long end)
575 {
576 	int nr_updated;
577 
578 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
579 	if (nr_updated)
580 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
581 
582 	return nr_updated;
583 }
584 #else
585 static unsigned long change_prot_numa(struct vm_area_struct *vma,
586 			unsigned long addr, unsigned long end)
587 {
588 	return 0;
589 }
590 #endif /* CONFIG_NUMA_BALANCING */
591 
592 static int queue_pages_test_walk(unsigned long start, unsigned long end,
593 				struct mm_walk *walk)
594 {
595 	struct vm_area_struct *vma = walk->vma;
596 	struct queue_pages *qp = walk->private;
597 	unsigned long endvma = vma->vm_end;
598 	unsigned long flags = qp->flags;
599 
600 	/*
601 	 * Need check MPOL_MF_STRICT to return -EIO if possible
602 	 * regardless of vma_migratable
603 	 */
604 	if (!vma_migratable(vma) &&
605 	    !(flags & MPOL_MF_STRICT))
606 		return 1;
607 
608 	if (endvma > end)
609 		endvma = end;
610 	if (vma->vm_start > start)
611 		start = vma->vm_start;
612 
613 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
614 		if (!vma->vm_next && vma->vm_end < end)
615 			return -EFAULT;
616 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
617 			return -EFAULT;
618 	}
619 
620 	qp->prev = vma;
621 
622 	if (flags & MPOL_MF_LAZY) {
623 		/* Similar to task_numa_work, skip inaccessible VMAs */
624 		if (!is_vm_hugetlb_page(vma) &&
625 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
626 			!(vma->vm_flags & VM_MIXEDMAP))
627 			change_prot_numa(vma, start, endvma);
628 		return 1;
629 	}
630 
631 	/* queue pages from current vma */
632 	if (flags & MPOL_MF_VALID)
633 		return 0;
634 	return 1;
635 }
636 
637 /*
638  * Walk through page tables and collect pages to be migrated.
639  *
640  * If pages found in a given range are on a set of nodes (determined by
641  * @nodes and @flags,) it's isolated and queued to the pagelist which is
642  * passed via @private.)
643  */
644 static int
645 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
646 		nodemask_t *nodes, unsigned long flags,
647 		struct list_head *pagelist)
648 {
649 	struct queue_pages qp = {
650 		.pagelist = pagelist,
651 		.flags = flags,
652 		.nmask = nodes,
653 		.prev = NULL,
654 	};
655 	struct mm_walk queue_pages_walk = {
656 		.hugetlb_entry = queue_pages_hugetlb,
657 		.pmd_entry = queue_pages_pte_range,
658 		.test_walk = queue_pages_test_walk,
659 		.mm = mm,
660 		.private = &qp,
661 	};
662 
663 	return walk_page_range(start, end, &queue_pages_walk);
664 }
665 
666 /*
667  * Apply policy to a single VMA
668  * This must be called with the mmap_sem held for writing.
669  */
670 static int vma_replace_policy(struct vm_area_struct *vma,
671 						struct mempolicy *pol)
672 {
673 	int err;
674 	struct mempolicy *old;
675 	struct mempolicy *new;
676 
677 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
678 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
679 		 vma->vm_ops, vma->vm_file,
680 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
681 
682 	new = mpol_dup(pol);
683 	if (IS_ERR(new))
684 		return PTR_ERR(new);
685 
686 	if (vma->vm_ops && vma->vm_ops->set_policy) {
687 		err = vma->vm_ops->set_policy(vma, new);
688 		if (err)
689 			goto err_out;
690 	}
691 
692 	old = vma->vm_policy;
693 	vma->vm_policy = new; /* protected by mmap_sem */
694 	mpol_put(old);
695 
696 	return 0;
697  err_out:
698 	mpol_put(new);
699 	return err;
700 }
701 
702 /* Step 2: apply policy to a range and do splits. */
703 static int mbind_range(struct mm_struct *mm, unsigned long start,
704 		       unsigned long end, struct mempolicy *new_pol)
705 {
706 	struct vm_area_struct *next;
707 	struct vm_area_struct *prev;
708 	struct vm_area_struct *vma;
709 	int err = 0;
710 	pgoff_t pgoff;
711 	unsigned long vmstart;
712 	unsigned long vmend;
713 
714 	vma = find_vma(mm, start);
715 	if (!vma || vma->vm_start > start)
716 		return -EFAULT;
717 
718 	prev = vma->vm_prev;
719 	if (start > vma->vm_start)
720 		prev = vma;
721 
722 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
723 		next = vma->vm_next;
724 		vmstart = max(start, vma->vm_start);
725 		vmend   = min(end, vma->vm_end);
726 
727 		if (mpol_equal(vma_policy(vma), new_pol))
728 			continue;
729 
730 		pgoff = vma->vm_pgoff +
731 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
732 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
733 				 vma->anon_vma, vma->vm_file, pgoff,
734 				 new_pol, vma->vm_userfaultfd_ctx);
735 		if (prev) {
736 			vma = prev;
737 			next = vma->vm_next;
738 			if (mpol_equal(vma_policy(vma), new_pol))
739 				continue;
740 			/* vma_merge() joined vma && vma->next, case 8 */
741 			goto replace;
742 		}
743 		if (vma->vm_start != vmstart) {
744 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
745 			if (err)
746 				goto out;
747 		}
748 		if (vma->vm_end != vmend) {
749 			err = split_vma(vma->vm_mm, vma, vmend, 0);
750 			if (err)
751 				goto out;
752 		}
753  replace:
754 		err = vma_replace_policy(vma, new_pol);
755 		if (err)
756 			goto out;
757 	}
758 
759  out:
760 	return err;
761 }
762 
763 /* Set the process memory policy */
764 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
765 			     nodemask_t *nodes)
766 {
767 	struct mempolicy *new, *old;
768 	NODEMASK_SCRATCH(scratch);
769 	int ret;
770 
771 	if (!scratch)
772 		return -ENOMEM;
773 
774 	new = mpol_new(mode, flags, nodes);
775 	if (IS_ERR(new)) {
776 		ret = PTR_ERR(new);
777 		goto out;
778 	}
779 
780 	task_lock(current);
781 	ret = mpol_set_nodemask(new, nodes, scratch);
782 	if (ret) {
783 		task_unlock(current);
784 		mpol_put(new);
785 		goto out;
786 	}
787 	old = current->mempolicy;
788 	current->mempolicy = new;
789 	if (new && new->mode == MPOL_INTERLEAVE)
790 		current->il_prev = MAX_NUMNODES-1;
791 	task_unlock(current);
792 	mpol_put(old);
793 	ret = 0;
794 out:
795 	NODEMASK_SCRATCH_FREE(scratch);
796 	return ret;
797 }
798 
799 /*
800  * Return nodemask for policy for get_mempolicy() query
801  *
802  * Called with task's alloc_lock held
803  */
804 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
805 {
806 	nodes_clear(*nodes);
807 	if (p == &default_policy)
808 		return;
809 
810 	switch (p->mode) {
811 	case MPOL_BIND:
812 		/* Fall through */
813 	case MPOL_INTERLEAVE:
814 		*nodes = p->v.nodes;
815 		break;
816 	case MPOL_PREFERRED:
817 		if (!(p->flags & MPOL_F_LOCAL))
818 			node_set(p->v.preferred_node, *nodes);
819 		/* else return empty node mask for local allocation */
820 		break;
821 	default:
822 		BUG();
823 	}
824 }
825 
826 static int lookup_node(struct mm_struct *mm, unsigned long addr)
827 {
828 	struct page *p;
829 	int err;
830 
831 	int locked = 1;
832 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
833 	if (err >= 0) {
834 		err = page_to_nid(p);
835 		put_page(p);
836 	}
837 	if (locked)
838 		up_read(&mm->mmap_sem);
839 	return err;
840 }
841 
842 /* Retrieve NUMA policy */
843 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
844 			     unsigned long addr, unsigned long flags)
845 {
846 	int err;
847 	struct mm_struct *mm = current->mm;
848 	struct vm_area_struct *vma = NULL;
849 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
850 
851 	if (flags &
852 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
853 		return -EINVAL;
854 
855 	if (flags & MPOL_F_MEMS_ALLOWED) {
856 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
857 			return -EINVAL;
858 		*policy = 0;	/* just so it's initialized */
859 		task_lock(current);
860 		*nmask  = cpuset_current_mems_allowed;
861 		task_unlock(current);
862 		return 0;
863 	}
864 
865 	if (flags & MPOL_F_ADDR) {
866 		/*
867 		 * Do NOT fall back to task policy if the
868 		 * vma/shared policy at addr is NULL.  We
869 		 * want to return MPOL_DEFAULT in this case.
870 		 */
871 		down_read(&mm->mmap_sem);
872 		vma = find_vma_intersection(mm, addr, addr+1);
873 		if (!vma) {
874 			up_read(&mm->mmap_sem);
875 			return -EFAULT;
876 		}
877 		if (vma->vm_ops && vma->vm_ops->get_policy)
878 			pol = vma->vm_ops->get_policy(vma, addr);
879 		else
880 			pol = vma->vm_policy;
881 	} else if (addr)
882 		return -EINVAL;
883 
884 	if (!pol)
885 		pol = &default_policy;	/* indicates default behavior */
886 
887 	if (flags & MPOL_F_NODE) {
888 		if (flags & MPOL_F_ADDR) {
889 			/*
890 			 * Take a refcount on the mpol, lookup_node()
891 			 * wil drop the mmap_sem, so after calling
892 			 * lookup_node() only "pol" remains valid, "vma"
893 			 * is stale.
894 			 */
895 			pol_refcount = pol;
896 			vma = NULL;
897 			mpol_get(pol);
898 			err = lookup_node(mm, addr);
899 			if (err < 0)
900 				goto out;
901 			*policy = err;
902 		} else if (pol == current->mempolicy &&
903 				pol->mode == MPOL_INTERLEAVE) {
904 			*policy = next_node_in(current->il_prev, pol->v.nodes);
905 		} else {
906 			err = -EINVAL;
907 			goto out;
908 		}
909 	} else {
910 		*policy = pol == &default_policy ? MPOL_DEFAULT :
911 						pol->mode;
912 		/*
913 		 * Internal mempolicy flags must be masked off before exposing
914 		 * the policy to userspace.
915 		 */
916 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
917 	}
918 
919 	err = 0;
920 	if (nmask) {
921 		if (mpol_store_user_nodemask(pol)) {
922 			*nmask = pol->w.user_nodemask;
923 		} else {
924 			task_lock(current);
925 			get_policy_nodemask(pol, nmask);
926 			task_unlock(current);
927 		}
928 	}
929 
930  out:
931 	mpol_cond_put(pol);
932 	if (vma)
933 		up_read(&mm->mmap_sem);
934 	if (pol_refcount)
935 		mpol_put(pol_refcount);
936 	return err;
937 }
938 
939 #ifdef CONFIG_MIGRATION
940 /*
941  * page migration, thp tail pages can be passed.
942  */
943 static void migrate_page_add(struct page *page, struct list_head *pagelist,
944 				unsigned long flags)
945 {
946 	struct page *head = compound_head(page);
947 	/*
948 	 * Avoid migrating a page that is shared with others.
949 	 */
950 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
951 		if (!isolate_lru_page(head)) {
952 			list_add_tail(&head->lru, pagelist);
953 			mod_node_page_state(page_pgdat(head),
954 				NR_ISOLATED_ANON + page_is_file_cache(head),
955 				hpage_nr_pages(head));
956 		}
957 	}
958 }
959 
960 /* page allocation callback for NUMA node migration */
961 struct page *alloc_new_node_page(struct page *page, unsigned long node)
962 {
963 	if (PageHuge(page))
964 		return alloc_huge_page_node(page_hstate(compound_head(page)),
965 					node);
966 	else if (PageTransHuge(page)) {
967 		struct page *thp;
968 
969 		thp = alloc_pages_node(node,
970 			(GFP_TRANSHUGE | __GFP_THISNODE),
971 			HPAGE_PMD_ORDER);
972 		if (!thp)
973 			return NULL;
974 		prep_transhuge_page(thp);
975 		return thp;
976 	} else
977 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
978 						    __GFP_THISNODE, 0);
979 }
980 
981 /*
982  * Migrate pages from one node to a target node.
983  * Returns error or the number of pages not migrated.
984  */
985 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
986 			   int flags)
987 {
988 	nodemask_t nmask;
989 	LIST_HEAD(pagelist);
990 	int err = 0;
991 
992 	nodes_clear(nmask);
993 	node_set(source, nmask);
994 
995 	/*
996 	 * This does not "check" the range but isolates all pages that
997 	 * need migration.  Between passing in the full user address
998 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
999 	 */
1000 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1001 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1002 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1003 
1004 	if (!list_empty(&pagelist)) {
1005 		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1006 					MIGRATE_SYNC, MR_SYSCALL);
1007 		if (err)
1008 			putback_movable_pages(&pagelist);
1009 	}
1010 
1011 	return err;
1012 }
1013 
1014 /*
1015  * Move pages between the two nodesets so as to preserve the physical
1016  * layout as much as possible.
1017  *
1018  * Returns the number of page that could not be moved.
1019  */
1020 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1021 		     const nodemask_t *to, int flags)
1022 {
1023 	int busy = 0;
1024 	int err;
1025 	nodemask_t tmp;
1026 
1027 	err = migrate_prep();
1028 	if (err)
1029 		return err;
1030 
1031 	down_read(&mm->mmap_sem);
1032 
1033 	/*
1034 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1035 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1036 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1037 	 * The pair of nodemasks 'to' and 'from' define the map.
1038 	 *
1039 	 * If no pair of bits is found that way, fallback to picking some
1040 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1041 	 * 'source' and 'dest' bits are the same, this represents a node
1042 	 * that will be migrating to itself, so no pages need move.
1043 	 *
1044 	 * If no bits are left in 'tmp', or if all remaining bits left
1045 	 * in 'tmp' correspond to the same bit in 'to', return false
1046 	 * (nothing left to migrate).
1047 	 *
1048 	 * This lets us pick a pair of nodes to migrate between, such that
1049 	 * if possible the dest node is not already occupied by some other
1050 	 * source node, minimizing the risk of overloading the memory on a
1051 	 * node that would happen if we migrated incoming memory to a node
1052 	 * before migrating outgoing memory source that same node.
1053 	 *
1054 	 * A single scan of tmp is sufficient.  As we go, we remember the
1055 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1056 	 * that not only moved, but what's better, moved to an empty slot
1057 	 * (d is not set in tmp), then we break out then, with that pair.
1058 	 * Otherwise when we finish scanning from_tmp, we at least have the
1059 	 * most recent <s, d> pair that moved.  If we get all the way through
1060 	 * the scan of tmp without finding any node that moved, much less
1061 	 * moved to an empty node, then there is nothing left worth migrating.
1062 	 */
1063 
1064 	tmp = *from;
1065 	while (!nodes_empty(tmp)) {
1066 		int s,d;
1067 		int source = NUMA_NO_NODE;
1068 		int dest = 0;
1069 
1070 		for_each_node_mask(s, tmp) {
1071 
1072 			/*
1073 			 * do_migrate_pages() tries to maintain the relative
1074 			 * node relationship of the pages established between
1075 			 * threads and memory areas.
1076                          *
1077 			 * However if the number of source nodes is not equal to
1078 			 * the number of destination nodes we can not preserve
1079 			 * this node relative relationship.  In that case, skip
1080 			 * copying memory from a node that is in the destination
1081 			 * mask.
1082 			 *
1083 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1084 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1085 			 */
1086 
1087 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1088 						(node_isset(s, *to)))
1089 				continue;
1090 
1091 			d = node_remap(s, *from, *to);
1092 			if (s == d)
1093 				continue;
1094 
1095 			source = s;	/* Node moved. Memorize */
1096 			dest = d;
1097 
1098 			/* dest not in remaining from nodes? */
1099 			if (!node_isset(dest, tmp))
1100 				break;
1101 		}
1102 		if (source == NUMA_NO_NODE)
1103 			break;
1104 
1105 		node_clear(source, tmp);
1106 		err = migrate_to_node(mm, source, dest, flags);
1107 		if (err > 0)
1108 			busy += err;
1109 		if (err < 0)
1110 			break;
1111 	}
1112 	up_read(&mm->mmap_sem);
1113 	if (err < 0)
1114 		return err;
1115 	return busy;
1116 
1117 }
1118 
1119 /*
1120  * Allocate a new page for page migration based on vma policy.
1121  * Start by assuming the page is mapped by the same vma as contains @start.
1122  * Search forward from there, if not.  N.B., this assumes that the
1123  * list of pages handed to migrate_pages()--which is how we get here--
1124  * is in virtual address order.
1125  */
1126 static struct page *new_page(struct page *page, unsigned long start)
1127 {
1128 	struct vm_area_struct *vma;
1129 	unsigned long uninitialized_var(address);
1130 
1131 	vma = find_vma(current->mm, start);
1132 	while (vma) {
1133 		address = page_address_in_vma(page, vma);
1134 		if (address != -EFAULT)
1135 			break;
1136 		vma = vma->vm_next;
1137 	}
1138 
1139 	if (PageHuge(page)) {
1140 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1141 				vma, address);
1142 	} else if (PageTransHuge(page)) {
1143 		struct page *thp;
1144 
1145 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1146 					 HPAGE_PMD_ORDER);
1147 		if (!thp)
1148 			return NULL;
1149 		prep_transhuge_page(thp);
1150 		return thp;
1151 	}
1152 	/*
1153 	 * if !vma, alloc_page_vma() will use task or system default policy
1154 	 */
1155 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1156 			vma, address);
1157 }
1158 #else
1159 
1160 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1161 				unsigned long flags)
1162 {
1163 }
1164 
1165 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1166 		     const nodemask_t *to, int flags)
1167 {
1168 	return -ENOSYS;
1169 }
1170 
1171 static struct page *new_page(struct page *page, unsigned long start)
1172 {
1173 	return NULL;
1174 }
1175 #endif
1176 
1177 static long do_mbind(unsigned long start, unsigned long len,
1178 		     unsigned short mode, unsigned short mode_flags,
1179 		     nodemask_t *nmask, unsigned long flags)
1180 {
1181 	struct mm_struct *mm = current->mm;
1182 	struct mempolicy *new;
1183 	unsigned long end;
1184 	int err;
1185 	LIST_HEAD(pagelist);
1186 
1187 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1188 		return -EINVAL;
1189 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1190 		return -EPERM;
1191 
1192 	if (start & ~PAGE_MASK)
1193 		return -EINVAL;
1194 
1195 	if (mode == MPOL_DEFAULT)
1196 		flags &= ~MPOL_MF_STRICT;
1197 
1198 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1199 	end = start + len;
1200 
1201 	if (end < start)
1202 		return -EINVAL;
1203 	if (end == start)
1204 		return 0;
1205 
1206 	new = mpol_new(mode, mode_flags, nmask);
1207 	if (IS_ERR(new))
1208 		return PTR_ERR(new);
1209 
1210 	if (flags & MPOL_MF_LAZY)
1211 		new->flags |= MPOL_F_MOF;
1212 
1213 	/*
1214 	 * If we are using the default policy then operation
1215 	 * on discontinuous address spaces is okay after all
1216 	 */
1217 	if (!new)
1218 		flags |= MPOL_MF_DISCONTIG_OK;
1219 
1220 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1221 		 start, start + len, mode, mode_flags,
1222 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1223 
1224 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1225 
1226 		err = migrate_prep();
1227 		if (err)
1228 			goto mpol_out;
1229 	}
1230 	{
1231 		NODEMASK_SCRATCH(scratch);
1232 		if (scratch) {
1233 			down_write(&mm->mmap_sem);
1234 			task_lock(current);
1235 			err = mpol_set_nodemask(new, nmask, scratch);
1236 			task_unlock(current);
1237 			if (err)
1238 				up_write(&mm->mmap_sem);
1239 		} else
1240 			err = -ENOMEM;
1241 		NODEMASK_SCRATCH_FREE(scratch);
1242 	}
1243 	if (err)
1244 		goto mpol_out;
1245 
1246 	err = queue_pages_range(mm, start, end, nmask,
1247 			  flags | MPOL_MF_INVERT, &pagelist);
1248 	if (!err)
1249 		err = mbind_range(mm, start, end, new);
1250 
1251 	if (!err) {
1252 		int nr_failed = 0;
1253 
1254 		if (!list_empty(&pagelist)) {
1255 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1256 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1257 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1258 			if (nr_failed)
1259 				putback_movable_pages(&pagelist);
1260 		}
1261 
1262 		if (nr_failed && (flags & MPOL_MF_STRICT))
1263 			err = -EIO;
1264 	} else
1265 		putback_movable_pages(&pagelist);
1266 
1267 	up_write(&mm->mmap_sem);
1268  mpol_out:
1269 	mpol_put(new);
1270 	return err;
1271 }
1272 
1273 /*
1274  * User space interface with variable sized bitmaps for nodelists.
1275  */
1276 
1277 /* Copy a node mask from user space. */
1278 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1279 		     unsigned long maxnode)
1280 {
1281 	unsigned long k;
1282 	unsigned long t;
1283 	unsigned long nlongs;
1284 	unsigned long endmask;
1285 
1286 	--maxnode;
1287 	nodes_clear(*nodes);
1288 	if (maxnode == 0 || !nmask)
1289 		return 0;
1290 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1291 		return -EINVAL;
1292 
1293 	nlongs = BITS_TO_LONGS(maxnode);
1294 	if ((maxnode % BITS_PER_LONG) == 0)
1295 		endmask = ~0UL;
1296 	else
1297 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1298 
1299 	/*
1300 	 * When the user specified more nodes than supported just check
1301 	 * if the non supported part is all zero.
1302 	 *
1303 	 * If maxnode have more longs than MAX_NUMNODES, check
1304 	 * the bits in that area first. And then go through to
1305 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1306 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1307 	 */
1308 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1309 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1310 			if (get_user(t, nmask + k))
1311 				return -EFAULT;
1312 			if (k == nlongs - 1) {
1313 				if (t & endmask)
1314 					return -EINVAL;
1315 			} else if (t)
1316 				return -EINVAL;
1317 		}
1318 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1319 		endmask = ~0UL;
1320 	}
1321 
1322 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1323 		unsigned long valid_mask = endmask;
1324 
1325 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1326 		if (get_user(t, nmask + nlongs - 1))
1327 			return -EFAULT;
1328 		if (t & valid_mask)
1329 			return -EINVAL;
1330 	}
1331 
1332 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1333 		return -EFAULT;
1334 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1335 	return 0;
1336 }
1337 
1338 /* Copy a kernel node mask to user space */
1339 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1340 			      nodemask_t *nodes)
1341 {
1342 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1343 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1344 
1345 	if (copy > nbytes) {
1346 		if (copy > PAGE_SIZE)
1347 			return -EINVAL;
1348 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1349 			return -EFAULT;
1350 		copy = nbytes;
1351 	}
1352 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1353 }
1354 
1355 static long kernel_mbind(unsigned long start, unsigned long len,
1356 			 unsigned long mode, const unsigned long __user *nmask,
1357 			 unsigned long maxnode, unsigned int flags)
1358 {
1359 	nodemask_t nodes;
1360 	int err;
1361 	unsigned short mode_flags;
1362 
1363 	mode_flags = mode & MPOL_MODE_FLAGS;
1364 	mode &= ~MPOL_MODE_FLAGS;
1365 	if (mode >= MPOL_MAX)
1366 		return -EINVAL;
1367 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1368 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1369 		return -EINVAL;
1370 	err = get_nodes(&nodes, nmask, maxnode);
1371 	if (err)
1372 		return err;
1373 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1374 }
1375 
1376 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1377 		unsigned long, mode, const unsigned long __user *, nmask,
1378 		unsigned long, maxnode, unsigned int, flags)
1379 {
1380 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1381 }
1382 
1383 /* Set the process memory policy */
1384 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1385 				 unsigned long maxnode)
1386 {
1387 	int err;
1388 	nodemask_t nodes;
1389 	unsigned short flags;
1390 
1391 	flags = mode & MPOL_MODE_FLAGS;
1392 	mode &= ~MPOL_MODE_FLAGS;
1393 	if ((unsigned int)mode >= MPOL_MAX)
1394 		return -EINVAL;
1395 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1396 		return -EINVAL;
1397 	err = get_nodes(&nodes, nmask, maxnode);
1398 	if (err)
1399 		return err;
1400 	return do_set_mempolicy(mode, flags, &nodes);
1401 }
1402 
1403 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1404 		unsigned long, maxnode)
1405 {
1406 	return kernel_set_mempolicy(mode, nmask, maxnode);
1407 }
1408 
1409 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1410 				const unsigned long __user *old_nodes,
1411 				const unsigned long __user *new_nodes)
1412 {
1413 	struct mm_struct *mm = NULL;
1414 	struct task_struct *task;
1415 	nodemask_t task_nodes;
1416 	int err;
1417 	nodemask_t *old;
1418 	nodemask_t *new;
1419 	NODEMASK_SCRATCH(scratch);
1420 
1421 	if (!scratch)
1422 		return -ENOMEM;
1423 
1424 	old = &scratch->mask1;
1425 	new = &scratch->mask2;
1426 
1427 	err = get_nodes(old, old_nodes, maxnode);
1428 	if (err)
1429 		goto out;
1430 
1431 	err = get_nodes(new, new_nodes, maxnode);
1432 	if (err)
1433 		goto out;
1434 
1435 	/* Find the mm_struct */
1436 	rcu_read_lock();
1437 	task = pid ? find_task_by_vpid(pid) : current;
1438 	if (!task) {
1439 		rcu_read_unlock();
1440 		err = -ESRCH;
1441 		goto out;
1442 	}
1443 	get_task_struct(task);
1444 
1445 	err = -EINVAL;
1446 
1447 	/*
1448 	 * Check if this process has the right to modify the specified process.
1449 	 * Use the regular "ptrace_may_access()" checks.
1450 	 */
1451 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1452 		rcu_read_unlock();
1453 		err = -EPERM;
1454 		goto out_put;
1455 	}
1456 	rcu_read_unlock();
1457 
1458 	task_nodes = cpuset_mems_allowed(task);
1459 	/* Is the user allowed to access the target nodes? */
1460 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1461 		err = -EPERM;
1462 		goto out_put;
1463 	}
1464 
1465 	task_nodes = cpuset_mems_allowed(current);
1466 	nodes_and(*new, *new, task_nodes);
1467 	if (nodes_empty(*new))
1468 		goto out_put;
1469 
1470 	nodes_and(*new, *new, node_states[N_MEMORY]);
1471 	if (nodes_empty(*new))
1472 		goto out_put;
1473 
1474 	err = security_task_movememory(task);
1475 	if (err)
1476 		goto out_put;
1477 
1478 	mm = get_task_mm(task);
1479 	put_task_struct(task);
1480 
1481 	if (!mm) {
1482 		err = -EINVAL;
1483 		goto out;
1484 	}
1485 
1486 	err = do_migrate_pages(mm, old, new,
1487 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1488 
1489 	mmput(mm);
1490 out:
1491 	NODEMASK_SCRATCH_FREE(scratch);
1492 
1493 	return err;
1494 
1495 out_put:
1496 	put_task_struct(task);
1497 	goto out;
1498 
1499 }
1500 
1501 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1502 		const unsigned long __user *, old_nodes,
1503 		const unsigned long __user *, new_nodes)
1504 {
1505 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1506 }
1507 
1508 
1509 /* Retrieve NUMA policy */
1510 static int kernel_get_mempolicy(int __user *policy,
1511 				unsigned long __user *nmask,
1512 				unsigned long maxnode,
1513 				unsigned long addr,
1514 				unsigned long flags)
1515 {
1516 	int err;
1517 	int uninitialized_var(pval);
1518 	nodemask_t nodes;
1519 
1520 	if (nmask != NULL && maxnode < nr_node_ids)
1521 		return -EINVAL;
1522 
1523 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1524 
1525 	if (err)
1526 		return err;
1527 
1528 	if (policy && put_user(pval, policy))
1529 		return -EFAULT;
1530 
1531 	if (nmask)
1532 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1533 
1534 	return err;
1535 }
1536 
1537 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1538 		unsigned long __user *, nmask, unsigned long, maxnode,
1539 		unsigned long, addr, unsigned long, flags)
1540 {
1541 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1542 }
1543 
1544 #ifdef CONFIG_COMPAT
1545 
1546 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1547 		       compat_ulong_t __user *, nmask,
1548 		       compat_ulong_t, maxnode,
1549 		       compat_ulong_t, addr, compat_ulong_t, flags)
1550 {
1551 	long err;
1552 	unsigned long __user *nm = NULL;
1553 	unsigned long nr_bits, alloc_size;
1554 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1555 
1556 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1557 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1558 
1559 	if (nmask)
1560 		nm = compat_alloc_user_space(alloc_size);
1561 
1562 	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1563 
1564 	if (!err && nmask) {
1565 		unsigned long copy_size;
1566 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1567 		err = copy_from_user(bm, nm, copy_size);
1568 		/* ensure entire bitmap is zeroed */
1569 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1570 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1571 	}
1572 
1573 	return err;
1574 }
1575 
1576 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1577 		       compat_ulong_t, maxnode)
1578 {
1579 	unsigned long __user *nm = NULL;
1580 	unsigned long nr_bits, alloc_size;
1581 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1582 
1583 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1584 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1585 
1586 	if (nmask) {
1587 		if (compat_get_bitmap(bm, nmask, nr_bits))
1588 			return -EFAULT;
1589 		nm = compat_alloc_user_space(alloc_size);
1590 		if (copy_to_user(nm, bm, alloc_size))
1591 			return -EFAULT;
1592 	}
1593 
1594 	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1595 }
1596 
1597 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1598 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1599 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1600 {
1601 	unsigned long __user *nm = NULL;
1602 	unsigned long nr_bits, alloc_size;
1603 	nodemask_t bm;
1604 
1605 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1606 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1607 
1608 	if (nmask) {
1609 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1610 			return -EFAULT;
1611 		nm = compat_alloc_user_space(alloc_size);
1612 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1613 			return -EFAULT;
1614 	}
1615 
1616 	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1617 }
1618 
1619 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1620 		       compat_ulong_t, maxnode,
1621 		       const compat_ulong_t __user *, old_nodes,
1622 		       const compat_ulong_t __user *, new_nodes)
1623 {
1624 	unsigned long __user *old = NULL;
1625 	unsigned long __user *new = NULL;
1626 	nodemask_t tmp_mask;
1627 	unsigned long nr_bits;
1628 	unsigned long size;
1629 
1630 	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1631 	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1632 	if (old_nodes) {
1633 		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1634 			return -EFAULT;
1635 		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1636 		if (new_nodes)
1637 			new = old + size / sizeof(unsigned long);
1638 		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1639 			return -EFAULT;
1640 	}
1641 	if (new_nodes) {
1642 		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1643 			return -EFAULT;
1644 		if (new == NULL)
1645 			new = compat_alloc_user_space(size);
1646 		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1647 			return -EFAULT;
1648 	}
1649 	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1650 }
1651 
1652 #endif /* CONFIG_COMPAT */
1653 
1654 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1655 						unsigned long addr)
1656 {
1657 	struct mempolicy *pol = NULL;
1658 
1659 	if (vma) {
1660 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1661 			pol = vma->vm_ops->get_policy(vma, addr);
1662 		} else if (vma->vm_policy) {
1663 			pol = vma->vm_policy;
1664 
1665 			/*
1666 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1667 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1668 			 * count on these policies which will be dropped by
1669 			 * mpol_cond_put() later
1670 			 */
1671 			if (mpol_needs_cond_ref(pol))
1672 				mpol_get(pol);
1673 		}
1674 	}
1675 
1676 	return pol;
1677 }
1678 
1679 /*
1680  * get_vma_policy(@vma, @addr)
1681  * @vma: virtual memory area whose policy is sought
1682  * @addr: address in @vma for shared policy lookup
1683  *
1684  * Returns effective policy for a VMA at specified address.
1685  * Falls back to current->mempolicy or system default policy, as necessary.
1686  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1687  * count--added by the get_policy() vm_op, as appropriate--to protect against
1688  * freeing by another task.  It is the caller's responsibility to free the
1689  * extra reference for shared policies.
1690  */
1691 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1692 						unsigned long addr)
1693 {
1694 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1695 
1696 	if (!pol)
1697 		pol = get_task_policy(current);
1698 
1699 	return pol;
1700 }
1701 
1702 bool vma_policy_mof(struct vm_area_struct *vma)
1703 {
1704 	struct mempolicy *pol;
1705 
1706 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1707 		bool ret = false;
1708 
1709 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1710 		if (pol && (pol->flags & MPOL_F_MOF))
1711 			ret = true;
1712 		mpol_cond_put(pol);
1713 
1714 		return ret;
1715 	}
1716 
1717 	pol = vma->vm_policy;
1718 	if (!pol)
1719 		pol = get_task_policy(current);
1720 
1721 	return pol->flags & MPOL_F_MOF;
1722 }
1723 
1724 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1725 {
1726 	enum zone_type dynamic_policy_zone = policy_zone;
1727 
1728 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1729 
1730 	/*
1731 	 * if policy->v.nodes has movable memory only,
1732 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1733 	 *
1734 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1735 	 * so if the following test faile, it implies
1736 	 * policy->v.nodes has movable memory only.
1737 	 */
1738 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1739 		dynamic_policy_zone = ZONE_MOVABLE;
1740 
1741 	return zone >= dynamic_policy_zone;
1742 }
1743 
1744 /*
1745  * Return a nodemask representing a mempolicy for filtering nodes for
1746  * page allocation
1747  */
1748 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1749 {
1750 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1751 	if (unlikely(policy->mode == MPOL_BIND) &&
1752 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1753 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1754 		return &policy->v.nodes;
1755 
1756 	return NULL;
1757 }
1758 
1759 /* Return the node id preferred by the given mempolicy, or the given id */
1760 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1761 								int nd)
1762 {
1763 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1764 		nd = policy->v.preferred_node;
1765 	else {
1766 		/*
1767 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1768 		 * because we might easily break the expectation to stay on the
1769 		 * requested node and not break the policy.
1770 		 */
1771 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1772 	}
1773 
1774 	return nd;
1775 }
1776 
1777 /* Do dynamic interleaving for a process */
1778 static unsigned interleave_nodes(struct mempolicy *policy)
1779 {
1780 	unsigned next;
1781 	struct task_struct *me = current;
1782 
1783 	next = next_node_in(me->il_prev, policy->v.nodes);
1784 	if (next < MAX_NUMNODES)
1785 		me->il_prev = next;
1786 	return next;
1787 }
1788 
1789 /*
1790  * Depending on the memory policy provide a node from which to allocate the
1791  * next slab entry.
1792  */
1793 unsigned int mempolicy_slab_node(void)
1794 {
1795 	struct mempolicy *policy;
1796 	int node = numa_mem_id();
1797 
1798 	if (in_interrupt())
1799 		return node;
1800 
1801 	policy = current->mempolicy;
1802 	if (!policy || policy->flags & MPOL_F_LOCAL)
1803 		return node;
1804 
1805 	switch (policy->mode) {
1806 	case MPOL_PREFERRED:
1807 		/*
1808 		 * handled MPOL_F_LOCAL above
1809 		 */
1810 		return policy->v.preferred_node;
1811 
1812 	case MPOL_INTERLEAVE:
1813 		return interleave_nodes(policy);
1814 
1815 	case MPOL_BIND: {
1816 		struct zoneref *z;
1817 
1818 		/*
1819 		 * Follow bind policy behavior and start allocation at the
1820 		 * first node.
1821 		 */
1822 		struct zonelist *zonelist;
1823 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1824 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1825 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1826 							&policy->v.nodes);
1827 		return z->zone ? zone_to_nid(z->zone) : node;
1828 	}
1829 
1830 	default:
1831 		BUG();
1832 	}
1833 }
1834 
1835 /*
1836  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1837  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1838  * number of present nodes.
1839  */
1840 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1841 {
1842 	unsigned nnodes = nodes_weight(pol->v.nodes);
1843 	unsigned target;
1844 	int i;
1845 	int nid;
1846 
1847 	if (!nnodes)
1848 		return numa_node_id();
1849 	target = (unsigned int)n % nnodes;
1850 	nid = first_node(pol->v.nodes);
1851 	for (i = 0; i < target; i++)
1852 		nid = next_node(nid, pol->v.nodes);
1853 	return nid;
1854 }
1855 
1856 /* Determine a node number for interleave */
1857 static inline unsigned interleave_nid(struct mempolicy *pol,
1858 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1859 {
1860 	if (vma) {
1861 		unsigned long off;
1862 
1863 		/*
1864 		 * for small pages, there is no difference between
1865 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1866 		 * for huge pages, since vm_pgoff is in units of small
1867 		 * pages, we need to shift off the always 0 bits to get
1868 		 * a useful offset.
1869 		 */
1870 		BUG_ON(shift < PAGE_SHIFT);
1871 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1872 		off += (addr - vma->vm_start) >> shift;
1873 		return offset_il_node(pol, off);
1874 	} else
1875 		return interleave_nodes(pol);
1876 }
1877 
1878 #ifdef CONFIG_HUGETLBFS
1879 /*
1880  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1881  * @vma: virtual memory area whose policy is sought
1882  * @addr: address in @vma for shared policy lookup and interleave policy
1883  * @gfp_flags: for requested zone
1884  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1885  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1886  *
1887  * Returns a nid suitable for a huge page allocation and a pointer
1888  * to the struct mempolicy for conditional unref after allocation.
1889  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1890  * @nodemask for filtering the zonelist.
1891  *
1892  * Must be protected by read_mems_allowed_begin()
1893  */
1894 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1895 				struct mempolicy **mpol, nodemask_t **nodemask)
1896 {
1897 	int nid;
1898 
1899 	*mpol = get_vma_policy(vma, addr);
1900 	*nodemask = NULL;	/* assume !MPOL_BIND */
1901 
1902 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1903 		nid = interleave_nid(*mpol, vma, addr,
1904 					huge_page_shift(hstate_vma(vma)));
1905 	} else {
1906 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1907 		if ((*mpol)->mode == MPOL_BIND)
1908 			*nodemask = &(*mpol)->v.nodes;
1909 	}
1910 	return nid;
1911 }
1912 
1913 /*
1914  * init_nodemask_of_mempolicy
1915  *
1916  * If the current task's mempolicy is "default" [NULL], return 'false'
1917  * to indicate default policy.  Otherwise, extract the policy nodemask
1918  * for 'bind' or 'interleave' policy into the argument nodemask, or
1919  * initialize the argument nodemask to contain the single node for
1920  * 'preferred' or 'local' policy and return 'true' to indicate presence
1921  * of non-default mempolicy.
1922  *
1923  * We don't bother with reference counting the mempolicy [mpol_get/put]
1924  * because the current task is examining it's own mempolicy and a task's
1925  * mempolicy is only ever changed by the task itself.
1926  *
1927  * N.B., it is the caller's responsibility to free a returned nodemask.
1928  */
1929 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1930 {
1931 	struct mempolicy *mempolicy;
1932 	int nid;
1933 
1934 	if (!(mask && current->mempolicy))
1935 		return false;
1936 
1937 	task_lock(current);
1938 	mempolicy = current->mempolicy;
1939 	switch (mempolicy->mode) {
1940 	case MPOL_PREFERRED:
1941 		if (mempolicy->flags & MPOL_F_LOCAL)
1942 			nid = numa_node_id();
1943 		else
1944 			nid = mempolicy->v.preferred_node;
1945 		init_nodemask_of_node(mask, nid);
1946 		break;
1947 
1948 	case MPOL_BIND:
1949 		/* Fall through */
1950 	case MPOL_INTERLEAVE:
1951 		*mask =  mempolicy->v.nodes;
1952 		break;
1953 
1954 	default:
1955 		BUG();
1956 	}
1957 	task_unlock(current);
1958 
1959 	return true;
1960 }
1961 #endif
1962 
1963 /*
1964  * mempolicy_nodemask_intersects
1965  *
1966  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1967  * policy.  Otherwise, check for intersection between mask and the policy
1968  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1969  * policy, always return true since it may allocate elsewhere on fallback.
1970  *
1971  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1972  */
1973 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1974 					const nodemask_t *mask)
1975 {
1976 	struct mempolicy *mempolicy;
1977 	bool ret = true;
1978 
1979 	if (!mask)
1980 		return ret;
1981 	task_lock(tsk);
1982 	mempolicy = tsk->mempolicy;
1983 	if (!mempolicy)
1984 		goto out;
1985 
1986 	switch (mempolicy->mode) {
1987 	case MPOL_PREFERRED:
1988 		/*
1989 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1990 		 * allocate from, they may fallback to other nodes when oom.
1991 		 * Thus, it's possible for tsk to have allocated memory from
1992 		 * nodes in mask.
1993 		 */
1994 		break;
1995 	case MPOL_BIND:
1996 	case MPOL_INTERLEAVE:
1997 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1998 		break;
1999 	default:
2000 		BUG();
2001 	}
2002 out:
2003 	task_unlock(tsk);
2004 	return ret;
2005 }
2006 
2007 /* Allocate a page in interleaved policy.
2008    Own path because it needs to do special accounting. */
2009 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2010 					unsigned nid)
2011 {
2012 	struct page *page;
2013 
2014 	page = __alloc_pages(gfp, order, nid);
2015 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2016 	if (!static_branch_likely(&vm_numa_stat_key))
2017 		return page;
2018 	if (page && page_to_nid(page) == nid) {
2019 		preempt_disable();
2020 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2021 		preempt_enable();
2022 	}
2023 	return page;
2024 }
2025 
2026 /**
2027  * 	alloc_pages_vma	- Allocate a page for a VMA.
2028  *
2029  * 	@gfp:
2030  *      %GFP_USER    user allocation.
2031  *      %GFP_KERNEL  kernel allocations,
2032  *      %GFP_HIGHMEM highmem/user allocations,
2033  *      %GFP_FS      allocation should not call back into a file system.
2034  *      %GFP_ATOMIC  don't sleep.
2035  *
2036  *	@order:Order of the GFP allocation.
2037  * 	@vma:  Pointer to VMA or NULL if not available.
2038  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2039  *	@node: Which node to prefer for allocation (modulo policy).
2040  *	@hugepage: for hugepages try only the preferred node if possible
2041  *
2042  * 	This function allocates a page from the kernel page pool and applies
2043  *	a NUMA policy associated with the VMA or the current process.
2044  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2045  *	mm_struct of the VMA to prevent it from going away. Should be used for
2046  *	all allocations for pages that will be mapped into user space. Returns
2047  *	NULL when no page can be allocated.
2048  */
2049 struct page *
2050 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2051 		unsigned long addr, int node, bool hugepage)
2052 {
2053 	struct mempolicy *pol;
2054 	struct page *page;
2055 	int preferred_nid;
2056 	nodemask_t *nmask;
2057 
2058 	pol = get_vma_policy(vma, addr);
2059 
2060 	if (pol->mode == MPOL_INTERLEAVE) {
2061 		unsigned nid;
2062 
2063 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2064 		mpol_cond_put(pol);
2065 		page = alloc_page_interleave(gfp, order, nid);
2066 		goto out;
2067 	}
2068 
2069 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2070 		int hpage_node = node;
2071 
2072 		/*
2073 		 * For hugepage allocation and non-interleave policy which
2074 		 * allows the current node (or other explicitly preferred
2075 		 * node) we only try to allocate from the current/preferred
2076 		 * node and don't fall back to other nodes, as the cost of
2077 		 * remote accesses would likely offset THP benefits.
2078 		 *
2079 		 * If the policy is interleave, or does not allow the current
2080 		 * node in its nodemask, we allocate the standard way.
2081 		 */
2082 		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2083 			hpage_node = pol->v.preferred_node;
2084 
2085 		nmask = policy_nodemask(gfp, pol);
2086 		if (!nmask || node_isset(hpage_node, *nmask)) {
2087 			mpol_cond_put(pol);
2088 			page = __alloc_pages_node(hpage_node,
2089 						gfp | __GFP_THISNODE, order);
2090 			goto out;
2091 		}
2092 	}
2093 
2094 	nmask = policy_nodemask(gfp, pol);
2095 	preferred_nid = policy_node(gfp, pol, node);
2096 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2097 	mpol_cond_put(pol);
2098 out:
2099 	return page;
2100 }
2101 
2102 /**
2103  * 	alloc_pages_current - Allocate pages.
2104  *
2105  *	@gfp:
2106  *		%GFP_USER   user allocation,
2107  *      	%GFP_KERNEL kernel allocation,
2108  *      	%GFP_HIGHMEM highmem allocation,
2109  *      	%GFP_FS     don't call back into a file system.
2110  *      	%GFP_ATOMIC don't sleep.
2111  *	@order: Power of two of allocation size in pages. 0 is a single page.
2112  *
2113  *	Allocate a page from the kernel page pool.  When not in
2114  *	interrupt context and apply the current process NUMA policy.
2115  *	Returns NULL when no page can be allocated.
2116  */
2117 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2118 {
2119 	struct mempolicy *pol = &default_policy;
2120 	struct page *page;
2121 
2122 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2123 		pol = get_task_policy(current);
2124 
2125 	/*
2126 	 * No reference counting needed for current->mempolicy
2127 	 * nor system default_policy
2128 	 */
2129 	if (pol->mode == MPOL_INTERLEAVE)
2130 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2131 	else
2132 		page = __alloc_pages_nodemask(gfp, order,
2133 				policy_node(gfp, pol, numa_node_id()),
2134 				policy_nodemask(gfp, pol));
2135 
2136 	return page;
2137 }
2138 EXPORT_SYMBOL(alloc_pages_current);
2139 
2140 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2141 {
2142 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2143 
2144 	if (IS_ERR(pol))
2145 		return PTR_ERR(pol);
2146 	dst->vm_policy = pol;
2147 	return 0;
2148 }
2149 
2150 /*
2151  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2152  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2153  * with the mems_allowed returned by cpuset_mems_allowed().  This
2154  * keeps mempolicies cpuset relative after its cpuset moves.  See
2155  * further kernel/cpuset.c update_nodemask().
2156  *
2157  * current's mempolicy may be rebinded by the other task(the task that changes
2158  * cpuset's mems), so we needn't do rebind work for current task.
2159  */
2160 
2161 /* Slow path of a mempolicy duplicate */
2162 struct mempolicy *__mpol_dup(struct mempolicy *old)
2163 {
2164 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2165 
2166 	if (!new)
2167 		return ERR_PTR(-ENOMEM);
2168 
2169 	/* task's mempolicy is protected by alloc_lock */
2170 	if (old == current->mempolicy) {
2171 		task_lock(current);
2172 		*new = *old;
2173 		task_unlock(current);
2174 	} else
2175 		*new = *old;
2176 
2177 	if (current_cpuset_is_being_rebound()) {
2178 		nodemask_t mems = cpuset_mems_allowed(current);
2179 		mpol_rebind_policy(new, &mems);
2180 	}
2181 	atomic_set(&new->refcnt, 1);
2182 	return new;
2183 }
2184 
2185 /* Slow path of a mempolicy comparison */
2186 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2187 {
2188 	if (!a || !b)
2189 		return false;
2190 	if (a->mode != b->mode)
2191 		return false;
2192 	if (a->flags != b->flags)
2193 		return false;
2194 	if (mpol_store_user_nodemask(a))
2195 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2196 			return false;
2197 
2198 	switch (a->mode) {
2199 	case MPOL_BIND:
2200 		/* Fall through */
2201 	case MPOL_INTERLEAVE:
2202 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2203 	case MPOL_PREFERRED:
2204 		/* a's ->flags is the same as b's */
2205 		if (a->flags & MPOL_F_LOCAL)
2206 			return true;
2207 		return a->v.preferred_node == b->v.preferred_node;
2208 	default:
2209 		BUG();
2210 		return false;
2211 	}
2212 }
2213 
2214 /*
2215  * Shared memory backing store policy support.
2216  *
2217  * Remember policies even when nobody has shared memory mapped.
2218  * The policies are kept in Red-Black tree linked from the inode.
2219  * They are protected by the sp->lock rwlock, which should be held
2220  * for any accesses to the tree.
2221  */
2222 
2223 /*
2224  * lookup first element intersecting start-end.  Caller holds sp->lock for
2225  * reading or for writing
2226  */
2227 static struct sp_node *
2228 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2229 {
2230 	struct rb_node *n = sp->root.rb_node;
2231 
2232 	while (n) {
2233 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2234 
2235 		if (start >= p->end)
2236 			n = n->rb_right;
2237 		else if (end <= p->start)
2238 			n = n->rb_left;
2239 		else
2240 			break;
2241 	}
2242 	if (!n)
2243 		return NULL;
2244 	for (;;) {
2245 		struct sp_node *w = NULL;
2246 		struct rb_node *prev = rb_prev(n);
2247 		if (!prev)
2248 			break;
2249 		w = rb_entry(prev, struct sp_node, nd);
2250 		if (w->end <= start)
2251 			break;
2252 		n = prev;
2253 	}
2254 	return rb_entry(n, struct sp_node, nd);
2255 }
2256 
2257 /*
2258  * Insert a new shared policy into the list.  Caller holds sp->lock for
2259  * writing.
2260  */
2261 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2262 {
2263 	struct rb_node **p = &sp->root.rb_node;
2264 	struct rb_node *parent = NULL;
2265 	struct sp_node *nd;
2266 
2267 	while (*p) {
2268 		parent = *p;
2269 		nd = rb_entry(parent, struct sp_node, nd);
2270 		if (new->start < nd->start)
2271 			p = &(*p)->rb_left;
2272 		else if (new->end > nd->end)
2273 			p = &(*p)->rb_right;
2274 		else
2275 			BUG();
2276 	}
2277 	rb_link_node(&new->nd, parent, p);
2278 	rb_insert_color(&new->nd, &sp->root);
2279 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2280 		 new->policy ? new->policy->mode : 0);
2281 }
2282 
2283 /* Find shared policy intersecting idx */
2284 struct mempolicy *
2285 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2286 {
2287 	struct mempolicy *pol = NULL;
2288 	struct sp_node *sn;
2289 
2290 	if (!sp->root.rb_node)
2291 		return NULL;
2292 	read_lock(&sp->lock);
2293 	sn = sp_lookup(sp, idx, idx+1);
2294 	if (sn) {
2295 		mpol_get(sn->policy);
2296 		pol = sn->policy;
2297 	}
2298 	read_unlock(&sp->lock);
2299 	return pol;
2300 }
2301 
2302 static void sp_free(struct sp_node *n)
2303 {
2304 	mpol_put(n->policy);
2305 	kmem_cache_free(sn_cache, n);
2306 }
2307 
2308 /**
2309  * mpol_misplaced - check whether current page node is valid in policy
2310  *
2311  * @page: page to be checked
2312  * @vma: vm area where page mapped
2313  * @addr: virtual address where page mapped
2314  *
2315  * Lookup current policy node id for vma,addr and "compare to" page's
2316  * node id.
2317  *
2318  * Returns:
2319  *	-1	- not misplaced, page is in the right node
2320  *	node	- node id where the page should be
2321  *
2322  * Policy determination "mimics" alloc_page_vma().
2323  * Called from fault path where we know the vma and faulting address.
2324  */
2325 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2326 {
2327 	struct mempolicy *pol;
2328 	struct zoneref *z;
2329 	int curnid = page_to_nid(page);
2330 	unsigned long pgoff;
2331 	int thiscpu = raw_smp_processor_id();
2332 	int thisnid = cpu_to_node(thiscpu);
2333 	int polnid = NUMA_NO_NODE;
2334 	int ret = -1;
2335 
2336 	pol = get_vma_policy(vma, addr);
2337 	if (!(pol->flags & MPOL_F_MOF))
2338 		goto out;
2339 
2340 	switch (pol->mode) {
2341 	case MPOL_INTERLEAVE:
2342 		pgoff = vma->vm_pgoff;
2343 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2344 		polnid = offset_il_node(pol, pgoff);
2345 		break;
2346 
2347 	case MPOL_PREFERRED:
2348 		if (pol->flags & MPOL_F_LOCAL)
2349 			polnid = numa_node_id();
2350 		else
2351 			polnid = pol->v.preferred_node;
2352 		break;
2353 
2354 	case MPOL_BIND:
2355 
2356 		/*
2357 		 * allows binding to multiple nodes.
2358 		 * use current page if in policy nodemask,
2359 		 * else select nearest allowed node, if any.
2360 		 * If no allowed nodes, use current [!misplaced].
2361 		 */
2362 		if (node_isset(curnid, pol->v.nodes))
2363 			goto out;
2364 		z = first_zones_zonelist(
2365 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2366 				gfp_zone(GFP_HIGHUSER),
2367 				&pol->v.nodes);
2368 		polnid = zone_to_nid(z->zone);
2369 		break;
2370 
2371 	default:
2372 		BUG();
2373 	}
2374 
2375 	/* Migrate the page towards the node whose CPU is referencing it */
2376 	if (pol->flags & MPOL_F_MORON) {
2377 		polnid = thisnid;
2378 
2379 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2380 			goto out;
2381 	}
2382 
2383 	if (curnid != polnid)
2384 		ret = polnid;
2385 out:
2386 	mpol_cond_put(pol);
2387 
2388 	return ret;
2389 }
2390 
2391 /*
2392  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2393  * dropped after task->mempolicy is set to NULL so that any allocation done as
2394  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2395  * policy.
2396  */
2397 void mpol_put_task_policy(struct task_struct *task)
2398 {
2399 	struct mempolicy *pol;
2400 
2401 	task_lock(task);
2402 	pol = task->mempolicy;
2403 	task->mempolicy = NULL;
2404 	task_unlock(task);
2405 	mpol_put(pol);
2406 }
2407 
2408 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2409 {
2410 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2411 	rb_erase(&n->nd, &sp->root);
2412 	sp_free(n);
2413 }
2414 
2415 static void sp_node_init(struct sp_node *node, unsigned long start,
2416 			unsigned long end, struct mempolicy *pol)
2417 {
2418 	node->start = start;
2419 	node->end = end;
2420 	node->policy = pol;
2421 }
2422 
2423 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2424 				struct mempolicy *pol)
2425 {
2426 	struct sp_node *n;
2427 	struct mempolicy *newpol;
2428 
2429 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2430 	if (!n)
2431 		return NULL;
2432 
2433 	newpol = mpol_dup(pol);
2434 	if (IS_ERR(newpol)) {
2435 		kmem_cache_free(sn_cache, n);
2436 		return NULL;
2437 	}
2438 	newpol->flags |= MPOL_F_SHARED;
2439 	sp_node_init(n, start, end, newpol);
2440 
2441 	return n;
2442 }
2443 
2444 /* Replace a policy range. */
2445 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2446 				 unsigned long end, struct sp_node *new)
2447 {
2448 	struct sp_node *n;
2449 	struct sp_node *n_new = NULL;
2450 	struct mempolicy *mpol_new = NULL;
2451 	int ret = 0;
2452 
2453 restart:
2454 	write_lock(&sp->lock);
2455 	n = sp_lookup(sp, start, end);
2456 	/* Take care of old policies in the same range. */
2457 	while (n && n->start < end) {
2458 		struct rb_node *next = rb_next(&n->nd);
2459 		if (n->start >= start) {
2460 			if (n->end <= end)
2461 				sp_delete(sp, n);
2462 			else
2463 				n->start = end;
2464 		} else {
2465 			/* Old policy spanning whole new range. */
2466 			if (n->end > end) {
2467 				if (!n_new)
2468 					goto alloc_new;
2469 
2470 				*mpol_new = *n->policy;
2471 				atomic_set(&mpol_new->refcnt, 1);
2472 				sp_node_init(n_new, end, n->end, mpol_new);
2473 				n->end = start;
2474 				sp_insert(sp, n_new);
2475 				n_new = NULL;
2476 				mpol_new = NULL;
2477 				break;
2478 			} else
2479 				n->end = start;
2480 		}
2481 		if (!next)
2482 			break;
2483 		n = rb_entry(next, struct sp_node, nd);
2484 	}
2485 	if (new)
2486 		sp_insert(sp, new);
2487 	write_unlock(&sp->lock);
2488 	ret = 0;
2489 
2490 err_out:
2491 	if (mpol_new)
2492 		mpol_put(mpol_new);
2493 	if (n_new)
2494 		kmem_cache_free(sn_cache, n_new);
2495 
2496 	return ret;
2497 
2498 alloc_new:
2499 	write_unlock(&sp->lock);
2500 	ret = -ENOMEM;
2501 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2502 	if (!n_new)
2503 		goto err_out;
2504 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2505 	if (!mpol_new)
2506 		goto err_out;
2507 	goto restart;
2508 }
2509 
2510 /**
2511  * mpol_shared_policy_init - initialize shared policy for inode
2512  * @sp: pointer to inode shared policy
2513  * @mpol:  struct mempolicy to install
2514  *
2515  * Install non-NULL @mpol in inode's shared policy rb-tree.
2516  * On entry, the current task has a reference on a non-NULL @mpol.
2517  * This must be released on exit.
2518  * This is called at get_inode() calls and we can use GFP_KERNEL.
2519  */
2520 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2521 {
2522 	int ret;
2523 
2524 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2525 	rwlock_init(&sp->lock);
2526 
2527 	if (mpol) {
2528 		struct vm_area_struct pvma;
2529 		struct mempolicy *new;
2530 		NODEMASK_SCRATCH(scratch);
2531 
2532 		if (!scratch)
2533 			goto put_mpol;
2534 		/* contextualize the tmpfs mount point mempolicy */
2535 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2536 		if (IS_ERR(new))
2537 			goto free_scratch; /* no valid nodemask intersection */
2538 
2539 		task_lock(current);
2540 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2541 		task_unlock(current);
2542 		if (ret)
2543 			goto put_new;
2544 
2545 		/* Create pseudo-vma that contains just the policy */
2546 		vma_init(&pvma, NULL);
2547 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2548 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2549 
2550 put_new:
2551 		mpol_put(new);			/* drop initial ref */
2552 free_scratch:
2553 		NODEMASK_SCRATCH_FREE(scratch);
2554 put_mpol:
2555 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2556 	}
2557 }
2558 
2559 int mpol_set_shared_policy(struct shared_policy *info,
2560 			struct vm_area_struct *vma, struct mempolicy *npol)
2561 {
2562 	int err;
2563 	struct sp_node *new = NULL;
2564 	unsigned long sz = vma_pages(vma);
2565 
2566 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2567 		 vma->vm_pgoff,
2568 		 sz, npol ? npol->mode : -1,
2569 		 npol ? npol->flags : -1,
2570 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2571 
2572 	if (npol) {
2573 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2574 		if (!new)
2575 			return -ENOMEM;
2576 	}
2577 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2578 	if (err && new)
2579 		sp_free(new);
2580 	return err;
2581 }
2582 
2583 /* Free a backing policy store on inode delete. */
2584 void mpol_free_shared_policy(struct shared_policy *p)
2585 {
2586 	struct sp_node *n;
2587 	struct rb_node *next;
2588 
2589 	if (!p->root.rb_node)
2590 		return;
2591 	write_lock(&p->lock);
2592 	next = rb_first(&p->root);
2593 	while (next) {
2594 		n = rb_entry(next, struct sp_node, nd);
2595 		next = rb_next(&n->nd);
2596 		sp_delete(p, n);
2597 	}
2598 	write_unlock(&p->lock);
2599 }
2600 
2601 #ifdef CONFIG_NUMA_BALANCING
2602 static int __initdata numabalancing_override;
2603 
2604 static void __init check_numabalancing_enable(void)
2605 {
2606 	bool numabalancing_default = false;
2607 
2608 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2609 		numabalancing_default = true;
2610 
2611 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2612 	if (numabalancing_override)
2613 		set_numabalancing_state(numabalancing_override == 1);
2614 
2615 	if (num_online_nodes() > 1 && !numabalancing_override) {
2616 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2617 			numabalancing_default ? "Enabling" : "Disabling");
2618 		set_numabalancing_state(numabalancing_default);
2619 	}
2620 }
2621 
2622 static int __init setup_numabalancing(char *str)
2623 {
2624 	int ret = 0;
2625 	if (!str)
2626 		goto out;
2627 
2628 	if (!strcmp(str, "enable")) {
2629 		numabalancing_override = 1;
2630 		ret = 1;
2631 	} else if (!strcmp(str, "disable")) {
2632 		numabalancing_override = -1;
2633 		ret = 1;
2634 	}
2635 out:
2636 	if (!ret)
2637 		pr_warn("Unable to parse numa_balancing=\n");
2638 
2639 	return ret;
2640 }
2641 __setup("numa_balancing=", setup_numabalancing);
2642 #else
2643 static inline void __init check_numabalancing_enable(void)
2644 {
2645 }
2646 #endif /* CONFIG_NUMA_BALANCING */
2647 
2648 /* assumes fs == KERNEL_DS */
2649 void __init numa_policy_init(void)
2650 {
2651 	nodemask_t interleave_nodes;
2652 	unsigned long largest = 0;
2653 	int nid, prefer = 0;
2654 
2655 	policy_cache = kmem_cache_create("numa_policy",
2656 					 sizeof(struct mempolicy),
2657 					 0, SLAB_PANIC, NULL);
2658 
2659 	sn_cache = kmem_cache_create("shared_policy_node",
2660 				     sizeof(struct sp_node),
2661 				     0, SLAB_PANIC, NULL);
2662 
2663 	for_each_node(nid) {
2664 		preferred_node_policy[nid] = (struct mempolicy) {
2665 			.refcnt = ATOMIC_INIT(1),
2666 			.mode = MPOL_PREFERRED,
2667 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2668 			.v = { .preferred_node = nid, },
2669 		};
2670 	}
2671 
2672 	/*
2673 	 * Set interleaving policy for system init. Interleaving is only
2674 	 * enabled across suitably sized nodes (default is >= 16MB), or
2675 	 * fall back to the largest node if they're all smaller.
2676 	 */
2677 	nodes_clear(interleave_nodes);
2678 	for_each_node_state(nid, N_MEMORY) {
2679 		unsigned long total_pages = node_present_pages(nid);
2680 
2681 		/* Preserve the largest node */
2682 		if (largest < total_pages) {
2683 			largest = total_pages;
2684 			prefer = nid;
2685 		}
2686 
2687 		/* Interleave this node? */
2688 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2689 			node_set(nid, interleave_nodes);
2690 	}
2691 
2692 	/* All too small, use the largest */
2693 	if (unlikely(nodes_empty(interleave_nodes)))
2694 		node_set(prefer, interleave_nodes);
2695 
2696 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2697 		pr_err("%s: interleaving failed\n", __func__);
2698 
2699 	check_numabalancing_enable();
2700 }
2701 
2702 /* Reset policy of current process to default */
2703 void numa_default_policy(void)
2704 {
2705 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2706 }
2707 
2708 /*
2709  * Parse and format mempolicy from/to strings
2710  */
2711 
2712 /*
2713  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2714  */
2715 static const char * const policy_modes[] =
2716 {
2717 	[MPOL_DEFAULT]    = "default",
2718 	[MPOL_PREFERRED]  = "prefer",
2719 	[MPOL_BIND]       = "bind",
2720 	[MPOL_INTERLEAVE] = "interleave",
2721 	[MPOL_LOCAL]      = "local",
2722 };
2723 
2724 
2725 #ifdef CONFIG_TMPFS
2726 /**
2727  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2728  * @str:  string containing mempolicy to parse
2729  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2730  *
2731  * Format of input:
2732  *	<mode>[=<flags>][:<nodelist>]
2733  *
2734  * On success, returns 0, else 1
2735  */
2736 int mpol_parse_str(char *str, struct mempolicy **mpol)
2737 {
2738 	struct mempolicy *new = NULL;
2739 	unsigned short mode_flags;
2740 	nodemask_t nodes;
2741 	char *nodelist = strchr(str, ':');
2742 	char *flags = strchr(str, '=');
2743 	int err = 1, mode;
2744 
2745 	if (nodelist) {
2746 		/* NUL-terminate mode or flags string */
2747 		*nodelist++ = '\0';
2748 		if (nodelist_parse(nodelist, nodes))
2749 			goto out;
2750 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2751 			goto out;
2752 	} else
2753 		nodes_clear(nodes);
2754 
2755 	if (flags)
2756 		*flags++ = '\0';	/* terminate mode string */
2757 
2758 	mode = match_string(policy_modes, MPOL_MAX, str);
2759 	if (mode < 0)
2760 		goto out;
2761 
2762 	switch (mode) {
2763 	case MPOL_PREFERRED:
2764 		/*
2765 		 * Insist on a nodelist of one node only
2766 		 */
2767 		if (nodelist) {
2768 			char *rest = nodelist;
2769 			while (isdigit(*rest))
2770 				rest++;
2771 			if (*rest)
2772 				goto out;
2773 		}
2774 		break;
2775 	case MPOL_INTERLEAVE:
2776 		/*
2777 		 * Default to online nodes with memory if no nodelist
2778 		 */
2779 		if (!nodelist)
2780 			nodes = node_states[N_MEMORY];
2781 		break;
2782 	case MPOL_LOCAL:
2783 		/*
2784 		 * Don't allow a nodelist;  mpol_new() checks flags
2785 		 */
2786 		if (nodelist)
2787 			goto out;
2788 		mode = MPOL_PREFERRED;
2789 		break;
2790 	case MPOL_DEFAULT:
2791 		/*
2792 		 * Insist on a empty nodelist
2793 		 */
2794 		if (!nodelist)
2795 			err = 0;
2796 		goto out;
2797 	case MPOL_BIND:
2798 		/*
2799 		 * Insist on a nodelist
2800 		 */
2801 		if (!nodelist)
2802 			goto out;
2803 	}
2804 
2805 	mode_flags = 0;
2806 	if (flags) {
2807 		/*
2808 		 * Currently, we only support two mutually exclusive
2809 		 * mode flags.
2810 		 */
2811 		if (!strcmp(flags, "static"))
2812 			mode_flags |= MPOL_F_STATIC_NODES;
2813 		else if (!strcmp(flags, "relative"))
2814 			mode_flags |= MPOL_F_RELATIVE_NODES;
2815 		else
2816 			goto out;
2817 	}
2818 
2819 	new = mpol_new(mode, mode_flags, &nodes);
2820 	if (IS_ERR(new))
2821 		goto out;
2822 
2823 	/*
2824 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2825 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2826 	 */
2827 	if (mode != MPOL_PREFERRED)
2828 		new->v.nodes = nodes;
2829 	else if (nodelist)
2830 		new->v.preferred_node = first_node(nodes);
2831 	else
2832 		new->flags |= MPOL_F_LOCAL;
2833 
2834 	/*
2835 	 * Save nodes for contextualization: this will be used to "clone"
2836 	 * the mempolicy in a specific context [cpuset] at a later time.
2837 	 */
2838 	new->w.user_nodemask = nodes;
2839 
2840 	err = 0;
2841 
2842 out:
2843 	/* Restore string for error message */
2844 	if (nodelist)
2845 		*--nodelist = ':';
2846 	if (flags)
2847 		*--flags = '=';
2848 	if (!err)
2849 		*mpol = new;
2850 	return err;
2851 }
2852 #endif /* CONFIG_TMPFS */
2853 
2854 /**
2855  * mpol_to_str - format a mempolicy structure for printing
2856  * @buffer:  to contain formatted mempolicy string
2857  * @maxlen:  length of @buffer
2858  * @pol:  pointer to mempolicy to be formatted
2859  *
2860  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2861  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2862  * longest flag, "relative", and to display at least a few node ids.
2863  */
2864 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2865 {
2866 	char *p = buffer;
2867 	nodemask_t nodes = NODE_MASK_NONE;
2868 	unsigned short mode = MPOL_DEFAULT;
2869 	unsigned short flags = 0;
2870 
2871 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2872 		mode = pol->mode;
2873 		flags = pol->flags;
2874 	}
2875 
2876 	switch (mode) {
2877 	case MPOL_DEFAULT:
2878 		break;
2879 	case MPOL_PREFERRED:
2880 		if (flags & MPOL_F_LOCAL)
2881 			mode = MPOL_LOCAL;
2882 		else
2883 			node_set(pol->v.preferred_node, nodes);
2884 		break;
2885 	case MPOL_BIND:
2886 	case MPOL_INTERLEAVE:
2887 		nodes = pol->v.nodes;
2888 		break;
2889 	default:
2890 		WARN_ON_ONCE(1);
2891 		snprintf(p, maxlen, "unknown");
2892 		return;
2893 	}
2894 
2895 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2896 
2897 	if (flags & MPOL_MODE_FLAGS) {
2898 		p += snprintf(p, buffer + maxlen - p, "=");
2899 
2900 		/*
2901 		 * Currently, the only defined flags are mutually exclusive
2902 		 */
2903 		if (flags & MPOL_F_STATIC_NODES)
2904 			p += snprintf(p, buffer + maxlen - p, "static");
2905 		else if (flags & MPOL_F_RELATIVE_NODES)
2906 			p += snprintf(p, buffer + maxlen - p, "relative");
2907 	}
2908 
2909 	if (!nodes_empty(nodes))
2910 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2911 			       nodemask_pr_args(&nodes));
2912 }
2913