xref: /linux/mm/mempolicy.c (revision 954ea91fb68b771dba6d87cfa61b68e09cc2497f)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Simple NUMA memory policy for the Linux kernel.
4   *
5   * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6   * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7   *
8   * NUMA policy allows the user to give hints in which node(s) memory should
9   * be allocated.
10   *
11   * Support four policies per VMA and per process:
12   *
13   * The VMA policy has priority over the process policy for a page fault.
14   *
15   * interleave     Allocate memory interleaved over a set of nodes,
16   *                with normal fallback if it fails.
17   *                For VMA based allocations this interleaves based on the
18   *                offset into the backing object or offset into the mapping
19   *                for anonymous memory. For process policy an process counter
20   *                is used.
21   *
22   * bind           Only allocate memory on a specific set of nodes,
23   *                no fallback.
24   *                FIXME: memory is allocated starting with the first node
25   *                to the last. It would be better if bind would truly restrict
26   *                the allocation to memory nodes instead
27   *
28   * preferred       Try a specific node first before normal fallback.
29   *                As a special case NUMA_NO_NODE here means do the allocation
30   *                on the local CPU. This is normally identical to default,
31   *                but useful to set in a VMA when you have a non default
32   *                process policy.
33   *
34   * preferred many Try a set of nodes first before normal fallback. This is
35   *                similar to preferred without the special case.
36   *
37   * default        Allocate on the local node first, or when on a VMA
38   *                use the process policy. This is what Linux always did
39   *		  in a NUMA aware kernel and still does by, ahem, default.
40   *
41   * The process policy is applied for most non interrupt memory allocations
42   * in that process' context. Interrupts ignore the policies and always
43   * try to allocate on the local CPU. The VMA policy is only applied for memory
44   * allocations for a VMA in the VM.
45   *
46   * Currently there are a few corner cases in swapping where the policy
47   * is not applied, but the majority should be handled. When process policy
48   * is used it is not remembered over swap outs/swap ins.
49   *
50   * Only the highest zone in the zone hierarchy gets policied. Allocations
51   * requesting a lower zone just use default policy. This implies that
52   * on systems with highmem kernel lowmem allocation don't get policied.
53   * Same with GFP_DMA allocations.
54   *
55   * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56   * all users and remembered even when nobody has memory mapped.
57   */
58  
59  /* Notebook:
60     fix mmap readahead to honour policy and enable policy for any page cache
61     object
62     statistics for bigpages
63     global policy for page cache? currently it uses process policy. Requires
64     first item above.
65     handle mremap for shared memory (currently ignored for the policy)
66     grows down?
67     make bind policy root only? It can trigger oom much faster and the
68     kernel is not always grateful with that.
69  */
70  
71  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72  
73  #include <linux/mempolicy.h>
74  #include <linux/pagewalk.h>
75  #include <linux/highmem.h>
76  #include <linux/hugetlb.h>
77  #include <linux/kernel.h>
78  #include <linux/sched.h>
79  #include <linux/sched/mm.h>
80  #include <linux/sched/numa_balancing.h>
81  #include <linux/sched/task.h>
82  #include <linux/nodemask.h>
83  #include <linux/cpuset.h>
84  #include <linux/slab.h>
85  #include <linux/string.h>
86  #include <linux/export.h>
87  #include <linux/nsproxy.h>
88  #include <linux/interrupt.h>
89  #include <linux/init.h>
90  #include <linux/compat.h>
91  #include <linux/ptrace.h>
92  #include <linux/swap.h>
93  #include <linux/seq_file.h>
94  #include <linux/proc_fs.h>
95  #include <linux/migrate.h>
96  #include <linux/ksm.h>
97  #include <linux/rmap.h>
98  #include <linux/security.h>
99  #include <linux/syscalls.h>
100  #include <linux/ctype.h>
101  #include <linux/mm_inline.h>
102  #include <linux/mmu_notifier.h>
103  #include <linux/printk.h>
104  #include <linux/swapops.h>
105  
106  #include <asm/tlbflush.h>
107  #include <asm/tlb.h>
108  #include <linux/uaccess.h>
109  
110  #include "internal.h"
111  
112  /* Internal flags */
113  #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
114  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
115  
116  static struct kmem_cache *policy_cache;
117  static struct kmem_cache *sn_cache;
118  
119  /* Highest zone. An specific allocation for a zone below that is not
120     policied. */
121  enum zone_type policy_zone = 0;
122  
123  /*
124   * run-time system-wide default policy => local allocation
125   */
126  static struct mempolicy default_policy = {
127  	.refcnt = ATOMIC_INIT(1), /* never free it */
128  	.mode = MPOL_LOCAL,
129  };
130  
131  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
132  
133  /**
134   * numa_map_to_online_node - Find closest online node
135   * @node: Node id to start the search
136   *
137   * Lookup the next closest node by distance if @nid is not online.
138   *
139   * Return: this @node if it is online, otherwise the closest node by distance
140   */
141  int numa_map_to_online_node(int node)
142  {
143  	int min_dist = INT_MAX, dist, n, min_node;
144  
145  	if (node == NUMA_NO_NODE || node_online(node))
146  		return node;
147  
148  	min_node = node;
149  	for_each_online_node(n) {
150  		dist = node_distance(node, n);
151  		if (dist < min_dist) {
152  			min_dist = dist;
153  			min_node = n;
154  		}
155  	}
156  
157  	return min_node;
158  }
159  EXPORT_SYMBOL_GPL(numa_map_to_online_node);
160  
161  struct mempolicy *get_task_policy(struct task_struct *p)
162  {
163  	struct mempolicy *pol = p->mempolicy;
164  	int node;
165  
166  	if (pol)
167  		return pol;
168  
169  	node = numa_node_id();
170  	if (node != NUMA_NO_NODE) {
171  		pol = &preferred_node_policy[node];
172  		/* preferred_node_policy is not initialised early in boot */
173  		if (pol->mode)
174  			return pol;
175  	}
176  
177  	return &default_policy;
178  }
179  
180  static const struct mempolicy_operations {
181  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
182  	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
183  } mpol_ops[MPOL_MAX];
184  
185  static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
186  {
187  	return pol->flags & MPOL_MODE_FLAGS;
188  }
189  
190  static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
191  				   const nodemask_t *rel)
192  {
193  	nodemask_t tmp;
194  	nodes_fold(tmp, *orig, nodes_weight(*rel));
195  	nodes_onto(*ret, tmp, *rel);
196  }
197  
198  static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
199  {
200  	if (nodes_empty(*nodes))
201  		return -EINVAL;
202  	pol->nodes = *nodes;
203  	return 0;
204  }
205  
206  static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
207  {
208  	if (nodes_empty(*nodes))
209  		return -EINVAL;
210  
211  	nodes_clear(pol->nodes);
212  	node_set(first_node(*nodes), pol->nodes);
213  	return 0;
214  }
215  
216  /*
217   * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
218   * any, for the new policy.  mpol_new() has already validated the nodes
219   * parameter with respect to the policy mode and flags.
220   *
221   * Must be called holding task's alloc_lock to protect task's mems_allowed
222   * and mempolicy.  May also be called holding the mmap_lock for write.
223   */
224  static int mpol_set_nodemask(struct mempolicy *pol,
225  		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
226  {
227  	int ret;
228  
229  	/*
230  	 * Default (pol==NULL) resp. local memory policies are not a
231  	 * subject of any remapping. They also do not need any special
232  	 * constructor.
233  	 */
234  	if (!pol || pol->mode == MPOL_LOCAL)
235  		return 0;
236  
237  	/* Check N_MEMORY */
238  	nodes_and(nsc->mask1,
239  		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
240  
241  	VM_BUG_ON(!nodes);
242  
243  	if (pol->flags & MPOL_F_RELATIVE_NODES)
244  		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
245  	else
246  		nodes_and(nsc->mask2, *nodes, nsc->mask1);
247  
248  	if (mpol_store_user_nodemask(pol))
249  		pol->w.user_nodemask = *nodes;
250  	else
251  		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
252  
253  	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
254  	return ret;
255  }
256  
257  /*
258   * This function just creates a new policy, does some check and simple
259   * initialization. You must invoke mpol_set_nodemask() to set nodes.
260   */
261  static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262  				  nodemask_t *nodes)
263  {
264  	struct mempolicy *policy;
265  
266  	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
267  		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
268  
269  	if (mode == MPOL_DEFAULT) {
270  		if (nodes && !nodes_empty(*nodes))
271  			return ERR_PTR(-EINVAL);
272  		return NULL;
273  	}
274  	VM_BUG_ON(!nodes);
275  
276  	/*
277  	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278  	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279  	 * All other modes require a valid pointer to a non-empty nodemask.
280  	 */
281  	if (mode == MPOL_PREFERRED) {
282  		if (nodes_empty(*nodes)) {
283  			if (((flags & MPOL_F_STATIC_NODES) ||
284  			     (flags & MPOL_F_RELATIVE_NODES)))
285  				return ERR_PTR(-EINVAL);
286  
287  			mode = MPOL_LOCAL;
288  		}
289  	} else if (mode == MPOL_LOCAL) {
290  		if (!nodes_empty(*nodes) ||
291  		    (flags & MPOL_F_STATIC_NODES) ||
292  		    (flags & MPOL_F_RELATIVE_NODES))
293  			return ERR_PTR(-EINVAL);
294  	} else if (nodes_empty(*nodes))
295  		return ERR_PTR(-EINVAL);
296  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
297  	if (!policy)
298  		return ERR_PTR(-ENOMEM);
299  	atomic_set(&policy->refcnt, 1);
300  	policy->mode = mode;
301  	policy->flags = flags;
302  	policy->home_node = NUMA_NO_NODE;
303  
304  	return policy;
305  }
306  
307  /* Slow path of a mpol destructor. */
308  void __mpol_put(struct mempolicy *p)
309  {
310  	if (!atomic_dec_and_test(&p->refcnt))
311  		return;
312  	kmem_cache_free(policy_cache, p);
313  }
314  
315  static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
316  {
317  }
318  
319  static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
320  {
321  	nodemask_t tmp;
322  
323  	if (pol->flags & MPOL_F_STATIC_NODES)
324  		nodes_and(tmp, pol->w.user_nodemask, *nodes);
325  	else if (pol->flags & MPOL_F_RELATIVE_NODES)
326  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
327  	else {
328  		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
329  								*nodes);
330  		pol->w.cpuset_mems_allowed = *nodes;
331  	}
332  
333  	if (nodes_empty(tmp))
334  		tmp = *nodes;
335  
336  	pol->nodes = tmp;
337  }
338  
339  static void mpol_rebind_preferred(struct mempolicy *pol,
340  						const nodemask_t *nodes)
341  {
342  	pol->w.cpuset_mems_allowed = *nodes;
343  }
344  
345  /*
346   * mpol_rebind_policy - Migrate a policy to a different set of nodes
347   *
348   * Per-vma policies are protected by mmap_lock. Allocations using per-task
349   * policies are protected by task->mems_allowed_seq to prevent a premature
350   * OOM/allocation failure due to parallel nodemask modification.
351   */
352  static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
353  {
354  	if (!pol || pol->mode == MPOL_LOCAL)
355  		return;
356  	if (!mpol_store_user_nodemask(pol) &&
357  	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
358  		return;
359  
360  	mpol_ops[pol->mode].rebind(pol, newmask);
361  }
362  
363  /*
364   * Wrapper for mpol_rebind_policy() that just requires task
365   * pointer, and updates task mempolicy.
366   *
367   * Called with task's alloc_lock held.
368   */
369  
370  void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
371  {
372  	mpol_rebind_policy(tsk->mempolicy, new);
373  }
374  
375  /*
376   * Rebind each vma in mm to new nodemask.
377   *
378   * Call holding a reference to mm.  Takes mm->mmap_lock during call.
379   */
380  
381  void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
382  {
383  	struct vm_area_struct *vma;
384  	VMA_ITERATOR(vmi, mm, 0);
385  
386  	mmap_write_lock(mm);
387  	for_each_vma(vmi, vma)
388  		mpol_rebind_policy(vma->vm_policy, new);
389  	mmap_write_unlock(mm);
390  }
391  
392  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
393  	[MPOL_DEFAULT] = {
394  		.rebind = mpol_rebind_default,
395  	},
396  	[MPOL_INTERLEAVE] = {
397  		.create = mpol_new_nodemask,
398  		.rebind = mpol_rebind_nodemask,
399  	},
400  	[MPOL_PREFERRED] = {
401  		.create = mpol_new_preferred,
402  		.rebind = mpol_rebind_preferred,
403  	},
404  	[MPOL_BIND] = {
405  		.create = mpol_new_nodemask,
406  		.rebind = mpol_rebind_nodemask,
407  	},
408  	[MPOL_LOCAL] = {
409  		.rebind = mpol_rebind_default,
410  	},
411  	[MPOL_PREFERRED_MANY] = {
412  		.create = mpol_new_nodemask,
413  		.rebind = mpol_rebind_preferred,
414  	},
415  };
416  
417  static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
418  				unsigned long flags);
419  
420  struct queue_pages {
421  	struct list_head *pagelist;
422  	unsigned long flags;
423  	nodemask_t *nmask;
424  	unsigned long start;
425  	unsigned long end;
426  	struct vm_area_struct *first;
427  };
428  
429  /*
430   * Check if the folio's nid is in qp->nmask.
431   *
432   * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
433   * in the invert of qp->nmask.
434   */
435  static inline bool queue_folio_required(struct folio *folio,
436  					struct queue_pages *qp)
437  {
438  	int nid = folio_nid(folio);
439  	unsigned long flags = qp->flags;
440  
441  	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
442  }
443  
444  /*
445   * queue_folios_pmd() has three possible return values:
446   * 0 - folios are placed on the right node or queued successfully, or
447   *     special page is met, i.e. huge zero page.
448   * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
449   *     specified.
450   * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
451   *        existing folio was already on a node that does not follow the
452   *        policy.
453   */
454  static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
455  				unsigned long end, struct mm_walk *walk)
456  	__releases(ptl)
457  {
458  	int ret = 0;
459  	struct folio *folio;
460  	struct queue_pages *qp = walk->private;
461  	unsigned long flags;
462  
463  	if (unlikely(is_pmd_migration_entry(*pmd))) {
464  		ret = -EIO;
465  		goto unlock;
466  	}
467  	folio = pfn_folio(pmd_pfn(*pmd));
468  	if (is_huge_zero_page(&folio->page)) {
469  		walk->action = ACTION_CONTINUE;
470  		goto unlock;
471  	}
472  	if (!queue_folio_required(folio, qp))
473  		goto unlock;
474  
475  	flags = qp->flags;
476  	/* go to folio migration */
477  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
478  		if (!vma_migratable(walk->vma) ||
479  		    migrate_folio_add(folio, qp->pagelist, flags)) {
480  			ret = 1;
481  			goto unlock;
482  		}
483  	} else
484  		ret = -EIO;
485  unlock:
486  	spin_unlock(ptl);
487  	return ret;
488  }
489  
490  /*
491   * Scan through pages checking if pages follow certain conditions,
492   * and move them to the pagelist if they do.
493   *
494   * queue_folios_pte_range() has three possible return values:
495   * 0 - folios are placed on the right node or queued successfully, or
496   *     special page is met, i.e. zero page.
497   * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
498   *     specified.
499   * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
500   *        on a node that does not follow the policy.
501   */
502  static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
503  			unsigned long end, struct mm_walk *walk)
504  {
505  	struct vm_area_struct *vma = walk->vma;
506  	struct folio *folio;
507  	struct queue_pages *qp = walk->private;
508  	unsigned long flags = qp->flags;
509  	bool has_unmovable = false;
510  	pte_t *pte, *mapped_pte;
511  	spinlock_t *ptl;
512  
513  	ptl = pmd_trans_huge_lock(pmd, vma);
514  	if (ptl)
515  		return queue_folios_pmd(pmd, ptl, addr, end, walk);
516  
517  	if (pmd_trans_unstable(pmd))
518  		return 0;
519  
520  	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
521  	for (; addr != end; pte++, addr += PAGE_SIZE) {
522  		if (!pte_present(*pte))
523  			continue;
524  		folio = vm_normal_folio(vma, addr, *pte);
525  		if (!folio || folio_is_zone_device(folio))
526  			continue;
527  		/*
528  		 * vm_normal_folio() filters out zero pages, but there might
529  		 * still be reserved folios to skip, perhaps in a VDSO.
530  		 */
531  		if (folio_test_reserved(folio))
532  			continue;
533  		if (!queue_folio_required(folio, qp))
534  			continue;
535  		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
536  			/* MPOL_MF_STRICT must be specified if we get here */
537  			if (!vma_migratable(vma)) {
538  				has_unmovable = true;
539  				break;
540  			}
541  
542  			/*
543  			 * Do not abort immediately since there may be
544  			 * temporary off LRU pages in the range.  Still
545  			 * need migrate other LRU pages.
546  			 */
547  			if (migrate_folio_add(folio, qp->pagelist, flags))
548  				has_unmovable = true;
549  		} else
550  			break;
551  	}
552  	pte_unmap_unlock(mapped_pte, ptl);
553  	cond_resched();
554  
555  	if (has_unmovable)
556  		return 1;
557  
558  	return addr != end ? -EIO : 0;
559  }
560  
561  static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
562  			       unsigned long addr, unsigned long end,
563  			       struct mm_walk *walk)
564  {
565  	int ret = 0;
566  #ifdef CONFIG_HUGETLB_PAGE
567  	struct queue_pages *qp = walk->private;
568  	unsigned long flags = (qp->flags & MPOL_MF_VALID);
569  	struct folio *folio;
570  	spinlock_t *ptl;
571  	pte_t entry;
572  
573  	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
574  	entry = huge_ptep_get(pte);
575  	if (!pte_present(entry))
576  		goto unlock;
577  	folio = pfn_folio(pte_pfn(entry));
578  	if (!queue_folio_required(folio, qp))
579  		goto unlock;
580  
581  	if (flags == MPOL_MF_STRICT) {
582  		/*
583  		 * STRICT alone means only detecting misplaced folio and no
584  		 * need to further check other vma.
585  		 */
586  		ret = -EIO;
587  		goto unlock;
588  	}
589  
590  	if (!vma_migratable(walk->vma)) {
591  		/*
592  		 * Must be STRICT with MOVE*, otherwise .test_walk() have
593  		 * stopped walking current vma.
594  		 * Detecting misplaced folio but allow migrating folios which
595  		 * have been queued.
596  		 */
597  		ret = 1;
598  		goto unlock;
599  	}
600  
601  	/*
602  	 * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
603  	 * is shared it is likely not worth migrating.
604  	 *
605  	 * To check if the folio is shared, ideally we want to make sure
606  	 * every page is mapped to the same process. Doing that is very
607  	 * expensive, so check the estimated mapcount of the folio instead.
608  	 */
609  	if (flags & (MPOL_MF_MOVE_ALL) ||
610  	    (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
611  	     !hugetlb_pmd_shared(pte))) {
612  		if (!isolate_hugetlb(folio, qp->pagelist) &&
613  			(flags & MPOL_MF_STRICT))
614  			/*
615  			 * Failed to isolate folio but allow migrating pages
616  			 * which have been queued.
617  			 */
618  			ret = 1;
619  	}
620  unlock:
621  	spin_unlock(ptl);
622  #else
623  	BUG();
624  #endif
625  	return ret;
626  }
627  
628  #ifdef CONFIG_NUMA_BALANCING
629  /*
630   * This is used to mark a range of virtual addresses to be inaccessible.
631   * These are later cleared by a NUMA hinting fault. Depending on these
632   * faults, pages may be migrated for better NUMA placement.
633   *
634   * This is assuming that NUMA faults are handled using PROT_NONE. If
635   * an architecture makes a different choice, it will need further
636   * changes to the core.
637   */
638  unsigned long change_prot_numa(struct vm_area_struct *vma,
639  			unsigned long addr, unsigned long end)
640  {
641  	struct mmu_gather tlb;
642  	long nr_updated;
643  
644  	tlb_gather_mmu(&tlb, vma->vm_mm);
645  
646  	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
647  	if (nr_updated > 0)
648  		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
649  
650  	tlb_finish_mmu(&tlb);
651  
652  	return nr_updated;
653  }
654  #else
655  static unsigned long change_prot_numa(struct vm_area_struct *vma,
656  			unsigned long addr, unsigned long end)
657  {
658  	return 0;
659  }
660  #endif /* CONFIG_NUMA_BALANCING */
661  
662  static int queue_pages_test_walk(unsigned long start, unsigned long end,
663  				struct mm_walk *walk)
664  {
665  	struct vm_area_struct *next, *vma = walk->vma;
666  	struct queue_pages *qp = walk->private;
667  	unsigned long endvma = vma->vm_end;
668  	unsigned long flags = qp->flags;
669  
670  	/* range check first */
671  	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
672  
673  	if (!qp->first) {
674  		qp->first = vma;
675  		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
676  			(qp->start < vma->vm_start))
677  			/* hole at head side of range */
678  			return -EFAULT;
679  	}
680  	next = find_vma(vma->vm_mm, vma->vm_end);
681  	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
682  		((vma->vm_end < qp->end) &&
683  		(!next || vma->vm_end < next->vm_start)))
684  		/* hole at middle or tail of range */
685  		return -EFAULT;
686  
687  	/*
688  	 * Need check MPOL_MF_STRICT to return -EIO if possible
689  	 * regardless of vma_migratable
690  	 */
691  	if (!vma_migratable(vma) &&
692  	    !(flags & MPOL_MF_STRICT))
693  		return 1;
694  
695  	if (endvma > end)
696  		endvma = end;
697  
698  	if (flags & MPOL_MF_LAZY) {
699  		/* Similar to task_numa_work, skip inaccessible VMAs */
700  		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
701  			!(vma->vm_flags & VM_MIXEDMAP))
702  			change_prot_numa(vma, start, endvma);
703  		return 1;
704  	}
705  
706  	/* queue pages from current vma */
707  	if (flags & MPOL_MF_VALID)
708  		return 0;
709  	return 1;
710  }
711  
712  static const struct mm_walk_ops queue_pages_walk_ops = {
713  	.hugetlb_entry		= queue_folios_hugetlb,
714  	.pmd_entry		= queue_folios_pte_range,
715  	.test_walk		= queue_pages_test_walk,
716  };
717  
718  /*
719   * Walk through page tables and collect pages to be migrated.
720   *
721   * If pages found in a given range are on a set of nodes (determined by
722   * @nodes and @flags,) it's isolated and queued to the pagelist which is
723   * passed via @private.
724   *
725   * queue_pages_range() has three possible return values:
726   * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
727   *     specified.
728   * 0 - queue pages successfully or no misplaced page.
729   * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
730   *         memory range specified by nodemask and maxnode points outside
731   *         your accessible address space (-EFAULT)
732   */
733  static int
734  queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
735  		nodemask_t *nodes, unsigned long flags,
736  		struct list_head *pagelist)
737  {
738  	int err;
739  	struct queue_pages qp = {
740  		.pagelist = pagelist,
741  		.flags = flags,
742  		.nmask = nodes,
743  		.start = start,
744  		.end = end,
745  		.first = NULL,
746  	};
747  
748  	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
749  
750  	if (!qp.first)
751  		/* whole range in hole */
752  		err = -EFAULT;
753  
754  	return err;
755  }
756  
757  /*
758   * Apply policy to a single VMA
759   * This must be called with the mmap_lock held for writing.
760   */
761  static int vma_replace_policy(struct vm_area_struct *vma,
762  						struct mempolicy *pol)
763  {
764  	int err;
765  	struct mempolicy *old;
766  	struct mempolicy *new;
767  
768  	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
769  		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
770  		 vma->vm_ops, vma->vm_file,
771  		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
772  
773  	new = mpol_dup(pol);
774  	if (IS_ERR(new))
775  		return PTR_ERR(new);
776  
777  	if (vma->vm_ops && vma->vm_ops->set_policy) {
778  		err = vma->vm_ops->set_policy(vma, new);
779  		if (err)
780  			goto err_out;
781  	}
782  
783  	old = vma->vm_policy;
784  	vma->vm_policy = new; /* protected by mmap_lock */
785  	mpol_put(old);
786  
787  	return 0;
788   err_out:
789  	mpol_put(new);
790  	return err;
791  }
792  
793  /* Step 2: apply policy to a range and do splits. */
794  static int mbind_range(struct mm_struct *mm, unsigned long start,
795  		       unsigned long end, struct mempolicy *new_pol)
796  {
797  	VMA_ITERATOR(vmi, mm, start);
798  	struct vm_area_struct *prev;
799  	struct vm_area_struct *vma;
800  	int err = 0;
801  	pgoff_t pgoff;
802  
803  	prev = vma_prev(&vmi);
804  	vma = vma_find(&vmi, end);
805  	if (WARN_ON(!vma))
806  		return 0;
807  
808  	if (start > vma->vm_start)
809  		prev = vma;
810  
811  	do {
812  		unsigned long vmstart = max(start, vma->vm_start);
813  		unsigned long vmend = min(end, vma->vm_end);
814  
815  		if (mpol_equal(vma_policy(vma), new_pol))
816  			goto next;
817  
818  		pgoff = vma->vm_pgoff +
819  			((vmstart - vma->vm_start) >> PAGE_SHIFT);
820  		prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags,
821  				 vma->anon_vma, vma->vm_file, pgoff,
822  				 new_pol, vma->vm_userfaultfd_ctx,
823  				 anon_vma_name(vma));
824  		if (prev) {
825  			vma = prev;
826  			goto replace;
827  		}
828  		if (vma->vm_start != vmstart) {
829  			err = split_vma(&vmi, vma, vmstart, 1);
830  			if (err)
831  				goto out;
832  		}
833  		if (vma->vm_end != vmend) {
834  			err = split_vma(&vmi, vma, vmend, 0);
835  			if (err)
836  				goto out;
837  		}
838  replace:
839  		err = vma_replace_policy(vma, new_pol);
840  		if (err)
841  			goto out;
842  next:
843  		prev = vma;
844  	} for_each_vma_range(vmi, vma, end);
845  
846  out:
847  	return err;
848  }
849  
850  /* Set the process memory policy */
851  static long do_set_mempolicy(unsigned short mode, unsigned short flags,
852  			     nodemask_t *nodes)
853  {
854  	struct mempolicy *new, *old;
855  	NODEMASK_SCRATCH(scratch);
856  	int ret;
857  
858  	if (!scratch)
859  		return -ENOMEM;
860  
861  	new = mpol_new(mode, flags, nodes);
862  	if (IS_ERR(new)) {
863  		ret = PTR_ERR(new);
864  		goto out;
865  	}
866  
867  	task_lock(current);
868  	ret = mpol_set_nodemask(new, nodes, scratch);
869  	if (ret) {
870  		task_unlock(current);
871  		mpol_put(new);
872  		goto out;
873  	}
874  
875  	old = current->mempolicy;
876  	current->mempolicy = new;
877  	if (new && new->mode == MPOL_INTERLEAVE)
878  		current->il_prev = MAX_NUMNODES-1;
879  	task_unlock(current);
880  	mpol_put(old);
881  	ret = 0;
882  out:
883  	NODEMASK_SCRATCH_FREE(scratch);
884  	return ret;
885  }
886  
887  /*
888   * Return nodemask for policy for get_mempolicy() query
889   *
890   * Called with task's alloc_lock held
891   */
892  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
893  {
894  	nodes_clear(*nodes);
895  	if (p == &default_policy)
896  		return;
897  
898  	switch (p->mode) {
899  	case MPOL_BIND:
900  	case MPOL_INTERLEAVE:
901  	case MPOL_PREFERRED:
902  	case MPOL_PREFERRED_MANY:
903  		*nodes = p->nodes;
904  		break;
905  	case MPOL_LOCAL:
906  		/* return empty node mask for local allocation */
907  		break;
908  	default:
909  		BUG();
910  	}
911  }
912  
913  static int lookup_node(struct mm_struct *mm, unsigned long addr)
914  {
915  	struct page *p = NULL;
916  	int ret;
917  
918  	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
919  	if (ret > 0) {
920  		ret = page_to_nid(p);
921  		put_page(p);
922  	}
923  	return ret;
924  }
925  
926  /* Retrieve NUMA policy */
927  static long do_get_mempolicy(int *policy, nodemask_t *nmask,
928  			     unsigned long addr, unsigned long flags)
929  {
930  	int err;
931  	struct mm_struct *mm = current->mm;
932  	struct vm_area_struct *vma = NULL;
933  	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
934  
935  	if (flags &
936  		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
937  		return -EINVAL;
938  
939  	if (flags & MPOL_F_MEMS_ALLOWED) {
940  		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
941  			return -EINVAL;
942  		*policy = 0;	/* just so it's initialized */
943  		task_lock(current);
944  		*nmask  = cpuset_current_mems_allowed;
945  		task_unlock(current);
946  		return 0;
947  	}
948  
949  	if (flags & MPOL_F_ADDR) {
950  		/*
951  		 * Do NOT fall back to task policy if the
952  		 * vma/shared policy at addr is NULL.  We
953  		 * want to return MPOL_DEFAULT in this case.
954  		 */
955  		mmap_read_lock(mm);
956  		vma = vma_lookup(mm, addr);
957  		if (!vma) {
958  			mmap_read_unlock(mm);
959  			return -EFAULT;
960  		}
961  		if (vma->vm_ops && vma->vm_ops->get_policy)
962  			pol = vma->vm_ops->get_policy(vma, addr);
963  		else
964  			pol = vma->vm_policy;
965  	} else if (addr)
966  		return -EINVAL;
967  
968  	if (!pol)
969  		pol = &default_policy;	/* indicates default behavior */
970  
971  	if (flags & MPOL_F_NODE) {
972  		if (flags & MPOL_F_ADDR) {
973  			/*
974  			 * Take a refcount on the mpol, because we are about to
975  			 * drop the mmap_lock, after which only "pol" remains
976  			 * valid, "vma" is stale.
977  			 */
978  			pol_refcount = pol;
979  			vma = NULL;
980  			mpol_get(pol);
981  			mmap_read_unlock(mm);
982  			err = lookup_node(mm, addr);
983  			if (err < 0)
984  				goto out;
985  			*policy = err;
986  		} else if (pol == current->mempolicy &&
987  				pol->mode == MPOL_INTERLEAVE) {
988  			*policy = next_node_in(current->il_prev, pol->nodes);
989  		} else {
990  			err = -EINVAL;
991  			goto out;
992  		}
993  	} else {
994  		*policy = pol == &default_policy ? MPOL_DEFAULT :
995  						pol->mode;
996  		/*
997  		 * Internal mempolicy flags must be masked off before exposing
998  		 * the policy to userspace.
999  		 */
1000  		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1001  	}
1002  
1003  	err = 0;
1004  	if (nmask) {
1005  		if (mpol_store_user_nodemask(pol)) {
1006  			*nmask = pol->w.user_nodemask;
1007  		} else {
1008  			task_lock(current);
1009  			get_policy_nodemask(pol, nmask);
1010  			task_unlock(current);
1011  		}
1012  	}
1013  
1014   out:
1015  	mpol_cond_put(pol);
1016  	if (vma)
1017  		mmap_read_unlock(mm);
1018  	if (pol_refcount)
1019  		mpol_put(pol_refcount);
1020  	return err;
1021  }
1022  
1023  #ifdef CONFIG_MIGRATION
1024  static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1025  				unsigned long flags)
1026  {
1027  	/*
1028  	 * We try to migrate only unshared folios. If it is shared it
1029  	 * is likely not worth migrating.
1030  	 *
1031  	 * To check if the folio is shared, ideally we want to make sure
1032  	 * every page is mapped to the same process. Doing that is very
1033  	 * expensive, so check the estimated mapcount of the folio instead.
1034  	 */
1035  	if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
1036  		if (folio_isolate_lru(folio)) {
1037  			list_add_tail(&folio->lru, foliolist);
1038  			node_stat_mod_folio(folio,
1039  				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1040  				folio_nr_pages(folio));
1041  		} else if (flags & MPOL_MF_STRICT) {
1042  			/*
1043  			 * Non-movable folio may reach here.  And, there may be
1044  			 * temporary off LRU folios or non-LRU movable folios.
1045  			 * Treat them as unmovable folios since they can't be
1046  			 * isolated, so they can't be moved at the moment.  It
1047  			 * should return -EIO for this case too.
1048  			 */
1049  			return -EIO;
1050  		}
1051  	}
1052  
1053  	return 0;
1054  }
1055  
1056  /*
1057   * Migrate pages from one node to a target node.
1058   * Returns error or the number of pages not migrated.
1059   */
1060  static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1061  			   int flags)
1062  {
1063  	nodemask_t nmask;
1064  	struct vm_area_struct *vma;
1065  	LIST_HEAD(pagelist);
1066  	int err = 0;
1067  	struct migration_target_control mtc = {
1068  		.nid = dest,
1069  		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1070  	};
1071  
1072  	nodes_clear(nmask);
1073  	node_set(source, nmask);
1074  
1075  	/*
1076  	 * This does not "check" the range but isolates all pages that
1077  	 * need migration.  Between passing in the full user address
1078  	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1079  	 */
1080  	vma = find_vma(mm, 0);
1081  	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1082  	queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1083  			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1084  
1085  	if (!list_empty(&pagelist)) {
1086  		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1087  				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1088  		if (err)
1089  			putback_movable_pages(&pagelist);
1090  	}
1091  
1092  	return err;
1093  }
1094  
1095  /*
1096   * Move pages between the two nodesets so as to preserve the physical
1097   * layout as much as possible.
1098   *
1099   * Returns the number of page that could not be moved.
1100   */
1101  int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1102  		     const nodemask_t *to, int flags)
1103  {
1104  	int busy = 0;
1105  	int err = 0;
1106  	nodemask_t tmp;
1107  
1108  	lru_cache_disable();
1109  
1110  	mmap_read_lock(mm);
1111  
1112  	/*
1113  	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1114  	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1115  	 * bit in 'tmp', and return that <source, dest> pair for migration.
1116  	 * The pair of nodemasks 'to' and 'from' define the map.
1117  	 *
1118  	 * If no pair of bits is found that way, fallback to picking some
1119  	 * pair of 'source' and 'dest' bits that are not the same.  If the
1120  	 * 'source' and 'dest' bits are the same, this represents a node
1121  	 * that will be migrating to itself, so no pages need move.
1122  	 *
1123  	 * If no bits are left in 'tmp', or if all remaining bits left
1124  	 * in 'tmp' correspond to the same bit in 'to', return false
1125  	 * (nothing left to migrate).
1126  	 *
1127  	 * This lets us pick a pair of nodes to migrate between, such that
1128  	 * if possible the dest node is not already occupied by some other
1129  	 * source node, minimizing the risk of overloading the memory on a
1130  	 * node that would happen if we migrated incoming memory to a node
1131  	 * before migrating outgoing memory source that same node.
1132  	 *
1133  	 * A single scan of tmp is sufficient.  As we go, we remember the
1134  	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1135  	 * that not only moved, but what's better, moved to an empty slot
1136  	 * (d is not set in tmp), then we break out then, with that pair.
1137  	 * Otherwise when we finish scanning from_tmp, we at least have the
1138  	 * most recent <s, d> pair that moved.  If we get all the way through
1139  	 * the scan of tmp without finding any node that moved, much less
1140  	 * moved to an empty node, then there is nothing left worth migrating.
1141  	 */
1142  
1143  	tmp = *from;
1144  	while (!nodes_empty(tmp)) {
1145  		int s, d;
1146  		int source = NUMA_NO_NODE;
1147  		int dest = 0;
1148  
1149  		for_each_node_mask(s, tmp) {
1150  
1151  			/*
1152  			 * do_migrate_pages() tries to maintain the relative
1153  			 * node relationship of the pages established between
1154  			 * threads and memory areas.
1155                           *
1156  			 * However if the number of source nodes is not equal to
1157  			 * the number of destination nodes we can not preserve
1158  			 * this node relative relationship.  In that case, skip
1159  			 * copying memory from a node that is in the destination
1160  			 * mask.
1161  			 *
1162  			 * Example: [2,3,4] -> [3,4,5] moves everything.
1163  			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1164  			 */
1165  
1166  			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1167  						(node_isset(s, *to)))
1168  				continue;
1169  
1170  			d = node_remap(s, *from, *to);
1171  			if (s == d)
1172  				continue;
1173  
1174  			source = s;	/* Node moved. Memorize */
1175  			dest = d;
1176  
1177  			/* dest not in remaining from nodes? */
1178  			if (!node_isset(dest, tmp))
1179  				break;
1180  		}
1181  		if (source == NUMA_NO_NODE)
1182  			break;
1183  
1184  		node_clear(source, tmp);
1185  		err = migrate_to_node(mm, source, dest, flags);
1186  		if (err > 0)
1187  			busy += err;
1188  		if (err < 0)
1189  			break;
1190  	}
1191  	mmap_read_unlock(mm);
1192  
1193  	lru_cache_enable();
1194  	if (err < 0)
1195  		return err;
1196  	return busy;
1197  
1198  }
1199  
1200  /*
1201   * Allocate a new page for page migration based on vma policy.
1202   * Start by assuming the page is mapped by the same vma as contains @start.
1203   * Search forward from there, if not.  N.B., this assumes that the
1204   * list of pages handed to migrate_pages()--which is how we get here--
1205   * is in virtual address order.
1206   */
1207  static struct page *new_page(struct page *page, unsigned long start)
1208  {
1209  	struct folio *dst, *src = page_folio(page);
1210  	struct vm_area_struct *vma;
1211  	unsigned long address;
1212  	VMA_ITERATOR(vmi, current->mm, start);
1213  	gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
1214  
1215  	for_each_vma(vmi, vma) {
1216  		address = page_address_in_vma(page, vma);
1217  		if (address != -EFAULT)
1218  			break;
1219  	}
1220  
1221  	if (folio_test_hugetlb(src)) {
1222  		dst = alloc_hugetlb_folio_vma(folio_hstate(src),
1223  				vma, address);
1224  		return &dst->page;
1225  	}
1226  
1227  	if (folio_test_large(src))
1228  		gfp = GFP_TRANSHUGE;
1229  
1230  	/*
1231  	 * if !vma, vma_alloc_folio() will use task or system default policy
1232  	 */
1233  	dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
1234  			folio_test_large(src));
1235  	return &dst->page;
1236  }
1237  #else
1238  
1239  static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1240  				unsigned long flags)
1241  {
1242  	return -EIO;
1243  }
1244  
1245  int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1246  		     const nodemask_t *to, int flags)
1247  {
1248  	return -ENOSYS;
1249  }
1250  
1251  static struct page *new_page(struct page *page, unsigned long start)
1252  {
1253  	return NULL;
1254  }
1255  #endif
1256  
1257  static long do_mbind(unsigned long start, unsigned long len,
1258  		     unsigned short mode, unsigned short mode_flags,
1259  		     nodemask_t *nmask, unsigned long flags)
1260  {
1261  	struct mm_struct *mm = current->mm;
1262  	struct mempolicy *new;
1263  	unsigned long end;
1264  	int err;
1265  	int ret;
1266  	LIST_HEAD(pagelist);
1267  
1268  	if (flags & ~(unsigned long)MPOL_MF_VALID)
1269  		return -EINVAL;
1270  	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1271  		return -EPERM;
1272  
1273  	if (start & ~PAGE_MASK)
1274  		return -EINVAL;
1275  
1276  	if (mode == MPOL_DEFAULT)
1277  		flags &= ~MPOL_MF_STRICT;
1278  
1279  	len = PAGE_ALIGN(len);
1280  	end = start + len;
1281  
1282  	if (end < start)
1283  		return -EINVAL;
1284  	if (end == start)
1285  		return 0;
1286  
1287  	new = mpol_new(mode, mode_flags, nmask);
1288  	if (IS_ERR(new))
1289  		return PTR_ERR(new);
1290  
1291  	if (flags & MPOL_MF_LAZY)
1292  		new->flags |= MPOL_F_MOF;
1293  
1294  	/*
1295  	 * If we are using the default policy then operation
1296  	 * on discontinuous address spaces is okay after all
1297  	 */
1298  	if (!new)
1299  		flags |= MPOL_MF_DISCONTIG_OK;
1300  
1301  	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1302  		 start, start + len, mode, mode_flags,
1303  		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1304  
1305  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1306  
1307  		lru_cache_disable();
1308  	}
1309  	{
1310  		NODEMASK_SCRATCH(scratch);
1311  		if (scratch) {
1312  			mmap_write_lock(mm);
1313  			err = mpol_set_nodemask(new, nmask, scratch);
1314  			if (err)
1315  				mmap_write_unlock(mm);
1316  		} else
1317  			err = -ENOMEM;
1318  		NODEMASK_SCRATCH_FREE(scratch);
1319  	}
1320  	if (err)
1321  		goto mpol_out;
1322  
1323  	ret = queue_pages_range(mm, start, end, nmask,
1324  			  flags | MPOL_MF_INVERT, &pagelist);
1325  
1326  	if (ret < 0) {
1327  		err = ret;
1328  		goto up_out;
1329  	}
1330  
1331  	err = mbind_range(mm, start, end, new);
1332  
1333  	if (!err) {
1334  		int nr_failed = 0;
1335  
1336  		if (!list_empty(&pagelist)) {
1337  			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1338  			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1339  				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1340  			if (nr_failed)
1341  				putback_movable_pages(&pagelist);
1342  		}
1343  
1344  		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1345  			err = -EIO;
1346  	} else {
1347  up_out:
1348  		if (!list_empty(&pagelist))
1349  			putback_movable_pages(&pagelist);
1350  	}
1351  
1352  	mmap_write_unlock(mm);
1353  mpol_out:
1354  	mpol_put(new);
1355  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1356  		lru_cache_enable();
1357  	return err;
1358  }
1359  
1360  /*
1361   * User space interface with variable sized bitmaps for nodelists.
1362   */
1363  static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1364  		      unsigned long maxnode)
1365  {
1366  	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1367  	int ret;
1368  
1369  	if (in_compat_syscall())
1370  		ret = compat_get_bitmap(mask,
1371  					(const compat_ulong_t __user *)nmask,
1372  					maxnode);
1373  	else
1374  		ret = copy_from_user(mask, nmask,
1375  				     nlongs * sizeof(unsigned long));
1376  
1377  	if (ret)
1378  		return -EFAULT;
1379  
1380  	if (maxnode % BITS_PER_LONG)
1381  		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1382  
1383  	return 0;
1384  }
1385  
1386  /* Copy a node mask from user space. */
1387  static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1388  		     unsigned long maxnode)
1389  {
1390  	--maxnode;
1391  	nodes_clear(*nodes);
1392  	if (maxnode == 0 || !nmask)
1393  		return 0;
1394  	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1395  		return -EINVAL;
1396  
1397  	/*
1398  	 * When the user specified more nodes than supported just check
1399  	 * if the non supported part is all zero, one word at a time,
1400  	 * starting at the end.
1401  	 */
1402  	while (maxnode > MAX_NUMNODES) {
1403  		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1404  		unsigned long t;
1405  
1406  		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1407  			return -EFAULT;
1408  
1409  		if (maxnode - bits >= MAX_NUMNODES) {
1410  			maxnode -= bits;
1411  		} else {
1412  			maxnode = MAX_NUMNODES;
1413  			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1414  		}
1415  		if (t)
1416  			return -EINVAL;
1417  	}
1418  
1419  	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1420  }
1421  
1422  /* Copy a kernel node mask to user space */
1423  static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1424  			      nodemask_t *nodes)
1425  {
1426  	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1427  	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1428  	bool compat = in_compat_syscall();
1429  
1430  	if (compat)
1431  		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1432  
1433  	if (copy > nbytes) {
1434  		if (copy > PAGE_SIZE)
1435  			return -EINVAL;
1436  		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1437  			return -EFAULT;
1438  		copy = nbytes;
1439  		maxnode = nr_node_ids;
1440  	}
1441  
1442  	if (compat)
1443  		return compat_put_bitmap((compat_ulong_t __user *)mask,
1444  					 nodes_addr(*nodes), maxnode);
1445  
1446  	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1447  }
1448  
1449  /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1450  static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1451  {
1452  	*flags = *mode & MPOL_MODE_FLAGS;
1453  	*mode &= ~MPOL_MODE_FLAGS;
1454  
1455  	if ((unsigned int)(*mode) >=  MPOL_MAX)
1456  		return -EINVAL;
1457  	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1458  		return -EINVAL;
1459  	if (*flags & MPOL_F_NUMA_BALANCING) {
1460  		if (*mode != MPOL_BIND)
1461  			return -EINVAL;
1462  		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1463  	}
1464  	return 0;
1465  }
1466  
1467  static long kernel_mbind(unsigned long start, unsigned long len,
1468  			 unsigned long mode, const unsigned long __user *nmask,
1469  			 unsigned long maxnode, unsigned int flags)
1470  {
1471  	unsigned short mode_flags;
1472  	nodemask_t nodes;
1473  	int lmode = mode;
1474  	int err;
1475  
1476  	start = untagged_addr(start);
1477  	err = sanitize_mpol_flags(&lmode, &mode_flags);
1478  	if (err)
1479  		return err;
1480  
1481  	err = get_nodes(&nodes, nmask, maxnode);
1482  	if (err)
1483  		return err;
1484  
1485  	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1486  }
1487  
1488  SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1489  		unsigned long, home_node, unsigned long, flags)
1490  {
1491  	struct mm_struct *mm = current->mm;
1492  	struct vm_area_struct *vma;
1493  	struct mempolicy *new, *old;
1494  	unsigned long vmstart;
1495  	unsigned long vmend;
1496  	unsigned long end;
1497  	int err = -ENOENT;
1498  	VMA_ITERATOR(vmi, mm, start);
1499  
1500  	start = untagged_addr(start);
1501  	if (start & ~PAGE_MASK)
1502  		return -EINVAL;
1503  	/*
1504  	 * flags is used for future extension if any.
1505  	 */
1506  	if (flags != 0)
1507  		return -EINVAL;
1508  
1509  	/*
1510  	 * Check home_node is online to avoid accessing uninitialized
1511  	 * NODE_DATA.
1512  	 */
1513  	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1514  		return -EINVAL;
1515  
1516  	len = PAGE_ALIGN(len);
1517  	end = start + len;
1518  
1519  	if (end < start)
1520  		return -EINVAL;
1521  	if (end == start)
1522  		return 0;
1523  	mmap_write_lock(mm);
1524  	for_each_vma_range(vmi, vma, end) {
1525  		/*
1526  		 * If any vma in the range got policy other than MPOL_BIND
1527  		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1528  		 * the home node for vmas we already updated before.
1529  		 */
1530  		old = vma_policy(vma);
1531  		if (!old)
1532  			continue;
1533  		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1534  			err = -EOPNOTSUPP;
1535  			break;
1536  		}
1537  		new = mpol_dup(old);
1538  		if (IS_ERR(new)) {
1539  			err = PTR_ERR(new);
1540  			break;
1541  		}
1542  
1543  		new->home_node = home_node;
1544  		vmstart = max(start, vma->vm_start);
1545  		vmend   = min(end, vma->vm_end);
1546  		err = mbind_range(mm, vmstart, vmend, new);
1547  		mpol_put(new);
1548  		if (err)
1549  			break;
1550  	}
1551  	mmap_write_unlock(mm);
1552  	return err;
1553  }
1554  
1555  SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1556  		unsigned long, mode, const unsigned long __user *, nmask,
1557  		unsigned long, maxnode, unsigned int, flags)
1558  {
1559  	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1560  }
1561  
1562  /* Set the process memory policy */
1563  static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1564  				 unsigned long maxnode)
1565  {
1566  	unsigned short mode_flags;
1567  	nodemask_t nodes;
1568  	int lmode = mode;
1569  	int err;
1570  
1571  	err = sanitize_mpol_flags(&lmode, &mode_flags);
1572  	if (err)
1573  		return err;
1574  
1575  	err = get_nodes(&nodes, nmask, maxnode);
1576  	if (err)
1577  		return err;
1578  
1579  	return do_set_mempolicy(lmode, mode_flags, &nodes);
1580  }
1581  
1582  SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1583  		unsigned long, maxnode)
1584  {
1585  	return kernel_set_mempolicy(mode, nmask, maxnode);
1586  }
1587  
1588  static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1589  				const unsigned long __user *old_nodes,
1590  				const unsigned long __user *new_nodes)
1591  {
1592  	struct mm_struct *mm = NULL;
1593  	struct task_struct *task;
1594  	nodemask_t task_nodes;
1595  	int err;
1596  	nodemask_t *old;
1597  	nodemask_t *new;
1598  	NODEMASK_SCRATCH(scratch);
1599  
1600  	if (!scratch)
1601  		return -ENOMEM;
1602  
1603  	old = &scratch->mask1;
1604  	new = &scratch->mask2;
1605  
1606  	err = get_nodes(old, old_nodes, maxnode);
1607  	if (err)
1608  		goto out;
1609  
1610  	err = get_nodes(new, new_nodes, maxnode);
1611  	if (err)
1612  		goto out;
1613  
1614  	/* Find the mm_struct */
1615  	rcu_read_lock();
1616  	task = pid ? find_task_by_vpid(pid) : current;
1617  	if (!task) {
1618  		rcu_read_unlock();
1619  		err = -ESRCH;
1620  		goto out;
1621  	}
1622  	get_task_struct(task);
1623  
1624  	err = -EINVAL;
1625  
1626  	/*
1627  	 * Check if this process has the right to modify the specified process.
1628  	 * Use the regular "ptrace_may_access()" checks.
1629  	 */
1630  	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1631  		rcu_read_unlock();
1632  		err = -EPERM;
1633  		goto out_put;
1634  	}
1635  	rcu_read_unlock();
1636  
1637  	task_nodes = cpuset_mems_allowed(task);
1638  	/* Is the user allowed to access the target nodes? */
1639  	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1640  		err = -EPERM;
1641  		goto out_put;
1642  	}
1643  
1644  	task_nodes = cpuset_mems_allowed(current);
1645  	nodes_and(*new, *new, task_nodes);
1646  	if (nodes_empty(*new))
1647  		goto out_put;
1648  
1649  	err = security_task_movememory(task);
1650  	if (err)
1651  		goto out_put;
1652  
1653  	mm = get_task_mm(task);
1654  	put_task_struct(task);
1655  
1656  	if (!mm) {
1657  		err = -EINVAL;
1658  		goto out;
1659  	}
1660  
1661  	err = do_migrate_pages(mm, old, new,
1662  		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1663  
1664  	mmput(mm);
1665  out:
1666  	NODEMASK_SCRATCH_FREE(scratch);
1667  
1668  	return err;
1669  
1670  out_put:
1671  	put_task_struct(task);
1672  	goto out;
1673  
1674  }
1675  
1676  SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1677  		const unsigned long __user *, old_nodes,
1678  		const unsigned long __user *, new_nodes)
1679  {
1680  	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1681  }
1682  
1683  
1684  /* Retrieve NUMA policy */
1685  static int kernel_get_mempolicy(int __user *policy,
1686  				unsigned long __user *nmask,
1687  				unsigned long maxnode,
1688  				unsigned long addr,
1689  				unsigned long flags)
1690  {
1691  	int err;
1692  	int pval;
1693  	nodemask_t nodes;
1694  
1695  	if (nmask != NULL && maxnode < nr_node_ids)
1696  		return -EINVAL;
1697  
1698  	addr = untagged_addr(addr);
1699  
1700  	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1701  
1702  	if (err)
1703  		return err;
1704  
1705  	if (policy && put_user(pval, policy))
1706  		return -EFAULT;
1707  
1708  	if (nmask)
1709  		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1710  
1711  	return err;
1712  }
1713  
1714  SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1715  		unsigned long __user *, nmask, unsigned long, maxnode,
1716  		unsigned long, addr, unsigned long, flags)
1717  {
1718  	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1719  }
1720  
1721  bool vma_migratable(struct vm_area_struct *vma)
1722  {
1723  	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1724  		return false;
1725  
1726  	/*
1727  	 * DAX device mappings require predictable access latency, so avoid
1728  	 * incurring periodic faults.
1729  	 */
1730  	if (vma_is_dax(vma))
1731  		return false;
1732  
1733  	if (is_vm_hugetlb_page(vma) &&
1734  		!hugepage_migration_supported(hstate_vma(vma)))
1735  		return false;
1736  
1737  	/*
1738  	 * Migration allocates pages in the highest zone. If we cannot
1739  	 * do so then migration (at least from node to node) is not
1740  	 * possible.
1741  	 */
1742  	if (vma->vm_file &&
1743  		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1744  			< policy_zone)
1745  		return false;
1746  	return true;
1747  }
1748  
1749  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1750  						unsigned long addr)
1751  {
1752  	struct mempolicy *pol = NULL;
1753  
1754  	if (vma) {
1755  		if (vma->vm_ops && vma->vm_ops->get_policy) {
1756  			pol = vma->vm_ops->get_policy(vma, addr);
1757  		} else if (vma->vm_policy) {
1758  			pol = vma->vm_policy;
1759  
1760  			/*
1761  			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1762  			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1763  			 * count on these policies which will be dropped by
1764  			 * mpol_cond_put() later
1765  			 */
1766  			if (mpol_needs_cond_ref(pol))
1767  				mpol_get(pol);
1768  		}
1769  	}
1770  
1771  	return pol;
1772  }
1773  
1774  /*
1775   * get_vma_policy(@vma, @addr)
1776   * @vma: virtual memory area whose policy is sought
1777   * @addr: address in @vma for shared policy lookup
1778   *
1779   * Returns effective policy for a VMA at specified address.
1780   * Falls back to current->mempolicy or system default policy, as necessary.
1781   * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1782   * count--added by the get_policy() vm_op, as appropriate--to protect against
1783   * freeing by another task.  It is the caller's responsibility to free the
1784   * extra reference for shared policies.
1785   */
1786  static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1787  						unsigned long addr)
1788  {
1789  	struct mempolicy *pol = __get_vma_policy(vma, addr);
1790  
1791  	if (!pol)
1792  		pol = get_task_policy(current);
1793  
1794  	return pol;
1795  }
1796  
1797  bool vma_policy_mof(struct vm_area_struct *vma)
1798  {
1799  	struct mempolicy *pol;
1800  
1801  	if (vma->vm_ops && vma->vm_ops->get_policy) {
1802  		bool ret = false;
1803  
1804  		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1805  		if (pol && (pol->flags & MPOL_F_MOF))
1806  			ret = true;
1807  		mpol_cond_put(pol);
1808  
1809  		return ret;
1810  	}
1811  
1812  	pol = vma->vm_policy;
1813  	if (!pol)
1814  		pol = get_task_policy(current);
1815  
1816  	return pol->flags & MPOL_F_MOF;
1817  }
1818  
1819  bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1820  {
1821  	enum zone_type dynamic_policy_zone = policy_zone;
1822  
1823  	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1824  
1825  	/*
1826  	 * if policy->nodes has movable memory only,
1827  	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1828  	 *
1829  	 * policy->nodes is intersect with node_states[N_MEMORY].
1830  	 * so if the following test fails, it implies
1831  	 * policy->nodes has movable memory only.
1832  	 */
1833  	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1834  		dynamic_policy_zone = ZONE_MOVABLE;
1835  
1836  	return zone >= dynamic_policy_zone;
1837  }
1838  
1839  /*
1840   * Return a nodemask representing a mempolicy for filtering nodes for
1841   * page allocation
1842   */
1843  nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1844  {
1845  	int mode = policy->mode;
1846  
1847  	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1848  	if (unlikely(mode == MPOL_BIND) &&
1849  		apply_policy_zone(policy, gfp_zone(gfp)) &&
1850  		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1851  		return &policy->nodes;
1852  
1853  	if (mode == MPOL_PREFERRED_MANY)
1854  		return &policy->nodes;
1855  
1856  	return NULL;
1857  }
1858  
1859  /*
1860   * Return the  preferred node id for 'prefer' mempolicy, and return
1861   * the given id for all other policies.
1862   *
1863   * policy_node() is always coupled with policy_nodemask(), which
1864   * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1865   */
1866  static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1867  {
1868  	if (policy->mode == MPOL_PREFERRED) {
1869  		nd = first_node(policy->nodes);
1870  	} else {
1871  		/*
1872  		 * __GFP_THISNODE shouldn't even be used with the bind policy
1873  		 * because we might easily break the expectation to stay on the
1874  		 * requested node and not break the policy.
1875  		 */
1876  		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1877  	}
1878  
1879  	if ((policy->mode == MPOL_BIND ||
1880  	     policy->mode == MPOL_PREFERRED_MANY) &&
1881  	    policy->home_node != NUMA_NO_NODE)
1882  		return policy->home_node;
1883  
1884  	return nd;
1885  }
1886  
1887  /* Do dynamic interleaving for a process */
1888  static unsigned interleave_nodes(struct mempolicy *policy)
1889  {
1890  	unsigned next;
1891  	struct task_struct *me = current;
1892  
1893  	next = next_node_in(me->il_prev, policy->nodes);
1894  	if (next < MAX_NUMNODES)
1895  		me->il_prev = next;
1896  	return next;
1897  }
1898  
1899  /*
1900   * Depending on the memory policy provide a node from which to allocate the
1901   * next slab entry.
1902   */
1903  unsigned int mempolicy_slab_node(void)
1904  {
1905  	struct mempolicy *policy;
1906  	int node = numa_mem_id();
1907  
1908  	if (!in_task())
1909  		return node;
1910  
1911  	policy = current->mempolicy;
1912  	if (!policy)
1913  		return node;
1914  
1915  	switch (policy->mode) {
1916  	case MPOL_PREFERRED:
1917  		return first_node(policy->nodes);
1918  
1919  	case MPOL_INTERLEAVE:
1920  		return interleave_nodes(policy);
1921  
1922  	case MPOL_BIND:
1923  	case MPOL_PREFERRED_MANY:
1924  	{
1925  		struct zoneref *z;
1926  
1927  		/*
1928  		 * Follow bind policy behavior and start allocation at the
1929  		 * first node.
1930  		 */
1931  		struct zonelist *zonelist;
1932  		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1933  		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1934  		z = first_zones_zonelist(zonelist, highest_zoneidx,
1935  							&policy->nodes);
1936  		return z->zone ? zone_to_nid(z->zone) : node;
1937  	}
1938  	case MPOL_LOCAL:
1939  		return node;
1940  
1941  	default:
1942  		BUG();
1943  	}
1944  }
1945  
1946  /*
1947   * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1948   * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1949   * number of present nodes.
1950   */
1951  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1952  {
1953  	nodemask_t nodemask = pol->nodes;
1954  	unsigned int target, nnodes;
1955  	int i;
1956  	int nid;
1957  	/*
1958  	 * The barrier will stabilize the nodemask in a register or on
1959  	 * the stack so that it will stop changing under the code.
1960  	 *
1961  	 * Between first_node() and next_node(), pol->nodes could be changed
1962  	 * by other threads. So we put pol->nodes in a local stack.
1963  	 */
1964  	barrier();
1965  
1966  	nnodes = nodes_weight(nodemask);
1967  	if (!nnodes)
1968  		return numa_node_id();
1969  	target = (unsigned int)n % nnodes;
1970  	nid = first_node(nodemask);
1971  	for (i = 0; i < target; i++)
1972  		nid = next_node(nid, nodemask);
1973  	return nid;
1974  }
1975  
1976  /* Determine a node number for interleave */
1977  static inline unsigned interleave_nid(struct mempolicy *pol,
1978  		 struct vm_area_struct *vma, unsigned long addr, int shift)
1979  {
1980  	if (vma) {
1981  		unsigned long off;
1982  
1983  		/*
1984  		 * for small pages, there is no difference between
1985  		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1986  		 * for huge pages, since vm_pgoff is in units of small
1987  		 * pages, we need to shift off the always 0 bits to get
1988  		 * a useful offset.
1989  		 */
1990  		BUG_ON(shift < PAGE_SHIFT);
1991  		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1992  		off += (addr - vma->vm_start) >> shift;
1993  		return offset_il_node(pol, off);
1994  	} else
1995  		return interleave_nodes(pol);
1996  }
1997  
1998  #ifdef CONFIG_HUGETLBFS
1999  /*
2000   * huge_node(@vma, @addr, @gfp_flags, @mpol)
2001   * @vma: virtual memory area whose policy is sought
2002   * @addr: address in @vma for shared policy lookup and interleave policy
2003   * @gfp_flags: for requested zone
2004   * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2005   * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2006   *
2007   * Returns a nid suitable for a huge page allocation and a pointer
2008   * to the struct mempolicy for conditional unref after allocation.
2009   * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2010   * to the mempolicy's @nodemask for filtering the zonelist.
2011   *
2012   * Must be protected by read_mems_allowed_begin()
2013   */
2014  int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2015  				struct mempolicy **mpol, nodemask_t **nodemask)
2016  {
2017  	int nid;
2018  	int mode;
2019  
2020  	*mpol = get_vma_policy(vma, addr);
2021  	*nodemask = NULL;
2022  	mode = (*mpol)->mode;
2023  
2024  	if (unlikely(mode == MPOL_INTERLEAVE)) {
2025  		nid = interleave_nid(*mpol, vma, addr,
2026  					huge_page_shift(hstate_vma(vma)));
2027  	} else {
2028  		nid = policy_node(gfp_flags, *mpol, numa_node_id());
2029  		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
2030  			*nodemask = &(*mpol)->nodes;
2031  	}
2032  	return nid;
2033  }
2034  
2035  /*
2036   * init_nodemask_of_mempolicy
2037   *
2038   * If the current task's mempolicy is "default" [NULL], return 'false'
2039   * to indicate default policy.  Otherwise, extract the policy nodemask
2040   * for 'bind' or 'interleave' policy into the argument nodemask, or
2041   * initialize the argument nodemask to contain the single node for
2042   * 'preferred' or 'local' policy and return 'true' to indicate presence
2043   * of non-default mempolicy.
2044   *
2045   * We don't bother with reference counting the mempolicy [mpol_get/put]
2046   * because the current task is examining it's own mempolicy and a task's
2047   * mempolicy is only ever changed by the task itself.
2048   *
2049   * N.B., it is the caller's responsibility to free a returned nodemask.
2050   */
2051  bool init_nodemask_of_mempolicy(nodemask_t *mask)
2052  {
2053  	struct mempolicy *mempolicy;
2054  
2055  	if (!(mask && current->mempolicy))
2056  		return false;
2057  
2058  	task_lock(current);
2059  	mempolicy = current->mempolicy;
2060  	switch (mempolicy->mode) {
2061  	case MPOL_PREFERRED:
2062  	case MPOL_PREFERRED_MANY:
2063  	case MPOL_BIND:
2064  	case MPOL_INTERLEAVE:
2065  		*mask = mempolicy->nodes;
2066  		break;
2067  
2068  	case MPOL_LOCAL:
2069  		init_nodemask_of_node(mask, numa_node_id());
2070  		break;
2071  
2072  	default:
2073  		BUG();
2074  	}
2075  	task_unlock(current);
2076  
2077  	return true;
2078  }
2079  #endif
2080  
2081  /*
2082   * mempolicy_in_oom_domain
2083   *
2084   * If tsk's mempolicy is "bind", check for intersection between mask and
2085   * the policy nodemask. Otherwise, return true for all other policies
2086   * including "interleave", as a tsk with "interleave" policy may have
2087   * memory allocated from all nodes in system.
2088   *
2089   * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2090   */
2091  bool mempolicy_in_oom_domain(struct task_struct *tsk,
2092  					const nodemask_t *mask)
2093  {
2094  	struct mempolicy *mempolicy;
2095  	bool ret = true;
2096  
2097  	if (!mask)
2098  		return ret;
2099  
2100  	task_lock(tsk);
2101  	mempolicy = tsk->mempolicy;
2102  	if (mempolicy && mempolicy->mode == MPOL_BIND)
2103  		ret = nodes_intersects(mempolicy->nodes, *mask);
2104  	task_unlock(tsk);
2105  
2106  	return ret;
2107  }
2108  
2109  /* Allocate a page in interleaved policy.
2110     Own path because it needs to do special accounting. */
2111  static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2112  					unsigned nid)
2113  {
2114  	struct page *page;
2115  
2116  	page = __alloc_pages(gfp, order, nid, NULL);
2117  	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2118  	if (!static_branch_likely(&vm_numa_stat_key))
2119  		return page;
2120  	if (page && page_to_nid(page) == nid) {
2121  		preempt_disable();
2122  		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2123  		preempt_enable();
2124  	}
2125  	return page;
2126  }
2127  
2128  static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2129  						int nid, struct mempolicy *pol)
2130  {
2131  	struct page *page;
2132  	gfp_t preferred_gfp;
2133  
2134  	/*
2135  	 * This is a two pass approach. The first pass will only try the
2136  	 * preferred nodes but skip the direct reclaim and allow the
2137  	 * allocation to fail, while the second pass will try all the
2138  	 * nodes in system.
2139  	 */
2140  	preferred_gfp = gfp | __GFP_NOWARN;
2141  	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2142  	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2143  	if (!page)
2144  		page = __alloc_pages(gfp, order, nid, NULL);
2145  
2146  	return page;
2147  }
2148  
2149  /**
2150   * vma_alloc_folio - Allocate a folio for a VMA.
2151   * @gfp: GFP flags.
2152   * @order: Order of the folio.
2153   * @vma: Pointer to VMA or NULL if not available.
2154   * @addr: Virtual address of the allocation.  Must be inside @vma.
2155   * @hugepage: For hugepages try only the preferred node if possible.
2156   *
2157   * Allocate a folio for a specific address in @vma, using the appropriate
2158   * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2159   * of the mm_struct of the VMA to prevent it from going away.  Should be
2160   * used for all allocations for folios that will be mapped into user space.
2161   *
2162   * Return: The folio on success or NULL if allocation fails.
2163   */
2164  struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2165  		unsigned long addr, bool hugepage)
2166  {
2167  	struct mempolicy *pol;
2168  	int node = numa_node_id();
2169  	struct folio *folio;
2170  	int preferred_nid;
2171  	nodemask_t *nmask;
2172  
2173  	pol = get_vma_policy(vma, addr);
2174  
2175  	if (pol->mode == MPOL_INTERLEAVE) {
2176  		struct page *page;
2177  		unsigned nid;
2178  
2179  		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2180  		mpol_cond_put(pol);
2181  		gfp |= __GFP_COMP;
2182  		page = alloc_page_interleave(gfp, order, nid);
2183  		if (page && order > 1)
2184  			prep_transhuge_page(page);
2185  		folio = (struct folio *)page;
2186  		goto out;
2187  	}
2188  
2189  	if (pol->mode == MPOL_PREFERRED_MANY) {
2190  		struct page *page;
2191  
2192  		node = policy_node(gfp, pol, node);
2193  		gfp |= __GFP_COMP;
2194  		page = alloc_pages_preferred_many(gfp, order, node, pol);
2195  		mpol_cond_put(pol);
2196  		if (page && order > 1)
2197  			prep_transhuge_page(page);
2198  		folio = (struct folio *)page;
2199  		goto out;
2200  	}
2201  
2202  	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2203  		int hpage_node = node;
2204  
2205  		/*
2206  		 * For hugepage allocation and non-interleave policy which
2207  		 * allows the current node (or other explicitly preferred
2208  		 * node) we only try to allocate from the current/preferred
2209  		 * node and don't fall back to other nodes, as the cost of
2210  		 * remote accesses would likely offset THP benefits.
2211  		 *
2212  		 * If the policy is interleave or does not allow the current
2213  		 * node in its nodemask, we allocate the standard way.
2214  		 */
2215  		if (pol->mode == MPOL_PREFERRED)
2216  			hpage_node = first_node(pol->nodes);
2217  
2218  		nmask = policy_nodemask(gfp, pol);
2219  		if (!nmask || node_isset(hpage_node, *nmask)) {
2220  			mpol_cond_put(pol);
2221  			/*
2222  			 * First, try to allocate THP only on local node, but
2223  			 * don't reclaim unnecessarily, just compact.
2224  			 */
2225  			folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2226  					__GFP_NORETRY, order, hpage_node);
2227  
2228  			/*
2229  			 * If hugepage allocations are configured to always
2230  			 * synchronous compact or the vma has been madvised
2231  			 * to prefer hugepage backing, retry allowing remote
2232  			 * memory with both reclaim and compact as well.
2233  			 */
2234  			if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2235  				folio = __folio_alloc(gfp, order, hpage_node,
2236  						      nmask);
2237  
2238  			goto out;
2239  		}
2240  	}
2241  
2242  	nmask = policy_nodemask(gfp, pol);
2243  	preferred_nid = policy_node(gfp, pol, node);
2244  	folio = __folio_alloc(gfp, order, preferred_nid, nmask);
2245  	mpol_cond_put(pol);
2246  out:
2247  	return folio;
2248  }
2249  EXPORT_SYMBOL(vma_alloc_folio);
2250  
2251  /**
2252   * alloc_pages - Allocate pages.
2253   * @gfp: GFP flags.
2254   * @order: Power of two of number of pages to allocate.
2255   *
2256   * Allocate 1 << @order contiguous pages.  The physical address of the
2257   * first page is naturally aligned (eg an order-3 allocation will be aligned
2258   * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2259   * process is honoured when in process context.
2260   *
2261   * Context: Can be called from any context, providing the appropriate GFP
2262   * flags are used.
2263   * Return: The page on success or NULL if allocation fails.
2264   */
2265  struct page *alloc_pages(gfp_t gfp, unsigned order)
2266  {
2267  	struct mempolicy *pol = &default_policy;
2268  	struct page *page;
2269  
2270  	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2271  		pol = get_task_policy(current);
2272  
2273  	/*
2274  	 * No reference counting needed for current->mempolicy
2275  	 * nor system default_policy
2276  	 */
2277  	if (pol->mode == MPOL_INTERLEAVE)
2278  		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2279  	else if (pol->mode == MPOL_PREFERRED_MANY)
2280  		page = alloc_pages_preferred_many(gfp, order,
2281  				  policy_node(gfp, pol, numa_node_id()), pol);
2282  	else
2283  		page = __alloc_pages(gfp, order,
2284  				policy_node(gfp, pol, numa_node_id()),
2285  				policy_nodemask(gfp, pol));
2286  
2287  	return page;
2288  }
2289  EXPORT_SYMBOL(alloc_pages);
2290  
2291  struct folio *folio_alloc(gfp_t gfp, unsigned order)
2292  {
2293  	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2294  
2295  	if (page && order > 1)
2296  		prep_transhuge_page(page);
2297  	return (struct folio *)page;
2298  }
2299  EXPORT_SYMBOL(folio_alloc);
2300  
2301  static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2302  		struct mempolicy *pol, unsigned long nr_pages,
2303  		struct page **page_array)
2304  {
2305  	int nodes;
2306  	unsigned long nr_pages_per_node;
2307  	int delta;
2308  	int i;
2309  	unsigned long nr_allocated;
2310  	unsigned long total_allocated = 0;
2311  
2312  	nodes = nodes_weight(pol->nodes);
2313  	nr_pages_per_node = nr_pages / nodes;
2314  	delta = nr_pages - nodes * nr_pages_per_node;
2315  
2316  	for (i = 0; i < nodes; i++) {
2317  		if (delta) {
2318  			nr_allocated = __alloc_pages_bulk(gfp,
2319  					interleave_nodes(pol), NULL,
2320  					nr_pages_per_node + 1, NULL,
2321  					page_array);
2322  			delta--;
2323  		} else {
2324  			nr_allocated = __alloc_pages_bulk(gfp,
2325  					interleave_nodes(pol), NULL,
2326  					nr_pages_per_node, NULL, page_array);
2327  		}
2328  
2329  		page_array += nr_allocated;
2330  		total_allocated += nr_allocated;
2331  	}
2332  
2333  	return total_allocated;
2334  }
2335  
2336  static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2337  		struct mempolicy *pol, unsigned long nr_pages,
2338  		struct page **page_array)
2339  {
2340  	gfp_t preferred_gfp;
2341  	unsigned long nr_allocated = 0;
2342  
2343  	preferred_gfp = gfp | __GFP_NOWARN;
2344  	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2345  
2346  	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2347  					   nr_pages, NULL, page_array);
2348  
2349  	if (nr_allocated < nr_pages)
2350  		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2351  				nr_pages - nr_allocated, NULL,
2352  				page_array + nr_allocated);
2353  	return nr_allocated;
2354  }
2355  
2356  /* alloc pages bulk and mempolicy should be considered at the
2357   * same time in some situation such as vmalloc.
2358   *
2359   * It can accelerate memory allocation especially interleaving
2360   * allocate memory.
2361   */
2362  unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2363  		unsigned long nr_pages, struct page **page_array)
2364  {
2365  	struct mempolicy *pol = &default_policy;
2366  
2367  	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2368  		pol = get_task_policy(current);
2369  
2370  	if (pol->mode == MPOL_INTERLEAVE)
2371  		return alloc_pages_bulk_array_interleave(gfp, pol,
2372  							 nr_pages, page_array);
2373  
2374  	if (pol->mode == MPOL_PREFERRED_MANY)
2375  		return alloc_pages_bulk_array_preferred_many(gfp,
2376  				numa_node_id(), pol, nr_pages, page_array);
2377  
2378  	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2379  				  policy_nodemask(gfp, pol), nr_pages, NULL,
2380  				  page_array);
2381  }
2382  
2383  int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2384  {
2385  	struct mempolicy *pol = mpol_dup(vma_policy(src));
2386  
2387  	if (IS_ERR(pol))
2388  		return PTR_ERR(pol);
2389  	dst->vm_policy = pol;
2390  	return 0;
2391  }
2392  
2393  /*
2394   * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2395   * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2396   * with the mems_allowed returned by cpuset_mems_allowed().  This
2397   * keeps mempolicies cpuset relative after its cpuset moves.  See
2398   * further kernel/cpuset.c update_nodemask().
2399   *
2400   * current's mempolicy may be rebinded by the other task(the task that changes
2401   * cpuset's mems), so we needn't do rebind work for current task.
2402   */
2403  
2404  /* Slow path of a mempolicy duplicate */
2405  struct mempolicy *__mpol_dup(struct mempolicy *old)
2406  {
2407  	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2408  
2409  	if (!new)
2410  		return ERR_PTR(-ENOMEM);
2411  
2412  	/* task's mempolicy is protected by alloc_lock */
2413  	if (old == current->mempolicy) {
2414  		task_lock(current);
2415  		*new = *old;
2416  		task_unlock(current);
2417  	} else
2418  		*new = *old;
2419  
2420  	if (current_cpuset_is_being_rebound()) {
2421  		nodemask_t mems = cpuset_mems_allowed(current);
2422  		mpol_rebind_policy(new, &mems);
2423  	}
2424  	atomic_set(&new->refcnt, 1);
2425  	return new;
2426  }
2427  
2428  /* Slow path of a mempolicy comparison */
2429  bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2430  {
2431  	if (!a || !b)
2432  		return false;
2433  	if (a->mode != b->mode)
2434  		return false;
2435  	if (a->flags != b->flags)
2436  		return false;
2437  	if (a->home_node != b->home_node)
2438  		return false;
2439  	if (mpol_store_user_nodemask(a))
2440  		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2441  			return false;
2442  
2443  	switch (a->mode) {
2444  	case MPOL_BIND:
2445  	case MPOL_INTERLEAVE:
2446  	case MPOL_PREFERRED:
2447  	case MPOL_PREFERRED_MANY:
2448  		return !!nodes_equal(a->nodes, b->nodes);
2449  	case MPOL_LOCAL:
2450  		return true;
2451  	default:
2452  		BUG();
2453  		return false;
2454  	}
2455  }
2456  
2457  /*
2458   * Shared memory backing store policy support.
2459   *
2460   * Remember policies even when nobody has shared memory mapped.
2461   * The policies are kept in Red-Black tree linked from the inode.
2462   * They are protected by the sp->lock rwlock, which should be held
2463   * for any accesses to the tree.
2464   */
2465  
2466  /*
2467   * lookup first element intersecting start-end.  Caller holds sp->lock for
2468   * reading or for writing
2469   */
2470  static struct sp_node *
2471  sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2472  {
2473  	struct rb_node *n = sp->root.rb_node;
2474  
2475  	while (n) {
2476  		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2477  
2478  		if (start >= p->end)
2479  			n = n->rb_right;
2480  		else if (end <= p->start)
2481  			n = n->rb_left;
2482  		else
2483  			break;
2484  	}
2485  	if (!n)
2486  		return NULL;
2487  	for (;;) {
2488  		struct sp_node *w = NULL;
2489  		struct rb_node *prev = rb_prev(n);
2490  		if (!prev)
2491  			break;
2492  		w = rb_entry(prev, struct sp_node, nd);
2493  		if (w->end <= start)
2494  			break;
2495  		n = prev;
2496  	}
2497  	return rb_entry(n, struct sp_node, nd);
2498  }
2499  
2500  /*
2501   * Insert a new shared policy into the list.  Caller holds sp->lock for
2502   * writing.
2503   */
2504  static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2505  {
2506  	struct rb_node **p = &sp->root.rb_node;
2507  	struct rb_node *parent = NULL;
2508  	struct sp_node *nd;
2509  
2510  	while (*p) {
2511  		parent = *p;
2512  		nd = rb_entry(parent, struct sp_node, nd);
2513  		if (new->start < nd->start)
2514  			p = &(*p)->rb_left;
2515  		else if (new->end > nd->end)
2516  			p = &(*p)->rb_right;
2517  		else
2518  			BUG();
2519  	}
2520  	rb_link_node(&new->nd, parent, p);
2521  	rb_insert_color(&new->nd, &sp->root);
2522  	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2523  		 new->policy ? new->policy->mode : 0);
2524  }
2525  
2526  /* Find shared policy intersecting idx */
2527  struct mempolicy *
2528  mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2529  {
2530  	struct mempolicy *pol = NULL;
2531  	struct sp_node *sn;
2532  
2533  	if (!sp->root.rb_node)
2534  		return NULL;
2535  	read_lock(&sp->lock);
2536  	sn = sp_lookup(sp, idx, idx+1);
2537  	if (sn) {
2538  		mpol_get(sn->policy);
2539  		pol = sn->policy;
2540  	}
2541  	read_unlock(&sp->lock);
2542  	return pol;
2543  }
2544  
2545  static void sp_free(struct sp_node *n)
2546  {
2547  	mpol_put(n->policy);
2548  	kmem_cache_free(sn_cache, n);
2549  }
2550  
2551  /**
2552   * mpol_misplaced - check whether current page node is valid in policy
2553   *
2554   * @page: page to be checked
2555   * @vma: vm area where page mapped
2556   * @addr: virtual address where page mapped
2557   *
2558   * Lookup current policy node id for vma,addr and "compare to" page's
2559   * node id.  Policy determination "mimics" alloc_page_vma().
2560   * Called from fault path where we know the vma and faulting address.
2561   *
2562   * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2563   * policy, or a suitable node ID to allocate a replacement page from.
2564   */
2565  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2566  {
2567  	struct mempolicy *pol;
2568  	struct zoneref *z;
2569  	int curnid = page_to_nid(page);
2570  	unsigned long pgoff;
2571  	int thiscpu = raw_smp_processor_id();
2572  	int thisnid = cpu_to_node(thiscpu);
2573  	int polnid = NUMA_NO_NODE;
2574  	int ret = NUMA_NO_NODE;
2575  
2576  	pol = get_vma_policy(vma, addr);
2577  	if (!(pol->flags & MPOL_F_MOF))
2578  		goto out;
2579  
2580  	switch (pol->mode) {
2581  	case MPOL_INTERLEAVE:
2582  		pgoff = vma->vm_pgoff;
2583  		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2584  		polnid = offset_il_node(pol, pgoff);
2585  		break;
2586  
2587  	case MPOL_PREFERRED:
2588  		if (node_isset(curnid, pol->nodes))
2589  			goto out;
2590  		polnid = first_node(pol->nodes);
2591  		break;
2592  
2593  	case MPOL_LOCAL:
2594  		polnid = numa_node_id();
2595  		break;
2596  
2597  	case MPOL_BIND:
2598  		/* Optimize placement among multiple nodes via NUMA balancing */
2599  		if (pol->flags & MPOL_F_MORON) {
2600  			if (node_isset(thisnid, pol->nodes))
2601  				break;
2602  			goto out;
2603  		}
2604  		fallthrough;
2605  
2606  	case MPOL_PREFERRED_MANY:
2607  		/*
2608  		 * use current page if in policy nodemask,
2609  		 * else select nearest allowed node, if any.
2610  		 * If no allowed nodes, use current [!misplaced].
2611  		 */
2612  		if (node_isset(curnid, pol->nodes))
2613  			goto out;
2614  		z = first_zones_zonelist(
2615  				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2616  				gfp_zone(GFP_HIGHUSER),
2617  				&pol->nodes);
2618  		polnid = zone_to_nid(z->zone);
2619  		break;
2620  
2621  	default:
2622  		BUG();
2623  	}
2624  
2625  	/* Migrate the page towards the node whose CPU is referencing it */
2626  	if (pol->flags & MPOL_F_MORON) {
2627  		polnid = thisnid;
2628  
2629  		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2630  			goto out;
2631  	}
2632  
2633  	if (curnid != polnid)
2634  		ret = polnid;
2635  out:
2636  	mpol_cond_put(pol);
2637  
2638  	return ret;
2639  }
2640  
2641  /*
2642   * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2643   * dropped after task->mempolicy is set to NULL so that any allocation done as
2644   * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2645   * policy.
2646   */
2647  void mpol_put_task_policy(struct task_struct *task)
2648  {
2649  	struct mempolicy *pol;
2650  
2651  	task_lock(task);
2652  	pol = task->mempolicy;
2653  	task->mempolicy = NULL;
2654  	task_unlock(task);
2655  	mpol_put(pol);
2656  }
2657  
2658  static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2659  {
2660  	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2661  	rb_erase(&n->nd, &sp->root);
2662  	sp_free(n);
2663  }
2664  
2665  static void sp_node_init(struct sp_node *node, unsigned long start,
2666  			unsigned long end, struct mempolicy *pol)
2667  {
2668  	node->start = start;
2669  	node->end = end;
2670  	node->policy = pol;
2671  }
2672  
2673  static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2674  				struct mempolicy *pol)
2675  {
2676  	struct sp_node *n;
2677  	struct mempolicy *newpol;
2678  
2679  	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2680  	if (!n)
2681  		return NULL;
2682  
2683  	newpol = mpol_dup(pol);
2684  	if (IS_ERR(newpol)) {
2685  		kmem_cache_free(sn_cache, n);
2686  		return NULL;
2687  	}
2688  	newpol->flags |= MPOL_F_SHARED;
2689  	sp_node_init(n, start, end, newpol);
2690  
2691  	return n;
2692  }
2693  
2694  /* Replace a policy range. */
2695  static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2696  				 unsigned long end, struct sp_node *new)
2697  {
2698  	struct sp_node *n;
2699  	struct sp_node *n_new = NULL;
2700  	struct mempolicy *mpol_new = NULL;
2701  	int ret = 0;
2702  
2703  restart:
2704  	write_lock(&sp->lock);
2705  	n = sp_lookup(sp, start, end);
2706  	/* Take care of old policies in the same range. */
2707  	while (n && n->start < end) {
2708  		struct rb_node *next = rb_next(&n->nd);
2709  		if (n->start >= start) {
2710  			if (n->end <= end)
2711  				sp_delete(sp, n);
2712  			else
2713  				n->start = end;
2714  		} else {
2715  			/* Old policy spanning whole new range. */
2716  			if (n->end > end) {
2717  				if (!n_new)
2718  					goto alloc_new;
2719  
2720  				*mpol_new = *n->policy;
2721  				atomic_set(&mpol_new->refcnt, 1);
2722  				sp_node_init(n_new, end, n->end, mpol_new);
2723  				n->end = start;
2724  				sp_insert(sp, n_new);
2725  				n_new = NULL;
2726  				mpol_new = NULL;
2727  				break;
2728  			} else
2729  				n->end = start;
2730  		}
2731  		if (!next)
2732  			break;
2733  		n = rb_entry(next, struct sp_node, nd);
2734  	}
2735  	if (new)
2736  		sp_insert(sp, new);
2737  	write_unlock(&sp->lock);
2738  	ret = 0;
2739  
2740  err_out:
2741  	if (mpol_new)
2742  		mpol_put(mpol_new);
2743  	if (n_new)
2744  		kmem_cache_free(sn_cache, n_new);
2745  
2746  	return ret;
2747  
2748  alloc_new:
2749  	write_unlock(&sp->lock);
2750  	ret = -ENOMEM;
2751  	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2752  	if (!n_new)
2753  		goto err_out;
2754  	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2755  	if (!mpol_new)
2756  		goto err_out;
2757  	atomic_set(&mpol_new->refcnt, 1);
2758  	goto restart;
2759  }
2760  
2761  /**
2762   * mpol_shared_policy_init - initialize shared policy for inode
2763   * @sp: pointer to inode shared policy
2764   * @mpol:  struct mempolicy to install
2765   *
2766   * Install non-NULL @mpol in inode's shared policy rb-tree.
2767   * On entry, the current task has a reference on a non-NULL @mpol.
2768   * This must be released on exit.
2769   * This is called at get_inode() calls and we can use GFP_KERNEL.
2770   */
2771  void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2772  {
2773  	int ret;
2774  
2775  	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2776  	rwlock_init(&sp->lock);
2777  
2778  	if (mpol) {
2779  		struct vm_area_struct pvma;
2780  		struct mempolicy *new;
2781  		NODEMASK_SCRATCH(scratch);
2782  
2783  		if (!scratch)
2784  			goto put_mpol;
2785  		/* contextualize the tmpfs mount point mempolicy */
2786  		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2787  		if (IS_ERR(new))
2788  			goto free_scratch; /* no valid nodemask intersection */
2789  
2790  		task_lock(current);
2791  		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2792  		task_unlock(current);
2793  		if (ret)
2794  			goto put_new;
2795  
2796  		/* Create pseudo-vma that contains just the policy */
2797  		vma_init(&pvma, NULL);
2798  		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2799  		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2800  
2801  put_new:
2802  		mpol_put(new);			/* drop initial ref */
2803  free_scratch:
2804  		NODEMASK_SCRATCH_FREE(scratch);
2805  put_mpol:
2806  		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2807  	}
2808  }
2809  
2810  int mpol_set_shared_policy(struct shared_policy *info,
2811  			struct vm_area_struct *vma, struct mempolicy *npol)
2812  {
2813  	int err;
2814  	struct sp_node *new = NULL;
2815  	unsigned long sz = vma_pages(vma);
2816  
2817  	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2818  		 vma->vm_pgoff,
2819  		 sz, npol ? npol->mode : -1,
2820  		 npol ? npol->flags : -1,
2821  		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2822  
2823  	if (npol) {
2824  		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2825  		if (!new)
2826  			return -ENOMEM;
2827  	}
2828  	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2829  	if (err && new)
2830  		sp_free(new);
2831  	return err;
2832  }
2833  
2834  /* Free a backing policy store on inode delete. */
2835  void mpol_free_shared_policy(struct shared_policy *p)
2836  {
2837  	struct sp_node *n;
2838  	struct rb_node *next;
2839  
2840  	if (!p->root.rb_node)
2841  		return;
2842  	write_lock(&p->lock);
2843  	next = rb_first(&p->root);
2844  	while (next) {
2845  		n = rb_entry(next, struct sp_node, nd);
2846  		next = rb_next(&n->nd);
2847  		sp_delete(p, n);
2848  	}
2849  	write_unlock(&p->lock);
2850  }
2851  
2852  #ifdef CONFIG_NUMA_BALANCING
2853  static int __initdata numabalancing_override;
2854  
2855  static void __init check_numabalancing_enable(void)
2856  {
2857  	bool numabalancing_default = false;
2858  
2859  	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2860  		numabalancing_default = true;
2861  
2862  	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2863  	if (numabalancing_override)
2864  		set_numabalancing_state(numabalancing_override == 1);
2865  
2866  	if (num_online_nodes() > 1 && !numabalancing_override) {
2867  		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2868  			numabalancing_default ? "Enabling" : "Disabling");
2869  		set_numabalancing_state(numabalancing_default);
2870  	}
2871  }
2872  
2873  static int __init setup_numabalancing(char *str)
2874  {
2875  	int ret = 0;
2876  	if (!str)
2877  		goto out;
2878  
2879  	if (!strcmp(str, "enable")) {
2880  		numabalancing_override = 1;
2881  		ret = 1;
2882  	} else if (!strcmp(str, "disable")) {
2883  		numabalancing_override = -1;
2884  		ret = 1;
2885  	}
2886  out:
2887  	if (!ret)
2888  		pr_warn("Unable to parse numa_balancing=\n");
2889  
2890  	return ret;
2891  }
2892  __setup("numa_balancing=", setup_numabalancing);
2893  #else
2894  static inline void __init check_numabalancing_enable(void)
2895  {
2896  }
2897  #endif /* CONFIG_NUMA_BALANCING */
2898  
2899  /* assumes fs == KERNEL_DS */
2900  void __init numa_policy_init(void)
2901  {
2902  	nodemask_t interleave_nodes;
2903  	unsigned long largest = 0;
2904  	int nid, prefer = 0;
2905  
2906  	policy_cache = kmem_cache_create("numa_policy",
2907  					 sizeof(struct mempolicy),
2908  					 0, SLAB_PANIC, NULL);
2909  
2910  	sn_cache = kmem_cache_create("shared_policy_node",
2911  				     sizeof(struct sp_node),
2912  				     0, SLAB_PANIC, NULL);
2913  
2914  	for_each_node(nid) {
2915  		preferred_node_policy[nid] = (struct mempolicy) {
2916  			.refcnt = ATOMIC_INIT(1),
2917  			.mode = MPOL_PREFERRED,
2918  			.flags = MPOL_F_MOF | MPOL_F_MORON,
2919  			.nodes = nodemask_of_node(nid),
2920  		};
2921  	}
2922  
2923  	/*
2924  	 * Set interleaving policy for system init. Interleaving is only
2925  	 * enabled across suitably sized nodes (default is >= 16MB), or
2926  	 * fall back to the largest node if they're all smaller.
2927  	 */
2928  	nodes_clear(interleave_nodes);
2929  	for_each_node_state(nid, N_MEMORY) {
2930  		unsigned long total_pages = node_present_pages(nid);
2931  
2932  		/* Preserve the largest node */
2933  		if (largest < total_pages) {
2934  			largest = total_pages;
2935  			prefer = nid;
2936  		}
2937  
2938  		/* Interleave this node? */
2939  		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2940  			node_set(nid, interleave_nodes);
2941  	}
2942  
2943  	/* All too small, use the largest */
2944  	if (unlikely(nodes_empty(interleave_nodes)))
2945  		node_set(prefer, interleave_nodes);
2946  
2947  	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2948  		pr_err("%s: interleaving failed\n", __func__);
2949  
2950  	check_numabalancing_enable();
2951  }
2952  
2953  /* Reset policy of current process to default */
2954  void numa_default_policy(void)
2955  {
2956  	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2957  }
2958  
2959  /*
2960   * Parse and format mempolicy from/to strings
2961   */
2962  
2963  static const char * const policy_modes[] =
2964  {
2965  	[MPOL_DEFAULT]    = "default",
2966  	[MPOL_PREFERRED]  = "prefer",
2967  	[MPOL_BIND]       = "bind",
2968  	[MPOL_INTERLEAVE] = "interleave",
2969  	[MPOL_LOCAL]      = "local",
2970  	[MPOL_PREFERRED_MANY]  = "prefer (many)",
2971  };
2972  
2973  
2974  #ifdef CONFIG_TMPFS
2975  /**
2976   * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2977   * @str:  string containing mempolicy to parse
2978   * @mpol:  pointer to struct mempolicy pointer, returned on success.
2979   *
2980   * Format of input:
2981   *	<mode>[=<flags>][:<nodelist>]
2982   *
2983   * Return: %0 on success, else %1
2984   */
2985  int mpol_parse_str(char *str, struct mempolicy **mpol)
2986  {
2987  	struct mempolicy *new = NULL;
2988  	unsigned short mode_flags;
2989  	nodemask_t nodes;
2990  	char *nodelist = strchr(str, ':');
2991  	char *flags = strchr(str, '=');
2992  	int err = 1, mode;
2993  
2994  	if (flags)
2995  		*flags++ = '\0';	/* terminate mode string */
2996  
2997  	if (nodelist) {
2998  		/* NUL-terminate mode or flags string */
2999  		*nodelist++ = '\0';
3000  		if (nodelist_parse(nodelist, nodes))
3001  			goto out;
3002  		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3003  			goto out;
3004  	} else
3005  		nodes_clear(nodes);
3006  
3007  	mode = match_string(policy_modes, MPOL_MAX, str);
3008  	if (mode < 0)
3009  		goto out;
3010  
3011  	switch (mode) {
3012  	case MPOL_PREFERRED:
3013  		/*
3014  		 * Insist on a nodelist of one node only, although later
3015  		 * we use first_node(nodes) to grab a single node, so here
3016  		 * nodelist (or nodes) cannot be empty.
3017  		 */
3018  		if (nodelist) {
3019  			char *rest = nodelist;
3020  			while (isdigit(*rest))
3021  				rest++;
3022  			if (*rest)
3023  				goto out;
3024  			if (nodes_empty(nodes))
3025  				goto out;
3026  		}
3027  		break;
3028  	case MPOL_INTERLEAVE:
3029  		/*
3030  		 * Default to online nodes with memory if no nodelist
3031  		 */
3032  		if (!nodelist)
3033  			nodes = node_states[N_MEMORY];
3034  		break;
3035  	case MPOL_LOCAL:
3036  		/*
3037  		 * Don't allow a nodelist;  mpol_new() checks flags
3038  		 */
3039  		if (nodelist)
3040  			goto out;
3041  		break;
3042  	case MPOL_DEFAULT:
3043  		/*
3044  		 * Insist on a empty nodelist
3045  		 */
3046  		if (!nodelist)
3047  			err = 0;
3048  		goto out;
3049  	case MPOL_PREFERRED_MANY:
3050  	case MPOL_BIND:
3051  		/*
3052  		 * Insist on a nodelist
3053  		 */
3054  		if (!nodelist)
3055  			goto out;
3056  	}
3057  
3058  	mode_flags = 0;
3059  	if (flags) {
3060  		/*
3061  		 * Currently, we only support two mutually exclusive
3062  		 * mode flags.
3063  		 */
3064  		if (!strcmp(flags, "static"))
3065  			mode_flags |= MPOL_F_STATIC_NODES;
3066  		else if (!strcmp(flags, "relative"))
3067  			mode_flags |= MPOL_F_RELATIVE_NODES;
3068  		else
3069  			goto out;
3070  	}
3071  
3072  	new = mpol_new(mode, mode_flags, &nodes);
3073  	if (IS_ERR(new))
3074  		goto out;
3075  
3076  	/*
3077  	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3078  	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3079  	 */
3080  	if (mode != MPOL_PREFERRED) {
3081  		new->nodes = nodes;
3082  	} else if (nodelist) {
3083  		nodes_clear(new->nodes);
3084  		node_set(first_node(nodes), new->nodes);
3085  	} else {
3086  		new->mode = MPOL_LOCAL;
3087  	}
3088  
3089  	/*
3090  	 * Save nodes for contextualization: this will be used to "clone"
3091  	 * the mempolicy in a specific context [cpuset] at a later time.
3092  	 */
3093  	new->w.user_nodemask = nodes;
3094  
3095  	err = 0;
3096  
3097  out:
3098  	/* Restore string for error message */
3099  	if (nodelist)
3100  		*--nodelist = ':';
3101  	if (flags)
3102  		*--flags = '=';
3103  	if (!err)
3104  		*mpol = new;
3105  	return err;
3106  }
3107  #endif /* CONFIG_TMPFS */
3108  
3109  /**
3110   * mpol_to_str - format a mempolicy structure for printing
3111   * @buffer:  to contain formatted mempolicy string
3112   * @maxlen:  length of @buffer
3113   * @pol:  pointer to mempolicy to be formatted
3114   *
3115   * Convert @pol into a string.  If @buffer is too short, truncate the string.
3116   * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3117   * longest flag, "relative", and to display at least a few node ids.
3118   */
3119  void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3120  {
3121  	char *p = buffer;
3122  	nodemask_t nodes = NODE_MASK_NONE;
3123  	unsigned short mode = MPOL_DEFAULT;
3124  	unsigned short flags = 0;
3125  
3126  	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3127  		mode = pol->mode;
3128  		flags = pol->flags;
3129  	}
3130  
3131  	switch (mode) {
3132  	case MPOL_DEFAULT:
3133  	case MPOL_LOCAL:
3134  		break;
3135  	case MPOL_PREFERRED:
3136  	case MPOL_PREFERRED_MANY:
3137  	case MPOL_BIND:
3138  	case MPOL_INTERLEAVE:
3139  		nodes = pol->nodes;
3140  		break;
3141  	default:
3142  		WARN_ON_ONCE(1);
3143  		snprintf(p, maxlen, "unknown");
3144  		return;
3145  	}
3146  
3147  	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3148  
3149  	if (flags & MPOL_MODE_FLAGS) {
3150  		p += snprintf(p, buffer + maxlen - p, "=");
3151  
3152  		/*
3153  		 * Currently, the only defined flags are mutually exclusive
3154  		 */
3155  		if (flags & MPOL_F_STATIC_NODES)
3156  			p += snprintf(p, buffer + maxlen - p, "static");
3157  		else if (flags & MPOL_F_RELATIVE_NODES)
3158  			p += snprintf(p, buffer + maxlen - p, "relative");
3159  	}
3160  
3161  	if (!nodes_empty(nodes))
3162  		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3163  			       nodemask_pr_args(&nodes));
3164  }
3165