146aeb7e6SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * Simple NUMA memory policy for the Linux kernel. 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright 2003,2004 Andi Kleen, SuSE Labs. 68bccd85fSChristoph Lameter * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * NUMA policy allows the user to give hints in which node(s) memory should 91da177e4SLinus Torvalds * be allocated. 101da177e4SLinus Torvalds * 111da177e4SLinus Torvalds * Support four policies per VMA and per process: 121da177e4SLinus Torvalds * 131da177e4SLinus Torvalds * The VMA policy has priority over the process policy for a page fault. 141da177e4SLinus Torvalds * 151da177e4SLinus Torvalds * interleave Allocate memory interleaved over a set of nodes, 161da177e4SLinus Torvalds * with normal fallback if it fails. 171da177e4SLinus Torvalds * For VMA based allocations this interleaves based on the 181da177e4SLinus Torvalds * offset into the backing object or offset into the mapping 191da177e4SLinus Torvalds * for anonymous memory. For process policy an process counter 201da177e4SLinus Torvalds * is used. 218bccd85fSChristoph Lameter * 22fa3bea4eSGregory Price * weighted interleave 23fa3bea4eSGregory Price * Allocate memory interleaved over a set of nodes based on 24fa3bea4eSGregory Price * a set of weights (per-node), with normal fallback if it 25fa3bea4eSGregory Price * fails. Otherwise operates the same as interleave. 26fa3bea4eSGregory Price * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27fa3bea4eSGregory Price * on node 0 for every 1 page allocated on node 1. 28fa3bea4eSGregory Price * 291da177e4SLinus Torvalds * bind Only allocate memory on a specific set of nodes, 301da177e4SLinus Torvalds * no fallback. 318bccd85fSChristoph Lameter * FIXME: memory is allocated starting with the first node 328bccd85fSChristoph Lameter * to the last. It would be better if bind would truly restrict 338bccd85fSChristoph Lameter * the allocation to memory nodes instead 348bccd85fSChristoph Lameter * 351da177e4SLinus Torvalds * preferred Try a specific node first before normal fallback. 3600ef2d2fSDavid Rientjes * As a special case NUMA_NO_NODE here means do the allocation 371da177e4SLinus Torvalds * on the local CPU. This is normally identical to default, 381da177e4SLinus Torvalds * but useful to set in a VMA when you have a non default 391da177e4SLinus Torvalds * process policy. 408bccd85fSChristoph Lameter * 41b27abaccSDave Hansen * preferred many Try a set of nodes first before normal fallback. This is 42b27abaccSDave Hansen * similar to preferred without the special case. 43b27abaccSDave Hansen * 441da177e4SLinus Torvalds * default Allocate on the local node first, or when on a VMA 451da177e4SLinus Torvalds * use the process policy. This is what Linux always did 461da177e4SLinus Torvalds * in a NUMA aware kernel and still does by, ahem, default. 471da177e4SLinus Torvalds * 481da177e4SLinus Torvalds * The process policy is applied for most non interrupt memory allocations 491da177e4SLinus Torvalds * in that process' context. Interrupts ignore the policies and always 501da177e4SLinus Torvalds * try to allocate on the local CPU. The VMA policy is only applied for memory 511da177e4SLinus Torvalds * allocations for a VMA in the VM. 521da177e4SLinus Torvalds * 531da177e4SLinus Torvalds * Currently there are a few corner cases in swapping where the policy 541da177e4SLinus Torvalds * is not applied, but the majority should be handled. When process policy 551da177e4SLinus Torvalds * is used it is not remembered over swap outs/swap ins. 561da177e4SLinus Torvalds * 571da177e4SLinus Torvalds * Only the highest zone in the zone hierarchy gets policied. Allocations 581da177e4SLinus Torvalds * requesting a lower zone just use default policy. This implies that 591da177e4SLinus Torvalds * on systems with highmem kernel lowmem allocation don't get policied. 601da177e4SLinus Torvalds * Same with GFP_DMA allocations. 611da177e4SLinus Torvalds * 62c36f6e6dSHugh Dickins * For shmem/tmpfs shared memory the policy is shared between 631da177e4SLinus Torvalds * all users and remembered even when nobody has memory mapped. 641da177e4SLinus Torvalds */ 651da177e4SLinus Torvalds 661da177e4SLinus Torvalds /* Notebook: 671da177e4SLinus Torvalds fix mmap readahead to honour policy and enable policy for any page cache 681da177e4SLinus Torvalds object 691da177e4SLinus Torvalds statistics for bigpages 701da177e4SLinus Torvalds global policy for page cache? currently it uses process policy. Requires 711da177e4SLinus Torvalds first item above. 721da177e4SLinus Torvalds handle mremap for shared memory (currently ignored for the policy) 731da177e4SLinus Torvalds grows down? 741da177e4SLinus Torvalds make bind policy root only? It can trigger oom much faster and the 751da177e4SLinus Torvalds kernel is not always grateful with that. 761da177e4SLinus Torvalds */ 771da177e4SLinus Torvalds 78b1de0d13SMitchel Humpherys #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79b1de0d13SMitchel Humpherys 801da177e4SLinus Torvalds #include <linux/mempolicy.h> 81a520110eSChristoph Hellwig #include <linux/pagewalk.h> 821da177e4SLinus Torvalds #include <linux/highmem.h> 831da177e4SLinus Torvalds #include <linux/hugetlb.h> 841da177e4SLinus Torvalds #include <linux/kernel.h> 851da177e4SLinus Torvalds #include <linux/sched.h> 866e84f315SIngo Molnar #include <linux/sched/mm.h> 876a3827d7SIngo Molnar #include <linux/sched/numa_balancing.h> 88f719ff9bSIngo Molnar #include <linux/sched/task.h> 891da177e4SLinus Torvalds #include <linux/nodemask.h> 901da177e4SLinus Torvalds #include <linux/cpuset.h> 911da177e4SLinus Torvalds #include <linux/slab.h> 921da177e4SLinus Torvalds #include <linux/string.h> 93b95f1b31SPaul Gortmaker #include <linux/export.h> 94b488893aSPavel Emelyanov #include <linux/nsproxy.h> 951da177e4SLinus Torvalds #include <linux/interrupt.h> 961da177e4SLinus Torvalds #include <linux/init.h> 971da177e4SLinus Torvalds #include <linux/compat.h> 9831367466SOtto Ebeling #include <linux/ptrace.h> 99dc9aa5b9SChristoph Lameter #include <linux/swap.h> 1001a75a6c8SChristoph Lameter #include <linux/seq_file.h> 1011a75a6c8SChristoph Lameter #include <linux/proc_fs.h> 102b20a3503SChristoph Lameter #include <linux/migrate.h> 10362b61f61SHugh Dickins #include <linux/ksm.h> 10495a402c3SChristoph Lameter #include <linux/rmap.h> 10586c3a764SDavid Quigley #include <linux/security.h> 106dbcb0f19SAdrian Bunk #include <linux/syscalls.h> 107095f1fc4SLee Schermerhorn #include <linux/ctype.h> 1086d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h> 109b24f53a0SLee Schermerhorn #include <linux/mmu_notifier.h> 110b1de0d13SMitchel Humpherys #include <linux/printk.h> 111c8633798SNaoya Horiguchi #include <linux/swapops.h> 112dc9aa5b9SChristoph Lameter 1131da177e4SLinus Torvalds #include <asm/tlbflush.h> 1144a18419fSNadav Amit #include <asm/tlb.h> 1157c0f6ba6SLinus Torvalds #include <linux/uaccess.h> 1161da177e4SLinus Torvalds 11762695a84SNick Piggin #include "internal.h" 11862695a84SNick Piggin 11938e35860SChristoph Lameter /* Internal flags */ 120dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 12138e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 1221cb5d11aSHugh Dickins #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 123dc9aa5b9SChristoph Lameter 124fcc234f8SPekka Enberg static struct kmem_cache *policy_cache; 125fcc234f8SPekka Enberg static struct kmem_cache *sn_cache; 1261da177e4SLinus Torvalds 1271da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not 1281da177e4SLinus Torvalds policied. */ 1296267276fSChristoph Lameter enum zone_type policy_zone = 0; 1301da177e4SLinus Torvalds 131bea904d5SLee Schermerhorn /* 132bea904d5SLee Schermerhorn * run-time system-wide default policy => local allocation 133bea904d5SLee Schermerhorn */ 134e754d79dSH Hartley Sweeten static struct mempolicy default_policy = { 1351da177e4SLinus Torvalds .refcnt = ATOMIC_INIT(1), /* never free it */ 1367858d7bcSFeng Tang .mode = MPOL_LOCAL, 1371da177e4SLinus Torvalds }; 1381da177e4SLinus Torvalds 1395606e387SMel Gorman static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 1405606e387SMel Gorman 141dce41f5aSRakie Kim /* 142dce41f5aSRakie Kim * iw_table is the sysfs-set interleave weight table, a value of 0 denotes 143dce41f5aSRakie Kim * system-default value should be used. A NULL iw_table also denotes that 144dce41f5aSRakie Kim * system-default values should be used. Until the system-default table 145dce41f5aSRakie Kim * is implemented, the system-default is always 1. 146dce41f5aSRakie Kim * 147dce41f5aSRakie Kim * iw_table is RCU protected 148dce41f5aSRakie Kim */ 149dce41f5aSRakie Kim static u8 __rcu *iw_table; 150dce41f5aSRakie Kim static DEFINE_MUTEX(iw_table_lock); 151dce41f5aSRakie Kim 152dce41f5aSRakie Kim static u8 get_il_weight(int node) 153dce41f5aSRakie Kim { 154dce41f5aSRakie Kim u8 *table; 155dce41f5aSRakie Kim u8 weight; 156dce41f5aSRakie Kim 157dce41f5aSRakie Kim rcu_read_lock(); 158dce41f5aSRakie Kim table = rcu_dereference(iw_table); 159dce41f5aSRakie Kim /* if no iw_table, use system default */ 160dce41f5aSRakie Kim weight = table ? table[node] : 1; 161dce41f5aSRakie Kim /* if value in iw_table is 0, use system default */ 162dce41f5aSRakie Kim weight = weight ? weight : 1; 163dce41f5aSRakie Kim rcu_read_unlock(); 164dce41f5aSRakie Kim return weight; 165dce41f5aSRakie Kim } 166dce41f5aSRakie Kim 167b2ca916cSDan Williams /** 168b1f099b1SYury Norov * numa_nearest_node - Find nearest node by state 169f6e92f40SKrzysztof Kozlowski * @node: Node id to start the search 170b1f099b1SYury Norov * @state: State to filter the search 171b2ca916cSDan Williams * 172b1f099b1SYury Norov * Lookup the closest node by distance if @nid is not in state. 173dad5b023SRandy Dunlap * 174b1f099b1SYury Norov * Return: this @node if it is in state, otherwise the closest node by distance 175b2ca916cSDan Williams */ 176b1f099b1SYury Norov int numa_nearest_node(int node, unsigned int state) 177b2ca916cSDan Williams { 1784fcbe96eSDan Williams int min_dist = INT_MAX, dist, n, min_node; 179b2ca916cSDan Williams 180b1f099b1SYury Norov if (state >= NR_NODE_STATES) 181b1f099b1SYury Norov return -EINVAL; 182b1f099b1SYury Norov 183b1f099b1SYury Norov if (node == NUMA_NO_NODE || node_state(node, state)) 1844fcbe96eSDan Williams return node; 185b2ca916cSDan Williams 186b2ca916cSDan Williams min_node = node; 187b1f099b1SYury Norov for_each_node_state(n, state) { 188b2ca916cSDan Williams dist = node_distance(node, n); 189b2ca916cSDan Williams if (dist < min_dist) { 190b2ca916cSDan Williams min_dist = dist; 191b2ca916cSDan Williams min_node = n; 192b2ca916cSDan Williams } 193b2ca916cSDan Williams } 194b2ca916cSDan Williams 195b2ca916cSDan Williams return min_node; 196b2ca916cSDan Williams } 197b1f099b1SYury Norov EXPORT_SYMBOL_GPL(numa_nearest_node); 198b2ca916cSDan Williams 19974d2c3a0SOleg Nesterov struct mempolicy *get_task_policy(struct task_struct *p) 2005606e387SMel Gorman { 2015606e387SMel Gorman struct mempolicy *pol = p->mempolicy; 202f15ca78eSOleg Nesterov int node; 2035606e387SMel Gorman 204f15ca78eSOleg Nesterov if (pol) 205f15ca78eSOleg Nesterov return pol; 2065606e387SMel Gorman 207f15ca78eSOleg Nesterov node = numa_node_id(); 2081da6f0e1SJianguo Wu if (node != NUMA_NO_NODE) { 2091da6f0e1SJianguo Wu pol = &preferred_node_policy[node]; 210f15ca78eSOleg Nesterov /* preferred_node_policy is not initialised early in boot */ 211f15ca78eSOleg Nesterov if (pol->mode) 212f15ca78eSOleg Nesterov return pol; 2131da6f0e1SJianguo Wu } 2145606e387SMel Gorman 215f15ca78eSOleg Nesterov return &default_policy; 2165606e387SMel Gorman } 2175606e387SMel Gorman 21837012946SDavid Rientjes static const struct mempolicy_operations { 21937012946SDavid Rientjes int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 220213980c0SVlastimil Babka void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 22137012946SDavid Rientjes } mpol_ops[MPOL_MAX]; 22237012946SDavid Rientjes 223f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 224f5b087b5SDavid Rientjes { 2256d556294SBob Liu return pol->flags & MPOL_MODE_FLAGS; 2264c50bc01SDavid Rientjes } 2274c50bc01SDavid Rientjes 2284c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 2294c50bc01SDavid Rientjes const nodemask_t *rel) 2304c50bc01SDavid Rientjes { 2314c50bc01SDavid Rientjes nodemask_t tmp; 2324c50bc01SDavid Rientjes nodes_fold(tmp, *orig, nodes_weight(*rel)); 2334c50bc01SDavid Rientjes nodes_onto(*ret, tmp, *rel); 234f5b087b5SDavid Rientjes } 235f5b087b5SDavid Rientjes 236be897d48SFeng Tang static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 23737012946SDavid Rientjes { 23837012946SDavid Rientjes if (nodes_empty(*nodes)) 23937012946SDavid Rientjes return -EINVAL; 240269fbe72SBen Widawsky pol->nodes = *nodes; 24137012946SDavid Rientjes return 0; 24237012946SDavid Rientjes } 24337012946SDavid Rientjes 24437012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 24537012946SDavid Rientjes { 2467858d7bcSFeng Tang if (nodes_empty(*nodes)) 2477858d7bcSFeng Tang return -EINVAL; 248269fbe72SBen Widawsky 249269fbe72SBen Widawsky nodes_clear(pol->nodes); 250269fbe72SBen Widawsky node_set(first_node(*nodes), pol->nodes); 25137012946SDavid Rientjes return 0; 25237012946SDavid Rientjes } 25337012946SDavid Rientjes 25458568d2aSMiao Xie /* 25558568d2aSMiao Xie * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 25658568d2aSMiao Xie * any, for the new policy. mpol_new() has already validated the nodes 2577858d7bcSFeng Tang * parameter with respect to the policy mode and flags. 25858568d2aSMiao Xie * 25958568d2aSMiao Xie * Must be called holding task's alloc_lock to protect task's mems_allowed 260c1e8d7c6SMichel Lespinasse * and mempolicy. May also be called holding the mmap_lock for write. 26158568d2aSMiao Xie */ 2624bfc4495SKAMEZAWA Hiroyuki static int mpol_set_nodemask(struct mempolicy *pol, 2634bfc4495SKAMEZAWA Hiroyuki const nodemask_t *nodes, struct nodemask_scratch *nsc) 26458568d2aSMiao Xie { 26558568d2aSMiao Xie int ret; 26658568d2aSMiao Xie 2677858d7bcSFeng Tang /* 2687858d7bcSFeng Tang * Default (pol==NULL) resp. local memory policies are not a 2697858d7bcSFeng Tang * subject of any remapping. They also do not need any special 2707858d7bcSFeng Tang * constructor. 2717858d7bcSFeng Tang */ 2727858d7bcSFeng Tang if (!pol || pol->mode == MPOL_LOCAL) 27358568d2aSMiao Xie return 0; 2747858d7bcSFeng Tang 27501f13bd6SLai Jiangshan /* Check N_MEMORY */ 2764bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask1, 27701f13bd6SLai Jiangshan cpuset_current_mems_allowed, node_states[N_MEMORY]); 27858568d2aSMiao Xie 27958568d2aSMiao Xie VM_BUG_ON(!nodes); 2807858d7bcSFeng Tang 28158568d2aSMiao Xie if (pol->flags & MPOL_F_RELATIVE_NODES) 2824bfc4495SKAMEZAWA Hiroyuki mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 28358568d2aSMiao Xie else 2844bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask2, *nodes, nsc->mask1); 2854bfc4495SKAMEZAWA Hiroyuki 28658568d2aSMiao Xie if (mpol_store_user_nodemask(pol)) 28758568d2aSMiao Xie pol->w.user_nodemask = *nodes; 28858568d2aSMiao Xie else 2897858d7bcSFeng Tang pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 29058568d2aSMiao Xie 2914bfc4495SKAMEZAWA Hiroyuki ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 29258568d2aSMiao Xie return ret; 29358568d2aSMiao Xie } 29458568d2aSMiao Xie 29558568d2aSMiao Xie /* 29658568d2aSMiao Xie * This function just creates a new policy, does some check and simple 29758568d2aSMiao Xie * initialization. You must invoke mpol_set_nodemask() to set nodes. 29858568d2aSMiao Xie */ 299028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 300028fec41SDavid Rientjes nodemask_t *nodes) 3011da177e4SLinus Torvalds { 3021da177e4SLinus Torvalds struct mempolicy *policy; 3031da177e4SLinus Torvalds 3043e1f0645SDavid Rientjes if (mode == MPOL_DEFAULT) { 3053e1f0645SDavid Rientjes if (nodes && !nodes_empty(*nodes)) 30637012946SDavid Rientjes return ERR_PTR(-EINVAL); 307d3a71033SLee Schermerhorn return NULL; 30837012946SDavid Rientjes } 3093e1f0645SDavid Rientjes VM_BUG_ON(!nodes); 3103e1f0645SDavid Rientjes 3113e1f0645SDavid Rientjes /* 3123e1f0645SDavid Rientjes * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 3133e1f0645SDavid Rientjes * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 3143e1f0645SDavid Rientjes * All other modes require a valid pointer to a non-empty nodemask. 3153e1f0645SDavid Rientjes */ 3163e1f0645SDavid Rientjes if (mode == MPOL_PREFERRED) { 3173e1f0645SDavid Rientjes if (nodes_empty(*nodes)) { 3183e1f0645SDavid Rientjes if (((flags & MPOL_F_STATIC_NODES) || 3193e1f0645SDavid Rientjes (flags & MPOL_F_RELATIVE_NODES))) 3203e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 3217858d7bcSFeng Tang 3227858d7bcSFeng Tang mode = MPOL_LOCAL; 3233e1f0645SDavid Rientjes } 324479e2802SPeter Zijlstra } else if (mode == MPOL_LOCAL) { 3258d303e44SPiotr Kwapulinski if (!nodes_empty(*nodes) || 3268d303e44SPiotr Kwapulinski (flags & MPOL_F_STATIC_NODES) || 3278d303e44SPiotr Kwapulinski (flags & MPOL_F_RELATIVE_NODES)) 328479e2802SPeter Zijlstra return ERR_PTR(-EINVAL); 3293e1f0645SDavid Rientjes } else if (nodes_empty(*nodes)) 3303e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 331c36f6e6dSHugh Dickins 3321da177e4SLinus Torvalds policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3331da177e4SLinus Torvalds if (!policy) 3341da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 3351da177e4SLinus Torvalds atomic_set(&policy->refcnt, 1); 33645c4745aSLee Schermerhorn policy->mode = mode; 33737012946SDavid Rientjes policy->flags = flags; 338c6018b4bSAneesh Kumar K.V policy->home_node = NUMA_NO_NODE; 3393e1f0645SDavid Rientjes 34037012946SDavid Rientjes return policy; 34137012946SDavid Rientjes } 34237012946SDavid Rientjes 34352cd3b07SLee Schermerhorn /* Slow path of a mpol destructor. */ 344c36f6e6dSHugh Dickins void __mpol_put(struct mempolicy *pol) 34552cd3b07SLee Schermerhorn { 346c36f6e6dSHugh Dickins if (!atomic_dec_and_test(&pol->refcnt)) 34752cd3b07SLee Schermerhorn return; 348c36f6e6dSHugh Dickins kmem_cache_free(policy_cache, pol); 34952cd3b07SLee Schermerhorn } 35052cd3b07SLee Schermerhorn 351213980c0SVlastimil Babka static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 35237012946SDavid Rientjes { 35337012946SDavid Rientjes } 35437012946SDavid Rientjes 355213980c0SVlastimil Babka static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 3561d0d2680SDavid Rientjes { 3571d0d2680SDavid Rientjes nodemask_t tmp; 3581d0d2680SDavid Rientjes 35937012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES) 36037012946SDavid Rientjes nodes_and(tmp, pol->w.user_nodemask, *nodes); 36137012946SDavid Rientjes else if (pol->flags & MPOL_F_RELATIVE_NODES) 36237012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 3631d0d2680SDavid Rientjes else { 364269fbe72SBen Widawsky nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 365213980c0SVlastimil Babka *nodes); 36629b190faSzhong jiang pol->w.cpuset_mems_allowed = *nodes; 3671d0d2680SDavid Rientjes } 36837012946SDavid Rientjes 369708c1bbcSMiao Xie if (nodes_empty(tmp)) 370708c1bbcSMiao Xie tmp = *nodes; 371708c1bbcSMiao Xie 372269fbe72SBen Widawsky pol->nodes = tmp; 37337012946SDavid Rientjes } 37437012946SDavid Rientjes 37537012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol, 376213980c0SVlastimil Babka const nodemask_t *nodes) 37737012946SDavid Rientjes { 37837012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes; 3791d0d2680SDavid Rientjes } 38037012946SDavid Rientjes 381708c1bbcSMiao Xie /* 382708c1bbcSMiao Xie * mpol_rebind_policy - Migrate a policy to a different set of nodes 383708c1bbcSMiao Xie * 384c1e8d7c6SMichel Lespinasse * Per-vma policies are protected by mmap_lock. Allocations using per-task 385213980c0SVlastimil Babka * policies are protected by task->mems_allowed_seq to prevent a premature 386213980c0SVlastimil Babka * OOM/allocation failure due to parallel nodemask modification. 387708c1bbcSMiao Xie */ 388213980c0SVlastimil Babka static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 38937012946SDavid Rientjes { 390018160adSWang Cheng if (!pol || pol->mode == MPOL_LOCAL) 39137012946SDavid Rientjes return; 3927858d7bcSFeng Tang if (!mpol_store_user_nodemask(pol) && 39337012946SDavid Rientjes nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 39437012946SDavid Rientjes return; 395708c1bbcSMiao Xie 396213980c0SVlastimil Babka mpol_ops[pol->mode].rebind(pol, newmask); 3971d0d2680SDavid Rientjes } 3981d0d2680SDavid Rientjes 3991d0d2680SDavid Rientjes /* 4001d0d2680SDavid Rientjes * Wrapper for mpol_rebind_policy() that just requires task 4011d0d2680SDavid Rientjes * pointer, and updates task mempolicy. 40258568d2aSMiao Xie * 40358568d2aSMiao Xie * Called with task's alloc_lock held. 4041d0d2680SDavid Rientjes */ 405213980c0SVlastimil Babka void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 4061d0d2680SDavid Rientjes { 407213980c0SVlastimil Babka mpol_rebind_policy(tsk->mempolicy, new); 4081d0d2680SDavid Rientjes } 4091d0d2680SDavid Rientjes 4101d0d2680SDavid Rientjes /* 4111d0d2680SDavid Rientjes * Rebind each vma in mm to new nodemask. 4121d0d2680SDavid Rientjes * 413c1e8d7c6SMichel Lespinasse * Call holding a reference to mm. Takes mm->mmap_lock during call. 4141d0d2680SDavid Rientjes */ 4151d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 4161d0d2680SDavid Rientjes { 4171d0d2680SDavid Rientjes struct vm_area_struct *vma; 41866850be5SLiam R. Howlett VMA_ITERATOR(vmi, mm, 0); 4191d0d2680SDavid Rientjes 420d8ed45c5SMichel Lespinasse mmap_write_lock(mm); 4216c21e066SJann Horn for_each_vma(vmi, vma) { 4226c21e066SJann Horn vma_start_write(vma); 423213980c0SVlastimil Babka mpol_rebind_policy(vma->vm_policy, new); 4246c21e066SJann Horn } 425d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 4261d0d2680SDavid Rientjes } 4271d0d2680SDavid Rientjes 42837012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 42937012946SDavid Rientjes [MPOL_DEFAULT] = { 43037012946SDavid Rientjes .rebind = mpol_rebind_default, 43137012946SDavid Rientjes }, 43237012946SDavid Rientjes [MPOL_INTERLEAVE] = { 433be897d48SFeng Tang .create = mpol_new_nodemask, 43437012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 43537012946SDavid Rientjes }, 43637012946SDavid Rientjes [MPOL_PREFERRED] = { 43737012946SDavid Rientjes .create = mpol_new_preferred, 43837012946SDavid Rientjes .rebind = mpol_rebind_preferred, 43937012946SDavid Rientjes }, 44037012946SDavid Rientjes [MPOL_BIND] = { 441be897d48SFeng Tang .create = mpol_new_nodemask, 44237012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 44337012946SDavid Rientjes }, 4447858d7bcSFeng Tang [MPOL_LOCAL] = { 4457858d7bcSFeng Tang .rebind = mpol_rebind_default, 4467858d7bcSFeng Tang }, 447b27abaccSDave Hansen [MPOL_PREFERRED_MANY] = { 448be897d48SFeng Tang .create = mpol_new_nodemask, 449b27abaccSDave Hansen .rebind = mpol_rebind_preferred, 450b27abaccSDave Hansen }, 451fa3bea4eSGregory Price [MPOL_WEIGHTED_INTERLEAVE] = { 452fa3bea4eSGregory Price .create = mpol_new_nodemask, 453fa3bea4eSGregory Price .rebind = mpol_rebind_nodemask, 454fa3bea4eSGregory Price }, 45537012946SDavid Rientjes }; 45637012946SDavid Rientjes 4571cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 458fc301289SChristoph Lameter unsigned long flags); 45972e315f7SHugh Dickins static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 46072e315f7SHugh Dickins pgoff_t ilx, int *nid); 4611a75a6c8SChristoph Lameter 4621cb5d11aSHugh Dickins static bool strictly_unmovable(unsigned long flags) 4631cb5d11aSHugh Dickins { 4641cb5d11aSHugh Dickins /* 4651cb5d11aSHugh Dickins * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 4661cb5d11aSHugh Dickins * if any misplaced page is found. 4671cb5d11aSHugh Dickins */ 4681cb5d11aSHugh Dickins return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 4691cb5d11aSHugh Dickins MPOL_MF_STRICT; 4701cb5d11aSHugh Dickins } 4711cb5d11aSHugh Dickins 47288c91dc5SHugh Dickins struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 47388c91dc5SHugh Dickins struct mempolicy *pol; 47488c91dc5SHugh Dickins pgoff_t ilx; 47588c91dc5SHugh Dickins }; 476dc9aa5b9SChristoph Lameter 4776f4576e3SNaoya Horiguchi struct queue_pages { 4786f4576e3SNaoya Horiguchi struct list_head *pagelist; 4796f4576e3SNaoya Horiguchi unsigned long flags; 4806f4576e3SNaoya Horiguchi nodemask_t *nmask; 481f18da660SLi Xinhai unsigned long start; 482f18da660SLi Xinhai unsigned long end; 483f18da660SLi Xinhai struct vm_area_struct *first; 4841cb5d11aSHugh Dickins struct folio *large; /* note last large folio encountered */ 4851cb5d11aSHugh Dickins long nr_failed; /* could not be isolated at this time */ 4866f4576e3SNaoya Horiguchi }; 4876f4576e3SNaoya Horiguchi 48898094945SNaoya Horiguchi /* 489d451b89dSVishal Moola (Oracle) * Check if the folio's nid is in qp->nmask. 49088aaa2a1SNaoya Horiguchi * 49188aaa2a1SNaoya Horiguchi * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 49288aaa2a1SNaoya Horiguchi * in the invert of qp->nmask. 49388aaa2a1SNaoya Horiguchi */ 494d451b89dSVishal Moola (Oracle) static inline bool queue_folio_required(struct folio *folio, 49588aaa2a1SNaoya Horiguchi struct queue_pages *qp) 49688aaa2a1SNaoya Horiguchi { 497d451b89dSVishal Moola (Oracle) int nid = folio_nid(folio); 49888aaa2a1SNaoya Horiguchi unsigned long flags = qp->flags; 49988aaa2a1SNaoya Horiguchi 50088aaa2a1SNaoya Horiguchi return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 50188aaa2a1SNaoya Horiguchi } 50288aaa2a1SNaoya Horiguchi 5031cb5d11aSHugh Dickins static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 504c8633798SNaoya Horiguchi { 505de1f5055SVishal Moola (Oracle) struct folio *folio; 506c8633798SNaoya Horiguchi struct queue_pages *qp = walk->private; 507c8633798SNaoya Horiguchi 508c8633798SNaoya Horiguchi if (unlikely(is_pmd_migration_entry(*pmd))) { 5091cb5d11aSHugh Dickins qp->nr_failed++; 5101cb5d11aSHugh Dickins return; 511c8633798SNaoya Horiguchi } 512de1f5055SVishal Moola (Oracle) folio = pfn_folio(pmd_pfn(*pmd)); 513de1f5055SVishal Moola (Oracle) if (is_huge_zero_page(&folio->page)) { 514e5947d23SYang Shi walk->action = ACTION_CONTINUE; 5151cb5d11aSHugh Dickins return; 516c8633798SNaoya Horiguchi } 517d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp)) 5181cb5d11aSHugh Dickins return; 5191cb5d11aSHugh Dickins if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 5201cb5d11aSHugh Dickins !vma_migratable(walk->vma) || 5211cb5d11aSHugh Dickins !migrate_folio_add(folio, qp->pagelist, qp->flags)) 5221cb5d11aSHugh Dickins qp->nr_failed++; 523c8633798SNaoya Horiguchi } 524c8633798SNaoya Horiguchi 52588aaa2a1SNaoya Horiguchi /* 5261cb5d11aSHugh Dickins * Scan through folios, checking if they satisfy the required conditions, 5271cb5d11aSHugh Dickins * moving them from LRU to local pagelist for migration if they do (or not). 528d8835445SYang Shi * 5291cb5d11aSHugh Dickins * queue_folios_pte_range() has two possible return values: 5301cb5d11aSHugh Dickins * 0 - continue walking to scan for more, even if an existing folio on the 5311cb5d11aSHugh Dickins * wrong node could not be isolated and queued for migration. 5321cb5d11aSHugh Dickins * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 5331cb5d11aSHugh Dickins * and an existing folio was on a node that does not follow the policy. 53498094945SNaoya Horiguchi */ 5353dae02bbSVishal Moola (Oracle) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 5366f4576e3SNaoya Horiguchi unsigned long end, struct mm_walk *walk) 5371da177e4SLinus Torvalds { 5386f4576e3SNaoya Horiguchi struct vm_area_struct *vma = walk->vma; 5393dae02bbSVishal Moola (Oracle) struct folio *folio; 5406f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private; 5416f4576e3SNaoya Horiguchi unsigned long flags = qp->flags; 5423f088420SShijie Luo pte_t *pte, *mapped_pte; 543c33c7948SRyan Roberts pte_t ptent; 544705e87c0SHugh Dickins spinlock_t *ptl; 545941150a3SHugh Dickins 546c8633798SNaoya Horiguchi ptl = pmd_trans_huge_lock(pmd, vma); 5471cb5d11aSHugh Dickins if (ptl) { 5481cb5d11aSHugh Dickins queue_folios_pmd(pmd, walk); 5491cb5d11aSHugh Dickins spin_unlock(ptl); 5501cb5d11aSHugh Dickins goto out; 5511cb5d11aSHugh Dickins } 55291612e0dSHugh Dickins 5533f088420SShijie Luo mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 5547780d040SHugh Dickins if (!pte) { 5557780d040SHugh Dickins walk->action = ACTION_AGAIN; 5567780d040SHugh Dickins return 0; 5577780d040SHugh Dickins } 5586f4576e3SNaoya Horiguchi for (; addr != end; pte++, addr += PAGE_SIZE) { 559c33c7948SRyan Roberts ptent = ptep_get(pte); 5601cb5d11aSHugh Dickins if (pte_none(ptent)) 56191612e0dSHugh Dickins continue; 5621cb5d11aSHugh Dickins if (!pte_present(ptent)) { 5631cb5d11aSHugh Dickins if (is_migration_entry(pte_to_swp_entry(ptent))) 5641cb5d11aSHugh Dickins qp->nr_failed++; 5651cb5d11aSHugh Dickins continue; 5661cb5d11aSHugh Dickins } 567c33c7948SRyan Roberts folio = vm_normal_folio(vma, addr, ptent); 5683dae02bbSVishal Moola (Oracle) if (!folio || folio_is_zone_device(folio)) 56991612e0dSHugh Dickins continue; 570053837fcSNick Piggin /* 5713dae02bbSVishal Moola (Oracle) * vm_normal_folio() filters out zero pages, but there might 5723dae02bbSVishal Moola (Oracle) * still be reserved folios to skip, perhaps in a VDSO. 573053837fcSNick Piggin */ 5743dae02bbSVishal Moola (Oracle) if (folio_test_reserved(folio)) 575f4598c8bSChristoph Lameter continue; 576d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp)) 57738e35860SChristoph Lameter continue; 5781cb5d11aSHugh Dickins if (folio_test_large(folio)) { 57924526268SYang Shi /* 5801cb5d11aSHugh Dickins * A large folio can only be isolated from LRU once, 5811cb5d11aSHugh Dickins * but may be mapped by many PTEs (and Copy-On-Write may 5821cb5d11aSHugh Dickins * intersperse PTEs of other, order 0, folios). This is 5831cb5d11aSHugh Dickins * a common case, so don't mistake it for failure (but 5841cb5d11aSHugh Dickins * there can be other cases of multi-mapped pages which 5851cb5d11aSHugh Dickins * this quick check does not help to filter out - and a 5861cb5d11aSHugh Dickins * search of the pagelist might grow to be prohibitive). 5871cb5d11aSHugh Dickins * 5881cb5d11aSHugh Dickins * migrate_pages(&pagelist) returns nr_failed folios, so 5891cb5d11aSHugh Dickins * check "large" now so that queue_pages_range() returns 5901cb5d11aSHugh Dickins * a comparable nr_failed folios. This does imply that 5911cb5d11aSHugh Dickins * if folio could not be isolated for some racy reason 5921cb5d11aSHugh Dickins * at its first PTE, later PTEs will not give it another 5931cb5d11aSHugh Dickins * chance of isolation; but keeps the accounting simple. 59424526268SYang Shi */ 5951cb5d11aSHugh Dickins if (folio == qp->large) 5961cb5d11aSHugh Dickins continue; 5971cb5d11aSHugh Dickins qp->large = folio; 5981cb5d11aSHugh Dickins } 5991cb5d11aSHugh Dickins if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 6001cb5d11aSHugh Dickins !vma_migratable(vma) || 6011cb5d11aSHugh Dickins !migrate_folio_add(folio, qp->pagelist, flags)) { 6021cb5d11aSHugh Dickins qp->nr_failed++; 6031cb5d11aSHugh Dickins if (strictly_unmovable(flags)) 604a7f40cfeSYang Shi break; 6056f4576e3SNaoya Horiguchi } 6061cb5d11aSHugh Dickins } 6073f088420SShijie Luo pte_unmap_unlock(mapped_pte, ptl); 6086f4576e3SNaoya Horiguchi cond_resched(); 6091cb5d11aSHugh Dickins out: 6101cb5d11aSHugh Dickins if (qp->nr_failed && strictly_unmovable(flags)) 6111cb5d11aSHugh Dickins return -EIO; 6121cb5d11aSHugh Dickins return 0; 61391612e0dSHugh Dickins } 61491612e0dSHugh Dickins 6150a2c1e81SVishal Moola (Oracle) static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 6166f4576e3SNaoya Horiguchi unsigned long addr, unsigned long end, 6176f4576e3SNaoya Horiguchi struct mm_walk *walk) 618e2d8cf40SNaoya Horiguchi { 619e2d8cf40SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE 6206f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private; 6211cb5d11aSHugh Dickins unsigned long flags = qp->flags; 6220a2c1e81SVishal Moola (Oracle) struct folio *folio; 623cb900f41SKirill A. Shutemov spinlock_t *ptl; 624d4c54919SNaoya Horiguchi pte_t entry; 625e2d8cf40SNaoya Horiguchi 6266f4576e3SNaoya Horiguchi ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 6276f4576e3SNaoya Horiguchi entry = huge_ptep_get(pte); 6281cb5d11aSHugh Dickins if (!pte_present(entry)) { 6291cb5d11aSHugh Dickins if (unlikely(is_hugetlb_entry_migration(entry))) 6301cb5d11aSHugh Dickins qp->nr_failed++; 631d4c54919SNaoya Horiguchi goto unlock; 6321cb5d11aSHugh Dickins } 6330a2c1e81SVishal Moola (Oracle) folio = pfn_folio(pte_pfn(entry)); 634d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp)) 635e2d8cf40SNaoya Horiguchi goto unlock; 6361cb5d11aSHugh Dickins if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 6371cb5d11aSHugh Dickins !vma_migratable(walk->vma)) { 6381cb5d11aSHugh Dickins qp->nr_failed++; 639dcf17635SLi Xinhai goto unlock; 640dcf17635SLi Xinhai } 641dcf17635SLi Xinhai /* 6421cb5d11aSHugh Dickins * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 6431cb5d11aSHugh Dickins * Choosing not to migrate a shared folio is not counted as a failure. 6440a2c1e81SVishal Moola (Oracle) * 6450a2c1e81SVishal Moola (Oracle) * To check if the folio is shared, ideally we want to make sure 6460a2c1e81SVishal Moola (Oracle) * every page is mapped to the same process. Doing that is very 6471cb5d11aSHugh Dickins * expensive, so check the estimated sharers of the folio instead. 6480a2c1e81SVishal Moola (Oracle) */ 6491cb5d11aSHugh Dickins if ((flags & MPOL_MF_MOVE_ALL) || 6501cb5d11aSHugh Dickins (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) 6511cb5d11aSHugh Dickins if (!isolate_hugetlb(folio, qp->pagelist)) 6521cb5d11aSHugh Dickins qp->nr_failed++; 653e2d8cf40SNaoya Horiguchi unlock: 654cb900f41SKirill A. Shutemov spin_unlock(ptl); 6551cb5d11aSHugh Dickins if (qp->nr_failed && strictly_unmovable(flags)) 6561cb5d11aSHugh Dickins return -EIO; 657e2d8cf40SNaoya Horiguchi #endif 6581cb5d11aSHugh Dickins return 0; 6591da177e4SLinus Torvalds } 6601da177e4SLinus Torvalds 6615877231fSAneesh Kumar K.V #ifdef CONFIG_NUMA_BALANCING 662b24f53a0SLee Schermerhorn /* 6634b10e7d5SMel Gorman * This is used to mark a range of virtual addresses to be inaccessible. 6644b10e7d5SMel Gorman * These are later cleared by a NUMA hinting fault. Depending on these 6654b10e7d5SMel Gorman * faults, pages may be migrated for better NUMA placement. 6664b10e7d5SMel Gorman * 6674b10e7d5SMel Gorman * This is assuming that NUMA faults are handled using PROT_NONE. If 6684b10e7d5SMel Gorman * an architecture makes a different choice, it will need further 6694b10e7d5SMel Gorman * changes to the core. 670b24f53a0SLee Schermerhorn */ 6714b10e7d5SMel Gorman unsigned long change_prot_numa(struct vm_area_struct *vma, 6724b10e7d5SMel Gorman unsigned long addr, unsigned long end) 673b24f53a0SLee Schermerhorn { 6744a18419fSNadav Amit struct mmu_gather tlb; 675a79390f5SPeter Xu long nr_updated; 676b24f53a0SLee Schermerhorn 6774a18419fSNadav Amit tlb_gather_mmu(&tlb, vma->vm_mm); 6784a18419fSNadav Amit 6791ef488edSDavid Hildenbrand nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 680d1751118SPeter Xu if (nr_updated > 0) 68103c5a6e1SMel Gorman count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 682b24f53a0SLee Schermerhorn 6834a18419fSNadav Amit tlb_finish_mmu(&tlb); 6844a18419fSNadav Amit 6854b10e7d5SMel Gorman return nr_updated; 686b24f53a0SLee Schermerhorn } 6875877231fSAneesh Kumar K.V #endif /* CONFIG_NUMA_BALANCING */ 688b24f53a0SLee Schermerhorn 6896f4576e3SNaoya Horiguchi static int queue_pages_test_walk(unsigned long start, unsigned long end, 6906f4576e3SNaoya Horiguchi struct mm_walk *walk) 6911da177e4SLinus Torvalds { 69266850be5SLiam R. Howlett struct vm_area_struct *next, *vma = walk->vma; 6936f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private; 6946f4576e3SNaoya Horiguchi unsigned long flags = qp->flags; 695dc9aa5b9SChristoph Lameter 696a18b3ac2SLi Xinhai /* range check first */ 697ce33135cSMiaohe Lin VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 698f18da660SLi Xinhai 699f18da660SLi Xinhai if (!qp->first) { 700f18da660SLi Xinhai qp->first = vma; 701f18da660SLi Xinhai if (!(flags & MPOL_MF_DISCONTIG_OK) && 702f18da660SLi Xinhai (qp->start < vma->vm_start)) 703f18da660SLi Xinhai /* hole at head side of range */ 704a18b3ac2SLi Xinhai return -EFAULT; 705a18b3ac2SLi Xinhai } 70666850be5SLiam R. Howlett next = find_vma(vma->vm_mm, vma->vm_end); 707f18da660SLi Xinhai if (!(flags & MPOL_MF_DISCONTIG_OK) && 708f18da660SLi Xinhai ((vma->vm_end < qp->end) && 70966850be5SLiam R. Howlett (!next || vma->vm_end < next->vm_start))) 710f18da660SLi Xinhai /* hole at middle or tail of range */ 711f18da660SLi Xinhai return -EFAULT; 712a18b3ac2SLi Xinhai 713a7f40cfeSYang Shi /* 714a7f40cfeSYang Shi * Need check MPOL_MF_STRICT to return -EIO if possible 715a7f40cfeSYang Shi * regardless of vma_migratable 716a7f40cfeSYang Shi */ 717a7f40cfeSYang Shi if (!vma_migratable(vma) && 718a7f40cfeSYang Shi !(flags & MPOL_MF_STRICT)) 71948684a65SNaoya Horiguchi return 1; 72048684a65SNaoya Horiguchi 7211cb5d11aSHugh Dickins /* 7221cb5d11aSHugh Dickins * Check page nodes, and queue pages to move, in the current vma. 7231cb5d11aSHugh Dickins * But if no moving, and no strict checking, the scan can be skipped. 7241cb5d11aSHugh Dickins */ 7251cb5d11aSHugh Dickins if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 7266f4576e3SNaoya Horiguchi return 0; 7276f4576e3SNaoya Horiguchi return 1; 7286f4576e3SNaoya Horiguchi } 729b24f53a0SLee Schermerhorn 7307b86ac33SChristoph Hellwig static const struct mm_walk_ops queue_pages_walk_ops = { 7310a2c1e81SVishal Moola (Oracle) .hugetlb_entry = queue_folios_hugetlb, 7323dae02bbSVishal Moola (Oracle) .pmd_entry = queue_folios_pte_range, 7337b86ac33SChristoph Hellwig .test_walk = queue_pages_test_walk, 73449b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK, 73549b06385SSuren Baghdasaryan }; 73649b06385SSuren Baghdasaryan 73749b06385SSuren Baghdasaryan static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 73849b06385SSuren Baghdasaryan .hugetlb_entry = queue_folios_hugetlb, 73949b06385SSuren Baghdasaryan .pmd_entry = queue_folios_pte_range, 74049b06385SSuren Baghdasaryan .test_walk = queue_pages_test_walk, 74149b06385SSuren Baghdasaryan .walk_lock = PGWALK_WRLOCK, 7427b86ac33SChristoph Hellwig }; 7437b86ac33SChristoph Hellwig 7446f4576e3SNaoya Horiguchi /* 7456f4576e3SNaoya Horiguchi * Walk through page tables and collect pages to be migrated. 7466f4576e3SNaoya Horiguchi * 7471cb5d11aSHugh Dickins * If pages found in a given range are not on the required set of @nodes, 7481cb5d11aSHugh Dickins * and migration is allowed, they are isolated and queued to @pagelist. 749d8835445SYang Shi * 7501cb5d11aSHugh Dickins * queue_pages_range() may return: 7511cb5d11aSHugh Dickins * 0 - all pages already on the right node, or successfully queued for moving 7521cb5d11aSHugh Dickins * (or neither strict checking nor moving requested: only range checking). 7531cb5d11aSHugh Dickins * >0 - this number of misplaced folios could not be queued for moving 7541cb5d11aSHugh Dickins * (a hugetlbfs page or a transparent huge page being counted as 1). 7551cb5d11aSHugh Dickins * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 7561cb5d11aSHugh Dickins * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 7576f4576e3SNaoya Horiguchi */ 7581cb5d11aSHugh Dickins static long 7596f4576e3SNaoya Horiguchi queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 7606f4576e3SNaoya Horiguchi nodemask_t *nodes, unsigned long flags, 7611cb5d11aSHugh Dickins struct list_head *pagelist) 7626f4576e3SNaoya Horiguchi { 763f18da660SLi Xinhai int err; 7646f4576e3SNaoya Horiguchi struct queue_pages qp = { 7656f4576e3SNaoya Horiguchi .pagelist = pagelist, 7666f4576e3SNaoya Horiguchi .flags = flags, 7676f4576e3SNaoya Horiguchi .nmask = nodes, 768f18da660SLi Xinhai .start = start, 769f18da660SLi Xinhai .end = end, 770f18da660SLi Xinhai .first = NULL, 7716f4576e3SNaoya Horiguchi }; 7721cb5d11aSHugh Dickins const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 77349b06385SSuren Baghdasaryan &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 7746f4576e3SNaoya Horiguchi 77549b06385SSuren Baghdasaryan err = walk_page_range(mm, start, end, ops, &qp); 776f18da660SLi Xinhai 777f18da660SLi Xinhai if (!qp.first) 778f18da660SLi Xinhai /* whole range in hole */ 779f18da660SLi Xinhai err = -EFAULT; 780f18da660SLi Xinhai 7811cb5d11aSHugh Dickins return err ? : qp.nr_failed; 7821da177e4SLinus Torvalds } 7831da177e4SLinus Torvalds 784869833f2SKOSAKI Motohiro /* 785869833f2SKOSAKI Motohiro * Apply policy to a single VMA 786c1e8d7c6SMichel Lespinasse * This must be called with the mmap_lock held for writing. 787869833f2SKOSAKI Motohiro */ 788869833f2SKOSAKI Motohiro static int vma_replace_policy(struct vm_area_struct *vma, 789869833f2SKOSAKI Motohiro struct mempolicy *pol) 7908d34694cSKOSAKI Motohiro { 791869833f2SKOSAKI Motohiro int err; 792869833f2SKOSAKI Motohiro struct mempolicy *old; 793869833f2SKOSAKI Motohiro struct mempolicy *new; 7948d34694cSKOSAKI Motohiro 7956c21e066SJann Horn vma_assert_write_locked(vma); 7966c21e066SJann Horn 797869833f2SKOSAKI Motohiro new = mpol_dup(pol); 798869833f2SKOSAKI Motohiro if (IS_ERR(new)) 799869833f2SKOSAKI Motohiro return PTR_ERR(new); 800869833f2SKOSAKI Motohiro 801869833f2SKOSAKI Motohiro if (vma->vm_ops && vma->vm_ops->set_policy) { 8028d34694cSKOSAKI Motohiro err = vma->vm_ops->set_policy(vma, new); 803869833f2SKOSAKI Motohiro if (err) 804869833f2SKOSAKI Motohiro goto err_out; 8058d34694cSKOSAKI Motohiro } 806869833f2SKOSAKI Motohiro 807869833f2SKOSAKI Motohiro old = vma->vm_policy; 808c1e8d7c6SMichel Lespinasse vma->vm_policy = new; /* protected by mmap_lock */ 809869833f2SKOSAKI Motohiro mpol_put(old); 810869833f2SKOSAKI Motohiro 811869833f2SKOSAKI Motohiro return 0; 812869833f2SKOSAKI Motohiro err_out: 813869833f2SKOSAKI Motohiro mpol_put(new); 8148d34694cSKOSAKI Motohiro return err; 8158d34694cSKOSAKI Motohiro } 8168d34694cSKOSAKI Motohiro 817f4e9e0e6SLiam R. Howlett /* Split or merge the VMA (if required) and apply the new policy */ 818f4e9e0e6SLiam R. Howlett static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 819f4e9e0e6SLiam R. Howlett struct vm_area_struct **prev, unsigned long start, 8209d8cebd4SKOSAKI Motohiro unsigned long end, struct mempolicy *new_pol) 8211da177e4SLinus Torvalds { 822f4e9e0e6SLiam R. Howlett unsigned long vmstart, vmend; 8231da177e4SLinus Torvalds 824f4e9e0e6SLiam R. Howlett vmend = min(end, vma->vm_end); 825f4e9e0e6SLiam R. Howlett if (start > vma->vm_start) { 826f4e9e0e6SLiam R. Howlett *prev = vma; 827f4e9e0e6SLiam R. Howlett vmstart = start; 828f4e9e0e6SLiam R. Howlett } else { 829f4e9e0e6SLiam R. Howlett vmstart = vma->vm_start; 830f4e9e0e6SLiam R. Howlett } 8319d8cebd4SKOSAKI Motohiro 832c36f6e6dSHugh Dickins if (mpol_equal(vma->vm_policy, new_pol)) { 83300ca0f2eSLorenzo Stoakes *prev = vma; 834f4e9e0e6SLiam R. Howlett return 0; 83500ca0f2eSLorenzo Stoakes } 836e26a5114SKOSAKI Motohiro 83794d7d923SLorenzo Stoakes vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 83894d7d923SLorenzo Stoakes if (IS_ERR(vma)) 83994d7d923SLorenzo Stoakes return PTR_ERR(vma); 840f4e9e0e6SLiam R. Howlett 841f4e9e0e6SLiam R. Howlett *prev = vma; 842f4e9e0e6SLiam R. Howlett return vma_replace_policy(vma, new_pol); 843f4e9e0e6SLiam R. Howlett } 844f4e9e0e6SLiam R. Howlett 8451da177e4SLinus Torvalds /* Set the process memory policy */ 846028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags, 847028fec41SDavid Rientjes nodemask_t *nodes) 8481da177e4SLinus Torvalds { 84958568d2aSMiao Xie struct mempolicy *new, *old; 8504bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch); 85158568d2aSMiao Xie int ret; 8521da177e4SLinus Torvalds 8534bfc4495SKAMEZAWA Hiroyuki if (!scratch) 8544bfc4495SKAMEZAWA Hiroyuki return -ENOMEM; 855f4e53d91SLee Schermerhorn 8564bfc4495SKAMEZAWA Hiroyuki new = mpol_new(mode, flags, nodes); 8574bfc4495SKAMEZAWA Hiroyuki if (IS_ERR(new)) { 8584bfc4495SKAMEZAWA Hiroyuki ret = PTR_ERR(new); 8594bfc4495SKAMEZAWA Hiroyuki goto out; 8604bfc4495SKAMEZAWA Hiroyuki } 8612c7c3a7dSOleg Nesterov 86212c1dc8eSAbel Wu task_lock(current); 8634bfc4495SKAMEZAWA Hiroyuki ret = mpol_set_nodemask(new, nodes, scratch); 86458568d2aSMiao Xie if (ret) { 86512c1dc8eSAbel Wu task_unlock(current); 86658568d2aSMiao Xie mpol_put(new); 8674bfc4495SKAMEZAWA Hiroyuki goto out; 86858568d2aSMiao Xie } 86912c1dc8eSAbel Wu 87058568d2aSMiao Xie old = current->mempolicy; 8711da177e4SLinus Torvalds current->mempolicy = new; 872fa3bea4eSGregory Price if (new && (new->mode == MPOL_INTERLEAVE || 873fa3bea4eSGregory Price new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 87445816682SVlastimil Babka current->il_prev = MAX_NUMNODES-1; 875fa3bea4eSGregory Price current->il_weight = 0; 876fa3bea4eSGregory Price } 87758568d2aSMiao Xie task_unlock(current); 87858568d2aSMiao Xie mpol_put(old); 8794bfc4495SKAMEZAWA Hiroyuki ret = 0; 8804bfc4495SKAMEZAWA Hiroyuki out: 8814bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch); 8824bfc4495SKAMEZAWA Hiroyuki return ret; 8831da177e4SLinus Torvalds } 8841da177e4SLinus Torvalds 885bea904d5SLee Schermerhorn /* 886bea904d5SLee Schermerhorn * Return nodemask for policy for get_mempolicy() query 88758568d2aSMiao Xie * 88858568d2aSMiao Xie * Called with task's alloc_lock held 889bea904d5SLee Schermerhorn */ 890c36f6e6dSHugh Dickins static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 8911da177e4SLinus Torvalds { 892dfcd3c0dSAndi Kleen nodes_clear(*nodes); 893c36f6e6dSHugh Dickins if (pol == &default_policy) 894bea904d5SLee Schermerhorn return; 895bea904d5SLee Schermerhorn 896c36f6e6dSHugh Dickins switch (pol->mode) { 89719770b32SMel Gorman case MPOL_BIND: 8981da177e4SLinus Torvalds case MPOL_INTERLEAVE: 899269fbe72SBen Widawsky case MPOL_PREFERRED: 900b27abaccSDave Hansen case MPOL_PREFERRED_MANY: 901fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 902c36f6e6dSHugh Dickins *nodes = pol->nodes; 9031da177e4SLinus Torvalds break; 9047858d7bcSFeng Tang case MPOL_LOCAL: 9057858d7bcSFeng Tang /* return empty node mask for local allocation */ 9067858d7bcSFeng Tang break; 9071da177e4SLinus Torvalds default: 9081da177e4SLinus Torvalds BUG(); 9091da177e4SLinus Torvalds } 9101da177e4SLinus Torvalds } 9111da177e4SLinus Torvalds 9123b9aadf7SAndrea Arcangeli static int lookup_node(struct mm_struct *mm, unsigned long addr) 9131da177e4SLinus Torvalds { 914ba841078SPeter Xu struct page *p = NULL; 915f728b9c4SJohn Hubbard int ret; 9161da177e4SLinus Torvalds 917f728b9c4SJohn Hubbard ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 918f728b9c4SJohn Hubbard if (ret > 0) { 919f728b9c4SJohn Hubbard ret = page_to_nid(p); 9201da177e4SLinus Torvalds put_page(p); 9211da177e4SLinus Torvalds } 922f728b9c4SJohn Hubbard return ret; 9231da177e4SLinus Torvalds } 9241da177e4SLinus Torvalds 9251da177e4SLinus Torvalds /* Retrieve NUMA policy */ 926dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask, 9271da177e4SLinus Torvalds unsigned long addr, unsigned long flags) 9281da177e4SLinus Torvalds { 9298bccd85fSChristoph Lameter int err; 9301da177e4SLinus Torvalds struct mm_struct *mm = current->mm; 9311da177e4SLinus Torvalds struct vm_area_struct *vma = NULL; 9323b9aadf7SAndrea Arcangeli struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 9331da177e4SLinus Torvalds 934754af6f5SLee Schermerhorn if (flags & 935754af6f5SLee Schermerhorn ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 9361da177e4SLinus Torvalds return -EINVAL; 937754af6f5SLee Schermerhorn 938754af6f5SLee Schermerhorn if (flags & MPOL_F_MEMS_ALLOWED) { 939754af6f5SLee Schermerhorn if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 940754af6f5SLee Schermerhorn return -EINVAL; 941754af6f5SLee Schermerhorn *policy = 0; /* just so it's initialized */ 94258568d2aSMiao Xie task_lock(current); 943754af6f5SLee Schermerhorn *nmask = cpuset_current_mems_allowed; 94458568d2aSMiao Xie task_unlock(current); 945754af6f5SLee Schermerhorn return 0; 946754af6f5SLee Schermerhorn } 947754af6f5SLee Schermerhorn 9481da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 949ddc1a5cbSHugh Dickins pgoff_t ilx; /* ignored here */ 950bea904d5SLee Schermerhorn /* 951bea904d5SLee Schermerhorn * Do NOT fall back to task policy if the 952bea904d5SLee Schermerhorn * vma/shared policy at addr is NULL. We 953bea904d5SLee Schermerhorn * want to return MPOL_DEFAULT in this case. 954bea904d5SLee Schermerhorn */ 955d8ed45c5SMichel Lespinasse mmap_read_lock(mm); 95633e3575cSLiam Howlett vma = vma_lookup(mm, addr); 9571da177e4SLinus Torvalds if (!vma) { 958d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 9591da177e4SLinus Torvalds return -EFAULT; 9601da177e4SLinus Torvalds } 961ddc1a5cbSHugh Dickins pol = __get_vma_policy(vma, addr, &ilx); 9621da177e4SLinus Torvalds } else if (addr) 9631da177e4SLinus Torvalds return -EINVAL; 9641da177e4SLinus Torvalds 9651da177e4SLinus Torvalds if (!pol) 966bea904d5SLee Schermerhorn pol = &default_policy; /* indicates default behavior */ 9671da177e4SLinus Torvalds 9681da177e4SLinus Torvalds if (flags & MPOL_F_NODE) { 9691da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 9703b9aadf7SAndrea Arcangeli /* 971f728b9c4SJohn Hubbard * Take a refcount on the mpol, because we are about to 972f728b9c4SJohn Hubbard * drop the mmap_lock, after which only "pol" remains 973f728b9c4SJohn Hubbard * valid, "vma" is stale. 9743b9aadf7SAndrea Arcangeli */ 9753b9aadf7SAndrea Arcangeli pol_refcount = pol; 9763b9aadf7SAndrea Arcangeli vma = NULL; 9773b9aadf7SAndrea Arcangeli mpol_get(pol); 978f728b9c4SJohn Hubbard mmap_read_unlock(mm); 9793b9aadf7SAndrea Arcangeli err = lookup_node(mm, addr); 9801da177e4SLinus Torvalds if (err < 0) 9811da177e4SLinus Torvalds goto out; 9828bccd85fSChristoph Lameter *policy = err; 9831da177e4SLinus Torvalds } else if (pol == current->mempolicy && 98445c4745aSLee Schermerhorn pol->mode == MPOL_INTERLEAVE) { 985269fbe72SBen Widawsky *policy = next_node_in(current->il_prev, pol->nodes); 986fa3bea4eSGregory Price } else if (pol == current->mempolicy && 987fa3bea4eSGregory Price pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 988fa3bea4eSGregory Price if (current->il_weight) 989fa3bea4eSGregory Price *policy = current->il_prev; 990fa3bea4eSGregory Price else 991fa3bea4eSGregory Price *policy = next_node_in(current->il_prev, 992fa3bea4eSGregory Price pol->nodes); 9931da177e4SLinus Torvalds } else { 9941da177e4SLinus Torvalds err = -EINVAL; 9951da177e4SLinus Torvalds goto out; 9961da177e4SLinus Torvalds } 997bea904d5SLee Schermerhorn } else { 998bea904d5SLee Schermerhorn *policy = pol == &default_policy ? MPOL_DEFAULT : 999bea904d5SLee Schermerhorn pol->mode; 1000d79df630SDavid Rientjes /* 1001d79df630SDavid Rientjes * Internal mempolicy flags must be masked off before exposing 1002d79df630SDavid Rientjes * the policy to userspace. 1003d79df630SDavid Rientjes */ 1004d79df630SDavid Rientjes *policy |= (pol->flags & MPOL_MODE_FLAGS); 1005bea904d5SLee Schermerhorn } 10061da177e4SLinus Torvalds 10071da177e4SLinus Torvalds err = 0; 100858568d2aSMiao Xie if (nmask) { 1009c6b6ef8bSLee Schermerhorn if (mpol_store_user_nodemask(pol)) { 1010c6b6ef8bSLee Schermerhorn *nmask = pol->w.user_nodemask; 1011c6b6ef8bSLee Schermerhorn } else { 101258568d2aSMiao Xie task_lock(current); 1013bea904d5SLee Schermerhorn get_policy_nodemask(pol, nmask); 101458568d2aSMiao Xie task_unlock(current); 101558568d2aSMiao Xie } 1016c6b6ef8bSLee Schermerhorn } 10171da177e4SLinus Torvalds 10181da177e4SLinus Torvalds out: 101952cd3b07SLee Schermerhorn mpol_cond_put(pol); 10201da177e4SLinus Torvalds if (vma) 1021d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 10223b9aadf7SAndrea Arcangeli if (pol_refcount) 10233b9aadf7SAndrea Arcangeli mpol_put(pol_refcount); 10241da177e4SLinus Torvalds return err; 10251da177e4SLinus Torvalds } 10261da177e4SLinus Torvalds 1027b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION 10281cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1029fc301289SChristoph Lameter unsigned long flags) 10306ce3c4c0SChristoph Lameter { 10316ce3c4c0SChristoph Lameter /* 10321cb5d11aSHugh Dickins * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 10331cb5d11aSHugh Dickins * Choosing not to migrate a shared folio is not counted as a failure. 10344a64981dSVishal Moola (Oracle) * 10354a64981dSVishal Moola (Oracle) * To check if the folio is shared, ideally we want to make sure 10364a64981dSVishal Moola (Oracle) * every page is mapped to the same process. Doing that is very 10371cb5d11aSHugh Dickins * expensive, so check the estimated sharers of the folio instead. 10386ce3c4c0SChristoph Lameter */ 10394a64981dSVishal Moola (Oracle) if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { 1040be2d5756SBaolin Wang if (folio_isolate_lru(folio)) { 10414a64981dSVishal Moola (Oracle) list_add_tail(&folio->lru, foliolist); 10424a64981dSVishal Moola (Oracle) node_stat_mod_folio(folio, 10434a64981dSVishal Moola (Oracle) NR_ISOLATED_ANON + folio_is_file_lru(folio), 10444a64981dSVishal Moola (Oracle) folio_nr_pages(folio)); 10451cb5d11aSHugh Dickins } else { 1046a53190a4SYang Shi /* 10474a64981dSVishal Moola (Oracle) * Non-movable folio may reach here. And, there may be 10484a64981dSVishal Moola (Oracle) * temporary off LRU folios or non-LRU movable folios. 10494a64981dSVishal Moola (Oracle) * Treat them as unmovable folios since they can't be 10501cb5d11aSHugh Dickins * isolated, so they can't be moved at the moment. 1051a53190a4SYang Shi */ 10521cb5d11aSHugh Dickins return false; 105362695a84SNick Piggin } 105462695a84SNick Piggin } 10551cb5d11aSHugh Dickins return true; 10566ce3c4c0SChristoph Lameter } 10576ce3c4c0SChristoph Lameter 10586ce3c4c0SChristoph Lameter /* 10597e2ab150SChristoph Lameter * Migrate pages from one node to a target node. 10607e2ab150SChristoph Lameter * Returns error or the number of pages not migrated. 10617e2ab150SChristoph Lameter */ 10621cb5d11aSHugh Dickins static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1063dbcb0f19SAdrian Bunk int flags) 10647e2ab150SChristoph Lameter { 10657e2ab150SChristoph Lameter nodemask_t nmask; 106666850be5SLiam R. Howlett struct vm_area_struct *vma; 10677e2ab150SChristoph Lameter LIST_HEAD(pagelist); 10681cb5d11aSHugh Dickins long nr_failed; 10691cb5d11aSHugh Dickins long err = 0; 1070a0976311SJoonsoo Kim struct migration_target_control mtc = { 1071a0976311SJoonsoo Kim .nid = dest, 1072a0976311SJoonsoo Kim .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1073a0976311SJoonsoo Kim }; 10747e2ab150SChristoph Lameter 10757e2ab150SChristoph Lameter nodes_clear(nmask); 10767e2ab150SChristoph Lameter node_set(source, nmask); 10777e2ab150SChristoph Lameter 107808270807SMinchan Kim VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 107972e315f7SHugh Dickins 108072e315f7SHugh Dickins mmap_read_lock(mm); 10811cb5d11aSHugh Dickins vma = find_vma(mm, 0); 10821cb5d11aSHugh Dickins 10831cb5d11aSHugh Dickins /* 10841cb5d11aSHugh Dickins * This does not migrate the range, but isolates all pages that 10851cb5d11aSHugh Dickins * need migration. Between passing in the full user address 10861cb5d11aSHugh Dickins * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 10871cb5d11aSHugh Dickins * but passes back the count of pages which could not be isolated. 10881cb5d11aSHugh Dickins */ 10891cb5d11aSHugh Dickins nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 10901cb5d11aSHugh Dickins flags | MPOL_MF_DISCONTIG_OK, &pagelist); 109172e315f7SHugh Dickins mmap_read_unlock(mm); 10927e2ab150SChristoph Lameter 1093cf608ac1SMinchan Kim if (!list_empty(&pagelist)) { 1094a0976311SJoonsoo Kim err = migrate_pages(&pagelist, alloc_migration_target, NULL, 10955ac95884SYang Shi (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1096cf608ac1SMinchan Kim if (err) 1097e2d8cf40SNaoya Horiguchi putback_movable_pages(&pagelist); 1098cf608ac1SMinchan Kim } 109995a402c3SChristoph Lameter 11001cb5d11aSHugh Dickins if (err >= 0) 11011cb5d11aSHugh Dickins err += nr_failed; 11027e2ab150SChristoph Lameter return err; 11037e2ab150SChristoph Lameter } 11047e2ab150SChristoph Lameter 11057e2ab150SChristoph Lameter /* 11067e2ab150SChristoph Lameter * Move pages between the two nodesets so as to preserve the physical 11077e2ab150SChristoph Lameter * layout as much as possible. 110839743889SChristoph Lameter * 110939743889SChristoph Lameter * Returns the number of page that could not be moved. 111039743889SChristoph Lameter */ 11110ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 11120ce72d4fSAndrew Morton const nodemask_t *to, int flags) 111339743889SChristoph Lameter { 11141cb5d11aSHugh Dickins long nr_failed = 0; 11151cb5d11aSHugh Dickins long err = 0; 11167e2ab150SChristoph Lameter nodemask_t tmp; 111739743889SChristoph Lameter 1118361a2a22SMinchan Kim lru_cache_disable(); 11190aedadf9SChristoph Lameter 11207e2ab150SChristoph Lameter /* 11217e2ab150SChristoph Lameter * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 11227e2ab150SChristoph Lameter * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 11237e2ab150SChristoph Lameter * bit in 'tmp', and return that <source, dest> pair for migration. 11247e2ab150SChristoph Lameter * The pair of nodemasks 'to' and 'from' define the map. 11257e2ab150SChristoph Lameter * 11267e2ab150SChristoph Lameter * If no pair of bits is found that way, fallback to picking some 11277e2ab150SChristoph Lameter * pair of 'source' and 'dest' bits that are not the same. If the 11287e2ab150SChristoph Lameter * 'source' and 'dest' bits are the same, this represents a node 11297e2ab150SChristoph Lameter * that will be migrating to itself, so no pages need move. 11307e2ab150SChristoph Lameter * 11317e2ab150SChristoph Lameter * If no bits are left in 'tmp', or if all remaining bits left 11327e2ab150SChristoph Lameter * in 'tmp' correspond to the same bit in 'to', return false 11337e2ab150SChristoph Lameter * (nothing left to migrate). 11347e2ab150SChristoph Lameter * 11357e2ab150SChristoph Lameter * This lets us pick a pair of nodes to migrate between, such that 11367e2ab150SChristoph Lameter * if possible the dest node is not already occupied by some other 11377e2ab150SChristoph Lameter * source node, minimizing the risk of overloading the memory on a 11387e2ab150SChristoph Lameter * node that would happen if we migrated incoming memory to a node 11397e2ab150SChristoph Lameter * before migrating outgoing memory source that same node. 11407e2ab150SChristoph Lameter * 11417e2ab150SChristoph Lameter * A single scan of tmp is sufficient. As we go, we remember the 11427e2ab150SChristoph Lameter * most recent <s, d> pair that moved (s != d). If we find a pair 11437e2ab150SChristoph Lameter * that not only moved, but what's better, moved to an empty slot 11447e2ab150SChristoph Lameter * (d is not set in tmp), then we break out then, with that pair. 1145ae0e47f0SJustin P. Mattock * Otherwise when we finish scanning from_tmp, we at least have the 11467e2ab150SChristoph Lameter * most recent <s, d> pair that moved. If we get all the way through 11477e2ab150SChristoph Lameter * the scan of tmp without finding any node that moved, much less 11487e2ab150SChristoph Lameter * moved to an empty node, then there is nothing left worth migrating. 11497e2ab150SChristoph Lameter */ 11507e2ab150SChristoph Lameter 11510ce72d4fSAndrew Morton tmp = *from; 11527e2ab150SChristoph Lameter while (!nodes_empty(tmp)) { 11537e2ab150SChristoph Lameter int s, d; 1154b76ac7e7SJianguo Wu int source = NUMA_NO_NODE; 11557e2ab150SChristoph Lameter int dest = 0; 11567e2ab150SChristoph Lameter 11577e2ab150SChristoph Lameter for_each_node_mask(s, tmp) { 11584a5b18ccSLarry Woodman 11594a5b18ccSLarry Woodman /* 11604a5b18ccSLarry Woodman * do_migrate_pages() tries to maintain the relative 11614a5b18ccSLarry Woodman * node relationship of the pages established between 11624a5b18ccSLarry Woodman * threads and memory areas. 11634a5b18ccSLarry Woodman * 11644a5b18ccSLarry Woodman * However if the number of source nodes is not equal to 11654a5b18ccSLarry Woodman * the number of destination nodes we can not preserve 11664a5b18ccSLarry Woodman * this node relative relationship. In that case, skip 11674a5b18ccSLarry Woodman * copying memory from a node that is in the destination 11684a5b18ccSLarry Woodman * mask. 11694a5b18ccSLarry Woodman * 11704a5b18ccSLarry Woodman * Example: [2,3,4] -> [3,4,5] moves everything. 11714a5b18ccSLarry Woodman * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 11724a5b18ccSLarry Woodman */ 11734a5b18ccSLarry Woodman 11740ce72d4fSAndrew Morton if ((nodes_weight(*from) != nodes_weight(*to)) && 11750ce72d4fSAndrew Morton (node_isset(s, *to))) 11764a5b18ccSLarry Woodman continue; 11774a5b18ccSLarry Woodman 11780ce72d4fSAndrew Morton d = node_remap(s, *from, *to); 11797e2ab150SChristoph Lameter if (s == d) 11807e2ab150SChristoph Lameter continue; 11817e2ab150SChristoph Lameter 11827e2ab150SChristoph Lameter source = s; /* Node moved. Memorize */ 11837e2ab150SChristoph Lameter dest = d; 11847e2ab150SChristoph Lameter 11857e2ab150SChristoph Lameter /* dest not in remaining from nodes? */ 11867e2ab150SChristoph Lameter if (!node_isset(dest, tmp)) 11877e2ab150SChristoph Lameter break; 11887e2ab150SChristoph Lameter } 1189b76ac7e7SJianguo Wu if (source == NUMA_NO_NODE) 11907e2ab150SChristoph Lameter break; 11917e2ab150SChristoph Lameter 11927e2ab150SChristoph Lameter node_clear(source, tmp); 11937e2ab150SChristoph Lameter err = migrate_to_node(mm, source, dest, flags); 11947e2ab150SChristoph Lameter if (err > 0) 11951cb5d11aSHugh Dickins nr_failed += err; 11967e2ab150SChristoph Lameter if (err < 0) 11977e2ab150SChristoph Lameter break; 119839743889SChristoph Lameter } 1199d479960eSMinchan Kim 1200361a2a22SMinchan Kim lru_cache_enable(); 12017e2ab150SChristoph Lameter if (err < 0) 12027e2ab150SChristoph Lameter return err; 12031cb5d11aSHugh Dickins return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 120439743889SChristoph Lameter } 120539743889SChristoph Lameter 12063ad33b24SLee Schermerhorn /* 120772e315f7SHugh Dickins * Allocate a new folio for page migration, according to NUMA mempolicy. 12083ad33b24SLee Schermerhorn */ 120972e315f7SHugh Dickins static struct folio *alloc_migration_target_by_mpol(struct folio *src, 121072e315f7SHugh Dickins unsigned long private) 121195a402c3SChristoph Lameter { 121288c91dc5SHugh Dickins struct migration_mpol *mmpol = (struct migration_mpol *)private; 121388c91dc5SHugh Dickins struct mempolicy *pol = mmpol->pol; 121488c91dc5SHugh Dickins pgoff_t ilx = mmpol->ilx; 121572e315f7SHugh Dickins struct page *page; 121672e315f7SHugh Dickins unsigned int order; 121772e315f7SHugh Dickins int nid = numa_node_id(); 121872e315f7SHugh Dickins gfp_t gfp; 121995a402c3SChristoph Lameter 122072e315f7SHugh Dickins order = folio_order(src); 122172e315f7SHugh Dickins ilx += src->index >> order; 12223ad33b24SLee Schermerhorn 1223d0ce0e47SSidhartha Kumar if (folio_test_hugetlb(src)) { 122472e315f7SHugh Dickins nodemask_t *nodemask; 122572e315f7SHugh Dickins struct hstate *h; 122672e315f7SHugh Dickins 122772e315f7SHugh Dickins h = folio_hstate(src); 122872e315f7SHugh Dickins gfp = htlb_alloc_mask(h); 122972e315f7SHugh Dickins nodemask = policy_nodemask(gfp, pol, ilx, &nid); 123072e315f7SHugh Dickins return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); 1231d0ce0e47SSidhartha Kumar } 1232c8633798SNaoya Horiguchi 1233ec4858e0SMatthew Wilcox (Oracle) if (folio_test_large(src)) 1234ec4858e0SMatthew Wilcox (Oracle) gfp = GFP_TRANSHUGE; 123572e315f7SHugh Dickins else 123672e315f7SHugh Dickins gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1237ec4858e0SMatthew Wilcox (Oracle) 123872e315f7SHugh Dickins page = alloc_pages_mpol(gfp, order, pol, ilx, nid); 123972e315f7SHugh Dickins return page_rmappable_folio(page); 124095a402c3SChristoph Lameter } 1241b20a3503SChristoph Lameter #else 1242b20a3503SChristoph Lameter 12431cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1244b20a3503SChristoph Lameter unsigned long flags) 1245b20a3503SChristoph Lameter { 12461cb5d11aSHugh Dickins return false; 1247b20a3503SChristoph Lameter } 1248b20a3503SChristoph Lameter 12490ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 12500ce72d4fSAndrew Morton const nodemask_t *to, int flags) 1251b20a3503SChristoph Lameter { 1252b20a3503SChristoph Lameter return -ENOSYS; 1253b20a3503SChristoph Lameter } 125495a402c3SChristoph Lameter 125572e315f7SHugh Dickins static struct folio *alloc_migration_target_by_mpol(struct folio *src, 125672e315f7SHugh Dickins unsigned long private) 125795a402c3SChristoph Lameter { 125895a402c3SChristoph Lameter return NULL; 125995a402c3SChristoph Lameter } 1260b20a3503SChristoph Lameter #endif 1261b20a3503SChristoph Lameter 1262dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len, 1263028fec41SDavid Rientjes unsigned short mode, unsigned short mode_flags, 1264028fec41SDavid Rientjes nodemask_t *nmask, unsigned long flags) 12656ce3c4c0SChristoph Lameter { 12666ce3c4c0SChristoph Lameter struct mm_struct *mm = current->mm; 1267f4e9e0e6SLiam R. Howlett struct vm_area_struct *vma, *prev; 1268f4e9e0e6SLiam R. Howlett struct vma_iterator vmi; 126988c91dc5SHugh Dickins struct migration_mpol mmpol; 12706ce3c4c0SChristoph Lameter struct mempolicy *new; 12716ce3c4c0SChristoph Lameter unsigned long end; 12721cb5d11aSHugh Dickins long err; 12731cb5d11aSHugh Dickins long nr_failed; 12746ce3c4c0SChristoph Lameter LIST_HEAD(pagelist); 12756ce3c4c0SChristoph Lameter 1276b24f53a0SLee Schermerhorn if (flags & ~(unsigned long)MPOL_MF_VALID) 12776ce3c4c0SChristoph Lameter return -EINVAL; 127874c00241SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 12796ce3c4c0SChristoph Lameter return -EPERM; 12806ce3c4c0SChristoph Lameter 12816ce3c4c0SChristoph Lameter if (start & ~PAGE_MASK) 12826ce3c4c0SChristoph Lameter return -EINVAL; 12836ce3c4c0SChristoph Lameter 12846ce3c4c0SChristoph Lameter if (mode == MPOL_DEFAULT) 12856ce3c4c0SChristoph Lameter flags &= ~MPOL_MF_STRICT; 12866ce3c4c0SChristoph Lameter 1287aaa31e05Sze zuo len = PAGE_ALIGN(len); 12886ce3c4c0SChristoph Lameter end = start + len; 12896ce3c4c0SChristoph Lameter 12906ce3c4c0SChristoph Lameter if (end < start) 12916ce3c4c0SChristoph Lameter return -EINVAL; 12926ce3c4c0SChristoph Lameter if (end == start) 12936ce3c4c0SChristoph Lameter return 0; 12946ce3c4c0SChristoph Lameter 1295028fec41SDavid Rientjes new = mpol_new(mode, mode_flags, nmask); 12966ce3c4c0SChristoph Lameter if (IS_ERR(new)) 12976ce3c4c0SChristoph Lameter return PTR_ERR(new); 12986ce3c4c0SChristoph Lameter 12996ce3c4c0SChristoph Lameter /* 13006ce3c4c0SChristoph Lameter * If we are using the default policy then operation 13016ce3c4c0SChristoph Lameter * on discontinuous address spaces is okay after all 13026ce3c4c0SChristoph Lameter */ 13036ce3c4c0SChristoph Lameter if (!new) 13046ce3c4c0SChristoph Lameter flags |= MPOL_MF_DISCONTIG_OK; 13056ce3c4c0SChristoph Lameter 13061cb5d11aSHugh Dickins if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1307361a2a22SMinchan Kim lru_cache_disable(); 13084bfc4495SKAMEZAWA Hiroyuki { 13094bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch); 13104bfc4495SKAMEZAWA Hiroyuki if (scratch) { 1311d8ed45c5SMichel Lespinasse mmap_write_lock(mm); 13124bfc4495SKAMEZAWA Hiroyuki err = mpol_set_nodemask(new, nmask, scratch); 13134bfc4495SKAMEZAWA Hiroyuki if (err) 1314d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 13154bfc4495SKAMEZAWA Hiroyuki } else 13164bfc4495SKAMEZAWA Hiroyuki err = -ENOMEM; 13174bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch); 13184bfc4495SKAMEZAWA Hiroyuki } 1319b05ca738SKOSAKI Motohiro if (err) 1320b05ca738SKOSAKI Motohiro goto mpol_out; 1321b05ca738SKOSAKI Motohiro 13226c21e066SJann Horn /* 13231cb5d11aSHugh Dickins * Lock the VMAs before scanning for pages to migrate, 13241cb5d11aSHugh Dickins * to ensure we don't miss a concurrently inserted page. 13256c21e066SJann Horn */ 13261cb5d11aSHugh Dickins nr_failed = queue_pages_range(mm, start, end, nmask, 13271cb5d11aSHugh Dickins flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1328d8835445SYang Shi 13291cb5d11aSHugh Dickins if (nr_failed < 0) { 13301cb5d11aSHugh Dickins err = nr_failed; 133172e315f7SHugh Dickins nr_failed = 0; 13321cb5d11aSHugh Dickins } else { 1333f4e9e0e6SLiam R. Howlett vma_iter_init(&vmi, mm, start); 1334f4e9e0e6SLiam R. Howlett prev = vma_prev(&vmi); 1335f4e9e0e6SLiam R. Howlett for_each_vma_range(vmi, vma, end) { 1336f4e9e0e6SLiam R. Howlett err = mbind_range(&vmi, vma, &prev, start, end, new); 1337f4e9e0e6SLiam R. Howlett if (err) 1338f4e9e0e6SLiam R. Howlett break; 1339f4e9e0e6SLiam R. Howlett } 1340cf608ac1SMinchan Kim } 13416ce3c4c0SChristoph Lameter 134272e315f7SHugh Dickins if (!err && !list_empty(&pagelist)) { 134372e315f7SHugh Dickins /* Convert MPOL_DEFAULT's NULL to task or default policy */ 134472e315f7SHugh Dickins if (!new) { 134572e315f7SHugh Dickins new = get_task_policy(current); 134672e315f7SHugh Dickins mpol_get(new); 13471cb5d11aSHugh Dickins } 134888c91dc5SHugh Dickins mmpol.pol = new; 134988c91dc5SHugh Dickins mmpol.ilx = 0; 135088c91dc5SHugh Dickins 135188c91dc5SHugh Dickins /* 135288c91dc5SHugh Dickins * In the interleaved case, attempt to allocate on exactly the 135388c91dc5SHugh Dickins * targeted nodes, for the first VMA to be migrated; for later 135488c91dc5SHugh Dickins * VMAs, the nodes will still be interleaved from the targeted 135588c91dc5SHugh Dickins * nodemask, but one by one may be selected differently. 135688c91dc5SHugh Dickins */ 1357fa3bea4eSGregory Price if (new->mode == MPOL_INTERLEAVE || 1358fa3bea4eSGregory Price new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1359*f1cce6f7SMatthew Wilcox (Oracle) struct folio *folio; 136088c91dc5SHugh Dickins unsigned int order; 136188c91dc5SHugh Dickins unsigned long addr = -EFAULT; 136288c91dc5SHugh Dickins 1363*f1cce6f7SMatthew Wilcox (Oracle) list_for_each_entry(folio, &pagelist, lru) { 1364*f1cce6f7SMatthew Wilcox (Oracle) if (!folio_test_ksm(folio)) 136588c91dc5SHugh Dickins break; 136688c91dc5SHugh Dickins } 1367*f1cce6f7SMatthew Wilcox (Oracle) if (!list_entry_is_head(folio, &pagelist, lru)) { 136888c91dc5SHugh Dickins vma_iter_init(&vmi, mm, start); 136988c91dc5SHugh Dickins for_each_vma_range(vmi, vma, end) { 1370*f1cce6f7SMatthew Wilcox (Oracle) addr = page_address_in_vma( 1371*f1cce6f7SMatthew Wilcox (Oracle) folio_page(folio, 0), vma); 137288c91dc5SHugh Dickins if (addr != -EFAULT) 137388c91dc5SHugh Dickins break; 137488c91dc5SHugh Dickins } 137588c91dc5SHugh Dickins } 137688c91dc5SHugh Dickins if (addr != -EFAULT) { 1377*f1cce6f7SMatthew Wilcox (Oracle) order = folio_order(folio); 137888c91dc5SHugh Dickins /* We already know the pol, but not the ilx */ 137988c91dc5SHugh Dickins mpol_cond_put(get_vma_policy(vma, addr, order, 138088c91dc5SHugh Dickins &mmpol.ilx)); 138188c91dc5SHugh Dickins /* Set base from which to increment by index */ 1382*f1cce6f7SMatthew Wilcox (Oracle) mmpol.ilx -= folio->index >> order; 138388c91dc5SHugh Dickins } 138488c91dc5SHugh Dickins } 1385a85dfc30SYang Shi } 1386a85dfc30SYang Shi 1387d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 138888c91dc5SHugh Dickins 138988c91dc5SHugh Dickins if (!err && !list_empty(&pagelist)) { 139072e315f7SHugh Dickins nr_failed |= migrate_pages(&pagelist, 139172e315f7SHugh Dickins alloc_migration_target_by_mpol, NULL, 139288c91dc5SHugh Dickins (unsigned long)&mmpol, MIGRATE_SYNC, 139372e315f7SHugh Dickins MR_MEMPOLICY_MBIND, NULL); 139472e315f7SHugh Dickins } 139572e315f7SHugh Dickins 13961cb5d11aSHugh Dickins if (nr_failed && (flags & MPOL_MF_STRICT)) 13971cb5d11aSHugh Dickins err = -EIO; 13986ce3c4c0SChristoph Lameter if (!list_empty(&pagelist)) 1399b05ca738SKOSAKI Motohiro putback_movable_pages(&pagelist); 14006ce3c4c0SChristoph Lameter mpol_out: 1401f0be3d32SLee Schermerhorn mpol_put(new); 1402d479960eSMinchan Kim if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1403361a2a22SMinchan Kim lru_cache_enable(); 14046ce3c4c0SChristoph Lameter return err; 14056ce3c4c0SChristoph Lameter } 14066ce3c4c0SChristoph Lameter 140739743889SChristoph Lameter /* 14088bccd85fSChristoph Lameter * User space interface with variable sized bitmaps for nodelists. 14098bccd85fSChristoph Lameter */ 1410e130242dSArnd Bergmann static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1411e130242dSArnd Bergmann unsigned long maxnode) 1412e130242dSArnd Bergmann { 1413e130242dSArnd Bergmann unsigned long nlongs = BITS_TO_LONGS(maxnode); 1414e130242dSArnd Bergmann int ret; 1415e130242dSArnd Bergmann 1416e130242dSArnd Bergmann if (in_compat_syscall()) 1417e130242dSArnd Bergmann ret = compat_get_bitmap(mask, 1418e130242dSArnd Bergmann (const compat_ulong_t __user *)nmask, 1419e130242dSArnd Bergmann maxnode); 1420e130242dSArnd Bergmann else 1421e130242dSArnd Bergmann ret = copy_from_user(mask, nmask, 1422e130242dSArnd Bergmann nlongs * sizeof(unsigned long)); 1423e130242dSArnd Bergmann 1424e130242dSArnd Bergmann if (ret) 1425e130242dSArnd Bergmann return -EFAULT; 1426e130242dSArnd Bergmann 1427e130242dSArnd Bergmann if (maxnode % BITS_PER_LONG) 1428e130242dSArnd Bergmann mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1429e130242dSArnd Bergmann 1430e130242dSArnd Bergmann return 0; 1431e130242dSArnd Bergmann } 14328bccd85fSChristoph Lameter 14338bccd85fSChristoph Lameter /* Copy a node mask from user space. */ 143439743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 14358bccd85fSChristoph Lameter unsigned long maxnode) 14368bccd85fSChristoph Lameter { 14378bccd85fSChristoph Lameter --maxnode; 14388bccd85fSChristoph Lameter nodes_clear(*nodes); 14398bccd85fSChristoph Lameter if (maxnode == 0 || !nmask) 14408bccd85fSChristoph Lameter return 0; 1441a9c930baSAndi Kleen if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1442636f13c1SChris Wright return -EINVAL; 14438bccd85fSChristoph Lameter 144456521e7aSYisheng Xie /* 144556521e7aSYisheng Xie * When the user specified more nodes than supported just check 1446e130242dSArnd Bergmann * if the non supported part is all zero, one word at a time, 1447e130242dSArnd Bergmann * starting at the end. 144856521e7aSYisheng Xie */ 1449e130242dSArnd Bergmann while (maxnode > MAX_NUMNODES) { 1450e130242dSArnd Bergmann unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1451e130242dSArnd Bergmann unsigned long t; 14528bccd85fSChristoph Lameter 1453000eca5dSTianyu Li if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 145456521e7aSYisheng Xie return -EFAULT; 1455e130242dSArnd Bergmann 1456e130242dSArnd Bergmann if (maxnode - bits >= MAX_NUMNODES) { 1457e130242dSArnd Bergmann maxnode -= bits; 1458e130242dSArnd Bergmann } else { 1459e130242dSArnd Bergmann maxnode = MAX_NUMNODES; 1460e130242dSArnd Bergmann t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1461e130242dSArnd Bergmann } 1462e130242dSArnd Bergmann if (t) 146356521e7aSYisheng Xie return -EINVAL; 146456521e7aSYisheng Xie } 146556521e7aSYisheng Xie 1466e130242dSArnd Bergmann return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 14678bccd85fSChristoph Lameter } 14688bccd85fSChristoph Lameter 14698bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */ 14708bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 14718bccd85fSChristoph Lameter nodemask_t *nodes) 14728bccd85fSChristoph Lameter { 14738bccd85fSChristoph Lameter unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1474050c17f2SRalph Campbell unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1475e130242dSArnd Bergmann bool compat = in_compat_syscall(); 1476e130242dSArnd Bergmann 1477e130242dSArnd Bergmann if (compat) 1478e130242dSArnd Bergmann nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 14798bccd85fSChristoph Lameter 14808bccd85fSChristoph Lameter if (copy > nbytes) { 14818bccd85fSChristoph Lameter if (copy > PAGE_SIZE) 14828bccd85fSChristoph Lameter return -EINVAL; 14838bccd85fSChristoph Lameter if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 14848bccd85fSChristoph Lameter return -EFAULT; 14858bccd85fSChristoph Lameter copy = nbytes; 1486e130242dSArnd Bergmann maxnode = nr_node_ids; 14878bccd85fSChristoph Lameter } 1488e130242dSArnd Bergmann 1489e130242dSArnd Bergmann if (compat) 1490e130242dSArnd Bergmann return compat_put_bitmap((compat_ulong_t __user *)mask, 1491e130242dSArnd Bergmann nodes_addr(*nodes), maxnode); 1492e130242dSArnd Bergmann 14938bccd85fSChristoph Lameter return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 14948bccd85fSChristoph Lameter } 14958bccd85fSChristoph Lameter 149695837924SFeng Tang /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 149795837924SFeng Tang static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 149895837924SFeng Tang { 149995837924SFeng Tang *flags = *mode & MPOL_MODE_FLAGS; 150095837924SFeng Tang *mode &= ~MPOL_MODE_FLAGS; 1501b27abaccSDave Hansen 1502a38a59fdSBen Widawsky if ((unsigned int)(*mode) >= MPOL_MAX) 150395837924SFeng Tang return -EINVAL; 150495837924SFeng Tang if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 150595837924SFeng Tang return -EINVAL; 15066d2aec9eSEric Dumazet if (*flags & MPOL_F_NUMA_BALANCING) { 15076d2aec9eSEric Dumazet if (*mode != MPOL_BIND) 15086d2aec9eSEric Dumazet return -EINVAL; 15096d2aec9eSEric Dumazet *flags |= (MPOL_F_MOF | MPOL_F_MORON); 15106d2aec9eSEric Dumazet } 151195837924SFeng Tang return 0; 151295837924SFeng Tang } 151395837924SFeng Tang 1514e7dc9ad6SDominik Brodowski static long kernel_mbind(unsigned long start, unsigned long len, 1515e7dc9ad6SDominik Brodowski unsigned long mode, const unsigned long __user *nmask, 1516e7dc9ad6SDominik Brodowski unsigned long maxnode, unsigned int flags) 15178bccd85fSChristoph Lameter { 1518028fec41SDavid Rientjes unsigned short mode_flags; 151995837924SFeng Tang nodemask_t nodes; 152095837924SFeng Tang int lmode = mode; 152195837924SFeng Tang int err; 15228bccd85fSChristoph Lameter 1523057d3389SAndrey Konovalov start = untagged_addr(start); 152495837924SFeng Tang err = sanitize_mpol_flags(&lmode, &mode_flags); 152595837924SFeng Tang if (err) 152695837924SFeng Tang return err; 152795837924SFeng Tang 15288bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 15298bccd85fSChristoph Lameter if (err) 15308bccd85fSChristoph Lameter return err; 153195837924SFeng Tang 153295837924SFeng Tang return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 15338bccd85fSChristoph Lameter } 15348bccd85fSChristoph Lameter 1535c6018b4bSAneesh Kumar K.V SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1536c6018b4bSAneesh Kumar K.V unsigned long, home_node, unsigned long, flags) 1537c6018b4bSAneesh Kumar K.V { 1538c6018b4bSAneesh Kumar K.V struct mm_struct *mm = current->mm; 1539f4e9e0e6SLiam R. Howlett struct vm_area_struct *vma, *prev; 1540e976936cSMichal Hocko struct mempolicy *new, *old; 1541c6018b4bSAneesh Kumar K.V unsigned long end; 1542c6018b4bSAneesh Kumar K.V int err = -ENOENT; 154366850be5SLiam R. Howlett VMA_ITERATOR(vmi, mm, start); 1544c6018b4bSAneesh Kumar K.V 1545c6018b4bSAneesh Kumar K.V start = untagged_addr(start); 1546c6018b4bSAneesh Kumar K.V if (start & ~PAGE_MASK) 1547c6018b4bSAneesh Kumar K.V return -EINVAL; 1548c6018b4bSAneesh Kumar K.V /* 1549c6018b4bSAneesh Kumar K.V * flags is used for future extension if any. 1550c6018b4bSAneesh Kumar K.V */ 1551c6018b4bSAneesh Kumar K.V if (flags != 0) 1552c6018b4bSAneesh Kumar K.V return -EINVAL; 1553c6018b4bSAneesh Kumar K.V 1554c6018b4bSAneesh Kumar K.V /* 1555c6018b4bSAneesh Kumar K.V * Check home_node is online to avoid accessing uninitialized 1556c6018b4bSAneesh Kumar K.V * NODE_DATA. 1557c6018b4bSAneesh Kumar K.V */ 1558c6018b4bSAneesh Kumar K.V if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1559c6018b4bSAneesh Kumar K.V return -EINVAL; 1560c6018b4bSAneesh Kumar K.V 1561aaa31e05Sze zuo len = PAGE_ALIGN(len); 1562c6018b4bSAneesh Kumar K.V end = start + len; 1563c6018b4bSAneesh Kumar K.V 1564c6018b4bSAneesh Kumar K.V if (end < start) 1565c6018b4bSAneesh Kumar K.V return -EINVAL; 1566c6018b4bSAneesh Kumar K.V if (end == start) 1567c6018b4bSAneesh Kumar K.V return 0; 1568c6018b4bSAneesh Kumar K.V mmap_write_lock(mm); 1569f4e9e0e6SLiam R. Howlett prev = vma_prev(&vmi); 157066850be5SLiam R. Howlett for_each_vma_range(vmi, vma, end) { 1571c6018b4bSAneesh Kumar K.V /* 1572c6018b4bSAneesh Kumar K.V * If any vma in the range got policy other than MPOL_BIND 1573c6018b4bSAneesh Kumar K.V * or MPOL_PREFERRED_MANY we return error. We don't reset 1574c6018b4bSAneesh Kumar K.V * the home node for vmas we already updated before. 1575c6018b4bSAneesh Kumar K.V */ 1576e976936cSMichal Hocko old = vma_policy(vma); 157751f62537SLiam R. Howlett if (!old) { 157851f62537SLiam R. Howlett prev = vma; 1579e976936cSMichal Hocko continue; 158051f62537SLiam R. Howlett } 1581e976936cSMichal Hocko if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1582c6018b4bSAneesh Kumar K.V err = -EOPNOTSUPP; 1583c6018b4bSAneesh Kumar K.V break; 1584c6018b4bSAneesh Kumar K.V } 1585e976936cSMichal Hocko new = mpol_dup(old); 1586e976936cSMichal Hocko if (IS_ERR(new)) { 1587e976936cSMichal Hocko err = PTR_ERR(new); 1588e976936cSMichal Hocko break; 1589e976936cSMichal Hocko } 1590c6018b4bSAneesh Kumar K.V 15916c21e066SJann Horn vma_start_write(vma); 1592c6018b4bSAneesh Kumar K.V new->home_node = home_node; 1593f4e9e0e6SLiam R. Howlett err = mbind_range(&vmi, vma, &prev, start, end, new); 1594c6018b4bSAneesh Kumar K.V mpol_put(new); 1595c6018b4bSAneesh Kumar K.V if (err) 1596c6018b4bSAneesh Kumar K.V break; 1597c6018b4bSAneesh Kumar K.V } 1598c6018b4bSAneesh Kumar K.V mmap_write_unlock(mm); 1599c6018b4bSAneesh Kumar K.V return err; 1600c6018b4bSAneesh Kumar K.V } 1601c6018b4bSAneesh Kumar K.V 1602e7dc9ad6SDominik Brodowski SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1603e7dc9ad6SDominik Brodowski unsigned long, mode, const unsigned long __user *, nmask, 1604e7dc9ad6SDominik Brodowski unsigned long, maxnode, unsigned int, flags) 1605e7dc9ad6SDominik Brodowski { 1606e7dc9ad6SDominik Brodowski return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1607e7dc9ad6SDominik Brodowski } 1608e7dc9ad6SDominik Brodowski 16098bccd85fSChristoph Lameter /* Set the process memory policy */ 1610af03c4acSDominik Brodowski static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1611af03c4acSDominik Brodowski unsigned long maxnode) 16128bccd85fSChristoph Lameter { 161395837924SFeng Tang unsigned short mode_flags; 16148bccd85fSChristoph Lameter nodemask_t nodes; 161595837924SFeng Tang int lmode = mode; 161695837924SFeng Tang int err; 16178bccd85fSChristoph Lameter 161895837924SFeng Tang err = sanitize_mpol_flags(&lmode, &mode_flags); 161995837924SFeng Tang if (err) 162095837924SFeng Tang return err; 162195837924SFeng Tang 16228bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 16238bccd85fSChristoph Lameter if (err) 16248bccd85fSChristoph Lameter return err; 162595837924SFeng Tang 162695837924SFeng Tang return do_set_mempolicy(lmode, mode_flags, &nodes); 16278bccd85fSChristoph Lameter } 16288bccd85fSChristoph Lameter 1629af03c4acSDominik Brodowski SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1630af03c4acSDominik Brodowski unsigned long, maxnode) 1631af03c4acSDominik Brodowski { 1632af03c4acSDominik Brodowski return kernel_set_mempolicy(mode, nmask, maxnode); 1633af03c4acSDominik Brodowski } 1634af03c4acSDominik Brodowski 1635b6e9b0baSDominik Brodowski static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1636b6e9b0baSDominik Brodowski const unsigned long __user *old_nodes, 1637b6e9b0baSDominik Brodowski const unsigned long __user *new_nodes) 163839743889SChristoph Lameter { 1639596d7cfaSKOSAKI Motohiro struct mm_struct *mm = NULL; 164039743889SChristoph Lameter struct task_struct *task; 164139743889SChristoph Lameter nodemask_t task_nodes; 164239743889SChristoph Lameter int err; 1643596d7cfaSKOSAKI Motohiro nodemask_t *old; 1644596d7cfaSKOSAKI Motohiro nodemask_t *new; 1645596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH(scratch); 164639743889SChristoph Lameter 1647596d7cfaSKOSAKI Motohiro if (!scratch) 1648596d7cfaSKOSAKI Motohiro return -ENOMEM; 164939743889SChristoph Lameter 1650596d7cfaSKOSAKI Motohiro old = &scratch->mask1; 1651596d7cfaSKOSAKI Motohiro new = &scratch->mask2; 1652596d7cfaSKOSAKI Motohiro 1653596d7cfaSKOSAKI Motohiro err = get_nodes(old, old_nodes, maxnode); 165439743889SChristoph Lameter if (err) 1655596d7cfaSKOSAKI Motohiro goto out; 1656596d7cfaSKOSAKI Motohiro 1657596d7cfaSKOSAKI Motohiro err = get_nodes(new, new_nodes, maxnode); 1658596d7cfaSKOSAKI Motohiro if (err) 1659596d7cfaSKOSAKI Motohiro goto out; 166039743889SChristoph Lameter 166139743889SChristoph Lameter /* Find the mm_struct */ 166255cfaa3cSZeng Zhaoming rcu_read_lock(); 1663228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 166439743889SChristoph Lameter if (!task) { 166555cfaa3cSZeng Zhaoming rcu_read_unlock(); 1666596d7cfaSKOSAKI Motohiro err = -ESRCH; 1667596d7cfaSKOSAKI Motohiro goto out; 166839743889SChristoph Lameter } 16693268c63eSChristoph Lameter get_task_struct(task); 167039743889SChristoph Lameter 1671596d7cfaSKOSAKI Motohiro err = -EINVAL; 167239743889SChristoph Lameter 167339743889SChristoph Lameter /* 167431367466SOtto Ebeling * Check if this process has the right to modify the specified process. 167531367466SOtto Ebeling * Use the regular "ptrace_may_access()" checks. 167639743889SChristoph Lameter */ 167731367466SOtto Ebeling if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1678c69e8d9cSDavid Howells rcu_read_unlock(); 167939743889SChristoph Lameter err = -EPERM; 16803268c63eSChristoph Lameter goto out_put; 168139743889SChristoph Lameter } 1682c69e8d9cSDavid Howells rcu_read_unlock(); 168339743889SChristoph Lameter 168439743889SChristoph Lameter task_nodes = cpuset_mems_allowed(task); 168539743889SChristoph Lameter /* Is the user allowed to access the target nodes? */ 1686596d7cfaSKOSAKI Motohiro if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 168739743889SChristoph Lameter err = -EPERM; 16883268c63eSChristoph Lameter goto out_put; 168939743889SChristoph Lameter } 169039743889SChristoph Lameter 16910486a38bSYisheng Xie task_nodes = cpuset_mems_allowed(current); 16920486a38bSYisheng Xie nodes_and(*new, *new, task_nodes); 16930486a38bSYisheng Xie if (nodes_empty(*new)) 16943268c63eSChristoph Lameter goto out_put; 16950486a38bSYisheng Xie 169686c3a764SDavid Quigley err = security_task_movememory(task); 169786c3a764SDavid Quigley if (err) 16983268c63eSChristoph Lameter goto out_put; 169986c3a764SDavid Quigley 17003268c63eSChristoph Lameter mm = get_task_mm(task); 17013268c63eSChristoph Lameter put_task_struct(task); 1702f2a9ef88SSasha Levin 1703f2a9ef88SSasha Levin if (!mm) { 1704f2a9ef88SSasha Levin err = -EINVAL; 1705f2a9ef88SSasha Levin goto out; 1706f2a9ef88SSasha Levin } 1707f2a9ef88SSasha Levin 1708596d7cfaSKOSAKI Motohiro err = do_migrate_pages(mm, old, new, 170974c00241SChristoph Lameter capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 17103268c63eSChristoph Lameter 171139743889SChristoph Lameter mmput(mm); 17123268c63eSChristoph Lameter out: 1713596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH_FREE(scratch); 1714596d7cfaSKOSAKI Motohiro 171539743889SChristoph Lameter return err; 17163268c63eSChristoph Lameter 17173268c63eSChristoph Lameter out_put: 17183268c63eSChristoph Lameter put_task_struct(task); 17193268c63eSChristoph Lameter goto out; 172039743889SChristoph Lameter } 172139743889SChristoph Lameter 1722b6e9b0baSDominik Brodowski SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1723b6e9b0baSDominik Brodowski const unsigned long __user *, old_nodes, 1724b6e9b0baSDominik Brodowski const unsigned long __user *, new_nodes) 1725b6e9b0baSDominik Brodowski { 1726b6e9b0baSDominik Brodowski return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1727b6e9b0baSDominik Brodowski } 1728b6e9b0baSDominik Brodowski 17298bccd85fSChristoph Lameter /* Retrieve NUMA policy */ 1730af03c4acSDominik Brodowski static int kernel_get_mempolicy(int __user *policy, 1731af03c4acSDominik Brodowski unsigned long __user *nmask, 1732af03c4acSDominik Brodowski unsigned long maxnode, 1733af03c4acSDominik Brodowski unsigned long addr, 1734af03c4acSDominik Brodowski unsigned long flags) 17358bccd85fSChristoph Lameter { 1736dbcb0f19SAdrian Bunk int err; 17373f649ab7SKees Cook int pval; 17388bccd85fSChristoph Lameter nodemask_t nodes; 17398bccd85fSChristoph Lameter 1740050c17f2SRalph Campbell if (nmask != NULL && maxnode < nr_node_ids) 17418bccd85fSChristoph Lameter return -EINVAL; 17428bccd85fSChristoph Lameter 17434605f057SWenchao Hao addr = untagged_addr(addr); 17444605f057SWenchao Hao 17458bccd85fSChristoph Lameter err = do_get_mempolicy(&pval, &nodes, addr, flags); 17468bccd85fSChristoph Lameter 17478bccd85fSChristoph Lameter if (err) 17488bccd85fSChristoph Lameter return err; 17498bccd85fSChristoph Lameter 17508bccd85fSChristoph Lameter if (policy && put_user(pval, policy)) 17518bccd85fSChristoph Lameter return -EFAULT; 17528bccd85fSChristoph Lameter 17538bccd85fSChristoph Lameter if (nmask) 17548bccd85fSChristoph Lameter err = copy_nodes_to_user(nmask, maxnode, &nodes); 17558bccd85fSChristoph Lameter 17568bccd85fSChristoph Lameter return err; 17578bccd85fSChristoph Lameter } 17588bccd85fSChristoph Lameter 1759af03c4acSDominik Brodowski SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1760af03c4acSDominik Brodowski unsigned long __user *, nmask, unsigned long, maxnode, 1761af03c4acSDominik Brodowski unsigned long, addr, unsigned long, flags) 1762af03c4acSDominik Brodowski { 1763af03c4acSDominik Brodowski return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1764af03c4acSDominik Brodowski } 1765af03c4acSDominik Brodowski 176620ca87f2SLi Xinhai bool vma_migratable(struct vm_area_struct *vma) 176720ca87f2SLi Xinhai { 176820ca87f2SLi Xinhai if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 176920ca87f2SLi Xinhai return false; 177020ca87f2SLi Xinhai 177120ca87f2SLi Xinhai /* 177220ca87f2SLi Xinhai * DAX device mappings require predictable access latency, so avoid 177320ca87f2SLi Xinhai * incurring periodic faults. 177420ca87f2SLi Xinhai */ 177520ca87f2SLi Xinhai if (vma_is_dax(vma)) 177620ca87f2SLi Xinhai return false; 177720ca87f2SLi Xinhai 177820ca87f2SLi Xinhai if (is_vm_hugetlb_page(vma) && 177920ca87f2SLi Xinhai !hugepage_migration_supported(hstate_vma(vma))) 178020ca87f2SLi Xinhai return false; 178120ca87f2SLi Xinhai 178220ca87f2SLi Xinhai /* 178320ca87f2SLi Xinhai * Migration allocates pages in the highest zone. If we cannot 178420ca87f2SLi Xinhai * do so then migration (at least from node to node) is not 178520ca87f2SLi Xinhai * possible. 178620ca87f2SLi Xinhai */ 178720ca87f2SLi Xinhai if (vma->vm_file && 178820ca87f2SLi Xinhai gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 178920ca87f2SLi Xinhai < policy_zone) 179020ca87f2SLi Xinhai return false; 179120ca87f2SLi Xinhai return true; 179220ca87f2SLi Xinhai } 179320ca87f2SLi Xinhai 179474d2c3a0SOleg Nesterov struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 1795ddc1a5cbSHugh Dickins unsigned long addr, pgoff_t *ilx) 17961da177e4SLinus Torvalds { 1797ddc1a5cbSHugh Dickins *ilx = 0; 1798ddc1a5cbSHugh Dickins return (vma->vm_ops && vma->vm_ops->get_policy) ? 1799ddc1a5cbSHugh Dickins vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 180074d2c3a0SOleg Nesterov } 180174d2c3a0SOleg Nesterov 180274d2c3a0SOleg Nesterov /* 1803ddc1a5cbSHugh Dickins * get_vma_policy(@vma, @addr, @order, @ilx) 180474d2c3a0SOleg Nesterov * @vma: virtual memory area whose policy is sought 180574d2c3a0SOleg Nesterov * @addr: address in @vma for shared policy lookup 1806ddc1a5cbSHugh Dickins * @order: 0, or appropriate huge_page_order for interleaving 1807fa3bea4eSGregory Price * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 1808fa3bea4eSGregory Price * MPOL_WEIGHTED_INTERLEAVE 180974d2c3a0SOleg Nesterov * 181074d2c3a0SOleg Nesterov * Returns effective policy for a VMA at specified address. 1811dd6eecb9SOleg Nesterov * Falls back to current->mempolicy or system default policy, as necessary. 181274d2c3a0SOleg Nesterov * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 181374d2c3a0SOleg Nesterov * count--added by the get_policy() vm_op, as appropriate--to protect against 181474d2c3a0SOleg Nesterov * freeing by another task. It is the caller's responsibility to free the 181574d2c3a0SOleg Nesterov * extra reference for shared policies. 181674d2c3a0SOleg Nesterov */ 1817ddc1a5cbSHugh Dickins struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1818ddc1a5cbSHugh Dickins unsigned long addr, int order, pgoff_t *ilx) 181974d2c3a0SOleg Nesterov { 1820ddc1a5cbSHugh Dickins struct mempolicy *pol; 182174d2c3a0SOleg Nesterov 1822ddc1a5cbSHugh Dickins pol = __get_vma_policy(vma, addr, ilx); 18238d90274bSOleg Nesterov if (!pol) 1824dd6eecb9SOleg Nesterov pol = get_task_policy(current); 1825fa3bea4eSGregory Price if (pol->mode == MPOL_INTERLEAVE || 1826fa3bea4eSGregory Price pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1827ddc1a5cbSHugh Dickins *ilx += vma->vm_pgoff >> order; 1828ddc1a5cbSHugh Dickins *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 1829ddc1a5cbSHugh Dickins } 18301da177e4SLinus Torvalds return pol; 18311da177e4SLinus Torvalds } 18321da177e4SLinus Torvalds 18336b6482bbSOleg Nesterov bool vma_policy_mof(struct vm_area_struct *vma) 1834fc314724SMel Gorman { 18356b6482bbSOleg Nesterov struct mempolicy *pol; 1836f15ca78eSOleg Nesterov 1837fc314724SMel Gorman if (vma->vm_ops && vma->vm_ops->get_policy) { 1838fc314724SMel Gorman bool ret = false; 1839ddc1a5cbSHugh Dickins pgoff_t ilx; /* ignored here */ 1840fc314724SMel Gorman 1841ddc1a5cbSHugh Dickins pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 1842fc314724SMel Gorman if (pol && (pol->flags & MPOL_F_MOF)) 1843fc314724SMel Gorman ret = true; 1844fc314724SMel Gorman mpol_cond_put(pol); 1845fc314724SMel Gorman 1846fc314724SMel Gorman return ret; 18478d90274bSOleg Nesterov } 18488d90274bSOleg Nesterov 1849fc314724SMel Gorman pol = vma->vm_policy; 18508d90274bSOleg Nesterov if (!pol) 18516b6482bbSOleg Nesterov pol = get_task_policy(current); 1852fc314724SMel Gorman 1853fc314724SMel Gorman return pol->flags & MPOL_F_MOF; 1854fc314724SMel Gorman } 1855fc314724SMel Gorman 1856d2226ebdSFeng Tang bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1857d3eb1570SLai Jiangshan { 1858d3eb1570SLai Jiangshan enum zone_type dynamic_policy_zone = policy_zone; 1859d3eb1570SLai Jiangshan 1860d3eb1570SLai Jiangshan BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 1861d3eb1570SLai Jiangshan 1862d3eb1570SLai Jiangshan /* 1863269fbe72SBen Widawsky * if policy->nodes has movable memory only, 1864d3eb1570SLai Jiangshan * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 1865d3eb1570SLai Jiangshan * 1866269fbe72SBen Widawsky * policy->nodes is intersect with node_states[N_MEMORY]. 1867f0953a1bSIngo Molnar * so if the following test fails, it implies 1868269fbe72SBen Widawsky * policy->nodes has movable memory only. 1869d3eb1570SLai Jiangshan */ 1870269fbe72SBen Widawsky if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 1871d3eb1570SLai Jiangshan dynamic_policy_zone = ZONE_MOVABLE; 1872d3eb1570SLai Jiangshan 1873d3eb1570SLai Jiangshan return zone >= dynamic_policy_zone; 1874d3eb1570SLai Jiangshan } 1875d3eb1570SLai Jiangshan 1876fa3bea4eSGregory Price static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 1877fa3bea4eSGregory Price { 1878274519edSGregory Price unsigned int node; 1879274519edSGregory Price unsigned int cpuset_mems_cookie; 1880fa3bea4eSGregory Price 1881274519edSGregory Price retry: 1882274519edSGregory Price /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 1883274519edSGregory Price cpuset_mems_cookie = read_mems_allowed_begin(); 1884274519edSGregory Price node = current->il_prev; 1885fa3bea4eSGregory Price if (!current->il_weight || !node_isset(node, policy->nodes)) { 1886fa3bea4eSGregory Price node = next_node_in(node, policy->nodes); 1887274519edSGregory Price if (read_mems_allowed_retry(cpuset_mems_cookie)) 1888274519edSGregory Price goto retry; 1889fa3bea4eSGregory Price if (node == MAX_NUMNODES) 1890fa3bea4eSGregory Price return node; 1891fa3bea4eSGregory Price current->il_prev = node; 1892fa3bea4eSGregory Price current->il_weight = get_il_weight(node); 1893fa3bea4eSGregory Price } 1894fa3bea4eSGregory Price current->il_weight--; 1895fa3bea4eSGregory Price return node; 1896fa3bea4eSGregory Price } 1897fa3bea4eSGregory Price 18981da177e4SLinus Torvalds /* Do dynamic interleaving for a process */ 1899c36f6e6dSHugh Dickins static unsigned int interleave_nodes(struct mempolicy *policy) 19001da177e4SLinus Torvalds { 1901c36f6e6dSHugh Dickins unsigned int nid; 1902274519edSGregory Price unsigned int cpuset_mems_cookie; 19031da177e4SLinus Torvalds 1904274519edSGregory Price /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 1905274519edSGregory Price do { 1906274519edSGregory Price cpuset_mems_cookie = read_mems_allowed_begin(); 1907c36f6e6dSHugh Dickins nid = next_node_in(current->il_prev, policy->nodes); 1908274519edSGregory Price } while (read_mems_allowed_retry(cpuset_mems_cookie)); 1909274519edSGregory Price 1910c36f6e6dSHugh Dickins if (nid < MAX_NUMNODES) 1911c36f6e6dSHugh Dickins current->il_prev = nid; 1912c36f6e6dSHugh Dickins return nid; 19131da177e4SLinus Torvalds } 19141da177e4SLinus Torvalds 1915dc85da15SChristoph Lameter /* 1916dc85da15SChristoph Lameter * Depending on the memory policy provide a node from which to allocate the 1917dc85da15SChristoph Lameter * next slab entry. 1918dc85da15SChristoph Lameter */ 19192a389610SDavid Rientjes unsigned int mempolicy_slab_node(void) 1920dc85da15SChristoph Lameter { 1921e7b691b0SAndi Kleen struct mempolicy *policy; 19222a389610SDavid Rientjes int node = numa_mem_id(); 1923e7b691b0SAndi Kleen 192438b031ddSVasily Averin if (!in_task()) 19252a389610SDavid Rientjes return node; 1926e7b691b0SAndi Kleen 1927e7b691b0SAndi Kleen policy = current->mempolicy; 19287858d7bcSFeng Tang if (!policy) 19292a389610SDavid Rientjes return node; 1930765c4507SChristoph Lameter 1931bea904d5SLee Schermerhorn switch (policy->mode) { 1932bea904d5SLee Schermerhorn case MPOL_PREFERRED: 1933269fbe72SBen Widawsky return first_node(policy->nodes); 1934bea904d5SLee Schermerhorn 1935dc85da15SChristoph Lameter case MPOL_INTERLEAVE: 1936dc85da15SChristoph Lameter return interleave_nodes(policy); 1937dc85da15SChristoph Lameter 1938fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 1939fa3bea4eSGregory Price return weighted_interleave_nodes(policy); 1940fa3bea4eSGregory Price 1941b27abaccSDave Hansen case MPOL_BIND: 1942b27abaccSDave Hansen case MPOL_PREFERRED_MANY: 1943b27abaccSDave Hansen { 1944c33d6c06SMel Gorman struct zoneref *z; 1945c33d6c06SMel Gorman 1946dc85da15SChristoph Lameter /* 1947dc85da15SChristoph Lameter * Follow bind policy behavior and start allocation at the 1948dc85da15SChristoph Lameter * first node. 1949dc85da15SChristoph Lameter */ 195019770b32SMel Gorman struct zonelist *zonelist; 195119770b32SMel Gorman enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1952c9634cf0SAneesh Kumar K.V zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 1953c33d6c06SMel Gorman z = first_zones_zonelist(zonelist, highest_zoneidx, 1954269fbe72SBen Widawsky &policy->nodes); 1955c1093b74SPavel Tatashin return z->zone ? zone_to_nid(z->zone) : node; 1956dd1a239fSMel Gorman } 19577858d7bcSFeng Tang case MPOL_LOCAL: 19587858d7bcSFeng Tang return node; 1959dc85da15SChristoph Lameter 1960dc85da15SChristoph Lameter default: 1961bea904d5SLee Schermerhorn BUG(); 1962dc85da15SChristoph Lameter } 1963dc85da15SChristoph Lameter } 1964dc85da15SChristoph Lameter 19659685e6e3SGregory Price static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 19669685e6e3SGregory Price nodemask_t *mask) 19679685e6e3SGregory Price { 19689685e6e3SGregory Price /* 19699685e6e3SGregory Price * barrier stabilizes the nodemask locally so that it can be iterated 19709685e6e3SGregory Price * over safely without concern for changes. Allocators validate node 19719685e6e3SGregory Price * selection does not violate mems_allowed, so this is safe. 19729685e6e3SGregory Price */ 19739685e6e3SGregory Price barrier(); 19749685e6e3SGregory Price memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 19759685e6e3SGregory Price barrier(); 19769685e6e3SGregory Price return nodes_weight(*mask); 19779685e6e3SGregory Price } 19789685e6e3SGregory Price 1979fa3bea4eSGregory Price static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 1980fa3bea4eSGregory Price { 1981fa3bea4eSGregory Price nodemask_t nodemask; 1982fa3bea4eSGregory Price unsigned int target, nr_nodes; 1983fa3bea4eSGregory Price u8 *table; 1984fa3bea4eSGregory Price unsigned int weight_total = 0; 1985fa3bea4eSGregory Price u8 weight; 1986fa3bea4eSGregory Price int nid; 1987fa3bea4eSGregory Price 1988fa3bea4eSGregory Price nr_nodes = read_once_policy_nodemask(pol, &nodemask); 1989fa3bea4eSGregory Price if (!nr_nodes) 1990fa3bea4eSGregory Price return numa_node_id(); 1991fa3bea4eSGregory Price 1992fa3bea4eSGregory Price rcu_read_lock(); 1993fa3bea4eSGregory Price table = rcu_dereference(iw_table); 1994fa3bea4eSGregory Price /* calculate the total weight */ 1995fa3bea4eSGregory Price for_each_node_mask(nid, nodemask) { 1996fa3bea4eSGregory Price /* detect system default usage */ 1997fa3bea4eSGregory Price weight = table ? table[nid] : 1; 1998fa3bea4eSGregory Price weight = weight ? weight : 1; 1999fa3bea4eSGregory Price weight_total += weight; 2000fa3bea4eSGregory Price } 2001fa3bea4eSGregory Price 2002fa3bea4eSGregory Price /* Calculate the node offset based on totals */ 2003fa3bea4eSGregory Price target = ilx % weight_total; 2004fa3bea4eSGregory Price nid = first_node(nodemask); 2005fa3bea4eSGregory Price while (target) { 2006fa3bea4eSGregory Price /* detect system default usage */ 2007fa3bea4eSGregory Price weight = table ? table[nid] : 1; 2008fa3bea4eSGregory Price weight = weight ? weight : 1; 2009fa3bea4eSGregory Price if (target < weight) 2010fa3bea4eSGregory Price break; 2011fa3bea4eSGregory Price target -= weight; 2012fa3bea4eSGregory Price nid = next_node_in(nid, nodemask); 2013fa3bea4eSGregory Price } 2014fa3bea4eSGregory Price rcu_read_unlock(); 2015fa3bea4eSGregory Price return nid; 2016fa3bea4eSGregory Price } 2017fa3bea4eSGregory Price 2018fee83b3aSAndrew Morton /* 2019ddc1a5cbSHugh Dickins * Do static interleaving for interleave index @ilx. Returns the ilx'th 2020ddc1a5cbSHugh Dickins * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2021ddc1a5cbSHugh Dickins * exceeds the number of present nodes. 2022fee83b3aSAndrew Morton */ 2023ddc1a5cbSHugh Dickins static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 20241da177e4SLinus Torvalds { 20259685e6e3SGregory Price nodemask_t nodemask; 2026276aeee1Syanghui unsigned int target, nnodes; 2027fee83b3aSAndrew Morton int i; 2028fee83b3aSAndrew Morton int nid; 20291da177e4SLinus Torvalds 20309685e6e3SGregory Price nnodes = read_once_policy_nodemask(pol, &nodemask); 2031f5b087b5SDavid Rientjes if (!nnodes) 2032f5b087b5SDavid Rientjes return numa_node_id(); 2033ddc1a5cbSHugh Dickins target = ilx % nnodes; 2034276aeee1Syanghui nid = first_node(nodemask); 2035fee83b3aSAndrew Morton for (i = 0; i < target; i++) 2036276aeee1Syanghui nid = next_node(nid, nodemask); 20371da177e4SLinus Torvalds return nid; 20381da177e4SLinus Torvalds } 20391da177e4SLinus Torvalds 20403b98b087SNishanth Aravamudan /* 2041ddc1a5cbSHugh Dickins * Return a nodemask representing a mempolicy for filtering nodes for 2042ddc1a5cbSHugh Dickins * page allocation, together with preferred node id (or the input node id). 20433b98b087SNishanth Aravamudan */ 2044ddc1a5cbSHugh Dickins static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2045ddc1a5cbSHugh Dickins pgoff_t ilx, int *nid) 2046ddc1a5cbSHugh Dickins { 2047ddc1a5cbSHugh Dickins nodemask_t *nodemask = NULL; 2048ddc1a5cbSHugh Dickins 2049ddc1a5cbSHugh Dickins switch (pol->mode) { 2050ddc1a5cbSHugh Dickins case MPOL_PREFERRED: 2051ddc1a5cbSHugh Dickins /* Override input node id */ 2052ddc1a5cbSHugh Dickins *nid = first_node(pol->nodes); 2053ddc1a5cbSHugh Dickins break; 2054ddc1a5cbSHugh Dickins case MPOL_PREFERRED_MANY: 2055ddc1a5cbSHugh Dickins nodemask = &pol->nodes; 2056ddc1a5cbSHugh Dickins if (pol->home_node != NUMA_NO_NODE) 2057ddc1a5cbSHugh Dickins *nid = pol->home_node; 2058ddc1a5cbSHugh Dickins break; 2059ddc1a5cbSHugh Dickins case MPOL_BIND: 2060ddc1a5cbSHugh Dickins /* Restrict to nodemask (but not on lower zones) */ 2061ddc1a5cbSHugh Dickins if (apply_policy_zone(pol, gfp_zone(gfp)) && 2062ddc1a5cbSHugh Dickins cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2063ddc1a5cbSHugh Dickins nodemask = &pol->nodes; 2064ddc1a5cbSHugh Dickins if (pol->home_node != NUMA_NO_NODE) 2065ddc1a5cbSHugh Dickins *nid = pol->home_node; 2066ddc1a5cbSHugh Dickins /* 2067ddc1a5cbSHugh Dickins * __GFP_THISNODE shouldn't even be used with the bind policy 2068ddc1a5cbSHugh Dickins * because we might easily break the expectation to stay on the 2069ddc1a5cbSHugh Dickins * requested node and not break the policy. 2070ddc1a5cbSHugh Dickins */ 2071ddc1a5cbSHugh Dickins WARN_ON_ONCE(gfp & __GFP_THISNODE); 2072ddc1a5cbSHugh Dickins break; 2073ddc1a5cbSHugh Dickins case MPOL_INTERLEAVE: 2074ddc1a5cbSHugh Dickins /* Override input node id */ 2075ddc1a5cbSHugh Dickins *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2076ddc1a5cbSHugh Dickins interleave_nodes(pol) : interleave_nid(pol, ilx); 2077ddc1a5cbSHugh Dickins break; 2078fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 2079fa3bea4eSGregory Price *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2080fa3bea4eSGregory Price weighted_interleave_nodes(pol) : 2081fa3bea4eSGregory Price weighted_interleave_nid(pol, ilx); 2082fa3bea4eSGregory Price break; 2083ddc1a5cbSHugh Dickins } 2084ddc1a5cbSHugh Dickins 2085ddc1a5cbSHugh Dickins return nodemask; 20865da7ca86SChristoph Lameter } 20875da7ca86SChristoph Lameter 208800ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS 2089480eccf9SLee Schermerhorn /* 209004ec6264SVlastimil Babka * huge_node(@vma, @addr, @gfp_flags, @mpol) 2091b46e14acSFabian Frederick * @vma: virtual memory area whose policy is sought 2092b46e14acSFabian Frederick * @addr: address in @vma for shared policy lookup and interleave policy 2093b46e14acSFabian Frederick * @gfp_flags: for requested zone 2094b46e14acSFabian Frederick * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2095b27abaccSDave Hansen * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2096480eccf9SLee Schermerhorn * 209704ec6264SVlastimil Babka * Returns a nid suitable for a huge page allocation and a pointer 209852cd3b07SLee Schermerhorn * to the struct mempolicy for conditional unref after allocation. 2099b27abaccSDave Hansen * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2100b27abaccSDave Hansen * to the mempolicy's @nodemask for filtering the zonelist. 2101480eccf9SLee Schermerhorn */ 210204ec6264SVlastimil Babka int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 210304ec6264SVlastimil Babka struct mempolicy **mpol, nodemask_t **nodemask) 21045da7ca86SChristoph Lameter { 2105ddc1a5cbSHugh Dickins pgoff_t ilx; 210604ec6264SVlastimil Babka int nid; 21075da7ca86SChristoph Lameter 2108ddc1a5cbSHugh Dickins nid = numa_node_id(); 2109ddc1a5cbSHugh Dickins *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2110ddc1a5cbSHugh Dickins *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 211104ec6264SVlastimil Babka return nid; 21125da7ca86SChristoph Lameter } 211306808b08SLee Schermerhorn 211406808b08SLee Schermerhorn /* 211506808b08SLee Schermerhorn * init_nodemask_of_mempolicy 211606808b08SLee Schermerhorn * 211706808b08SLee Schermerhorn * If the current task's mempolicy is "default" [NULL], return 'false' 211806808b08SLee Schermerhorn * to indicate default policy. Otherwise, extract the policy nodemask 211906808b08SLee Schermerhorn * for 'bind' or 'interleave' policy into the argument nodemask, or 212006808b08SLee Schermerhorn * initialize the argument nodemask to contain the single node for 212106808b08SLee Schermerhorn * 'preferred' or 'local' policy and return 'true' to indicate presence 212206808b08SLee Schermerhorn * of non-default mempolicy. 212306808b08SLee Schermerhorn * 212406808b08SLee Schermerhorn * We don't bother with reference counting the mempolicy [mpol_get/put] 212506808b08SLee Schermerhorn * because the current task is examining it's own mempolicy and a task's 212606808b08SLee Schermerhorn * mempolicy is only ever changed by the task itself. 212706808b08SLee Schermerhorn * 212806808b08SLee Schermerhorn * N.B., it is the caller's responsibility to free a returned nodemask. 212906808b08SLee Schermerhorn */ 213006808b08SLee Schermerhorn bool init_nodemask_of_mempolicy(nodemask_t *mask) 213106808b08SLee Schermerhorn { 213206808b08SLee Schermerhorn struct mempolicy *mempolicy; 213306808b08SLee Schermerhorn 213406808b08SLee Schermerhorn if (!(mask && current->mempolicy)) 213506808b08SLee Schermerhorn return false; 213606808b08SLee Schermerhorn 2137c0ff7453SMiao Xie task_lock(current); 213806808b08SLee Schermerhorn mempolicy = current->mempolicy; 213906808b08SLee Schermerhorn switch (mempolicy->mode) { 214006808b08SLee Schermerhorn case MPOL_PREFERRED: 2141b27abaccSDave Hansen case MPOL_PREFERRED_MANY: 214206808b08SLee Schermerhorn case MPOL_BIND: 214306808b08SLee Schermerhorn case MPOL_INTERLEAVE: 2144fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 2145269fbe72SBen Widawsky *mask = mempolicy->nodes; 214606808b08SLee Schermerhorn break; 214706808b08SLee Schermerhorn 21487858d7bcSFeng Tang case MPOL_LOCAL: 2149269fbe72SBen Widawsky init_nodemask_of_node(mask, numa_node_id()); 21507858d7bcSFeng Tang break; 21517858d7bcSFeng Tang 215206808b08SLee Schermerhorn default: 215306808b08SLee Schermerhorn BUG(); 215406808b08SLee Schermerhorn } 2155c0ff7453SMiao Xie task_unlock(current); 215606808b08SLee Schermerhorn 215706808b08SLee Schermerhorn return true; 215806808b08SLee Schermerhorn } 215900ac59adSChen, Kenneth W #endif 21605da7ca86SChristoph Lameter 21616f48d0ebSDavid Rientjes /* 2162b26e517aSFeng Tang * mempolicy_in_oom_domain 21636f48d0ebSDavid Rientjes * 2164b26e517aSFeng Tang * If tsk's mempolicy is "bind", check for intersection between mask and 2165b26e517aSFeng Tang * the policy nodemask. Otherwise, return true for all other policies 2166b26e517aSFeng Tang * including "interleave", as a tsk with "interleave" policy may have 2167b26e517aSFeng Tang * memory allocated from all nodes in system. 21686f48d0ebSDavid Rientjes * 21696f48d0ebSDavid Rientjes * Takes task_lock(tsk) to prevent freeing of its mempolicy. 21706f48d0ebSDavid Rientjes */ 2171b26e517aSFeng Tang bool mempolicy_in_oom_domain(struct task_struct *tsk, 21726f48d0ebSDavid Rientjes const nodemask_t *mask) 21736f48d0ebSDavid Rientjes { 21746f48d0ebSDavid Rientjes struct mempolicy *mempolicy; 21756f48d0ebSDavid Rientjes bool ret = true; 21766f48d0ebSDavid Rientjes 21776f48d0ebSDavid Rientjes if (!mask) 21786f48d0ebSDavid Rientjes return ret; 2179b26e517aSFeng Tang 21806f48d0ebSDavid Rientjes task_lock(tsk); 21816f48d0ebSDavid Rientjes mempolicy = tsk->mempolicy; 2182b26e517aSFeng Tang if (mempolicy && mempolicy->mode == MPOL_BIND) 2183269fbe72SBen Widawsky ret = nodes_intersects(mempolicy->nodes, *mask); 21846f48d0ebSDavid Rientjes task_unlock(tsk); 2185b26e517aSFeng Tang 21866f48d0ebSDavid Rientjes return ret; 21876f48d0ebSDavid Rientjes } 21886f48d0ebSDavid Rientjes 21894c54d949SFeng Tang static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2190ddc1a5cbSHugh Dickins int nid, nodemask_t *nodemask) 21914c54d949SFeng Tang { 21924c54d949SFeng Tang struct page *page; 21934c54d949SFeng Tang gfp_t preferred_gfp; 21944c54d949SFeng Tang 21954c54d949SFeng Tang /* 21964c54d949SFeng Tang * This is a two pass approach. The first pass will only try the 21974c54d949SFeng Tang * preferred nodes but skip the direct reclaim and allow the 21984c54d949SFeng Tang * allocation to fail, while the second pass will try all the 21994c54d949SFeng Tang * nodes in system. 22004c54d949SFeng Tang */ 22014c54d949SFeng Tang preferred_gfp = gfp | __GFP_NOWARN; 22024c54d949SFeng Tang preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2203ddc1a5cbSHugh Dickins page = __alloc_pages(preferred_gfp, order, nid, nodemask); 22044c54d949SFeng Tang if (!page) 2205c0455116SAneesh Kumar K.V page = __alloc_pages(gfp, order, nid, NULL); 22064c54d949SFeng Tang 22074c54d949SFeng Tang return page; 22084c54d949SFeng Tang } 22094c54d949SFeng Tang 22101da177e4SLinus Torvalds /** 2211ddc1a5cbSHugh Dickins * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2212eb350739SMatthew Wilcox (Oracle) * @gfp: GFP flags. 2213ddc1a5cbSHugh Dickins * @order: Order of the page allocation. 2214ddc1a5cbSHugh Dickins * @pol: Pointer to the NUMA mempolicy. 2215ddc1a5cbSHugh Dickins * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2216ddc1a5cbSHugh Dickins * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 22171da177e4SLinus Torvalds * 2218ddc1a5cbSHugh Dickins * Return: The page on success or NULL if allocation fails. 22191da177e4SLinus Torvalds */ 2220ddc1a5cbSHugh Dickins struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2221ddc1a5cbSHugh Dickins struct mempolicy *pol, pgoff_t ilx, int nid) 22221da177e4SLinus Torvalds { 2223ddc1a5cbSHugh Dickins nodemask_t *nodemask; 2224adf88aa8SMatthew Wilcox (Oracle) struct page *page; 2225adf88aa8SMatthew Wilcox (Oracle) 2226ddc1a5cbSHugh Dickins nodemask = policy_nodemask(gfp, pol, ilx, &nid); 22274c54d949SFeng Tang 2228ddc1a5cbSHugh Dickins if (pol->mode == MPOL_PREFERRED_MANY) 2229ddc1a5cbSHugh Dickins return alloc_pages_preferred_many(gfp, order, nid, nodemask); 223019deb769SDavid Rientjes 2231ddc1a5cbSHugh Dickins if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2232ddc1a5cbSHugh Dickins /* filter "hugepage" allocation, unless from alloc_pages() */ 2233ddc1a5cbSHugh Dickins order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 223419deb769SDavid Rientjes /* 223519deb769SDavid Rientjes * For hugepage allocation and non-interleave policy which 223619deb769SDavid Rientjes * allows the current node (or other explicitly preferred 223719deb769SDavid Rientjes * node) we only try to allocate from the current/preferred 223819deb769SDavid Rientjes * node and don't fall back to other nodes, as the cost of 223919deb769SDavid Rientjes * remote accesses would likely offset THP benefits. 224019deb769SDavid Rientjes * 2241b27abaccSDave Hansen * If the policy is interleave or does not allow the current 224219deb769SDavid Rientjes * node in its nodemask, we allocate the standard way. 224319deb769SDavid Rientjes */ 2244ddc1a5cbSHugh Dickins if (pol->mode != MPOL_INTERLEAVE && 2245fa3bea4eSGregory Price pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2246ddc1a5cbSHugh Dickins (!nodemask || node_isset(nid, *nodemask))) { 2247cc638f32SVlastimil Babka /* 2248cc638f32SVlastimil Babka * First, try to allocate THP only on local node, but 2249cc638f32SVlastimil Babka * don't reclaim unnecessarily, just compact. 2250cc638f32SVlastimil Babka */ 2251ddc1a5cbSHugh Dickins page = __alloc_pages_node(nid, 2252ddc1a5cbSHugh Dickins gfp | __GFP_THISNODE | __GFP_NORETRY, order); 2253ddc1a5cbSHugh Dickins if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2254ddc1a5cbSHugh Dickins return page; 225576e654ccSDavid Rientjes /* 225676e654ccSDavid Rientjes * If hugepage allocations are configured to always 225776e654ccSDavid Rientjes * synchronous compact or the vma has been madvised 225876e654ccSDavid Rientjes * to prefer hugepage backing, retry allowing remote 2259cc638f32SVlastimil Babka * memory with both reclaim and compact as well. 226076e654ccSDavid Rientjes */ 226119deb769SDavid Rientjes } 226219deb769SDavid Rientjes } 226319deb769SDavid Rientjes 2264ddc1a5cbSHugh Dickins page = __alloc_pages(gfp, order, nid, nodemask); 2265ddc1a5cbSHugh Dickins 2266ddc1a5cbSHugh Dickins if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { 2267ddc1a5cbSHugh Dickins /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2268ddc1a5cbSHugh Dickins if (static_branch_likely(&vm_numa_stat_key) && 2269ddc1a5cbSHugh Dickins page_to_nid(page) == nid) { 2270ddc1a5cbSHugh Dickins preempt_disable(); 2271ddc1a5cbSHugh Dickins __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2272ddc1a5cbSHugh Dickins preempt_enable(); 2273ddc1a5cbSHugh Dickins } 2274ddc1a5cbSHugh Dickins } 2275ddc1a5cbSHugh Dickins 2276ddc1a5cbSHugh Dickins return page; 2277ddc1a5cbSHugh Dickins } 2278ddc1a5cbSHugh Dickins 2279ddc1a5cbSHugh Dickins /** 2280ddc1a5cbSHugh Dickins * vma_alloc_folio - Allocate a folio for a VMA. 2281ddc1a5cbSHugh Dickins * @gfp: GFP flags. 2282ddc1a5cbSHugh Dickins * @order: Order of the folio. 2283ddc1a5cbSHugh Dickins * @vma: Pointer to VMA. 2284ddc1a5cbSHugh Dickins * @addr: Virtual address of the allocation. Must be inside @vma. 2285ddc1a5cbSHugh Dickins * @hugepage: Unused (was: For hugepages try only preferred node if possible). 2286ddc1a5cbSHugh Dickins * 2287ddc1a5cbSHugh Dickins * Allocate a folio for a specific address in @vma, using the appropriate 2288ddc1a5cbSHugh Dickins * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2289ddc1a5cbSHugh Dickins * VMA to prevent it from going away. Should be used for all allocations 2290ddc1a5cbSHugh Dickins * for folios that will be mapped into user space, excepting hugetlbfs, and 2291ddc1a5cbSHugh Dickins * excepting where direct use of alloc_pages_mpol() is more appropriate. 2292ddc1a5cbSHugh Dickins * 2293ddc1a5cbSHugh Dickins * Return: The folio on success or NULL if allocation fails. 2294ddc1a5cbSHugh Dickins */ 2295ddc1a5cbSHugh Dickins struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, 2296ddc1a5cbSHugh Dickins unsigned long addr, bool hugepage) 2297ddc1a5cbSHugh Dickins { 2298ddc1a5cbSHugh Dickins struct mempolicy *pol; 2299ddc1a5cbSHugh Dickins pgoff_t ilx; 2300ddc1a5cbSHugh Dickins struct page *page; 2301ddc1a5cbSHugh Dickins 2302ddc1a5cbSHugh Dickins pol = get_vma_policy(vma, addr, order, &ilx); 2303ddc1a5cbSHugh Dickins page = alloc_pages_mpol(gfp | __GFP_COMP, order, 2304ddc1a5cbSHugh Dickins pol, ilx, numa_node_id()); 2305d51e9894SVlastimil Babka mpol_cond_put(pol); 2306ddc1a5cbSHugh Dickins return page_rmappable_folio(page); 2307f584b680SMatthew Wilcox (Oracle) } 2308adf88aa8SMatthew Wilcox (Oracle) EXPORT_SYMBOL(vma_alloc_folio); 2309f584b680SMatthew Wilcox (Oracle) 23101da177e4SLinus Torvalds /** 2311d7f946d0SMatthew Wilcox (Oracle) * alloc_pages - Allocate pages. 23126421ec76SMatthew Wilcox (Oracle) * @gfp: GFP flags. 23136421ec76SMatthew Wilcox (Oracle) * @order: Power of two of number of pages to allocate. 23141da177e4SLinus Torvalds * 23156421ec76SMatthew Wilcox (Oracle) * Allocate 1 << @order contiguous pages. The physical address of the 23166421ec76SMatthew Wilcox (Oracle) * first page is naturally aligned (eg an order-3 allocation will be aligned 23176421ec76SMatthew Wilcox (Oracle) * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 23186421ec76SMatthew Wilcox (Oracle) * process is honoured when in process context. 23191da177e4SLinus Torvalds * 23206421ec76SMatthew Wilcox (Oracle) * Context: Can be called from any context, providing the appropriate GFP 23216421ec76SMatthew Wilcox (Oracle) * flags are used. 23226421ec76SMatthew Wilcox (Oracle) * Return: The page on success or NULL if allocation fails. 23231da177e4SLinus Torvalds */ 2324ddc1a5cbSHugh Dickins struct page *alloc_pages(gfp_t gfp, unsigned int order) 23251da177e4SLinus Torvalds { 23268d90274bSOleg Nesterov struct mempolicy *pol = &default_policy; 232752cd3b07SLee Schermerhorn 232852cd3b07SLee Schermerhorn /* 232952cd3b07SLee Schermerhorn * No reference counting needed for current->mempolicy 233052cd3b07SLee Schermerhorn * nor system default_policy 233152cd3b07SLee Schermerhorn */ 2332ddc1a5cbSHugh Dickins if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2333ddc1a5cbSHugh Dickins pol = get_task_policy(current); 2334cc9a6c87SMel Gorman 2335ddc1a5cbSHugh Dickins return alloc_pages_mpol(gfp, order, 2336ddc1a5cbSHugh Dickins pol, NO_INTERLEAVE_INDEX, numa_node_id()); 23371da177e4SLinus Torvalds } 2338d7f946d0SMatthew Wilcox (Oracle) EXPORT_SYMBOL(alloc_pages); 23391da177e4SLinus Torvalds 2340ddc1a5cbSHugh Dickins struct folio *folio_alloc(gfp_t gfp, unsigned int order) 2341cc09cb13SMatthew Wilcox (Oracle) { 234223e48832SHugh Dickins return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); 2343cc09cb13SMatthew Wilcox (Oracle) } 2344cc09cb13SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_alloc); 2345cc09cb13SMatthew Wilcox (Oracle) 2346c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, 2347c00b6b96SChen Wandun struct mempolicy *pol, unsigned long nr_pages, 2348c00b6b96SChen Wandun struct page **page_array) 2349c00b6b96SChen Wandun { 2350c00b6b96SChen Wandun int nodes; 2351c00b6b96SChen Wandun unsigned long nr_pages_per_node; 2352c00b6b96SChen Wandun int delta; 2353c00b6b96SChen Wandun int i; 2354c00b6b96SChen Wandun unsigned long nr_allocated; 2355c00b6b96SChen Wandun unsigned long total_allocated = 0; 2356c00b6b96SChen Wandun 2357c00b6b96SChen Wandun nodes = nodes_weight(pol->nodes); 2358c00b6b96SChen Wandun nr_pages_per_node = nr_pages / nodes; 2359c00b6b96SChen Wandun delta = nr_pages - nodes * nr_pages_per_node; 2360c00b6b96SChen Wandun 2361c00b6b96SChen Wandun for (i = 0; i < nodes; i++) { 2362c00b6b96SChen Wandun if (delta) { 2363c00b6b96SChen Wandun nr_allocated = __alloc_pages_bulk(gfp, 2364c00b6b96SChen Wandun interleave_nodes(pol), NULL, 2365c00b6b96SChen Wandun nr_pages_per_node + 1, NULL, 2366c00b6b96SChen Wandun page_array); 2367c00b6b96SChen Wandun delta--; 2368c00b6b96SChen Wandun } else { 2369c00b6b96SChen Wandun nr_allocated = __alloc_pages_bulk(gfp, 2370c00b6b96SChen Wandun interleave_nodes(pol), NULL, 2371c00b6b96SChen Wandun nr_pages_per_node, NULL, page_array); 2372c00b6b96SChen Wandun } 2373c00b6b96SChen Wandun 2374c00b6b96SChen Wandun page_array += nr_allocated; 2375c00b6b96SChen Wandun total_allocated += nr_allocated; 2376c00b6b96SChen Wandun } 2377c00b6b96SChen Wandun 2378c00b6b96SChen Wandun return total_allocated; 2379c00b6b96SChen Wandun } 2380c00b6b96SChen Wandun 2381fa3bea4eSGregory Price static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, 2382fa3bea4eSGregory Price struct mempolicy *pol, unsigned long nr_pages, 2383fa3bea4eSGregory Price struct page **page_array) 2384fa3bea4eSGregory Price { 2385fa3bea4eSGregory Price struct task_struct *me = current; 2386274519edSGregory Price unsigned int cpuset_mems_cookie; 2387fa3bea4eSGregory Price unsigned long total_allocated = 0; 2388fa3bea4eSGregory Price unsigned long nr_allocated = 0; 2389fa3bea4eSGregory Price unsigned long rounds; 2390fa3bea4eSGregory Price unsigned long node_pages, delta; 2391fa3bea4eSGregory Price u8 *table, *weights, weight; 2392fa3bea4eSGregory Price unsigned int weight_total = 0; 2393fa3bea4eSGregory Price unsigned long rem_pages = nr_pages; 2394fa3bea4eSGregory Price nodemask_t nodes; 2395fa3bea4eSGregory Price int nnodes, node; 2396fa3bea4eSGregory Price int resume_node = MAX_NUMNODES - 1; 2397fa3bea4eSGregory Price u8 resume_weight = 0; 2398fa3bea4eSGregory Price int prev_node; 2399fa3bea4eSGregory Price int i; 2400fa3bea4eSGregory Price 2401fa3bea4eSGregory Price if (!nr_pages) 2402fa3bea4eSGregory Price return 0; 2403fa3bea4eSGregory Price 2404274519edSGregory Price /* read the nodes onto the stack, retry if done during rebind */ 2405274519edSGregory Price do { 2406274519edSGregory Price cpuset_mems_cookie = read_mems_allowed_begin(); 2407fa3bea4eSGregory Price nnodes = read_once_policy_nodemask(pol, &nodes); 2408274519edSGregory Price } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2409274519edSGregory Price 2410274519edSGregory Price /* if the nodemask has become invalid, we cannot do anything */ 2411fa3bea4eSGregory Price if (!nnodes) 2412fa3bea4eSGregory Price return 0; 2413fa3bea4eSGregory Price 2414fa3bea4eSGregory Price /* Continue allocating from most recent node and adjust the nr_pages */ 2415fa3bea4eSGregory Price node = me->il_prev; 2416fa3bea4eSGregory Price weight = me->il_weight; 2417fa3bea4eSGregory Price if (weight && node_isset(node, nodes)) { 2418fa3bea4eSGregory Price node_pages = min(rem_pages, weight); 2419fa3bea4eSGregory Price nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2420fa3bea4eSGregory Price NULL, page_array); 2421fa3bea4eSGregory Price page_array += nr_allocated; 2422fa3bea4eSGregory Price total_allocated += nr_allocated; 2423fa3bea4eSGregory Price /* if that's all the pages, no need to interleave */ 2424fa3bea4eSGregory Price if (rem_pages <= weight) { 2425fa3bea4eSGregory Price me->il_weight -= rem_pages; 2426fa3bea4eSGregory Price return total_allocated; 2427fa3bea4eSGregory Price } 2428fa3bea4eSGregory Price /* Otherwise we adjust remaining pages, continue from there */ 2429fa3bea4eSGregory Price rem_pages -= weight; 2430fa3bea4eSGregory Price } 2431fa3bea4eSGregory Price /* clear active weight in case of an allocation failure */ 2432fa3bea4eSGregory Price me->il_weight = 0; 2433fa3bea4eSGregory Price prev_node = node; 2434fa3bea4eSGregory Price 2435fa3bea4eSGregory Price /* create a local copy of node weights to operate on outside rcu */ 2436fa3bea4eSGregory Price weights = kzalloc(nr_node_ids, GFP_KERNEL); 2437fa3bea4eSGregory Price if (!weights) 2438fa3bea4eSGregory Price return total_allocated; 2439fa3bea4eSGregory Price 2440fa3bea4eSGregory Price rcu_read_lock(); 2441fa3bea4eSGregory Price table = rcu_dereference(iw_table); 2442fa3bea4eSGregory Price if (table) 2443fa3bea4eSGregory Price memcpy(weights, table, nr_node_ids); 2444fa3bea4eSGregory Price rcu_read_unlock(); 2445fa3bea4eSGregory Price 2446fa3bea4eSGregory Price /* calculate total, detect system default usage */ 2447fa3bea4eSGregory Price for_each_node_mask(node, nodes) { 2448fa3bea4eSGregory Price if (!weights[node]) 2449fa3bea4eSGregory Price weights[node] = 1; 2450fa3bea4eSGregory Price weight_total += weights[node]; 2451fa3bea4eSGregory Price } 2452fa3bea4eSGregory Price 2453fa3bea4eSGregory Price /* 2454fa3bea4eSGregory Price * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2455fa3bea4eSGregory Price * Track which node weighted interleave should resume from. 2456fa3bea4eSGregory Price * 2457fa3bea4eSGregory Price * if (rounds > 0) and (delta == 0), resume_node will always be 2458fa3bea4eSGregory Price * the node following prev_node and its weight. 2459fa3bea4eSGregory Price */ 2460fa3bea4eSGregory Price rounds = rem_pages / weight_total; 2461fa3bea4eSGregory Price delta = rem_pages % weight_total; 2462fa3bea4eSGregory Price resume_node = next_node_in(prev_node, nodes); 2463fa3bea4eSGregory Price resume_weight = weights[resume_node]; 2464fa3bea4eSGregory Price for (i = 0; i < nnodes; i++) { 2465fa3bea4eSGregory Price node = next_node_in(prev_node, nodes); 2466fa3bea4eSGregory Price weight = weights[node]; 2467fa3bea4eSGregory Price node_pages = weight * rounds; 2468fa3bea4eSGregory Price /* If a delta exists, add this node's portion of the delta */ 2469fa3bea4eSGregory Price if (delta > weight) { 2470fa3bea4eSGregory Price node_pages += weight; 2471fa3bea4eSGregory Price delta -= weight; 2472fa3bea4eSGregory Price } else if (delta) { 2473fa3bea4eSGregory Price /* when delta is depleted, resume from that node */ 2474fa3bea4eSGregory Price node_pages += delta; 2475fa3bea4eSGregory Price resume_node = node; 2476fa3bea4eSGregory Price resume_weight = weight - delta; 2477fa3bea4eSGregory Price delta = 0; 2478fa3bea4eSGregory Price } 2479fa3bea4eSGregory Price /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2480fa3bea4eSGregory Price if (!node_pages) 2481fa3bea4eSGregory Price break; 2482fa3bea4eSGregory Price nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2483fa3bea4eSGregory Price NULL, page_array); 2484fa3bea4eSGregory Price page_array += nr_allocated; 2485fa3bea4eSGregory Price total_allocated += nr_allocated; 2486fa3bea4eSGregory Price if (total_allocated == nr_pages) 2487fa3bea4eSGregory Price break; 2488fa3bea4eSGregory Price prev_node = node; 2489fa3bea4eSGregory Price } 2490fa3bea4eSGregory Price me->il_prev = resume_node; 2491fa3bea4eSGregory Price me->il_weight = resume_weight; 2492fa3bea4eSGregory Price kfree(weights); 2493fa3bea4eSGregory Price return total_allocated; 2494fa3bea4eSGregory Price } 2495fa3bea4eSGregory Price 2496c00b6b96SChen Wandun static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, 2497c00b6b96SChen Wandun struct mempolicy *pol, unsigned long nr_pages, 2498c00b6b96SChen Wandun struct page **page_array) 2499c00b6b96SChen Wandun { 2500c00b6b96SChen Wandun gfp_t preferred_gfp; 2501c00b6b96SChen Wandun unsigned long nr_allocated = 0; 2502c00b6b96SChen Wandun 2503c00b6b96SChen Wandun preferred_gfp = gfp | __GFP_NOWARN; 2504c00b6b96SChen Wandun preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2505c00b6b96SChen Wandun 2506c00b6b96SChen Wandun nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, 2507c00b6b96SChen Wandun nr_pages, NULL, page_array); 2508c00b6b96SChen Wandun 2509c00b6b96SChen Wandun if (nr_allocated < nr_pages) 2510c00b6b96SChen Wandun nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, 2511c00b6b96SChen Wandun nr_pages - nr_allocated, NULL, 2512c00b6b96SChen Wandun page_array + nr_allocated); 2513c00b6b96SChen Wandun return nr_allocated; 2514c00b6b96SChen Wandun } 2515c00b6b96SChen Wandun 2516c00b6b96SChen Wandun /* alloc pages bulk and mempolicy should be considered at the 2517c00b6b96SChen Wandun * same time in some situation such as vmalloc. 2518c00b6b96SChen Wandun * 2519c00b6b96SChen Wandun * It can accelerate memory allocation especially interleaving 2520c00b6b96SChen Wandun * allocate memory. 2521c00b6b96SChen Wandun */ 2522c00b6b96SChen Wandun unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, 2523c00b6b96SChen Wandun unsigned long nr_pages, struct page **page_array) 2524c00b6b96SChen Wandun { 2525c00b6b96SChen Wandun struct mempolicy *pol = &default_policy; 2526ddc1a5cbSHugh Dickins nodemask_t *nodemask; 2527ddc1a5cbSHugh Dickins int nid; 2528c00b6b96SChen Wandun 2529c00b6b96SChen Wandun if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2530c00b6b96SChen Wandun pol = get_task_policy(current); 2531c00b6b96SChen Wandun 2532c00b6b96SChen Wandun if (pol->mode == MPOL_INTERLEAVE) 2533c00b6b96SChen Wandun return alloc_pages_bulk_array_interleave(gfp, pol, 2534c00b6b96SChen Wandun nr_pages, page_array); 2535c00b6b96SChen Wandun 2536fa3bea4eSGregory Price if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2537fa3bea4eSGregory Price return alloc_pages_bulk_array_weighted_interleave( 2538fa3bea4eSGregory Price gfp, pol, nr_pages, page_array); 2539fa3bea4eSGregory Price 2540c00b6b96SChen Wandun if (pol->mode == MPOL_PREFERRED_MANY) 2541c00b6b96SChen Wandun return alloc_pages_bulk_array_preferred_many(gfp, 2542c00b6b96SChen Wandun numa_node_id(), pol, nr_pages, page_array); 2543c00b6b96SChen Wandun 2544ddc1a5cbSHugh Dickins nid = numa_node_id(); 2545ddc1a5cbSHugh Dickins nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2546ddc1a5cbSHugh Dickins return __alloc_pages_bulk(gfp, nid, nodemask, 2547ddc1a5cbSHugh Dickins nr_pages, NULL, page_array); 2548c00b6b96SChen Wandun } 2549c00b6b96SChen Wandun 2550ef0855d3SOleg Nesterov int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2551ef0855d3SOleg Nesterov { 2552c36f6e6dSHugh Dickins struct mempolicy *pol = mpol_dup(src->vm_policy); 2553ef0855d3SOleg Nesterov 2554ef0855d3SOleg Nesterov if (IS_ERR(pol)) 2555ef0855d3SOleg Nesterov return PTR_ERR(pol); 2556ef0855d3SOleg Nesterov dst->vm_policy = pol; 2557ef0855d3SOleg Nesterov return 0; 2558ef0855d3SOleg Nesterov } 2559ef0855d3SOleg Nesterov 25604225399aSPaul Jackson /* 2561846a16bfSLee Schermerhorn * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 25624225399aSPaul Jackson * rebinds the mempolicy its copying by calling mpol_rebind_policy() 25634225399aSPaul Jackson * with the mems_allowed returned by cpuset_mems_allowed(). This 25644225399aSPaul Jackson * keeps mempolicies cpuset relative after its cpuset moves. See 25654225399aSPaul Jackson * further kernel/cpuset.c update_nodemask(). 2566708c1bbcSMiao Xie * 2567708c1bbcSMiao Xie * current's mempolicy may be rebinded by the other task(the task that changes 2568708c1bbcSMiao Xie * cpuset's mems), so we needn't do rebind work for current task. 25694225399aSPaul Jackson */ 25704225399aSPaul Jackson 2571846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */ 2572846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old) 25731da177e4SLinus Torvalds { 25741da177e4SLinus Torvalds struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 25751da177e4SLinus Torvalds 25761da177e4SLinus Torvalds if (!new) 25771da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 2578708c1bbcSMiao Xie 2579708c1bbcSMiao Xie /* task's mempolicy is protected by alloc_lock */ 2580708c1bbcSMiao Xie if (old == current->mempolicy) { 2581708c1bbcSMiao Xie task_lock(current); 2582708c1bbcSMiao Xie *new = *old; 2583708c1bbcSMiao Xie task_unlock(current); 2584708c1bbcSMiao Xie } else 2585708c1bbcSMiao Xie *new = *old; 2586708c1bbcSMiao Xie 25874225399aSPaul Jackson if (current_cpuset_is_being_rebound()) { 25884225399aSPaul Jackson nodemask_t mems = cpuset_mems_allowed(current); 2589213980c0SVlastimil Babka mpol_rebind_policy(new, &mems); 25904225399aSPaul Jackson } 25911da177e4SLinus Torvalds atomic_set(&new->refcnt, 1); 25921da177e4SLinus Torvalds return new; 25931da177e4SLinus Torvalds } 25941da177e4SLinus Torvalds 25951da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */ 2596fcfb4dccSKOSAKI Motohiro bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 25971da177e4SLinus Torvalds { 25981da177e4SLinus Torvalds if (!a || !b) 2599fcfb4dccSKOSAKI Motohiro return false; 260045c4745aSLee Schermerhorn if (a->mode != b->mode) 2601fcfb4dccSKOSAKI Motohiro return false; 260219800502SBob Liu if (a->flags != b->flags) 2603fcfb4dccSKOSAKI Motohiro return false; 2604c6018b4bSAneesh Kumar K.V if (a->home_node != b->home_node) 2605c6018b4bSAneesh Kumar K.V return false; 260619800502SBob Liu if (mpol_store_user_nodemask(a)) 260719800502SBob Liu if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2608fcfb4dccSKOSAKI Motohiro return false; 260919800502SBob Liu 261045c4745aSLee Schermerhorn switch (a->mode) { 261119770b32SMel Gorman case MPOL_BIND: 26121da177e4SLinus Torvalds case MPOL_INTERLEAVE: 26131da177e4SLinus Torvalds case MPOL_PREFERRED: 2614b27abaccSDave Hansen case MPOL_PREFERRED_MANY: 2615fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 2616269fbe72SBen Widawsky return !!nodes_equal(a->nodes, b->nodes); 26177858d7bcSFeng Tang case MPOL_LOCAL: 26187858d7bcSFeng Tang return true; 26191da177e4SLinus Torvalds default: 26201da177e4SLinus Torvalds BUG(); 2621fcfb4dccSKOSAKI Motohiro return false; 26221da177e4SLinus Torvalds } 26231da177e4SLinus Torvalds } 26241da177e4SLinus Torvalds 26251da177e4SLinus Torvalds /* 26261da177e4SLinus Torvalds * Shared memory backing store policy support. 26271da177e4SLinus Torvalds * 26281da177e4SLinus Torvalds * Remember policies even when nobody has shared memory mapped. 26291da177e4SLinus Torvalds * The policies are kept in Red-Black tree linked from the inode. 26304a8c7bb5SNathan Zimmer * They are protected by the sp->lock rwlock, which should be held 26311da177e4SLinus Torvalds * for any accesses to the tree. 26321da177e4SLinus Torvalds */ 26331da177e4SLinus Torvalds 26344a8c7bb5SNathan Zimmer /* 26354a8c7bb5SNathan Zimmer * lookup first element intersecting start-end. Caller holds sp->lock for 26364a8c7bb5SNathan Zimmer * reading or for writing 26374a8c7bb5SNathan Zimmer */ 263893397c3bSHugh Dickins static struct sp_node *sp_lookup(struct shared_policy *sp, 263993397c3bSHugh Dickins pgoff_t start, pgoff_t end) 26401da177e4SLinus Torvalds { 26411da177e4SLinus Torvalds struct rb_node *n = sp->root.rb_node; 26421da177e4SLinus Torvalds 26431da177e4SLinus Torvalds while (n) { 26441da177e4SLinus Torvalds struct sp_node *p = rb_entry(n, struct sp_node, nd); 26451da177e4SLinus Torvalds 26461da177e4SLinus Torvalds if (start >= p->end) 26471da177e4SLinus Torvalds n = n->rb_right; 26481da177e4SLinus Torvalds else if (end <= p->start) 26491da177e4SLinus Torvalds n = n->rb_left; 26501da177e4SLinus Torvalds else 26511da177e4SLinus Torvalds break; 26521da177e4SLinus Torvalds } 26531da177e4SLinus Torvalds if (!n) 26541da177e4SLinus Torvalds return NULL; 26551da177e4SLinus Torvalds for (;;) { 26561da177e4SLinus Torvalds struct sp_node *w = NULL; 26571da177e4SLinus Torvalds struct rb_node *prev = rb_prev(n); 26581da177e4SLinus Torvalds if (!prev) 26591da177e4SLinus Torvalds break; 26601da177e4SLinus Torvalds w = rb_entry(prev, struct sp_node, nd); 26611da177e4SLinus Torvalds if (w->end <= start) 26621da177e4SLinus Torvalds break; 26631da177e4SLinus Torvalds n = prev; 26641da177e4SLinus Torvalds } 26651da177e4SLinus Torvalds return rb_entry(n, struct sp_node, nd); 26661da177e4SLinus Torvalds } 26671da177e4SLinus Torvalds 26684a8c7bb5SNathan Zimmer /* 26694a8c7bb5SNathan Zimmer * Insert a new shared policy into the list. Caller holds sp->lock for 26704a8c7bb5SNathan Zimmer * writing. 26714a8c7bb5SNathan Zimmer */ 26721da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new) 26731da177e4SLinus Torvalds { 26741da177e4SLinus Torvalds struct rb_node **p = &sp->root.rb_node; 26751da177e4SLinus Torvalds struct rb_node *parent = NULL; 26761da177e4SLinus Torvalds struct sp_node *nd; 26771da177e4SLinus Torvalds 26781da177e4SLinus Torvalds while (*p) { 26791da177e4SLinus Torvalds parent = *p; 26801da177e4SLinus Torvalds nd = rb_entry(parent, struct sp_node, nd); 26811da177e4SLinus Torvalds if (new->start < nd->start) 26821da177e4SLinus Torvalds p = &(*p)->rb_left; 26831da177e4SLinus Torvalds else if (new->end > nd->end) 26841da177e4SLinus Torvalds p = &(*p)->rb_right; 26851da177e4SLinus Torvalds else 26861da177e4SLinus Torvalds BUG(); 26871da177e4SLinus Torvalds } 26881da177e4SLinus Torvalds rb_link_node(&new->nd, parent, p); 26891da177e4SLinus Torvalds rb_insert_color(&new->nd, &sp->root); 26901da177e4SLinus Torvalds } 26911da177e4SLinus Torvalds 26921da177e4SLinus Torvalds /* Find shared policy intersecting idx */ 269393397c3bSHugh Dickins struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 269493397c3bSHugh Dickins pgoff_t idx) 26951da177e4SLinus Torvalds { 26961da177e4SLinus Torvalds struct mempolicy *pol = NULL; 26971da177e4SLinus Torvalds struct sp_node *sn; 26981da177e4SLinus Torvalds 26991da177e4SLinus Torvalds if (!sp->root.rb_node) 27001da177e4SLinus Torvalds return NULL; 27014a8c7bb5SNathan Zimmer read_lock(&sp->lock); 27021da177e4SLinus Torvalds sn = sp_lookup(sp, idx, idx+1); 27031da177e4SLinus Torvalds if (sn) { 27041da177e4SLinus Torvalds mpol_get(sn->policy); 27051da177e4SLinus Torvalds pol = sn->policy; 27061da177e4SLinus Torvalds } 27074a8c7bb5SNathan Zimmer read_unlock(&sp->lock); 27081da177e4SLinus Torvalds return pol; 27091da177e4SLinus Torvalds } 27101da177e4SLinus Torvalds 271163f74ca2SKOSAKI Motohiro static void sp_free(struct sp_node *n) 271263f74ca2SKOSAKI Motohiro { 271363f74ca2SKOSAKI Motohiro mpol_put(n->policy); 271463f74ca2SKOSAKI Motohiro kmem_cache_free(sn_cache, n); 271563f74ca2SKOSAKI Motohiro } 271663f74ca2SKOSAKI Motohiro 2717771fb4d8SLee Schermerhorn /** 271875c70128SKefeng Wang * mpol_misplaced - check whether current folio node is valid in policy 2719771fb4d8SLee Schermerhorn * 272075c70128SKefeng Wang * @folio: folio to be checked 272175c70128SKefeng Wang * @vma: vm area where folio mapped 272275c70128SKefeng Wang * @addr: virtual address in @vma for shared policy lookup and interleave policy 2723771fb4d8SLee Schermerhorn * 272475c70128SKefeng Wang * Lookup current policy node id for vma,addr and "compare to" folio's 27255f076944SMatthew Wilcox (Oracle) * node id. Policy determination "mimics" alloc_page_vma(). 2726771fb4d8SLee Schermerhorn * Called from fault path where we know the vma and faulting address. 27275f076944SMatthew Wilcox (Oracle) * 2728062db293SBaolin Wang * Return: NUMA_NO_NODE if the page is in a node that is valid for this 272975c70128SKefeng Wang * policy, or a suitable node ID to allocate a replacement folio from. 2730771fb4d8SLee Schermerhorn */ 273175c70128SKefeng Wang int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, 273275c70128SKefeng Wang unsigned long addr) 2733771fb4d8SLee Schermerhorn { 2734771fb4d8SLee Schermerhorn struct mempolicy *pol; 2735ddc1a5cbSHugh Dickins pgoff_t ilx; 2736c33d6c06SMel Gorman struct zoneref *z; 273775c70128SKefeng Wang int curnid = folio_nid(folio); 273890572890SPeter Zijlstra int thiscpu = raw_smp_processor_id(); 273990572890SPeter Zijlstra int thisnid = cpu_to_node(thiscpu); 274098fa15f3SAnshuman Khandual int polnid = NUMA_NO_NODE; 2741062db293SBaolin Wang int ret = NUMA_NO_NODE; 2742771fb4d8SLee Schermerhorn 2743ddc1a5cbSHugh Dickins pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2744771fb4d8SLee Schermerhorn if (!(pol->flags & MPOL_F_MOF)) 2745771fb4d8SLee Schermerhorn goto out; 2746771fb4d8SLee Schermerhorn 2747771fb4d8SLee Schermerhorn switch (pol->mode) { 2748771fb4d8SLee Schermerhorn case MPOL_INTERLEAVE: 2749ddc1a5cbSHugh Dickins polnid = interleave_nid(pol, ilx); 2750771fb4d8SLee Schermerhorn break; 2751771fb4d8SLee Schermerhorn 2752fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 2753fa3bea4eSGregory Price polnid = weighted_interleave_nid(pol, ilx); 2754fa3bea4eSGregory Price break; 2755fa3bea4eSGregory Price 2756771fb4d8SLee Schermerhorn case MPOL_PREFERRED: 2757b27abaccSDave Hansen if (node_isset(curnid, pol->nodes)) 2758b27abaccSDave Hansen goto out; 2759269fbe72SBen Widawsky polnid = first_node(pol->nodes); 2760771fb4d8SLee Schermerhorn break; 2761771fb4d8SLee Schermerhorn 27627858d7bcSFeng Tang case MPOL_LOCAL: 27637858d7bcSFeng Tang polnid = numa_node_id(); 27647858d7bcSFeng Tang break; 27657858d7bcSFeng Tang 2766771fb4d8SLee Schermerhorn case MPOL_BIND: 2767bda420b9SHuang Ying /* Optimize placement among multiple nodes via NUMA balancing */ 2768bda420b9SHuang Ying if (pol->flags & MPOL_F_MORON) { 2769269fbe72SBen Widawsky if (node_isset(thisnid, pol->nodes)) 2770bda420b9SHuang Ying break; 2771bda420b9SHuang Ying goto out; 2772bda420b9SHuang Ying } 2773b27abaccSDave Hansen fallthrough; 2774c33d6c06SMel Gorman 2775b27abaccSDave Hansen case MPOL_PREFERRED_MANY: 2776771fb4d8SLee Schermerhorn /* 2777771fb4d8SLee Schermerhorn * use current page if in policy nodemask, 2778771fb4d8SLee Schermerhorn * else select nearest allowed node, if any. 2779771fb4d8SLee Schermerhorn * If no allowed nodes, use current [!misplaced]. 2780771fb4d8SLee Schermerhorn */ 2781269fbe72SBen Widawsky if (node_isset(curnid, pol->nodes)) 2782771fb4d8SLee Schermerhorn goto out; 2783c33d6c06SMel Gorman z = first_zones_zonelist( 2784771fb4d8SLee Schermerhorn node_zonelist(numa_node_id(), GFP_HIGHUSER), 2785771fb4d8SLee Schermerhorn gfp_zone(GFP_HIGHUSER), 2786269fbe72SBen Widawsky &pol->nodes); 2787c1093b74SPavel Tatashin polnid = zone_to_nid(z->zone); 2788771fb4d8SLee Schermerhorn break; 2789771fb4d8SLee Schermerhorn 2790771fb4d8SLee Schermerhorn default: 2791771fb4d8SLee Schermerhorn BUG(); 2792771fb4d8SLee Schermerhorn } 27935606e387SMel Gorman 279475c70128SKefeng Wang /* Migrate the folio towards the node whose CPU is referencing it */ 2795e42c8ff2SMel Gorman if (pol->flags & MPOL_F_MORON) { 279690572890SPeter Zijlstra polnid = thisnid; 27975606e387SMel Gorman 27988c9ae56dSKefeng Wang if (!should_numa_migrate_memory(current, folio, curnid, 279975c70128SKefeng Wang thiscpu)) 2800de1c9ce6SRik van Riel goto out; 2801de1c9ce6SRik van Riel } 2802e42c8ff2SMel Gorman 2803771fb4d8SLee Schermerhorn if (curnid != polnid) 2804771fb4d8SLee Schermerhorn ret = polnid; 2805771fb4d8SLee Schermerhorn out: 2806771fb4d8SLee Schermerhorn mpol_cond_put(pol); 2807771fb4d8SLee Schermerhorn 2808771fb4d8SLee Schermerhorn return ret; 2809771fb4d8SLee Schermerhorn } 2810771fb4d8SLee Schermerhorn 2811c11600e4SDavid Rientjes /* 2812c11600e4SDavid Rientjes * Drop the (possibly final) reference to task->mempolicy. It needs to be 2813c11600e4SDavid Rientjes * dropped after task->mempolicy is set to NULL so that any allocation done as 2814c11600e4SDavid Rientjes * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 2815c11600e4SDavid Rientjes * policy. 2816c11600e4SDavid Rientjes */ 2817c11600e4SDavid Rientjes void mpol_put_task_policy(struct task_struct *task) 2818c11600e4SDavid Rientjes { 2819c11600e4SDavid Rientjes struct mempolicy *pol; 2820c11600e4SDavid Rientjes 2821c11600e4SDavid Rientjes task_lock(task); 2822c11600e4SDavid Rientjes pol = task->mempolicy; 2823c11600e4SDavid Rientjes task->mempolicy = NULL; 2824c11600e4SDavid Rientjes task_unlock(task); 2825c11600e4SDavid Rientjes mpol_put(pol); 2826c11600e4SDavid Rientjes } 2827c11600e4SDavid Rientjes 28281da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n) 28291da177e4SLinus Torvalds { 28301da177e4SLinus Torvalds rb_erase(&n->nd, &sp->root); 283163f74ca2SKOSAKI Motohiro sp_free(n); 28321da177e4SLinus Torvalds } 28331da177e4SLinus Torvalds 283442288fe3SMel Gorman static void sp_node_init(struct sp_node *node, unsigned long start, 283542288fe3SMel Gorman unsigned long end, struct mempolicy *pol) 283642288fe3SMel Gorman { 283742288fe3SMel Gorman node->start = start; 283842288fe3SMel Gorman node->end = end; 283942288fe3SMel Gorman node->policy = pol; 284042288fe3SMel Gorman } 284142288fe3SMel Gorman 2842dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2843dbcb0f19SAdrian Bunk struct mempolicy *pol) 28441da177e4SLinus Torvalds { 2845869833f2SKOSAKI Motohiro struct sp_node *n; 2846869833f2SKOSAKI Motohiro struct mempolicy *newpol; 28471da177e4SLinus Torvalds 2848869833f2SKOSAKI Motohiro n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 28491da177e4SLinus Torvalds if (!n) 28501da177e4SLinus Torvalds return NULL; 2851869833f2SKOSAKI Motohiro 2852869833f2SKOSAKI Motohiro newpol = mpol_dup(pol); 2853869833f2SKOSAKI Motohiro if (IS_ERR(newpol)) { 2854869833f2SKOSAKI Motohiro kmem_cache_free(sn_cache, n); 2855869833f2SKOSAKI Motohiro return NULL; 2856869833f2SKOSAKI Motohiro } 2857869833f2SKOSAKI Motohiro newpol->flags |= MPOL_F_SHARED; 285842288fe3SMel Gorman sp_node_init(n, start, end, newpol); 2859869833f2SKOSAKI Motohiro 28601da177e4SLinus Torvalds return n; 28611da177e4SLinus Torvalds } 28621da177e4SLinus Torvalds 28631da177e4SLinus Torvalds /* Replace a policy range. */ 286493397c3bSHugh Dickins static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 286593397c3bSHugh Dickins pgoff_t end, struct sp_node *new) 28661da177e4SLinus Torvalds { 2867b22d127aSMel Gorman struct sp_node *n; 286842288fe3SMel Gorman struct sp_node *n_new = NULL; 286942288fe3SMel Gorman struct mempolicy *mpol_new = NULL; 2870b22d127aSMel Gorman int ret = 0; 28711da177e4SLinus Torvalds 287242288fe3SMel Gorman restart: 28734a8c7bb5SNathan Zimmer write_lock(&sp->lock); 28741da177e4SLinus Torvalds n = sp_lookup(sp, start, end); 28751da177e4SLinus Torvalds /* Take care of old policies in the same range. */ 28761da177e4SLinus Torvalds while (n && n->start < end) { 28771da177e4SLinus Torvalds struct rb_node *next = rb_next(&n->nd); 28781da177e4SLinus Torvalds if (n->start >= start) { 28791da177e4SLinus Torvalds if (n->end <= end) 28801da177e4SLinus Torvalds sp_delete(sp, n); 28811da177e4SLinus Torvalds else 28821da177e4SLinus Torvalds n->start = end; 28831da177e4SLinus Torvalds } else { 28841da177e4SLinus Torvalds /* Old policy spanning whole new range. */ 28851da177e4SLinus Torvalds if (n->end > end) { 288642288fe3SMel Gorman if (!n_new) 288742288fe3SMel Gorman goto alloc_new; 288842288fe3SMel Gorman 288942288fe3SMel Gorman *mpol_new = *n->policy; 289042288fe3SMel Gorman atomic_set(&mpol_new->refcnt, 1); 28917880639cSKOSAKI Motohiro sp_node_init(n_new, end, n->end, mpol_new); 28921da177e4SLinus Torvalds n->end = start; 28935ca39575SHillf Danton sp_insert(sp, n_new); 289442288fe3SMel Gorman n_new = NULL; 289542288fe3SMel Gorman mpol_new = NULL; 28961da177e4SLinus Torvalds break; 28971da177e4SLinus Torvalds } else 28981da177e4SLinus Torvalds n->end = start; 28991da177e4SLinus Torvalds } 29001da177e4SLinus Torvalds if (!next) 29011da177e4SLinus Torvalds break; 29021da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 29031da177e4SLinus Torvalds } 29041da177e4SLinus Torvalds if (new) 29051da177e4SLinus Torvalds sp_insert(sp, new); 29064a8c7bb5SNathan Zimmer write_unlock(&sp->lock); 290742288fe3SMel Gorman ret = 0; 290842288fe3SMel Gorman 290942288fe3SMel Gorman err_out: 291042288fe3SMel Gorman if (mpol_new) 291142288fe3SMel Gorman mpol_put(mpol_new); 291242288fe3SMel Gorman if (n_new) 291342288fe3SMel Gorman kmem_cache_free(sn_cache, n_new); 291442288fe3SMel Gorman 2915b22d127aSMel Gorman return ret; 291642288fe3SMel Gorman 291742288fe3SMel Gorman alloc_new: 29184a8c7bb5SNathan Zimmer write_unlock(&sp->lock); 291942288fe3SMel Gorman ret = -ENOMEM; 292042288fe3SMel Gorman n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 292142288fe3SMel Gorman if (!n_new) 292242288fe3SMel Gorman goto err_out; 292342288fe3SMel Gorman mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 292442288fe3SMel Gorman if (!mpol_new) 292542288fe3SMel Gorman goto err_out; 29264ad09955SMiaohe Lin atomic_set(&mpol_new->refcnt, 1); 292742288fe3SMel Gorman goto restart; 29281da177e4SLinus Torvalds } 29291da177e4SLinus Torvalds 293071fe804bSLee Schermerhorn /** 293171fe804bSLee Schermerhorn * mpol_shared_policy_init - initialize shared policy for inode 293271fe804bSLee Schermerhorn * @sp: pointer to inode shared policy 293371fe804bSLee Schermerhorn * @mpol: struct mempolicy to install 293471fe804bSLee Schermerhorn * 293571fe804bSLee Schermerhorn * Install non-NULL @mpol in inode's shared policy rb-tree. 293671fe804bSLee Schermerhorn * On entry, the current task has a reference on a non-NULL @mpol. 293771fe804bSLee Schermerhorn * This must be released on exit. 29384bfc4495SKAMEZAWA Hiroyuki * This is called at get_inode() calls and we can use GFP_KERNEL. 293971fe804bSLee Schermerhorn */ 294071fe804bSLee Schermerhorn void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 29417339ff83SRobin Holt { 294258568d2aSMiao Xie int ret; 294358568d2aSMiao Xie 294471fe804bSLee Schermerhorn sp->root = RB_ROOT; /* empty tree == default mempolicy */ 29454a8c7bb5SNathan Zimmer rwlock_init(&sp->lock); 29467339ff83SRobin Holt 294771fe804bSLee Schermerhorn if (mpol) { 294835ec8fa0SHugh Dickins struct sp_node *sn; 294935ec8fa0SHugh Dickins struct mempolicy *npol; 29504bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch); 29517339ff83SRobin Holt 29524bfc4495SKAMEZAWA Hiroyuki if (!scratch) 29535c0c1654SLee Schermerhorn goto put_mpol; 295435ec8fa0SHugh Dickins 295535ec8fa0SHugh Dickins /* contextualize the tmpfs mount point mempolicy to this file */ 295635ec8fa0SHugh Dickins npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 295735ec8fa0SHugh Dickins if (IS_ERR(npol)) 29580cae3457SDan Carpenter goto free_scratch; /* no valid nodemask intersection */ 295958568d2aSMiao Xie 296058568d2aSMiao Xie task_lock(current); 296135ec8fa0SHugh Dickins ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 296258568d2aSMiao Xie task_unlock(current); 296315d77835SLee Schermerhorn if (ret) 296435ec8fa0SHugh Dickins goto put_npol; 296571fe804bSLee Schermerhorn 296635ec8fa0SHugh Dickins /* alloc node covering entire file; adds ref to file's npol */ 296735ec8fa0SHugh Dickins sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 296835ec8fa0SHugh Dickins if (sn) 296935ec8fa0SHugh Dickins sp_insert(sp, sn); 297035ec8fa0SHugh Dickins put_npol: 297135ec8fa0SHugh Dickins mpol_put(npol); /* drop initial ref on file's npol */ 29720cae3457SDan Carpenter free_scratch: 29734bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch); 29745c0c1654SLee Schermerhorn put_mpol: 29755c0c1654SLee Schermerhorn mpol_put(mpol); /* drop our incoming ref on sb mpol */ 29767339ff83SRobin Holt } 29777339ff83SRobin Holt } 29787339ff83SRobin Holt 2979c36f6e6dSHugh Dickins int mpol_set_shared_policy(struct shared_policy *sp, 2980c36f6e6dSHugh Dickins struct vm_area_struct *vma, struct mempolicy *pol) 29811da177e4SLinus Torvalds { 29821da177e4SLinus Torvalds int err; 29831da177e4SLinus Torvalds struct sp_node *new = NULL; 29841da177e4SLinus Torvalds unsigned long sz = vma_pages(vma); 29851da177e4SLinus Torvalds 2986c36f6e6dSHugh Dickins if (pol) { 2987c36f6e6dSHugh Dickins new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 29881da177e4SLinus Torvalds if (!new) 29891da177e4SLinus Torvalds return -ENOMEM; 29901da177e4SLinus Torvalds } 2991c36f6e6dSHugh Dickins err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 29921da177e4SLinus Torvalds if (err && new) 299363f74ca2SKOSAKI Motohiro sp_free(new); 29941da177e4SLinus Torvalds return err; 29951da177e4SLinus Torvalds } 29961da177e4SLinus Torvalds 29971da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */ 2998c36f6e6dSHugh Dickins void mpol_free_shared_policy(struct shared_policy *sp) 29991da177e4SLinus Torvalds { 30001da177e4SLinus Torvalds struct sp_node *n; 30011da177e4SLinus Torvalds struct rb_node *next; 30021da177e4SLinus Torvalds 3003c36f6e6dSHugh Dickins if (!sp->root.rb_node) 30041da177e4SLinus Torvalds return; 3005c36f6e6dSHugh Dickins write_lock(&sp->lock); 3006c36f6e6dSHugh Dickins next = rb_first(&sp->root); 30071da177e4SLinus Torvalds while (next) { 30081da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 30091da177e4SLinus Torvalds next = rb_next(&n->nd); 3010c36f6e6dSHugh Dickins sp_delete(sp, n); 30111da177e4SLinus Torvalds } 3012c36f6e6dSHugh Dickins write_unlock(&sp->lock); 30131da177e4SLinus Torvalds } 30141da177e4SLinus Torvalds 30151a687c2eSMel Gorman #ifdef CONFIG_NUMA_BALANCING 3016c297663cSMel Gorman static int __initdata numabalancing_override; 30171a687c2eSMel Gorman 30181a687c2eSMel Gorman static void __init check_numabalancing_enable(void) 30191a687c2eSMel Gorman { 30201a687c2eSMel Gorman bool numabalancing_default = false; 30211a687c2eSMel Gorman 30221a687c2eSMel Gorman if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 30231a687c2eSMel Gorman numabalancing_default = true; 30241a687c2eSMel Gorman 3025c297663cSMel Gorman /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3026c297663cSMel Gorman if (numabalancing_override) 3027c297663cSMel Gorman set_numabalancing_state(numabalancing_override == 1); 3028c297663cSMel Gorman 3029b0dc2b9bSMel Gorman if (num_online_nodes() > 1 && !numabalancing_override) { 3030756a025fSJoe Perches pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3031c297663cSMel Gorman numabalancing_default ? "Enabling" : "Disabling"); 30321a687c2eSMel Gorman set_numabalancing_state(numabalancing_default); 30331a687c2eSMel Gorman } 30341a687c2eSMel Gorman } 30351a687c2eSMel Gorman 30361a687c2eSMel Gorman static int __init setup_numabalancing(char *str) 30371a687c2eSMel Gorman { 30381a687c2eSMel Gorman int ret = 0; 30391a687c2eSMel Gorman if (!str) 30401a687c2eSMel Gorman goto out; 30411a687c2eSMel Gorman 30421a687c2eSMel Gorman if (!strcmp(str, "enable")) { 3043c297663cSMel Gorman numabalancing_override = 1; 30441a687c2eSMel Gorman ret = 1; 30451a687c2eSMel Gorman } else if (!strcmp(str, "disable")) { 3046c297663cSMel Gorman numabalancing_override = -1; 30471a687c2eSMel Gorman ret = 1; 30481a687c2eSMel Gorman } 30491a687c2eSMel Gorman out: 30501a687c2eSMel Gorman if (!ret) 30514a404beaSAndrew Morton pr_warn("Unable to parse numa_balancing=\n"); 30521a687c2eSMel Gorman 30531a687c2eSMel Gorman return ret; 30541a687c2eSMel Gorman } 30551a687c2eSMel Gorman __setup("numa_balancing=", setup_numabalancing); 30561a687c2eSMel Gorman #else 30571a687c2eSMel Gorman static inline void __init check_numabalancing_enable(void) 30581a687c2eSMel Gorman { 30591a687c2eSMel Gorman } 30601a687c2eSMel Gorman #endif /* CONFIG_NUMA_BALANCING */ 30611a687c2eSMel Gorman 30621da177e4SLinus Torvalds void __init numa_policy_init(void) 30631da177e4SLinus Torvalds { 3064b71636e2SPaul Mundt nodemask_t interleave_nodes; 3065b71636e2SPaul Mundt unsigned long largest = 0; 3066b71636e2SPaul Mundt int nid, prefer = 0; 3067b71636e2SPaul Mundt 30681da177e4SLinus Torvalds policy_cache = kmem_cache_create("numa_policy", 30691da177e4SLinus Torvalds sizeof(struct mempolicy), 307020c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 30711da177e4SLinus Torvalds 30721da177e4SLinus Torvalds sn_cache = kmem_cache_create("shared_policy_node", 30731da177e4SLinus Torvalds sizeof(struct sp_node), 307420c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 30751da177e4SLinus Torvalds 30765606e387SMel Gorman for_each_node(nid) { 30775606e387SMel Gorman preferred_node_policy[nid] = (struct mempolicy) { 30785606e387SMel Gorman .refcnt = ATOMIC_INIT(1), 30795606e387SMel Gorman .mode = MPOL_PREFERRED, 30805606e387SMel Gorman .flags = MPOL_F_MOF | MPOL_F_MORON, 3081269fbe72SBen Widawsky .nodes = nodemask_of_node(nid), 30825606e387SMel Gorman }; 30835606e387SMel Gorman } 30845606e387SMel Gorman 3085b71636e2SPaul Mundt /* 3086b71636e2SPaul Mundt * Set interleaving policy for system init. Interleaving is only 3087b71636e2SPaul Mundt * enabled across suitably sized nodes (default is >= 16MB), or 3088b71636e2SPaul Mundt * fall back to the largest node if they're all smaller. 3089b71636e2SPaul Mundt */ 3090b71636e2SPaul Mundt nodes_clear(interleave_nodes); 309101f13bd6SLai Jiangshan for_each_node_state(nid, N_MEMORY) { 3092b71636e2SPaul Mundt unsigned long total_pages = node_present_pages(nid); 30931da177e4SLinus Torvalds 3094b71636e2SPaul Mundt /* Preserve the largest node */ 3095b71636e2SPaul Mundt if (largest < total_pages) { 3096b71636e2SPaul Mundt largest = total_pages; 3097b71636e2SPaul Mundt prefer = nid; 3098b71636e2SPaul Mundt } 3099b71636e2SPaul Mundt 3100b71636e2SPaul Mundt /* Interleave this node? */ 3101b71636e2SPaul Mundt if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3102b71636e2SPaul Mundt node_set(nid, interleave_nodes); 3103b71636e2SPaul Mundt } 3104b71636e2SPaul Mundt 3105b71636e2SPaul Mundt /* All too small, use the largest */ 3106b71636e2SPaul Mundt if (unlikely(nodes_empty(interleave_nodes))) 3107b71636e2SPaul Mundt node_set(prefer, interleave_nodes); 3108b71636e2SPaul Mundt 3109028fec41SDavid Rientjes if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3110b1de0d13SMitchel Humpherys pr_err("%s: interleaving failed\n", __func__); 31111a687c2eSMel Gorman 31121a687c2eSMel Gorman check_numabalancing_enable(); 31131da177e4SLinus Torvalds } 31141da177e4SLinus Torvalds 31158bccd85fSChristoph Lameter /* Reset policy of current process to default */ 31161da177e4SLinus Torvalds void numa_default_policy(void) 31171da177e4SLinus Torvalds { 3118028fec41SDavid Rientjes do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 31191da177e4SLinus Torvalds } 312068860ec1SPaul Jackson 31214225399aSPaul Jackson /* 3122095f1fc4SLee Schermerhorn * Parse and format mempolicy from/to strings 3123095f1fc4SLee Schermerhorn */ 3124345ace9cSLee Schermerhorn static const char * const policy_modes[] = 3125345ace9cSLee Schermerhorn { 3126345ace9cSLee Schermerhorn [MPOL_DEFAULT] = "default", 3127345ace9cSLee Schermerhorn [MPOL_PREFERRED] = "prefer", 3128345ace9cSLee Schermerhorn [MPOL_BIND] = "bind", 3129345ace9cSLee Schermerhorn [MPOL_INTERLEAVE] = "interleave", 3130fa3bea4eSGregory Price [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3131d3a71033SLee Schermerhorn [MPOL_LOCAL] = "local", 3132b27abaccSDave Hansen [MPOL_PREFERRED_MANY] = "prefer (many)", 3133345ace9cSLee Schermerhorn }; 31341a75a6c8SChristoph Lameter 3135095f1fc4SLee Schermerhorn #ifdef CONFIG_TMPFS 3136095f1fc4SLee Schermerhorn /** 3137f2a07f40SHugh Dickins * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3138095f1fc4SLee Schermerhorn * @str: string containing mempolicy to parse 313971fe804bSLee Schermerhorn * @mpol: pointer to struct mempolicy pointer, returned on success. 3140095f1fc4SLee Schermerhorn * 3141095f1fc4SLee Schermerhorn * Format of input: 3142095f1fc4SLee Schermerhorn * <mode>[=<flags>][:<nodelist>] 3143095f1fc4SLee Schermerhorn * 3144dad5b023SRandy Dunlap * Return: %0 on success, else %1 3145095f1fc4SLee Schermerhorn */ 3146a7a88b23SHugh Dickins int mpol_parse_str(char *str, struct mempolicy **mpol) 3147095f1fc4SLee Schermerhorn { 314871fe804bSLee Schermerhorn struct mempolicy *new = NULL; 3149f2a07f40SHugh Dickins unsigned short mode_flags; 315071fe804bSLee Schermerhorn nodemask_t nodes; 3151095f1fc4SLee Schermerhorn char *nodelist = strchr(str, ':'); 3152095f1fc4SLee Schermerhorn char *flags = strchr(str, '='); 3153dedf2c73Szhong jiang int err = 1, mode; 3154095f1fc4SLee Schermerhorn 3155c7a91bc7SDan Carpenter if (flags) 3156c7a91bc7SDan Carpenter *flags++ = '\0'; /* terminate mode string */ 3157c7a91bc7SDan Carpenter 3158095f1fc4SLee Schermerhorn if (nodelist) { 3159095f1fc4SLee Schermerhorn /* NUL-terminate mode or flags string */ 3160095f1fc4SLee Schermerhorn *nodelist++ = '\0'; 316171fe804bSLee Schermerhorn if (nodelist_parse(nodelist, nodes)) 3162095f1fc4SLee Schermerhorn goto out; 316301f13bd6SLai Jiangshan if (!nodes_subset(nodes, node_states[N_MEMORY])) 3164095f1fc4SLee Schermerhorn goto out; 316571fe804bSLee Schermerhorn } else 316671fe804bSLee Schermerhorn nodes_clear(nodes); 316771fe804bSLee Schermerhorn 3168dedf2c73Szhong jiang mode = match_string(policy_modes, MPOL_MAX, str); 3169dedf2c73Szhong jiang if (mode < 0) 3170095f1fc4SLee Schermerhorn goto out; 3171095f1fc4SLee Schermerhorn 317271fe804bSLee Schermerhorn switch (mode) { 3173095f1fc4SLee Schermerhorn case MPOL_PREFERRED: 317471fe804bSLee Schermerhorn /* 3175aa9f7d51SRandy Dunlap * Insist on a nodelist of one node only, although later 3176aa9f7d51SRandy Dunlap * we use first_node(nodes) to grab a single node, so here 3177aa9f7d51SRandy Dunlap * nodelist (or nodes) cannot be empty. 317871fe804bSLee Schermerhorn */ 3179095f1fc4SLee Schermerhorn if (nodelist) { 3180095f1fc4SLee Schermerhorn char *rest = nodelist; 3181095f1fc4SLee Schermerhorn while (isdigit(*rest)) 3182095f1fc4SLee Schermerhorn rest++; 3183926f2ae0SKOSAKI Motohiro if (*rest) 3184926f2ae0SKOSAKI Motohiro goto out; 3185aa9f7d51SRandy Dunlap if (nodes_empty(nodes)) 3186aa9f7d51SRandy Dunlap goto out; 3187095f1fc4SLee Schermerhorn } 3188095f1fc4SLee Schermerhorn break; 3189095f1fc4SLee Schermerhorn case MPOL_INTERLEAVE: 3190fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 3191095f1fc4SLee Schermerhorn /* 3192095f1fc4SLee Schermerhorn * Default to online nodes with memory if no nodelist 3193095f1fc4SLee Schermerhorn */ 3194095f1fc4SLee Schermerhorn if (!nodelist) 319501f13bd6SLai Jiangshan nodes = node_states[N_MEMORY]; 31963f226aa1SLee Schermerhorn break; 319771fe804bSLee Schermerhorn case MPOL_LOCAL: 31983f226aa1SLee Schermerhorn /* 319971fe804bSLee Schermerhorn * Don't allow a nodelist; mpol_new() checks flags 32003f226aa1SLee Schermerhorn */ 320171fe804bSLee Schermerhorn if (nodelist) 32023f226aa1SLee Schermerhorn goto out; 32033f226aa1SLee Schermerhorn break; 3204413b43deSRavikiran G Thirumalai case MPOL_DEFAULT: 3205413b43deSRavikiran G Thirumalai /* 3206413b43deSRavikiran G Thirumalai * Insist on a empty nodelist 3207413b43deSRavikiran G Thirumalai */ 3208413b43deSRavikiran G Thirumalai if (!nodelist) 3209413b43deSRavikiran G Thirumalai err = 0; 3210413b43deSRavikiran G Thirumalai goto out; 3211b27abaccSDave Hansen case MPOL_PREFERRED_MANY: 3212d69b2e63SKOSAKI Motohiro case MPOL_BIND: 321371fe804bSLee Schermerhorn /* 3214d69b2e63SKOSAKI Motohiro * Insist on a nodelist 321571fe804bSLee Schermerhorn */ 3216d69b2e63SKOSAKI Motohiro if (!nodelist) 3217d69b2e63SKOSAKI Motohiro goto out; 3218095f1fc4SLee Schermerhorn } 3219095f1fc4SLee Schermerhorn 322071fe804bSLee Schermerhorn mode_flags = 0; 3221095f1fc4SLee Schermerhorn if (flags) { 3222095f1fc4SLee Schermerhorn /* 3223095f1fc4SLee Schermerhorn * Currently, we only support two mutually exclusive 3224095f1fc4SLee Schermerhorn * mode flags. 3225095f1fc4SLee Schermerhorn */ 3226095f1fc4SLee Schermerhorn if (!strcmp(flags, "static")) 322771fe804bSLee Schermerhorn mode_flags |= MPOL_F_STATIC_NODES; 3228095f1fc4SLee Schermerhorn else if (!strcmp(flags, "relative")) 322971fe804bSLee Schermerhorn mode_flags |= MPOL_F_RELATIVE_NODES; 3230095f1fc4SLee Schermerhorn else 3231926f2ae0SKOSAKI Motohiro goto out; 3232095f1fc4SLee Schermerhorn } 323371fe804bSLee Schermerhorn 323471fe804bSLee Schermerhorn new = mpol_new(mode, mode_flags, &nodes); 323571fe804bSLee Schermerhorn if (IS_ERR(new)) 3236926f2ae0SKOSAKI Motohiro goto out; 3237926f2ae0SKOSAKI Motohiro 3238f2a07f40SHugh Dickins /* 3239f2a07f40SHugh Dickins * Save nodes for mpol_to_str() to show the tmpfs mount options 3240f2a07f40SHugh Dickins * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3241f2a07f40SHugh Dickins */ 3242269fbe72SBen Widawsky if (mode != MPOL_PREFERRED) { 3243269fbe72SBen Widawsky new->nodes = nodes; 3244269fbe72SBen Widawsky } else if (nodelist) { 3245269fbe72SBen Widawsky nodes_clear(new->nodes); 3246269fbe72SBen Widawsky node_set(first_node(nodes), new->nodes); 3247269fbe72SBen Widawsky } else { 32487858d7bcSFeng Tang new->mode = MPOL_LOCAL; 3249269fbe72SBen Widawsky } 3250f2a07f40SHugh Dickins 3251f2a07f40SHugh Dickins /* 3252f2a07f40SHugh Dickins * Save nodes for contextualization: this will be used to "clone" 3253f2a07f40SHugh Dickins * the mempolicy in a specific context [cpuset] at a later time. 3254f2a07f40SHugh Dickins */ 3255e17f74afSLee Schermerhorn new->w.user_nodemask = nodes; 3256f2a07f40SHugh Dickins 3257926f2ae0SKOSAKI Motohiro err = 0; 325871fe804bSLee Schermerhorn 3259095f1fc4SLee Schermerhorn out: 3260095f1fc4SLee Schermerhorn /* Restore string for error message */ 3261095f1fc4SLee Schermerhorn if (nodelist) 3262095f1fc4SLee Schermerhorn *--nodelist = ':'; 3263095f1fc4SLee Schermerhorn if (flags) 3264095f1fc4SLee Schermerhorn *--flags = '='; 326571fe804bSLee Schermerhorn if (!err) 326671fe804bSLee Schermerhorn *mpol = new; 3267095f1fc4SLee Schermerhorn return err; 3268095f1fc4SLee Schermerhorn } 3269095f1fc4SLee Schermerhorn #endif /* CONFIG_TMPFS */ 3270095f1fc4SLee Schermerhorn 327171fe804bSLee Schermerhorn /** 327271fe804bSLee Schermerhorn * mpol_to_str - format a mempolicy structure for printing 327371fe804bSLee Schermerhorn * @buffer: to contain formatted mempolicy string 327471fe804bSLee Schermerhorn * @maxlen: length of @buffer 327571fe804bSLee Schermerhorn * @pol: pointer to mempolicy to be formatted 327671fe804bSLee Schermerhorn * 3277948927eeSDavid Rientjes * Convert @pol into a string. If @buffer is too short, truncate the string. 3278948927eeSDavid Rientjes * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the 3279948927eeSDavid Rientjes * longest flag, "relative", and to display at least a few node ids. 32801a75a6c8SChristoph Lameter */ 3281948927eeSDavid Rientjes void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 32821a75a6c8SChristoph Lameter { 32831a75a6c8SChristoph Lameter char *p = buffer; 3284948927eeSDavid Rientjes nodemask_t nodes = NODE_MASK_NONE; 3285948927eeSDavid Rientjes unsigned short mode = MPOL_DEFAULT; 3286948927eeSDavid Rientjes unsigned short flags = 0; 32871a75a6c8SChristoph Lameter 32888790c71aSDavid Rientjes if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { 3289bea904d5SLee Schermerhorn mode = pol->mode; 3290948927eeSDavid Rientjes flags = pol->flags; 3291948927eeSDavid Rientjes } 3292bea904d5SLee Schermerhorn 32931a75a6c8SChristoph Lameter switch (mode) { 32941a75a6c8SChristoph Lameter case MPOL_DEFAULT: 32957858d7bcSFeng Tang case MPOL_LOCAL: 32961a75a6c8SChristoph Lameter break; 32971a75a6c8SChristoph Lameter case MPOL_PREFERRED: 3298b27abaccSDave Hansen case MPOL_PREFERRED_MANY: 32991a75a6c8SChristoph Lameter case MPOL_BIND: 33001a75a6c8SChristoph Lameter case MPOL_INTERLEAVE: 3301fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE: 3302269fbe72SBen Widawsky nodes = pol->nodes; 33031a75a6c8SChristoph Lameter break; 33041a75a6c8SChristoph Lameter default: 3305948927eeSDavid Rientjes WARN_ON_ONCE(1); 3306948927eeSDavid Rientjes snprintf(p, maxlen, "unknown"); 3307948927eeSDavid Rientjes return; 33081a75a6c8SChristoph Lameter } 33091a75a6c8SChristoph Lameter 3310b7a9f420SDavid Rientjes p += snprintf(p, maxlen, "%s", policy_modes[mode]); 33111a75a6c8SChristoph Lameter 3312fc36b8d3SLee Schermerhorn if (flags & MPOL_MODE_FLAGS) { 3313948927eeSDavid Rientjes p += snprintf(p, buffer + maxlen - p, "="); 3314f5b087b5SDavid Rientjes 33152291990aSLee Schermerhorn /* 33162291990aSLee Schermerhorn * Currently, the only defined flags are mutually exclusive 33172291990aSLee Schermerhorn */ 3318f5b087b5SDavid Rientjes if (flags & MPOL_F_STATIC_NODES) 33192291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "static"); 33202291990aSLee Schermerhorn else if (flags & MPOL_F_RELATIVE_NODES) 33212291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "relative"); 3322f5b087b5SDavid Rientjes } 3323f5b087b5SDavid Rientjes 33249e763e0fSTejun Heo if (!nodes_empty(nodes)) 33259e763e0fSTejun Heo p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 33269e763e0fSTejun Heo nodemask_pr_args(&nodes)); 33271a75a6c8SChristoph Lameter } 3328dce41f5aSRakie Kim 3329dce41f5aSRakie Kim #ifdef CONFIG_SYSFS 3330dce41f5aSRakie Kim struct iw_node_attr { 3331dce41f5aSRakie Kim struct kobj_attribute kobj_attr; 3332dce41f5aSRakie Kim int nid; 3333dce41f5aSRakie Kim }; 3334dce41f5aSRakie Kim 3335dce41f5aSRakie Kim static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3336dce41f5aSRakie Kim char *buf) 3337dce41f5aSRakie Kim { 3338dce41f5aSRakie Kim struct iw_node_attr *node_attr; 3339dce41f5aSRakie Kim u8 weight; 3340dce41f5aSRakie Kim 3341dce41f5aSRakie Kim node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3342dce41f5aSRakie Kim weight = get_il_weight(node_attr->nid); 3343dce41f5aSRakie Kim return sysfs_emit(buf, "%d\n", weight); 3344dce41f5aSRakie Kim } 3345dce41f5aSRakie Kim 3346dce41f5aSRakie Kim static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3347dce41f5aSRakie Kim const char *buf, size_t count) 3348dce41f5aSRakie Kim { 3349dce41f5aSRakie Kim struct iw_node_attr *node_attr; 3350dce41f5aSRakie Kim u8 *new; 3351dce41f5aSRakie Kim u8 *old; 3352dce41f5aSRakie Kim u8 weight = 0; 3353dce41f5aSRakie Kim 3354dce41f5aSRakie Kim node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3355dce41f5aSRakie Kim if (count == 0 || sysfs_streq(buf, "")) 3356dce41f5aSRakie Kim weight = 0; 3357dce41f5aSRakie Kim else if (kstrtou8(buf, 0, &weight)) 3358dce41f5aSRakie Kim return -EINVAL; 3359dce41f5aSRakie Kim 3360dce41f5aSRakie Kim new = kzalloc(nr_node_ids, GFP_KERNEL); 3361dce41f5aSRakie Kim if (!new) 3362dce41f5aSRakie Kim return -ENOMEM; 3363dce41f5aSRakie Kim 3364dce41f5aSRakie Kim mutex_lock(&iw_table_lock); 3365dce41f5aSRakie Kim old = rcu_dereference_protected(iw_table, 3366dce41f5aSRakie Kim lockdep_is_held(&iw_table_lock)); 3367dce41f5aSRakie Kim if (old) 3368dce41f5aSRakie Kim memcpy(new, old, nr_node_ids); 3369dce41f5aSRakie Kim new[node_attr->nid] = weight; 3370dce41f5aSRakie Kim rcu_assign_pointer(iw_table, new); 3371dce41f5aSRakie Kim mutex_unlock(&iw_table_lock); 3372dce41f5aSRakie Kim synchronize_rcu(); 3373dce41f5aSRakie Kim kfree(old); 3374dce41f5aSRakie Kim return count; 3375dce41f5aSRakie Kim } 3376dce41f5aSRakie Kim 3377dce41f5aSRakie Kim static struct iw_node_attr **node_attrs; 3378dce41f5aSRakie Kim 3379dce41f5aSRakie Kim static void sysfs_wi_node_release(struct iw_node_attr *node_attr, 3380dce41f5aSRakie Kim struct kobject *parent) 3381dce41f5aSRakie Kim { 3382dce41f5aSRakie Kim if (!node_attr) 3383dce41f5aSRakie Kim return; 3384dce41f5aSRakie Kim sysfs_remove_file(parent, &node_attr->kobj_attr.attr); 3385dce41f5aSRakie Kim kfree(node_attr->kobj_attr.attr.name); 3386dce41f5aSRakie Kim kfree(node_attr); 3387dce41f5aSRakie Kim } 3388dce41f5aSRakie Kim 3389dce41f5aSRakie Kim static void sysfs_wi_release(struct kobject *wi_kobj) 3390dce41f5aSRakie Kim { 3391dce41f5aSRakie Kim int i; 3392dce41f5aSRakie Kim 3393dce41f5aSRakie Kim for (i = 0; i < nr_node_ids; i++) 3394dce41f5aSRakie Kim sysfs_wi_node_release(node_attrs[i], wi_kobj); 3395dce41f5aSRakie Kim kobject_put(wi_kobj); 3396dce41f5aSRakie Kim } 3397dce41f5aSRakie Kim 3398dce41f5aSRakie Kim static const struct kobj_type wi_ktype = { 3399dce41f5aSRakie Kim .sysfs_ops = &kobj_sysfs_ops, 3400dce41f5aSRakie Kim .release = sysfs_wi_release, 3401dce41f5aSRakie Kim }; 3402dce41f5aSRakie Kim 3403dce41f5aSRakie Kim static int add_weight_node(int nid, struct kobject *wi_kobj) 3404dce41f5aSRakie Kim { 3405dce41f5aSRakie Kim struct iw_node_attr *node_attr; 3406dce41f5aSRakie Kim char *name; 3407dce41f5aSRakie Kim 3408dce41f5aSRakie Kim node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); 3409dce41f5aSRakie Kim if (!node_attr) 3410dce41f5aSRakie Kim return -ENOMEM; 3411dce41f5aSRakie Kim 3412dce41f5aSRakie Kim name = kasprintf(GFP_KERNEL, "node%d", nid); 3413dce41f5aSRakie Kim if (!name) { 3414dce41f5aSRakie Kim kfree(node_attr); 3415dce41f5aSRakie Kim return -ENOMEM; 3416dce41f5aSRakie Kim } 3417dce41f5aSRakie Kim 3418dce41f5aSRakie Kim sysfs_attr_init(&node_attr->kobj_attr.attr); 3419dce41f5aSRakie Kim node_attr->kobj_attr.attr.name = name; 3420dce41f5aSRakie Kim node_attr->kobj_attr.attr.mode = 0644; 3421dce41f5aSRakie Kim node_attr->kobj_attr.show = node_show; 3422dce41f5aSRakie Kim node_attr->kobj_attr.store = node_store; 3423dce41f5aSRakie Kim node_attr->nid = nid; 3424dce41f5aSRakie Kim 3425dce41f5aSRakie Kim if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { 3426dce41f5aSRakie Kim kfree(node_attr->kobj_attr.attr.name); 3427dce41f5aSRakie Kim kfree(node_attr); 3428dce41f5aSRakie Kim pr_err("failed to add attribute to weighted_interleave\n"); 3429dce41f5aSRakie Kim return -ENOMEM; 3430dce41f5aSRakie Kim } 3431dce41f5aSRakie Kim 3432dce41f5aSRakie Kim node_attrs[nid] = node_attr; 3433dce41f5aSRakie Kim return 0; 3434dce41f5aSRakie Kim } 3435dce41f5aSRakie Kim 3436dce41f5aSRakie Kim static int add_weighted_interleave_group(struct kobject *root_kobj) 3437dce41f5aSRakie Kim { 3438dce41f5aSRakie Kim struct kobject *wi_kobj; 3439dce41f5aSRakie Kim int nid, err; 3440dce41f5aSRakie Kim 3441dce41f5aSRakie Kim wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); 3442dce41f5aSRakie Kim if (!wi_kobj) 3443dce41f5aSRakie Kim return -ENOMEM; 3444dce41f5aSRakie Kim 3445dce41f5aSRakie Kim err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, 3446dce41f5aSRakie Kim "weighted_interleave"); 3447dce41f5aSRakie Kim if (err) { 3448dce41f5aSRakie Kim kfree(wi_kobj); 3449dce41f5aSRakie Kim return err; 3450dce41f5aSRakie Kim } 3451dce41f5aSRakie Kim 3452dce41f5aSRakie Kim for_each_node_state(nid, N_POSSIBLE) { 3453dce41f5aSRakie Kim err = add_weight_node(nid, wi_kobj); 3454dce41f5aSRakie Kim if (err) { 3455dce41f5aSRakie Kim pr_err("failed to add sysfs [node%d]\n", nid); 3456dce41f5aSRakie Kim break; 3457dce41f5aSRakie Kim } 3458dce41f5aSRakie Kim } 3459dce41f5aSRakie Kim if (err) 3460dce41f5aSRakie Kim kobject_put(wi_kobj); 3461dce41f5aSRakie Kim return 0; 3462dce41f5aSRakie Kim } 3463dce41f5aSRakie Kim 3464dce41f5aSRakie Kim static void mempolicy_kobj_release(struct kobject *kobj) 3465dce41f5aSRakie Kim { 3466dce41f5aSRakie Kim u8 *old; 3467dce41f5aSRakie Kim 3468dce41f5aSRakie Kim mutex_lock(&iw_table_lock); 3469dce41f5aSRakie Kim old = rcu_dereference_protected(iw_table, 3470dce41f5aSRakie Kim lockdep_is_held(&iw_table_lock)); 3471dce41f5aSRakie Kim rcu_assign_pointer(iw_table, NULL); 3472dce41f5aSRakie Kim mutex_unlock(&iw_table_lock); 3473dce41f5aSRakie Kim synchronize_rcu(); 3474dce41f5aSRakie Kim kfree(old); 3475dce41f5aSRakie Kim kfree(node_attrs); 3476dce41f5aSRakie Kim kfree(kobj); 3477dce41f5aSRakie Kim } 3478dce41f5aSRakie Kim 3479dce41f5aSRakie Kim static const struct kobj_type mempolicy_ktype = { 3480dce41f5aSRakie Kim .release = mempolicy_kobj_release 3481dce41f5aSRakie Kim }; 3482dce41f5aSRakie Kim 3483dce41f5aSRakie Kim static int __init mempolicy_sysfs_init(void) 3484dce41f5aSRakie Kim { 3485dce41f5aSRakie Kim int err; 3486dce41f5aSRakie Kim static struct kobject *mempolicy_kobj; 3487dce41f5aSRakie Kim 3488dce41f5aSRakie Kim mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); 3489dce41f5aSRakie Kim if (!mempolicy_kobj) { 3490dce41f5aSRakie Kim err = -ENOMEM; 3491dce41f5aSRakie Kim goto err_out; 3492dce41f5aSRakie Kim } 3493dce41f5aSRakie Kim 3494dce41f5aSRakie Kim node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), 3495dce41f5aSRakie Kim GFP_KERNEL); 3496dce41f5aSRakie Kim if (!node_attrs) { 3497dce41f5aSRakie Kim err = -ENOMEM; 3498dce41f5aSRakie Kim goto mempol_out; 3499dce41f5aSRakie Kim } 3500dce41f5aSRakie Kim 3501dce41f5aSRakie Kim err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, 3502dce41f5aSRakie Kim "mempolicy"); 3503dce41f5aSRakie Kim if (err) 3504dce41f5aSRakie Kim goto node_out; 3505dce41f5aSRakie Kim 3506dce41f5aSRakie Kim err = add_weighted_interleave_group(mempolicy_kobj); 3507dce41f5aSRakie Kim if (err) { 3508dce41f5aSRakie Kim pr_err("mempolicy sysfs structure failed to initialize\n"); 3509dce41f5aSRakie Kim kobject_put(mempolicy_kobj); 3510dce41f5aSRakie Kim return err; 3511dce41f5aSRakie Kim } 3512dce41f5aSRakie Kim 3513dce41f5aSRakie Kim return err; 3514dce41f5aSRakie Kim node_out: 3515dce41f5aSRakie Kim kfree(node_attrs); 3516dce41f5aSRakie Kim mempol_out: 3517dce41f5aSRakie Kim kfree(mempolicy_kobj); 3518dce41f5aSRakie Kim err_out: 3519dce41f5aSRakie Kim pr_err("failed to add mempolicy kobject to the system\n"); 3520dce41f5aSRakie Kim return err; 3521dce41f5aSRakie Kim } 3522dce41f5aSRakie Kim 3523dce41f5aSRakie Kim late_initcall(mempolicy_sysfs_init); 3524dce41f5aSRakie Kim #endif /* CONFIG_SYSFS */ 3525