11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * Simple NUMA memory policy for the Linux kernel. 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright 2003,2004 Andi Kleen, SuSE Labs. 58bccd85fSChristoph Lameter * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 61da177e4SLinus Torvalds * Subject to the GNU Public License, version 2. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * NUMA policy allows the user to give hints in which node(s) memory should 91da177e4SLinus Torvalds * be allocated. 101da177e4SLinus Torvalds * 111da177e4SLinus Torvalds * Support four policies per VMA and per process: 121da177e4SLinus Torvalds * 131da177e4SLinus Torvalds * The VMA policy has priority over the process policy for a page fault. 141da177e4SLinus Torvalds * 151da177e4SLinus Torvalds * interleave Allocate memory interleaved over a set of nodes, 161da177e4SLinus Torvalds * with normal fallback if it fails. 171da177e4SLinus Torvalds * For VMA based allocations this interleaves based on the 181da177e4SLinus Torvalds * offset into the backing object or offset into the mapping 191da177e4SLinus Torvalds * for anonymous memory. For process policy an process counter 201da177e4SLinus Torvalds * is used. 218bccd85fSChristoph Lameter * 221da177e4SLinus Torvalds * bind Only allocate memory on a specific set of nodes, 231da177e4SLinus Torvalds * no fallback. 248bccd85fSChristoph Lameter * FIXME: memory is allocated starting with the first node 258bccd85fSChristoph Lameter * to the last. It would be better if bind would truly restrict 268bccd85fSChristoph Lameter * the allocation to memory nodes instead 278bccd85fSChristoph Lameter * 281da177e4SLinus Torvalds * preferred Try a specific node first before normal fallback. 291da177e4SLinus Torvalds * As a special case node -1 here means do the allocation 301da177e4SLinus Torvalds * on the local CPU. This is normally identical to default, 311da177e4SLinus Torvalds * but useful to set in a VMA when you have a non default 321da177e4SLinus Torvalds * process policy. 338bccd85fSChristoph Lameter * 341da177e4SLinus Torvalds * default Allocate on the local node first, or when on a VMA 351da177e4SLinus Torvalds * use the process policy. This is what Linux always did 361da177e4SLinus Torvalds * in a NUMA aware kernel and still does by, ahem, default. 371da177e4SLinus Torvalds * 381da177e4SLinus Torvalds * The process policy is applied for most non interrupt memory allocations 391da177e4SLinus Torvalds * in that process' context. Interrupts ignore the policies and always 401da177e4SLinus Torvalds * try to allocate on the local CPU. The VMA policy is only applied for memory 411da177e4SLinus Torvalds * allocations for a VMA in the VM. 421da177e4SLinus Torvalds * 431da177e4SLinus Torvalds * Currently there are a few corner cases in swapping where the policy 441da177e4SLinus Torvalds * is not applied, but the majority should be handled. When process policy 451da177e4SLinus Torvalds * is used it is not remembered over swap outs/swap ins. 461da177e4SLinus Torvalds * 471da177e4SLinus Torvalds * Only the highest zone in the zone hierarchy gets policied. Allocations 481da177e4SLinus Torvalds * requesting a lower zone just use default policy. This implies that 491da177e4SLinus Torvalds * on systems with highmem kernel lowmem allocation don't get policied. 501da177e4SLinus Torvalds * Same with GFP_DMA allocations. 511da177e4SLinus Torvalds * 521da177e4SLinus Torvalds * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 531da177e4SLinus Torvalds * all users and remembered even when nobody has memory mapped. 541da177e4SLinus Torvalds */ 551da177e4SLinus Torvalds 561da177e4SLinus Torvalds /* Notebook: 571da177e4SLinus Torvalds fix mmap readahead to honour policy and enable policy for any page cache 581da177e4SLinus Torvalds object 591da177e4SLinus Torvalds statistics for bigpages 601da177e4SLinus Torvalds global policy for page cache? currently it uses process policy. Requires 611da177e4SLinus Torvalds first item above. 621da177e4SLinus Torvalds handle mremap for shared memory (currently ignored for the policy) 631da177e4SLinus Torvalds grows down? 641da177e4SLinus Torvalds make bind policy root only? It can trigger oom much faster and the 651da177e4SLinus Torvalds kernel is not always grateful with that. 661da177e4SLinus Torvalds */ 671da177e4SLinus Torvalds 681da177e4SLinus Torvalds #include <linux/mempolicy.h> 691da177e4SLinus Torvalds #include <linux/mm.h> 701da177e4SLinus Torvalds #include <linux/highmem.h> 711da177e4SLinus Torvalds #include <linux/hugetlb.h> 721da177e4SLinus Torvalds #include <linux/kernel.h> 731da177e4SLinus Torvalds #include <linux/sched.h> 741da177e4SLinus Torvalds #include <linux/nodemask.h> 751da177e4SLinus Torvalds #include <linux/cpuset.h> 761da177e4SLinus Torvalds #include <linux/gfp.h> 771da177e4SLinus Torvalds #include <linux/slab.h> 781da177e4SLinus Torvalds #include <linux/string.h> 791da177e4SLinus Torvalds #include <linux/module.h> 80b488893aSPavel Emelyanov #include <linux/nsproxy.h> 811da177e4SLinus Torvalds #include <linux/interrupt.h> 821da177e4SLinus Torvalds #include <linux/init.h> 831da177e4SLinus Torvalds #include <linux/compat.h> 84dc9aa5b9SChristoph Lameter #include <linux/swap.h> 851a75a6c8SChristoph Lameter #include <linux/seq_file.h> 861a75a6c8SChristoph Lameter #include <linux/proc_fs.h> 87b20a3503SChristoph Lameter #include <linux/migrate.h> 8895a402c3SChristoph Lameter #include <linux/rmap.h> 8986c3a764SDavid Quigley #include <linux/security.h> 90dbcb0f19SAdrian Bunk #include <linux/syscalls.h> 91dc9aa5b9SChristoph Lameter 921da177e4SLinus Torvalds #include <asm/tlbflush.h> 931da177e4SLinus Torvalds #include <asm/uaccess.h> 941da177e4SLinus Torvalds 9538e35860SChristoph Lameter /* Internal flags */ 96dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 9738e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 981a75a6c8SChristoph Lameter #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 99dc9aa5b9SChristoph Lameter 100fcc234f8SPekka Enberg static struct kmem_cache *policy_cache; 101fcc234f8SPekka Enberg static struct kmem_cache *sn_cache; 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not 1041da177e4SLinus Torvalds policied. */ 1056267276fSChristoph Lameter enum zone_type policy_zone = 0; 1061da177e4SLinus Torvalds 107d42c6997SAndi Kleen struct mempolicy default_policy = { 1081da177e4SLinus Torvalds .refcnt = ATOMIC_INIT(1), /* never free it */ 1091da177e4SLinus Torvalds .policy = MPOL_DEFAULT, 1101da177e4SLinus Torvalds }; 1111da177e4SLinus Torvalds 11237012946SDavid Rientjes static const struct mempolicy_operations { 11337012946SDavid Rientjes int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 11437012946SDavid Rientjes void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 11537012946SDavid Rientjes } mpol_ops[MPOL_MAX]; 11637012946SDavid Rientjes 11719770b32SMel Gorman /* Check that the nodemask contains at least one populated zone */ 11837012946SDavid Rientjes static int is_valid_nodemask(const nodemask_t *nodemask) 1191da177e4SLinus Torvalds { 12019770b32SMel Gorman int nd, k; 1211da177e4SLinus Torvalds 12219770b32SMel Gorman /* Check that there is something useful in this mask */ 12319770b32SMel Gorman k = policy_zone; 12419770b32SMel Gorman 12519770b32SMel Gorman for_each_node_mask(nd, *nodemask) { 12619770b32SMel Gorman struct zone *z; 12719770b32SMel Gorman 12819770b32SMel Gorman for (k = 0; k <= policy_zone; k++) { 12919770b32SMel Gorman z = &NODE_DATA(nd)->node_zones[k]; 130dd942ae3SAndi Kleen if (z->present_pages > 0) 13119770b32SMel Gorman return 1; 132dd942ae3SAndi Kleen } 133dd942ae3SAndi Kleen } 13419770b32SMel Gorman 13519770b32SMel Gorman return 0; 1361da177e4SLinus Torvalds } 1371da177e4SLinus Torvalds 138f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 139f5b087b5SDavid Rientjes { 1404c50bc01SDavid Rientjes return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); 1414c50bc01SDavid Rientjes } 1424c50bc01SDavid Rientjes 1434c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 1444c50bc01SDavid Rientjes const nodemask_t *rel) 1454c50bc01SDavid Rientjes { 1464c50bc01SDavid Rientjes nodemask_t tmp; 1474c50bc01SDavid Rientjes nodes_fold(tmp, *orig, nodes_weight(*rel)); 1484c50bc01SDavid Rientjes nodes_onto(*ret, tmp, *rel); 149f5b087b5SDavid Rientjes } 150f5b087b5SDavid Rientjes 15137012946SDavid Rientjes static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) 15237012946SDavid Rientjes { 15337012946SDavid Rientjes if (nodes_empty(*nodes)) 15437012946SDavid Rientjes return -EINVAL; 15537012946SDavid Rientjes pol->v.nodes = *nodes; 15637012946SDavid Rientjes return 0; 15737012946SDavid Rientjes } 15837012946SDavid Rientjes 15937012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 16037012946SDavid Rientjes { 16137012946SDavid Rientjes if (!nodes) 16237012946SDavid Rientjes pol->v.preferred_node = -1; /* local allocation */ 16337012946SDavid Rientjes else if (nodes_empty(*nodes)) 16437012946SDavid Rientjes return -EINVAL; /* no allowed nodes */ 16537012946SDavid Rientjes else 16637012946SDavid Rientjes pol->v.preferred_node = first_node(*nodes); 16737012946SDavid Rientjes return 0; 16837012946SDavid Rientjes } 16937012946SDavid Rientjes 17037012946SDavid Rientjes static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) 17137012946SDavid Rientjes { 17237012946SDavid Rientjes if (!is_valid_nodemask(nodes)) 17337012946SDavid Rientjes return -EINVAL; 17437012946SDavid Rientjes pol->v.nodes = *nodes; 17537012946SDavid Rientjes return 0; 17637012946SDavid Rientjes } 17737012946SDavid Rientjes 1781da177e4SLinus Torvalds /* Create a new policy */ 179028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 180028fec41SDavid Rientjes nodemask_t *nodes) 1811da177e4SLinus Torvalds { 1821da177e4SLinus Torvalds struct mempolicy *policy; 183f5b087b5SDavid Rientjes nodemask_t cpuset_context_nmask; 18437012946SDavid Rientjes int ret; 1851da177e4SLinus Torvalds 186028fec41SDavid Rientjes pr_debug("setting mode %d flags %d nodes[0] %lx\n", 187028fec41SDavid Rientjes mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 188140d5a49SPaul Mundt 1893e1f0645SDavid Rientjes if (mode == MPOL_DEFAULT) { 1903e1f0645SDavid Rientjes if (nodes && !nodes_empty(*nodes)) 19137012946SDavid Rientjes return ERR_PTR(-EINVAL); 1923e1f0645SDavid Rientjes return NULL; 19337012946SDavid Rientjes } 1943e1f0645SDavid Rientjes VM_BUG_ON(!nodes); 1953e1f0645SDavid Rientjes 1963e1f0645SDavid Rientjes /* 1973e1f0645SDavid Rientjes * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 1983e1f0645SDavid Rientjes * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 1993e1f0645SDavid Rientjes * All other modes require a valid pointer to a non-empty nodemask. 2003e1f0645SDavid Rientjes */ 2013e1f0645SDavid Rientjes if (mode == MPOL_PREFERRED) { 2023e1f0645SDavid Rientjes if (nodes_empty(*nodes)) { 2033e1f0645SDavid Rientjes if (((flags & MPOL_F_STATIC_NODES) || 2043e1f0645SDavid Rientjes (flags & MPOL_F_RELATIVE_NODES))) 2053e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 2063e1f0645SDavid Rientjes nodes = NULL; /* flag local alloc */ 2073e1f0645SDavid Rientjes } 2083e1f0645SDavid Rientjes } else if (nodes_empty(*nodes)) 2093e1f0645SDavid Rientjes return ERR_PTR(-EINVAL); 2101da177e4SLinus Torvalds policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2111da177e4SLinus Torvalds if (!policy) 2121da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 2131da177e4SLinus Torvalds atomic_set(&policy->refcnt, 1); 21437012946SDavid Rientjes policy->policy = mode; 21537012946SDavid Rientjes policy->flags = flags; 2163e1f0645SDavid Rientjes 2173e1f0645SDavid Rientjes if (nodes) { 2183e1f0645SDavid Rientjes /* 2193e1f0645SDavid Rientjes * cpuset related setup doesn't apply to local allocation 2203e1f0645SDavid Rientjes */ 221f5b087b5SDavid Rientjes cpuset_update_task_memory_state(); 2224c50bc01SDavid Rientjes if (flags & MPOL_F_RELATIVE_NODES) 2234c50bc01SDavid Rientjes mpol_relative_nodemask(&cpuset_context_nmask, nodes, 2244c50bc01SDavid Rientjes &cpuset_current_mems_allowed); 2254c50bc01SDavid Rientjes else 2264c50bc01SDavid Rientjes nodes_and(cpuset_context_nmask, *nodes, 2274c50bc01SDavid Rientjes cpuset_current_mems_allowed); 228f5b087b5SDavid Rientjes if (mpol_store_user_nodemask(policy)) 229f5b087b5SDavid Rientjes policy->w.user_nodemask = *nodes; 230f5b087b5SDavid Rientjes else 23137012946SDavid Rientjes policy->w.cpuset_mems_allowed = 23237012946SDavid Rientjes cpuset_mems_allowed(current); 2331da177e4SLinus Torvalds } 2341da177e4SLinus Torvalds 23537012946SDavid Rientjes ret = mpol_ops[mode].create(policy, 2363e1f0645SDavid Rientjes nodes ? &cpuset_context_nmask : NULL); 23737012946SDavid Rientjes if (ret < 0) { 23837012946SDavid Rientjes kmem_cache_free(policy_cache, policy); 23937012946SDavid Rientjes return ERR_PTR(ret); 24037012946SDavid Rientjes } 24137012946SDavid Rientjes return policy; 24237012946SDavid Rientjes } 24337012946SDavid Rientjes 24437012946SDavid Rientjes static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 24537012946SDavid Rientjes { 24637012946SDavid Rientjes } 24737012946SDavid Rientjes 24837012946SDavid Rientjes static void mpol_rebind_nodemask(struct mempolicy *pol, 24937012946SDavid Rientjes const nodemask_t *nodes) 2501d0d2680SDavid Rientjes { 2511d0d2680SDavid Rientjes nodemask_t tmp; 2521d0d2680SDavid Rientjes 25337012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES) 25437012946SDavid Rientjes nodes_and(tmp, pol->w.user_nodemask, *nodes); 25537012946SDavid Rientjes else if (pol->flags & MPOL_F_RELATIVE_NODES) 25637012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 2571d0d2680SDavid Rientjes else { 25837012946SDavid Rientjes nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, 25937012946SDavid Rientjes *nodes); 26037012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes; 2611d0d2680SDavid Rientjes } 26237012946SDavid Rientjes 2631d0d2680SDavid Rientjes pol->v.nodes = tmp; 2641d0d2680SDavid Rientjes if (!node_isset(current->il_next, tmp)) { 2651d0d2680SDavid Rientjes current->il_next = next_node(current->il_next, tmp); 2661d0d2680SDavid Rientjes if (current->il_next >= MAX_NUMNODES) 2671d0d2680SDavid Rientjes current->il_next = first_node(tmp); 2681d0d2680SDavid Rientjes if (current->il_next >= MAX_NUMNODES) 2691d0d2680SDavid Rientjes current->il_next = numa_node_id(); 2701d0d2680SDavid Rientjes } 27137012946SDavid Rientjes } 27237012946SDavid Rientjes 27337012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol, 27437012946SDavid Rientjes const nodemask_t *nodes) 27537012946SDavid Rientjes { 27637012946SDavid Rientjes nodemask_t tmp; 27737012946SDavid Rientjes 27837012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES) { 2791d0d2680SDavid Rientjes int node = first_node(pol->w.user_nodemask); 2801d0d2680SDavid Rientjes 28137012946SDavid Rientjes if (node_isset(node, *nodes)) 2821d0d2680SDavid Rientjes pol->v.preferred_node = node; 2831d0d2680SDavid Rientjes else 2841d0d2680SDavid Rientjes pol->v.preferred_node = -1; 28537012946SDavid Rientjes } else if (pol->flags & MPOL_F_RELATIVE_NODES) { 28637012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 2871d0d2680SDavid Rientjes pol->v.preferred_node = first_node(tmp); 2883e1f0645SDavid Rientjes } else if (pol->v.preferred_node != -1) { 2891d0d2680SDavid Rientjes pol->v.preferred_node = node_remap(pol->v.preferred_node, 29037012946SDavid Rientjes pol->w.cpuset_mems_allowed, 29137012946SDavid Rientjes *nodes); 29237012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes; 2931d0d2680SDavid Rientjes } 2941d0d2680SDavid Rientjes } 29537012946SDavid Rientjes 29637012946SDavid Rientjes /* Migrate a policy to a different set of nodes */ 29737012946SDavid Rientjes static void mpol_rebind_policy(struct mempolicy *pol, 29837012946SDavid Rientjes const nodemask_t *newmask) 29937012946SDavid Rientjes { 30037012946SDavid Rientjes if (!pol) 30137012946SDavid Rientjes return; 30237012946SDavid Rientjes if (!mpol_store_user_nodemask(pol) && 30337012946SDavid Rientjes nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 30437012946SDavid Rientjes return; 30537012946SDavid Rientjes mpol_ops[pol->policy].rebind(pol, newmask); 3061d0d2680SDavid Rientjes } 3071d0d2680SDavid Rientjes 3081d0d2680SDavid Rientjes /* 3091d0d2680SDavid Rientjes * Wrapper for mpol_rebind_policy() that just requires task 3101d0d2680SDavid Rientjes * pointer, and updates task mempolicy. 3111d0d2680SDavid Rientjes */ 3121d0d2680SDavid Rientjes 3131d0d2680SDavid Rientjes void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 3141d0d2680SDavid Rientjes { 3151d0d2680SDavid Rientjes mpol_rebind_policy(tsk->mempolicy, new); 3161d0d2680SDavid Rientjes } 3171d0d2680SDavid Rientjes 3181d0d2680SDavid Rientjes /* 3191d0d2680SDavid Rientjes * Rebind each vma in mm to new nodemask. 3201d0d2680SDavid Rientjes * 3211d0d2680SDavid Rientjes * Call holding a reference to mm. Takes mm->mmap_sem during call. 3221d0d2680SDavid Rientjes */ 3231d0d2680SDavid Rientjes 3241d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 3251d0d2680SDavid Rientjes { 3261d0d2680SDavid Rientjes struct vm_area_struct *vma; 3271d0d2680SDavid Rientjes 3281d0d2680SDavid Rientjes down_write(&mm->mmap_sem); 3291d0d2680SDavid Rientjes for (vma = mm->mmap; vma; vma = vma->vm_next) 3301d0d2680SDavid Rientjes mpol_rebind_policy(vma->vm_policy, new); 3311d0d2680SDavid Rientjes up_write(&mm->mmap_sem); 3321d0d2680SDavid Rientjes } 3331d0d2680SDavid Rientjes 33437012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 33537012946SDavid Rientjes [MPOL_DEFAULT] = { 33637012946SDavid Rientjes .rebind = mpol_rebind_default, 33737012946SDavid Rientjes }, 33837012946SDavid Rientjes [MPOL_INTERLEAVE] = { 33937012946SDavid Rientjes .create = mpol_new_interleave, 34037012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 34137012946SDavid Rientjes }, 34237012946SDavid Rientjes [MPOL_PREFERRED] = { 34337012946SDavid Rientjes .create = mpol_new_preferred, 34437012946SDavid Rientjes .rebind = mpol_rebind_preferred, 34537012946SDavid Rientjes }, 34637012946SDavid Rientjes [MPOL_BIND] = { 34737012946SDavid Rientjes .create = mpol_new_bind, 34837012946SDavid Rientjes .rebind = mpol_rebind_nodemask, 34937012946SDavid Rientjes }, 35037012946SDavid Rientjes }; 35137012946SDavid Rientjes 352397874dfSChristoph Lameter static void gather_stats(struct page *, void *, int pte_dirty); 353fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 354fc301289SChristoph Lameter unsigned long flags); 3551a75a6c8SChristoph Lameter 35638e35860SChristoph Lameter /* Scan through pages checking if pages follow certain conditions. */ 357b5810039SNick Piggin static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 358dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 359dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 36038e35860SChristoph Lameter void *private) 3611da177e4SLinus Torvalds { 36291612e0dSHugh Dickins pte_t *orig_pte; 36391612e0dSHugh Dickins pte_t *pte; 364705e87c0SHugh Dickins spinlock_t *ptl; 365941150a3SHugh Dickins 366705e87c0SHugh Dickins orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 36791612e0dSHugh Dickins do { 3686aab341eSLinus Torvalds struct page *page; 36925ba77c1SAndy Whitcroft int nid; 37091612e0dSHugh Dickins 37191612e0dSHugh Dickins if (!pte_present(*pte)) 37291612e0dSHugh Dickins continue; 3736aab341eSLinus Torvalds page = vm_normal_page(vma, addr, *pte); 3746aab341eSLinus Torvalds if (!page) 37591612e0dSHugh Dickins continue; 376053837fcSNick Piggin /* 377053837fcSNick Piggin * The check for PageReserved here is important to avoid 378053837fcSNick Piggin * handling zero pages and other pages that may have been 379053837fcSNick Piggin * marked special by the system. 380053837fcSNick Piggin * 381053837fcSNick Piggin * If the PageReserved would not be checked here then f.e. 382053837fcSNick Piggin * the location of the zero page could have an influence 383053837fcSNick Piggin * on MPOL_MF_STRICT, zero pages would be counted for 384053837fcSNick Piggin * the per node stats, and there would be useless attempts 385053837fcSNick Piggin * to put zero pages on the migration list. 386053837fcSNick Piggin */ 387f4598c8bSChristoph Lameter if (PageReserved(page)) 388f4598c8bSChristoph Lameter continue; 3896aab341eSLinus Torvalds nid = page_to_nid(page); 39038e35860SChristoph Lameter if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 39138e35860SChristoph Lameter continue; 39238e35860SChristoph Lameter 3931a75a6c8SChristoph Lameter if (flags & MPOL_MF_STATS) 394397874dfSChristoph Lameter gather_stats(page, private, pte_dirty(*pte)); 395053837fcSNick Piggin else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 396fc301289SChristoph Lameter migrate_page_add(page, private, flags); 397dc9aa5b9SChristoph Lameter else 3981da177e4SLinus Torvalds break; 39991612e0dSHugh Dickins } while (pte++, addr += PAGE_SIZE, addr != end); 400705e87c0SHugh Dickins pte_unmap_unlock(orig_pte, ptl); 40191612e0dSHugh Dickins return addr != end; 40291612e0dSHugh Dickins } 40391612e0dSHugh Dickins 404b5810039SNick Piggin static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 405dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 406dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 40738e35860SChristoph Lameter void *private) 40891612e0dSHugh Dickins { 40991612e0dSHugh Dickins pmd_t *pmd; 41091612e0dSHugh Dickins unsigned long next; 41191612e0dSHugh Dickins 41291612e0dSHugh Dickins pmd = pmd_offset(pud, addr); 41391612e0dSHugh Dickins do { 41491612e0dSHugh Dickins next = pmd_addr_end(addr, end); 41591612e0dSHugh Dickins if (pmd_none_or_clear_bad(pmd)) 41691612e0dSHugh Dickins continue; 417dc9aa5b9SChristoph Lameter if (check_pte_range(vma, pmd, addr, next, nodes, 41838e35860SChristoph Lameter flags, private)) 41991612e0dSHugh Dickins return -EIO; 42091612e0dSHugh Dickins } while (pmd++, addr = next, addr != end); 42191612e0dSHugh Dickins return 0; 42291612e0dSHugh Dickins } 42391612e0dSHugh Dickins 424b5810039SNick Piggin static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 425dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 426dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 42738e35860SChristoph Lameter void *private) 42891612e0dSHugh Dickins { 42991612e0dSHugh Dickins pud_t *pud; 43091612e0dSHugh Dickins unsigned long next; 43191612e0dSHugh Dickins 43291612e0dSHugh Dickins pud = pud_offset(pgd, addr); 43391612e0dSHugh Dickins do { 43491612e0dSHugh Dickins next = pud_addr_end(addr, end); 43591612e0dSHugh Dickins if (pud_none_or_clear_bad(pud)) 43691612e0dSHugh Dickins continue; 437dc9aa5b9SChristoph Lameter if (check_pmd_range(vma, pud, addr, next, nodes, 43838e35860SChristoph Lameter flags, private)) 43991612e0dSHugh Dickins return -EIO; 44091612e0dSHugh Dickins } while (pud++, addr = next, addr != end); 44191612e0dSHugh Dickins return 0; 44291612e0dSHugh Dickins } 44391612e0dSHugh Dickins 444b5810039SNick Piggin static inline int check_pgd_range(struct vm_area_struct *vma, 445dc9aa5b9SChristoph Lameter unsigned long addr, unsigned long end, 446dc9aa5b9SChristoph Lameter const nodemask_t *nodes, unsigned long flags, 44738e35860SChristoph Lameter void *private) 44891612e0dSHugh Dickins { 44991612e0dSHugh Dickins pgd_t *pgd; 45091612e0dSHugh Dickins unsigned long next; 45191612e0dSHugh Dickins 452b5810039SNick Piggin pgd = pgd_offset(vma->vm_mm, addr); 45391612e0dSHugh Dickins do { 45491612e0dSHugh Dickins next = pgd_addr_end(addr, end); 45591612e0dSHugh Dickins if (pgd_none_or_clear_bad(pgd)) 45691612e0dSHugh Dickins continue; 457dc9aa5b9SChristoph Lameter if (check_pud_range(vma, pgd, addr, next, nodes, 45838e35860SChristoph Lameter flags, private)) 45991612e0dSHugh Dickins return -EIO; 46091612e0dSHugh Dickins } while (pgd++, addr = next, addr != end); 46191612e0dSHugh Dickins return 0; 4621da177e4SLinus Torvalds } 4631da177e4SLinus Torvalds 464dc9aa5b9SChristoph Lameter /* 465dc9aa5b9SChristoph Lameter * Check if all pages in a range are on a set of nodes. 466dc9aa5b9SChristoph Lameter * If pagelist != NULL then isolate pages from the LRU and 467dc9aa5b9SChristoph Lameter * put them on the pagelist. 468dc9aa5b9SChristoph Lameter */ 4691da177e4SLinus Torvalds static struct vm_area_struct * 4701da177e4SLinus Torvalds check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 47138e35860SChristoph Lameter const nodemask_t *nodes, unsigned long flags, void *private) 4721da177e4SLinus Torvalds { 4731da177e4SLinus Torvalds int err; 4741da177e4SLinus Torvalds struct vm_area_struct *first, *vma, *prev; 4751da177e4SLinus Torvalds 47690036ee5SChristoph Lameter if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 47790036ee5SChristoph Lameter 478b20a3503SChristoph Lameter err = migrate_prep(); 479b20a3503SChristoph Lameter if (err) 480b20a3503SChristoph Lameter return ERR_PTR(err); 48190036ee5SChristoph Lameter } 482053837fcSNick Piggin 4831da177e4SLinus Torvalds first = find_vma(mm, start); 4841da177e4SLinus Torvalds if (!first) 4851da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 4861da177e4SLinus Torvalds prev = NULL; 4871da177e4SLinus Torvalds for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 488dc9aa5b9SChristoph Lameter if (!(flags & MPOL_MF_DISCONTIG_OK)) { 4891da177e4SLinus Torvalds if (!vma->vm_next && vma->vm_end < end) 4901da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 4911da177e4SLinus Torvalds if (prev && prev->vm_end < vma->vm_start) 4921da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 493dc9aa5b9SChristoph Lameter } 494dc9aa5b9SChristoph Lameter if (!is_vm_hugetlb_page(vma) && 495dc9aa5b9SChristoph Lameter ((flags & MPOL_MF_STRICT) || 496dc9aa5b9SChristoph Lameter ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 497dc9aa5b9SChristoph Lameter vma_migratable(vma)))) { 4985b952b3cSAndi Kleen unsigned long endvma = vma->vm_end; 499dc9aa5b9SChristoph Lameter 5005b952b3cSAndi Kleen if (endvma > end) 5015b952b3cSAndi Kleen endvma = end; 5025b952b3cSAndi Kleen if (vma->vm_start > start) 5035b952b3cSAndi Kleen start = vma->vm_start; 504dc9aa5b9SChristoph Lameter err = check_pgd_range(vma, start, endvma, nodes, 50538e35860SChristoph Lameter flags, private); 5061da177e4SLinus Torvalds if (err) { 5071da177e4SLinus Torvalds first = ERR_PTR(err); 5081da177e4SLinus Torvalds break; 5091da177e4SLinus Torvalds } 5101da177e4SLinus Torvalds } 5111da177e4SLinus Torvalds prev = vma; 5121da177e4SLinus Torvalds } 5131da177e4SLinus Torvalds return first; 5141da177e4SLinus Torvalds } 5151da177e4SLinus Torvalds 5161da177e4SLinus Torvalds /* Apply policy to a single VMA */ 5171da177e4SLinus Torvalds static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 5181da177e4SLinus Torvalds { 5191da177e4SLinus Torvalds int err = 0; 5201da177e4SLinus Torvalds struct mempolicy *old = vma->vm_policy; 5211da177e4SLinus Torvalds 522140d5a49SPaul Mundt pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 5231da177e4SLinus Torvalds vma->vm_start, vma->vm_end, vma->vm_pgoff, 5241da177e4SLinus Torvalds vma->vm_ops, vma->vm_file, 5251da177e4SLinus Torvalds vma->vm_ops ? vma->vm_ops->set_policy : NULL); 5261da177e4SLinus Torvalds 5271da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->set_policy) 5281da177e4SLinus Torvalds err = vma->vm_ops->set_policy(vma, new); 5291da177e4SLinus Torvalds if (!err) { 5301da177e4SLinus Torvalds mpol_get(new); 5311da177e4SLinus Torvalds vma->vm_policy = new; 532f0be3d32SLee Schermerhorn mpol_put(old); 5331da177e4SLinus Torvalds } 5341da177e4SLinus Torvalds return err; 5351da177e4SLinus Torvalds } 5361da177e4SLinus Torvalds 5371da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */ 5381da177e4SLinus Torvalds static int mbind_range(struct vm_area_struct *vma, unsigned long start, 5391da177e4SLinus Torvalds unsigned long end, struct mempolicy *new) 5401da177e4SLinus Torvalds { 5411da177e4SLinus Torvalds struct vm_area_struct *next; 5421da177e4SLinus Torvalds int err; 5431da177e4SLinus Torvalds 5441da177e4SLinus Torvalds err = 0; 5451da177e4SLinus Torvalds for (; vma && vma->vm_start < end; vma = next) { 5461da177e4SLinus Torvalds next = vma->vm_next; 5471da177e4SLinus Torvalds if (vma->vm_start < start) 5481da177e4SLinus Torvalds err = split_vma(vma->vm_mm, vma, start, 1); 5491da177e4SLinus Torvalds if (!err && vma->vm_end > end) 5501da177e4SLinus Torvalds err = split_vma(vma->vm_mm, vma, end, 0); 5511da177e4SLinus Torvalds if (!err) 5521da177e4SLinus Torvalds err = policy_vma(vma, new); 5531da177e4SLinus Torvalds if (err) 5541da177e4SLinus Torvalds break; 5551da177e4SLinus Torvalds } 5561da177e4SLinus Torvalds return err; 5571da177e4SLinus Torvalds } 5581da177e4SLinus Torvalds 559c61afb18SPaul Jackson /* 560c61afb18SPaul Jackson * Update task->flags PF_MEMPOLICY bit: set iff non-default 561c61afb18SPaul Jackson * mempolicy. Allows more rapid checking of this (combined perhaps 562c61afb18SPaul Jackson * with other PF_* flag bits) on memory allocation hot code paths. 563c61afb18SPaul Jackson * 564c61afb18SPaul Jackson * If called from outside this file, the task 'p' should -only- be 565c61afb18SPaul Jackson * a newly forked child not yet visible on the task list, because 566c61afb18SPaul Jackson * manipulating the task flags of a visible task is not safe. 567c61afb18SPaul Jackson * 568c61afb18SPaul Jackson * The above limitation is why this routine has the funny name 569c61afb18SPaul Jackson * mpol_fix_fork_child_flag(). 570c61afb18SPaul Jackson * 571c61afb18SPaul Jackson * It is also safe to call this with a task pointer of current, 572c61afb18SPaul Jackson * which the static wrapper mpol_set_task_struct_flag() does, 573c61afb18SPaul Jackson * for use within this file. 574c61afb18SPaul Jackson */ 575c61afb18SPaul Jackson 576c61afb18SPaul Jackson void mpol_fix_fork_child_flag(struct task_struct *p) 577c61afb18SPaul Jackson { 578c61afb18SPaul Jackson if (p->mempolicy) 579c61afb18SPaul Jackson p->flags |= PF_MEMPOLICY; 580c61afb18SPaul Jackson else 581c61afb18SPaul Jackson p->flags &= ~PF_MEMPOLICY; 582c61afb18SPaul Jackson } 583c61afb18SPaul Jackson 584c61afb18SPaul Jackson static void mpol_set_task_struct_flag(void) 585c61afb18SPaul Jackson { 586c61afb18SPaul Jackson mpol_fix_fork_child_flag(current); 587c61afb18SPaul Jackson } 588c61afb18SPaul Jackson 5891da177e4SLinus Torvalds /* Set the process memory policy */ 590028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags, 591028fec41SDavid Rientjes nodemask_t *nodes) 5921da177e4SLinus Torvalds { 5931da177e4SLinus Torvalds struct mempolicy *new; 594*f4e53d91SLee Schermerhorn struct mm_struct *mm = current->mm; 5951da177e4SLinus Torvalds 596028fec41SDavid Rientjes new = mpol_new(mode, flags, nodes); 5971da177e4SLinus Torvalds if (IS_ERR(new)) 5981da177e4SLinus Torvalds return PTR_ERR(new); 599*f4e53d91SLee Schermerhorn 600*f4e53d91SLee Schermerhorn /* 601*f4e53d91SLee Schermerhorn * prevent changing our mempolicy while show_numa_maps() 602*f4e53d91SLee Schermerhorn * is using it. 603*f4e53d91SLee Schermerhorn * Note: do_set_mempolicy() can be called at init time 604*f4e53d91SLee Schermerhorn * with no 'mm'. 605*f4e53d91SLee Schermerhorn */ 606*f4e53d91SLee Schermerhorn if (mm) 607*f4e53d91SLee Schermerhorn down_write(&mm->mmap_sem); 608f0be3d32SLee Schermerhorn mpol_put(current->mempolicy); 6091da177e4SLinus Torvalds current->mempolicy = new; 610c61afb18SPaul Jackson mpol_set_task_struct_flag(); 611f5b087b5SDavid Rientjes if (new && new->policy == MPOL_INTERLEAVE && 612f5b087b5SDavid Rientjes nodes_weight(new->v.nodes)) 613dfcd3c0dSAndi Kleen current->il_next = first_node(new->v.nodes); 614*f4e53d91SLee Schermerhorn if (mm) 615*f4e53d91SLee Schermerhorn up_write(&mm->mmap_sem); 616*f4e53d91SLee Schermerhorn 6171da177e4SLinus Torvalds return 0; 6181da177e4SLinus Torvalds } 6191da177e4SLinus Torvalds 6201da177e4SLinus Torvalds /* Fill a zone bitmap for a policy */ 621dfcd3c0dSAndi Kleen static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 6221da177e4SLinus Torvalds { 623dfcd3c0dSAndi Kleen nodes_clear(*nodes); 6241da177e4SLinus Torvalds switch (p->policy) { 6251da177e4SLinus Torvalds case MPOL_DEFAULT: 6261da177e4SLinus Torvalds break; 62719770b32SMel Gorman case MPOL_BIND: 62819770b32SMel Gorman /* Fall through */ 6291da177e4SLinus Torvalds case MPOL_INTERLEAVE: 630dfcd3c0dSAndi Kleen *nodes = p->v.nodes; 6311da177e4SLinus Torvalds break; 6321da177e4SLinus Torvalds case MPOL_PREFERRED: 63356bbd65dSChristoph Lameter /* or use current node instead of memory_map? */ 6341da177e4SLinus Torvalds if (p->v.preferred_node < 0) 63556bbd65dSChristoph Lameter *nodes = node_states[N_HIGH_MEMORY]; 6361da177e4SLinus Torvalds else 637dfcd3c0dSAndi Kleen node_set(p->v.preferred_node, *nodes); 6381da177e4SLinus Torvalds break; 6391da177e4SLinus Torvalds default: 6401da177e4SLinus Torvalds BUG(); 6411da177e4SLinus Torvalds } 6421da177e4SLinus Torvalds } 6431da177e4SLinus Torvalds 6441da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr) 6451da177e4SLinus Torvalds { 6461da177e4SLinus Torvalds struct page *p; 6471da177e4SLinus Torvalds int err; 6481da177e4SLinus Torvalds 6491da177e4SLinus Torvalds err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 6501da177e4SLinus Torvalds if (err >= 0) { 6511da177e4SLinus Torvalds err = page_to_nid(p); 6521da177e4SLinus Torvalds put_page(p); 6531da177e4SLinus Torvalds } 6541da177e4SLinus Torvalds return err; 6551da177e4SLinus Torvalds } 6561da177e4SLinus Torvalds 6571da177e4SLinus Torvalds /* Retrieve NUMA policy */ 658dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask, 6591da177e4SLinus Torvalds unsigned long addr, unsigned long flags) 6601da177e4SLinus Torvalds { 6618bccd85fSChristoph Lameter int err; 6621da177e4SLinus Torvalds struct mm_struct *mm = current->mm; 6631da177e4SLinus Torvalds struct vm_area_struct *vma = NULL; 6641da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 6651da177e4SLinus Torvalds 666cf2a473cSPaul Jackson cpuset_update_task_memory_state(); 667754af6f5SLee Schermerhorn if (flags & 668754af6f5SLee Schermerhorn ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 6691da177e4SLinus Torvalds return -EINVAL; 670754af6f5SLee Schermerhorn 671754af6f5SLee Schermerhorn if (flags & MPOL_F_MEMS_ALLOWED) { 672754af6f5SLee Schermerhorn if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 673754af6f5SLee Schermerhorn return -EINVAL; 674754af6f5SLee Schermerhorn *policy = 0; /* just so it's initialized */ 675754af6f5SLee Schermerhorn *nmask = cpuset_current_mems_allowed; 676754af6f5SLee Schermerhorn return 0; 677754af6f5SLee Schermerhorn } 678754af6f5SLee Schermerhorn 6791da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 6801da177e4SLinus Torvalds down_read(&mm->mmap_sem); 6811da177e4SLinus Torvalds vma = find_vma_intersection(mm, addr, addr+1); 6821da177e4SLinus Torvalds if (!vma) { 6831da177e4SLinus Torvalds up_read(&mm->mmap_sem); 6841da177e4SLinus Torvalds return -EFAULT; 6851da177e4SLinus Torvalds } 6861da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->get_policy) 6871da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr); 6881da177e4SLinus Torvalds else 6891da177e4SLinus Torvalds pol = vma->vm_policy; 6901da177e4SLinus Torvalds } else if (addr) 6911da177e4SLinus Torvalds return -EINVAL; 6921da177e4SLinus Torvalds 6931da177e4SLinus Torvalds if (!pol) 6941da177e4SLinus Torvalds pol = &default_policy; 6951da177e4SLinus Torvalds 6961da177e4SLinus Torvalds if (flags & MPOL_F_NODE) { 6971da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 6981da177e4SLinus Torvalds err = lookup_node(mm, addr); 6991da177e4SLinus Torvalds if (err < 0) 7001da177e4SLinus Torvalds goto out; 7018bccd85fSChristoph Lameter *policy = err; 7021da177e4SLinus Torvalds } else if (pol == current->mempolicy && 7031da177e4SLinus Torvalds pol->policy == MPOL_INTERLEAVE) { 7048bccd85fSChristoph Lameter *policy = current->il_next; 7051da177e4SLinus Torvalds } else { 7061da177e4SLinus Torvalds err = -EINVAL; 7071da177e4SLinus Torvalds goto out; 7081da177e4SLinus Torvalds } 7091da177e4SLinus Torvalds } else 710028fec41SDavid Rientjes *policy = pol->policy | pol->flags; 7111da177e4SLinus Torvalds 7121da177e4SLinus Torvalds if (vma) { 7131da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 7141da177e4SLinus Torvalds vma = NULL; 7151da177e4SLinus Torvalds } 7161da177e4SLinus Torvalds 7171da177e4SLinus Torvalds err = 0; 7188bccd85fSChristoph Lameter if (nmask) 7198bccd85fSChristoph Lameter get_zonemask(pol, nmask); 7201da177e4SLinus Torvalds 7211da177e4SLinus Torvalds out: 7221da177e4SLinus Torvalds if (vma) 7231da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 7241da177e4SLinus Torvalds return err; 7251da177e4SLinus Torvalds } 7261da177e4SLinus Torvalds 727b20a3503SChristoph Lameter #ifdef CONFIG_MIGRATION 7288bccd85fSChristoph Lameter /* 7296ce3c4c0SChristoph Lameter * page migration 7306ce3c4c0SChristoph Lameter */ 731fc301289SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 732fc301289SChristoph Lameter unsigned long flags) 7336ce3c4c0SChristoph Lameter { 7346ce3c4c0SChristoph Lameter /* 735fc301289SChristoph Lameter * Avoid migrating a page that is shared with others. 7366ce3c4c0SChristoph Lameter */ 737b20a3503SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) 738b20a3503SChristoph Lameter isolate_lru_page(page, pagelist); 7396ce3c4c0SChristoph Lameter } 7406ce3c4c0SChristoph Lameter 741742755a1SChristoph Lameter static struct page *new_node_page(struct page *page, unsigned long node, int **x) 74295a402c3SChristoph Lameter { 743769848c0SMel Gorman return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); 74495a402c3SChristoph Lameter } 74595a402c3SChristoph Lameter 7466ce3c4c0SChristoph Lameter /* 7477e2ab150SChristoph Lameter * Migrate pages from one node to a target node. 7487e2ab150SChristoph Lameter * Returns error or the number of pages not migrated. 7497e2ab150SChristoph Lameter */ 750dbcb0f19SAdrian Bunk static int migrate_to_node(struct mm_struct *mm, int source, int dest, 751dbcb0f19SAdrian Bunk int flags) 7527e2ab150SChristoph Lameter { 7537e2ab150SChristoph Lameter nodemask_t nmask; 7547e2ab150SChristoph Lameter LIST_HEAD(pagelist); 7557e2ab150SChristoph Lameter int err = 0; 7567e2ab150SChristoph Lameter 7577e2ab150SChristoph Lameter nodes_clear(nmask); 7587e2ab150SChristoph Lameter node_set(source, nmask); 7597e2ab150SChristoph Lameter 7607e2ab150SChristoph Lameter check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 7617e2ab150SChristoph Lameter flags | MPOL_MF_DISCONTIG_OK, &pagelist); 7627e2ab150SChristoph Lameter 7637e2ab150SChristoph Lameter if (!list_empty(&pagelist)) 76495a402c3SChristoph Lameter err = migrate_pages(&pagelist, new_node_page, dest); 76595a402c3SChristoph Lameter 7667e2ab150SChristoph Lameter return err; 7677e2ab150SChristoph Lameter } 7687e2ab150SChristoph Lameter 7697e2ab150SChristoph Lameter /* 7707e2ab150SChristoph Lameter * Move pages between the two nodesets so as to preserve the physical 7717e2ab150SChristoph Lameter * layout as much as possible. 77239743889SChristoph Lameter * 77339743889SChristoph Lameter * Returns the number of page that could not be moved. 77439743889SChristoph Lameter */ 77539743889SChristoph Lameter int do_migrate_pages(struct mm_struct *mm, 77639743889SChristoph Lameter const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 77739743889SChristoph Lameter { 77839743889SChristoph Lameter LIST_HEAD(pagelist); 7797e2ab150SChristoph Lameter int busy = 0; 7807e2ab150SChristoph Lameter int err = 0; 7817e2ab150SChristoph Lameter nodemask_t tmp; 78239743889SChristoph Lameter 78339743889SChristoph Lameter down_read(&mm->mmap_sem); 784d4984711SChristoph Lameter 7857b2259b3SChristoph Lameter err = migrate_vmas(mm, from_nodes, to_nodes, flags); 7867b2259b3SChristoph Lameter if (err) 7877b2259b3SChristoph Lameter goto out; 7887b2259b3SChristoph Lameter 7897e2ab150SChristoph Lameter /* 7907e2ab150SChristoph Lameter * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 7917e2ab150SChristoph Lameter * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 7927e2ab150SChristoph Lameter * bit in 'tmp', and return that <source, dest> pair for migration. 7937e2ab150SChristoph Lameter * The pair of nodemasks 'to' and 'from' define the map. 7947e2ab150SChristoph Lameter * 7957e2ab150SChristoph Lameter * If no pair of bits is found that way, fallback to picking some 7967e2ab150SChristoph Lameter * pair of 'source' and 'dest' bits that are not the same. If the 7977e2ab150SChristoph Lameter * 'source' and 'dest' bits are the same, this represents a node 7987e2ab150SChristoph Lameter * that will be migrating to itself, so no pages need move. 7997e2ab150SChristoph Lameter * 8007e2ab150SChristoph Lameter * If no bits are left in 'tmp', or if all remaining bits left 8017e2ab150SChristoph Lameter * in 'tmp' correspond to the same bit in 'to', return false 8027e2ab150SChristoph Lameter * (nothing left to migrate). 8037e2ab150SChristoph Lameter * 8047e2ab150SChristoph Lameter * This lets us pick a pair of nodes to migrate between, such that 8057e2ab150SChristoph Lameter * if possible the dest node is not already occupied by some other 8067e2ab150SChristoph Lameter * source node, minimizing the risk of overloading the memory on a 8077e2ab150SChristoph Lameter * node that would happen if we migrated incoming memory to a node 8087e2ab150SChristoph Lameter * before migrating outgoing memory source that same node. 8097e2ab150SChristoph Lameter * 8107e2ab150SChristoph Lameter * A single scan of tmp is sufficient. As we go, we remember the 8117e2ab150SChristoph Lameter * most recent <s, d> pair that moved (s != d). If we find a pair 8127e2ab150SChristoph Lameter * that not only moved, but what's better, moved to an empty slot 8137e2ab150SChristoph Lameter * (d is not set in tmp), then we break out then, with that pair. 8147e2ab150SChristoph Lameter * Otherwise when we finish scannng from_tmp, we at least have the 8157e2ab150SChristoph Lameter * most recent <s, d> pair that moved. If we get all the way through 8167e2ab150SChristoph Lameter * the scan of tmp without finding any node that moved, much less 8177e2ab150SChristoph Lameter * moved to an empty node, then there is nothing left worth migrating. 8187e2ab150SChristoph Lameter */ 8197e2ab150SChristoph Lameter 8207e2ab150SChristoph Lameter tmp = *from_nodes; 8217e2ab150SChristoph Lameter while (!nodes_empty(tmp)) { 8227e2ab150SChristoph Lameter int s,d; 8237e2ab150SChristoph Lameter int source = -1; 8247e2ab150SChristoph Lameter int dest = 0; 8257e2ab150SChristoph Lameter 8267e2ab150SChristoph Lameter for_each_node_mask(s, tmp) { 8277e2ab150SChristoph Lameter d = node_remap(s, *from_nodes, *to_nodes); 8287e2ab150SChristoph Lameter if (s == d) 8297e2ab150SChristoph Lameter continue; 8307e2ab150SChristoph Lameter 8317e2ab150SChristoph Lameter source = s; /* Node moved. Memorize */ 8327e2ab150SChristoph Lameter dest = d; 8337e2ab150SChristoph Lameter 8347e2ab150SChristoph Lameter /* dest not in remaining from nodes? */ 8357e2ab150SChristoph Lameter if (!node_isset(dest, tmp)) 8367e2ab150SChristoph Lameter break; 8377e2ab150SChristoph Lameter } 8387e2ab150SChristoph Lameter if (source == -1) 8397e2ab150SChristoph Lameter break; 8407e2ab150SChristoph Lameter 8417e2ab150SChristoph Lameter node_clear(source, tmp); 8427e2ab150SChristoph Lameter err = migrate_to_node(mm, source, dest, flags); 8437e2ab150SChristoph Lameter if (err > 0) 8447e2ab150SChristoph Lameter busy += err; 8457e2ab150SChristoph Lameter if (err < 0) 8467e2ab150SChristoph Lameter break; 84739743889SChristoph Lameter } 8487b2259b3SChristoph Lameter out: 84939743889SChristoph Lameter up_read(&mm->mmap_sem); 8507e2ab150SChristoph Lameter if (err < 0) 8517e2ab150SChristoph Lameter return err; 8527e2ab150SChristoph Lameter return busy; 853b20a3503SChristoph Lameter 85439743889SChristoph Lameter } 85539743889SChristoph Lameter 8563ad33b24SLee Schermerhorn /* 8573ad33b24SLee Schermerhorn * Allocate a new page for page migration based on vma policy. 8583ad33b24SLee Schermerhorn * Start assuming that page is mapped by vma pointed to by @private. 8593ad33b24SLee Schermerhorn * Search forward from there, if not. N.B., this assumes that the 8603ad33b24SLee Schermerhorn * list of pages handed to migrate_pages()--which is how we get here-- 8613ad33b24SLee Schermerhorn * is in virtual address order. 8623ad33b24SLee Schermerhorn */ 863742755a1SChristoph Lameter static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 86495a402c3SChristoph Lameter { 86595a402c3SChristoph Lameter struct vm_area_struct *vma = (struct vm_area_struct *)private; 8663ad33b24SLee Schermerhorn unsigned long uninitialized_var(address); 86795a402c3SChristoph Lameter 8683ad33b24SLee Schermerhorn while (vma) { 8693ad33b24SLee Schermerhorn address = page_address_in_vma(page, vma); 8703ad33b24SLee Schermerhorn if (address != -EFAULT) 8713ad33b24SLee Schermerhorn break; 8723ad33b24SLee Schermerhorn vma = vma->vm_next; 8733ad33b24SLee Schermerhorn } 8743ad33b24SLee Schermerhorn 8753ad33b24SLee Schermerhorn /* 8763ad33b24SLee Schermerhorn * if !vma, alloc_page_vma() will use task or system default policy 8773ad33b24SLee Schermerhorn */ 8783ad33b24SLee Schermerhorn return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 87995a402c3SChristoph Lameter } 880b20a3503SChristoph Lameter #else 881b20a3503SChristoph Lameter 882b20a3503SChristoph Lameter static void migrate_page_add(struct page *page, struct list_head *pagelist, 883b20a3503SChristoph Lameter unsigned long flags) 884b20a3503SChristoph Lameter { 885b20a3503SChristoph Lameter } 886b20a3503SChristoph Lameter 887b20a3503SChristoph Lameter int do_migrate_pages(struct mm_struct *mm, 888b20a3503SChristoph Lameter const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 889b20a3503SChristoph Lameter { 890b20a3503SChristoph Lameter return -ENOSYS; 891b20a3503SChristoph Lameter } 89295a402c3SChristoph Lameter 89369939749SKeith Owens static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 89495a402c3SChristoph Lameter { 89595a402c3SChristoph Lameter return NULL; 89695a402c3SChristoph Lameter } 897b20a3503SChristoph Lameter #endif 898b20a3503SChristoph Lameter 899dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len, 900028fec41SDavid Rientjes unsigned short mode, unsigned short mode_flags, 901028fec41SDavid Rientjes nodemask_t *nmask, unsigned long flags) 9026ce3c4c0SChristoph Lameter { 9036ce3c4c0SChristoph Lameter struct vm_area_struct *vma; 9046ce3c4c0SChristoph Lameter struct mm_struct *mm = current->mm; 9056ce3c4c0SChristoph Lameter struct mempolicy *new; 9066ce3c4c0SChristoph Lameter unsigned long end; 9076ce3c4c0SChristoph Lameter int err; 9086ce3c4c0SChristoph Lameter LIST_HEAD(pagelist); 9096ce3c4c0SChristoph Lameter 910a3b51e01SDavid Rientjes if (flags & ~(unsigned long)(MPOL_MF_STRICT | 9116ce3c4c0SChristoph Lameter MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 9126ce3c4c0SChristoph Lameter return -EINVAL; 91374c00241SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 9146ce3c4c0SChristoph Lameter return -EPERM; 9156ce3c4c0SChristoph Lameter 9166ce3c4c0SChristoph Lameter if (start & ~PAGE_MASK) 9176ce3c4c0SChristoph Lameter return -EINVAL; 9186ce3c4c0SChristoph Lameter 9196ce3c4c0SChristoph Lameter if (mode == MPOL_DEFAULT) 9206ce3c4c0SChristoph Lameter flags &= ~MPOL_MF_STRICT; 9216ce3c4c0SChristoph Lameter 9226ce3c4c0SChristoph Lameter len = (len + PAGE_SIZE - 1) & PAGE_MASK; 9236ce3c4c0SChristoph Lameter end = start + len; 9246ce3c4c0SChristoph Lameter 9256ce3c4c0SChristoph Lameter if (end < start) 9266ce3c4c0SChristoph Lameter return -EINVAL; 9276ce3c4c0SChristoph Lameter if (end == start) 9286ce3c4c0SChristoph Lameter return 0; 9296ce3c4c0SChristoph Lameter 930028fec41SDavid Rientjes new = mpol_new(mode, mode_flags, nmask); 9316ce3c4c0SChristoph Lameter if (IS_ERR(new)) 9326ce3c4c0SChristoph Lameter return PTR_ERR(new); 9336ce3c4c0SChristoph Lameter 9346ce3c4c0SChristoph Lameter /* 9356ce3c4c0SChristoph Lameter * If we are using the default policy then operation 9366ce3c4c0SChristoph Lameter * on discontinuous address spaces is okay after all 9376ce3c4c0SChristoph Lameter */ 9386ce3c4c0SChristoph Lameter if (!new) 9396ce3c4c0SChristoph Lameter flags |= MPOL_MF_DISCONTIG_OK; 9406ce3c4c0SChristoph Lameter 941028fec41SDavid Rientjes pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 942028fec41SDavid Rientjes start, start + len, mode, mode_flags, 943028fec41SDavid Rientjes nmask ? nodes_addr(*nmask)[0] : -1); 9446ce3c4c0SChristoph Lameter 9456ce3c4c0SChristoph Lameter down_write(&mm->mmap_sem); 9466ce3c4c0SChristoph Lameter vma = check_range(mm, start, end, nmask, 9476ce3c4c0SChristoph Lameter flags | MPOL_MF_INVERT, &pagelist); 9486ce3c4c0SChristoph Lameter 9496ce3c4c0SChristoph Lameter err = PTR_ERR(vma); 9506ce3c4c0SChristoph Lameter if (!IS_ERR(vma)) { 9516ce3c4c0SChristoph Lameter int nr_failed = 0; 9526ce3c4c0SChristoph Lameter 9536ce3c4c0SChristoph Lameter err = mbind_range(vma, start, end, new); 9547e2ab150SChristoph Lameter 9556ce3c4c0SChristoph Lameter if (!list_empty(&pagelist)) 95695a402c3SChristoph Lameter nr_failed = migrate_pages(&pagelist, new_vma_page, 95795a402c3SChristoph Lameter (unsigned long)vma); 9586ce3c4c0SChristoph Lameter 9596ce3c4c0SChristoph Lameter if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 9606ce3c4c0SChristoph Lameter err = -EIO; 9616ce3c4c0SChristoph Lameter } 962b20a3503SChristoph Lameter 9636ce3c4c0SChristoph Lameter up_write(&mm->mmap_sem); 964f0be3d32SLee Schermerhorn mpol_put(new); 9656ce3c4c0SChristoph Lameter return err; 9666ce3c4c0SChristoph Lameter } 9676ce3c4c0SChristoph Lameter 96839743889SChristoph Lameter /* 9698bccd85fSChristoph Lameter * User space interface with variable sized bitmaps for nodelists. 9708bccd85fSChristoph Lameter */ 9718bccd85fSChristoph Lameter 9728bccd85fSChristoph Lameter /* Copy a node mask from user space. */ 97339743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 9748bccd85fSChristoph Lameter unsigned long maxnode) 9758bccd85fSChristoph Lameter { 9768bccd85fSChristoph Lameter unsigned long k; 9778bccd85fSChristoph Lameter unsigned long nlongs; 9788bccd85fSChristoph Lameter unsigned long endmask; 9798bccd85fSChristoph Lameter 9808bccd85fSChristoph Lameter --maxnode; 9818bccd85fSChristoph Lameter nodes_clear(*nodes); 9828bccd85fSChristoph Lameter if (maxnode == 0 || !nmask) 9838bccd85fSChristoph Lameter return 0; 984a9c930baSAndi Kleen if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 985636f13c1SChris Wright return -EINVAL; 9868bccd85fSChristoph Lameter 9878bccd85fSChristoph Lameter nlongs = BITS_TO_LONGS(maxnode); 9888bccd85fSChristoph Lameter if ((maxnode % BITS_PER_LONG) == 0) 9898bccd85fSChristoph Lameter endmask = ~0UL; 9908bccd85fSChristoph Lameter else 9918bccd85fSChristoph Lameter endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 9928bccd85fSChristoph Lameter 9938bccd85fSChristoph Lameter /* When the user specified more nodes than supported just check 9948bccd85fSChristoph Lameter if the non supported part is all zero. */ 9958bccd85fSChristoph Lameter if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 9968bccd85fSChristoph Lameter if (nlongs > PAGE_SIZE/sizeof(long)) 9978bccd85fSChristoph Lameter return -EINVAL; 9988bccd85fSChristoph Lameter for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 9998bccd85fSChristoph Lameter unsigned long t; 10008bccd85fSChristoph Lameter if (get_user(t, nmask + k)) 10018bccd85fSChristoph Lameter return -EFAULT; 10028bccd85fSChristoph Lameter if (k == nlongs - 1) { 10038bccd85fSChristoph Lameter if (t & endmask) 10048bccd85fSChristoph Lameter return -EINVAL; 10058bccd85fSChristoph Lameter } else if (t) 10068bccd85fSChristoph Lameter return -EINVAL; 10078bccd85fSChristoph Lameter } 10088bccd85fSChristoph Lameter nlongs = BITS_TO_LONGS(MAX_NUMNODES); 10098bccd85fSChristoph Lameter endmask = ~0UL; 10108bccd85fSChristoph Lameter } 10118bccd85fSChristoph Lameter 10128bccd85fSChristoph Lameter if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 10138bccd85fSChristoph Lameter return -EFAULT; 10148bccd85fSChristoph Lameter nodes_addr(*nodes)[nlongs-1] &= endmask; 10158bccd85fSChristoph Lameter return 0; 10168bccd85fSChristoph Lameter } 10178bccd85fSChristoph Lameter 10188bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */ 10198bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 10208bccd85fSChristoph Lameter nodemask_t *nodes) 10218bccd85fSChristoph Lameter { 10228bccd85fSChristoph Lameter unsigned long copy = ALIGN(maxnode-1, 64) / 8; 10238bccd85fSChristoph Lameter const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 10248bccd85fSChristoph Lameter 10258bccd85fSChristoph Lameter if (copy > nbytes) { 10268bccd85fSChristoph Lameter if (copy > PAGE_SIZE) 10278bccd85fSChristoph Lameter return -EINVAL; 10288bccd85fSChristoph Lameter if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 10298bccd85fSChristoph Lameter return -EFAULT; 10308bccd85fSChristoph Lameter copy = nbytes; 10318bccd85fSChristoph Lameter } 10328bccd85fSChristoph Lameter return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 10338bccd85fSChristoph Lameter } 10348bccd85fSChristoph Lameter 10358bccd85fSChristoph Lameter asmlinkage long sys_mbind(unsigned long start, unsigned long len, 10368bccd85fSChristoph Lameter unsigned long mode, 10378bccd85fSChristoph Lameter unsigned long __user *nmask, unsigned long maxnode, 10388bccd85fSChristoph Lameter unsigned flags) 10398bccd85fSChristoph Lameter { 10408bccd85fSChristoph Lameter nodemask_t nodes; 10418bccd85fSChristoph Lameter int err; 1042028fec41SDavid Rientjes unsigned short mode_flags; 10438bccd85fSChristoph Lameter 1044028fec41SDavid Rientjes mode_flags = mode & MPOL_MODE_FLAGS; 1045028fec41SDavid Rientjes mode &= ~MPOL_MODE_FLAGS; 1046a3b51e01SDavid Rientjes if (mode >= MPOL_MAX) 1047a3b51e01SDavid Rientjes return -EINVAL; 10484c50bc01SDavid Rientjes if ((mode_flags & MPOL_F_STATIC_NODES) && 10494c50bc01SDavid Rientjes (mode_flags & MPOL_F_RELATIVE_NODES)) 10504c50bc01SDavid Rientjes return -EINVAL; 10518bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 10528bccd85fSChristoph Lameter if (err) 10538bccd85fSChristoph Lameter return err; 1054028fec41SDavid Rientjes return do_mbind(start, len, mode, mode_flags, &nodes, flags); 10558bccd85fSChristoph Lameter } 10568bccd85fSChristoph Lameter 10578bccd85fSChristoph Lameter /* Set the process memory policy */ 10588bccd85fSChristoph Lameter asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 10598bccd85fSChristoph Lameter unsigned long maxnode) 10608bccd85fSChristoph Lameter { 10618bccd85fSChristoph Lameter int err; 10628bccd85fSChristoph Lameter nodemask_t nodes; 1063028fec41SDavid Rientjes unsigned short flags; 10648bccd85fSChristoph Lameter 1065028fec41SDavid Rientjes flags = mode & MPOL_MODE_FLAGS; 1066028fec41SDavid Rientjes mode &= ~MPOL_MODE_FLAGS; 1067028fec41SDavid Rientjes if ((unsigned int)mode >= MPOL_MAX) 10688bccd85fSChristoph Lameter return -EINVAL; 10694c50bc01SDavid Rientjes if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) 10704c50bc01SDavid Rientjes return -EINVAL; 10718bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode); 10728bccd85fSChristoph Lameter if (err) 10738bccd85fSChristoph Lameter return err; 1074028fec41SDavid Rientjes return do_set_mempolicy(mode, flags, &nodes); 10758bccd85fSChristoph Lameter } 10768bccd85fSChristoph Lameter 107739743889SChristoph Lameter asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 107839743889SChristoph Lameter const unsigned long __user *old_nodes, 107939743889SChristoph Lameter const unsigned long __user *new_nodes) 108039743889SChristoph Lameter { 108139743889SChristoph Lameter struct mm_struct *mm; 108239743889SChristoph Lameter struct task_struct *task; 108339743889SChristoph Lameter nodemask_t old; 108439743889SChristoph Lameter nodemask_t new; 108539743889SChristoph Lameter nodemask_t task_nodes; 108639743889SChristoph Lameter int err; 108739743889SChristoph Lameter 108839743889SChristoph Lameter err = get_nodes(&old, old_nodes, maxnode); 108939743889SChristoph Lameter if (err) 109039743889SChristoph Lameter return err; 109139743889SChristoph Lameter 109239743889SChristoph Lameter err = get_nodes(&new, new_nodes, maxnode); 109339743889SChristoph Lameter if (err) 109439743889SChristoph Lameter return err; 109539743889SChristoph Lameter 109639743889SChristoph Lameter /* Find the mm_struct */ 109739743889SChristoph Lameter read_lock(&tasklist_lock); 1098228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 109939743889SChristoph Lameter if (!task) { 110039743889SChristoph Lameter read_unlock(&tasklist_lock); 110139743889SChristoph Lameter return -ESRCH; 110239743889SChristoph Lameter } 110339743889SChristoph Lameter mm = get_task_mm(task); 110439743889SChristoph Lameter read_unlock(&tasklist_lock); 110539743889SChristoph Lameter 110639743889SChristoph Lameter if (!mm) 110739743889SChristoph Lameter return -EINVAL; 110839743889SChristoph Lameter 110939743889SChristoph Lameter /* 111039743889SChristoph Lameter * Check if this process has the right to modify the specified 111139743889SChristoph Lameter * process. The right exists if the process has administrative 11127f927fccSAlexey Dobriyan * capabilities, superuser privileges or the same 111339743889SChristoph Lameter * userid as the target process. 111439743889SChristoph Lameter */ 111539743889SChristoph Lameter if ((current->euid != task->suid) && (current->euid != task->uid) && 111639743889SChristoph Lameter (current->uid != task->suid) && (current->uid != task->uid) && 111774c00241SChristoph Lameter !capable(CAP_SYS_NICE)) { 111839743889SChristoph Lameter err = -EPERM; 111939743889SChristoph Lameter goto out; 112039743889SChristoph Lameter } 112139743889SChristoph Lameter 112239743889SChristoph Lameter task_nodes = cpuset_mems_allowed(task); 112339743889SChristoph Lameter /* Is the user allowed to access the target nodes? */ 112474c00241SChristoph Lameter if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { 112539743889SChristoph Lameter err = -EPERM; 112639743889SChristoph Lameter goto out; 112739743889SChristoph Lameter } 112839743889SChristoph Lameter 112937b07e41SLee Schermerhorn if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { 11303b42d28bSChristoph Lameter err = -EINVAL; 11313b42d28bSChristoph Lameter goto out; 11323b42d28bSChristoph Lameter } 11333b42d28bSChristoph Lameter 113486c3a764SDavid Quigley err = security_task_movememory(task); 113586c3a764SDavid Quigley if (err) 113686c3a764SDavid Quigley goto out; 113786c3a764SDavid Quigley 1138511030bcSChristoph Lameter err = do_migrate_pages(mm, &old, &new, 113974c00241SChristoph Lameter capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 114039743889SChristoph Lameter out: 114139743889SChristoph Lameter mmput(mm); 114239743889SChristoph Lameter return err; 114339743889SChristoph Lameter } 114439743889SChristoph Lameter 114539743889SChristoph Lameter 11468bccd85fSChristoph Lameter /* Retrieve NUMA policy */ 11478bccd85fSChristoph Lameter asmlinkage long sys_get_mempolicy(int __user *policy, 11488bccd85fSChristoph Lameter unsigned long __user *nmask, 11498bccd85fSChristoph Lameter unsigned long maxnode, 11508bccd85fSChristoph Lameter unsigned long addr, unsigned long flags) 11518bccd85fSChristoph Lameter { 1152dbcb0f19SAdrian Bunk int err; 1153dbcb0f19SAdrian Bunk int uninitialized_var(pval); 11548bccd85fSChristoph Lameter nodemask_t nodes; 11558bccd85fSChristoph Lameter 11568bccd85fSChristoph Lameter if (nmask != NULL && maxnode < MAX_NUMNODES) 11578bccd85fSChristoph Lameter return -EINVAL; 11588bccd85fSChristoph Lameter 11598bccd85fSChristoph Lameter err = do_get_mempolicy(&pval, &nodes, addr, flags); 11608bccd85fSChristoph Lameter 11618bccd85fSChristoph Lameter if (err) 11628bccd85fSChristoph Lameter return err; 11638bccd85fSChristoph Lameter 11648bccd85fSChristoph Lameter if (policy && put_user(pval, policy)) 11658bccd85fSChristoph Lameter return -EFAULT; 11668bccd85fSChristoph Lameter 11678bccd85fSChristoph Lameter if (nmask) 11688bccd85fSChristoph Lameter err = copy_nodes_to_user(nmask, maxnode, &nodes); 11698bccd85fSChristoph Lameter 11708bccd85fSChristoph Lameter return err; 11718bccd85fSChristoph Lameter } 11728bccd85fSChristoph Lameter 11731da177e4SLinus Torvalds #ifdef CONFIG_COMPAT 11741da177e4SLinus Torvalds 11751da177e4SLinus Torvalds asmlinkage long compat_sys_get_mempolicy(int __user *policy, 11761da177e4SLinus Torvalds compat_ulong_t __user *nmask, 11771da177e4SLinus Torvalds compat_ulong_t maxnode, 11781da177e4SLinus Torvalds compat_ulong_t addr, compat_ulong_t flags) 11791da177e4SLinus Torvalds { 11801da177e4SLinus Torvalds long err; 11811da177e4SLinus Torvalds unsigned long __user *nm = NULL; 11821da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 11831da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 11841da177e4SLinus Torvalds 11851da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 11861da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 11871da177e4SLinus Torvalds 11881da177e4SLinus Torvalds if (nmask) 11891da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 11901da177e4SLinus Torvalds 11911da177e4SLinus Torvalds err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 11921da177e4SLinus Torvalds 11931da177e4SLinus Torvalds if (!err && nmask) { 11941da177e4SLinus Torvalds err = copy_from_user(bm, nm, alloc_size); 11951da177e4SLinus Torvalds /* ensure entire bitmap is zeroed */ 11961da177e4SLinus Torvalds err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 11971da177e4SLinus Torvalds err |= compat_put_bitmap(nmask, bm, nr_bits); 11981da177e4SLinus Torvalds } 11991da177e4SLinus Torvalds 12001da177e4SLinus Torvalds return err; 12011da177e4SLinus Torvalds } 12021da177e4SLinus Torvalds 12031da177e4SLinus Torvalds asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, 12041da177e4SLinus Torvalds compat_ulong_t maxnode) 12051da177e4SLinus Torvalds { 12061da177e4SLinus Torvalds long err = 0; 12071da177e4SLinus Torvalds unsigned long __user *nm = NULL; 12081da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 12091da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 12101da177e4SLinus Torvalds 12111da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 12121da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 12131da177e4SLinus Torvalds 12141da177e4SLinus Torvalds if (nmask) { 12151da177e4SLinus Torvalds err = compat_get_bitmap(bm, nmask, nr_bits); 12161da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 12171da177e4SLinus Torvalds err |= copy_to_user(nm, bm, alloc_size); 12181da177e4SLinus Torvalds } 12191da177e4SLinus Torvalds 12201da177e4SLinus Torvalds if (err) 12211da177e4SLinus Torvalds return -EFAULT; 12221da177e4SLinus Torvalds 12231da177e4SLinus Torvalds return sys_set_mempolicy(mode, nm, nr_bits+1); 12241da177e4SLinus Torvalds } 12251da177e4SLinus Torvalds 12261da177e4SLinus Torvalds asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, 12271da177e4SLinus Torvalds compat_ulong_t mode, compat_ulong_t __user *nmask, 12281da177e4SLinus Torvalds compat_ulong_t maxnode, compat_ulong_t flags) 12291da177e4SLinus Torvalds { 12301da177e4SLinus Torvalds long err = 0; 12311da177e4SLinus Torvalds unsigned long __user *nm = NULL; 12321da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 1233dfcd3c0dSAndi Kleen nodemask_t bm; 12341da177e4SLinus Torvalds 12351da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 12361da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 12371da177e4SLinus Torvalds 12381da177e4SLinus Torvalds if (nmask) { 1239dfcd3c0dSAndi Kleen err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 12401da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 1241dfcd3c0dSAndi Kleen err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 12421da177e4SLinus Torvalds } 12431da177e4SLinus Torvalds 12441da177e4SLinus Torvalds if (err) 12451da177e4SLinus Torvalds return -EFAULT; 12461da177e4SLinus Torvalds 12471da177e4SLinus Torvalds return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 12481da177e4SLinus Torvalds } 12491da177e4SLinus Torvalds 12501da177e4SLinus Torvalds #endif 12511da177e4SLinus Torvalds 1252480eccf9SLee Schermerhorn /* 1253480eccf9SLee Schermerhorn * get_vma_policy(@task, @vma, @addr) 1254480eccf9SLee Schermerhorn * @task - task for fallback if vma policy == default 1255480eccf9SLee Schermerhorn * @vma - virtual memory area whose policy is sought 1256480eccf9SLee Schermerhorn * @addr - address in @vma for shared policy lookup 1257480eccf9SLee Schermerhorn * 1258480eccf9SLee Schermerhorn * Returns effective policy for a VMA at specified address. 1259480eccf9SLee Schermerhorn * Falls back to @task or system default policy, as necessary. 1260480eccf9SLee Schermerhorn * Returned policy has extra reference count if shared, vma, 1261480eccf9SLee Schermerhorn * or some other task's policy [show_numa_maps() can pass 1262480eccf9SLee Schermerhorn * @task != current]. It is the caller's responsibility to 1263480eccf9SLee Schermerhorn * free the reference in these cases. 1264480eccf9SLee Schermerhorn */ 126548fce342SChristoph Lameter static struct mempolicy * get_vma_policy(struct task_struct *task, 126648fce342SChristoph Lameter struct vm_area_struct *vma, unsigned long addr) 12671da177e4SLinus Torvalds { 12686e21c8f1SChristoph Lameter struct mempolicy *pol = task->mempolicy; 1269480eccf9SLee Schermerhorn int shared_pol = 0; 12701da177e4SLinus Torvalds 12711da177e4SLinus Torvalds if (vma) { 1272480eccf9SLee Schermerhorn if (vma->vm_ops && vma->vm_ops->get_policy) { 12731da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr); 1274480eccf9SLee Schermerhorn shared_pol = 1; /* if pol non-NULL, add ref below */ 1275480eccf9SLee Schermerhorn } else if (vma->vm_policy && 12761da177e4SLinus Torvalds vma->vm_policy->policy != MPOL_DEFAULT) 12771da177e4SLinus Torvalds pol = vma->vm_policy; 12781da177e4SLinus Torvalds } 12791da177e4SLinus Torvalds if (!pol) 12801da177e4SLinus Torvalds pol = &default_policy; 1281480eccf9SLee Schermerhorn else if (!shared_pol && pol != current->mempolicy) 1282480eccf9SLee Schermerhorn mpol_get(pol); /* vma or other task's policy */ 12831da177e4SLinus Torvalds return pol; 12841da177e4SLinus Torvalds } 12851da177e4SLinus Torvalds 128619770b32SMel Gorman /* Return a nodemask representing a mempolicy */ 128719770b32SMel Gorman static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) 128819770b32SMel Gorman { 128919770b32SMel Gorman /* Lower zones don't get a nodemask applied for MPOL_BIND */ 129019770b32SMel Gorman if (unlikely(policy->policy == MPOL_BIND) && 129119770b32SMel Gorman gfp_zone(gfp) >= policy_zone && 129219770b32SMel Gorman cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 129319770b32SMel Gorman return &policy->v.nodes; 129419770b32SMel Gorman 129519770b32SMel Gorman return NULL; 129619770b32SMel Gorman } 129719770b32SMel Gorman 12981da177e4SLinus Torvalds /* Return a zonelist representing a mempolicy */ 1299dd0fc66fSAl Viro static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 13001da177e4SLinus Torvalds { 13011da177e4SLinus Torvalds int nd; 13021da177e4SLinus Torvalds 13031da177e4SLinus Torvalds switch (policy->policy) { 13041da177e4SLinus Torvalds case MPOL_PREFERRED: 13051da177e4SLinus Torvalds nd = policy->v.preferred_node; 13061da177e4SLinus Torvalds if (nd < 0) 13071da177e4SLinus Torvalds nd = numa_node_id(); 13081da177e4SLinus Torvalds break; 13091da177e4SLinus Torvalds case MPOL_BIND: 131019770b32SMel Gorman /* 131119770b32SMel Gorman * Normally, MPOL_BIND allocations node-local are node-local 131219770b32SMel Gorman * within the allowed nodemask. However, if __GFP_THISNODE is 131319770b32SMel Gorman * set and the current node is part of the mask, we use the 131419770b32SMel Gorman * the zonelist for the first node in the mask instead. 131519770b32SMel Gorman */ 131619770b32SMel Gorman nd = numa_node_id(); 131719770b32SMel Gorman if (unlikely(gfp & __GFP_THISNODE) && 131819770b32SMel Gorman unlikely(!node_isset(nd, policy->v.nodes))) 131919770b32SMel Gorman nd = first_node(policy->v.nodes); 132019770b32SMel Gorman break; 13211da177e4SLinus Torvalds case MPOL_INTERLEAVE: /* should not happen */ 13221da177e4SLinus Torvalds case MPOL_DEFAULT: 13231da177e4SLinus Torvalds nd = numa_node_id(); 13241da177e4SLinus Torvalds break; 13251da177e4SLinus Torvalds default: 13261da177e4SLinus Torvalds nd = 0; 13271da177e4SLinus Torvalds BUG(); 13281da177e4SLinus Torvalds } 13290e88460dSMel Gorman return node_zonelist(nd, gfp); 13301da177e4SLinus Torvalds } 13311da177e4SLinus Torvalds 13321da177e4SLinus Torvalds /* Do dynamic interleaving for a process */ 13331da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy) 13341da177e4SLinus Torvalds { 13351da177e4SLinus Torvalds unsigned nid, next; 13361da177e4SLinus Torvalds struct task_struct *me = current; 13371da177e4SLinus Torvalds 13381da177e4SLinus Torvalds nid = me->il_next; 1339dfcd3c0dSAndi Kleen next = next_node(nid, policy->v.nodes); 13401da177e4SLinus Torvalds if (next >= MAX_NUMNODES) 1341dfcd3c0dSAndi Kleen next = first_node(policy->v.nodes); 1342f5b087b5SDavid Rientjes if (next < MAX_NUMNODES) 13431da177e4SLinus Torvalds me->il_next = next; 13441da177e4SLinus Torvalds return nid; 13451da177e4SLinus Torvalds } 13461da177e4SLinus Torvalds 1347dc85da15SChristoph Lameter /* 1348dc85da15SChristoph Lameter * Depending on the memory policy provide a node from which to allocate the 1349dc85da15SChristoph Lameter * next slab entry. 1350dc85da15SChristoph Lameter */ 1351dc85da15SChristoph Lameter unsigned slab_node(struct mempolicy *policy) 1352dc85da15SChristoph Lameter { 1353a3b51e01SDavid Rientjes unsigned short pol = policy ? policy->policy : MPOL_DEFAULT; 1354765c4507SChristoph Lameter 1355765c4507SChristoph Lameter switch (pol) { 1356dc85da15SChristoph Lameter case MPOL_INTERLEAVE: 1357dc85da15SChristoph Lameter return interleave_nodes(policy); 1358dc85da15SChristoph Lameter 1359dd1a239fSMel Gorman case MPOL_BIND: { 1360dc85da15SChristoph Lameter /* 1361dc85da15SChristoph Lameter * Follow bind policy behavior and start allocation at the 1362dc85da15SChristoph Lameter * first node. 1363dc85da15SChristoph Lameter */ 136419770b32SMel Gorman struct zonelist *zonelist; 136519770b32SMel Gorman struct zone *zone; 136619770b32SMel Gorman enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 136719770b32SMel Gorman zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; 136819770b32SMel Gorman (void)first_zones_zonelist(zonelist, highest_zoneidx, 136919770b32SMel Gorman &policy->v.nodes, 137019770b32SMel Gorman &zone); 137119770b32SMel Gorman return zone->node; 1372dd1a239fSMel Gorman } 1373dc85da15SChristoph Lameter 1374dc85da15SChristoph Lameter case MPOL_PREFERRED: 1375dc85da15SChristoph Lameter if (policy->v.preferred_node >= 0) 1376dc85da15SChristoph Lameter return policy->v.preferred_node; 1377dc85da15SChristoph Lameter /* Fall through */ 1378dc85da15SChristoph Lameter 1379dc85da15SChristoph Lameter default: 1380dc85da15SChristoph Lameter return numa_node_id(); 1381dc85da15SChristoph Lameter } 1382dc85da15SChristoph Lameter } 1383dc85da15SChristoph Lameter 13841da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */ 13851da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol, 13861da177e4SLinus Torvalds struct vm_area_struct *vma, unsigned long off) 13871da177e4SLinus Torvalds { 1388dfcd3c0dSAndi Kleen unsigned nnodes = nodes_weight(pol->v.nodes); 1389f5b087b5SDavid Rientjes unsigned target; 13901da177e4SLinus Torvalds int c; 13911da177e4SLinus Torvalds int nid = -1; 13921da177e4SLinus Torvalds 1393f5b087b5SDavid Rientjes if (!nnodes) 1394f5b087b5SDavid Rientjes return numa_node_id(); 1395f5b087b5SDavid Rientjes target = (unsigned int)off % nnodes; 13961da177e4SLinus Torvalds c = 0; 13971da177e4SLinus Torvalds do { 1398dfcd3c0dSAndi Kleen nid = next_node(nid, pol->v.nodes); 13991da177e4SLinus Torvalds c++; 14001da177e4SLinus Torvalds } while (c <= target); 14011da177e4SLinus Torvalds return nid; 14021da177e4SLinus Torvalds } 14031da177e4SLinus Torvalds 14045da7ca86SChristoph Lameter /* Determine a node number for interleave */ 14055da7ca86SChristoph Lameter static inline unsigned interleave_nid(struct mempolicy *pol, 14065da7ca86SChristoph Lameter struct vm_area_struct *vma, unsigned long addr, int shift) 14075da7ca86SChristoph Lameter { 14085da7ca86SChristoph Lameter if (vma) { 14095da7ca86SChristoph Lameter unsigned long off; 14105da7ca86SChristoph Lameter 14113b98b087SNishanth Aravamudan /* 14123b98b087SNishanth Aravamudan * for small pages, there is no difference between 14133b98b087SNishanth Aravamudan * shift and PAGE_SHIFT, so the bit-shift is safe. 14143b98b087SNishanth Aravamudan * for huge pages, since vm_pgoff is in units of small 14153b98b087SNishanth Aravamudan * pages, we need to shift off the always 0 bits to get 14163b98b087SNishanth Aravamudan * a useful offset. 14173b98b087SNishanth Aravamudan */ 14183b98b087SNishanth Aravamudan BUG_ON(shift < PAGE_SHIFT); 14193b98b087SNishanth Aravamudan off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 14205da7ca86SChristoph Lameter off += (addr - vma->vm_start) >> shift; 14215da7ca86SChristoph Lameter return offset_il_node(pol, vma, off); 14225da7ca86SChristoph Lameter } else 14235da7ca86SChristoph Lameter return interleave_nodes(pol); 14245da7ca86SChristoph Lameter } 14255da7ca86SChristoph Lameter 142600ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS 1427480eccf9SLee Schermerhorn /* 1428480eccf9SLee Schermerhorn * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1429480eccf9SLee Schermerhorn * @vma = virtual memory area whose policy is sought 1430480eccf9SLee Schermerhorn * @addr = address in @vma for shared policy lookup and interleave policy 1431480eccf9SLee Schermerhorn * @gfp_flags = for requested zone 143219770b32SMel Gorman * @mpol = pointer to mempolicy pointer for reference counted mempolicy 143319770b32SMel Gorman * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1434480eccf9SLee Schermerhorn * 1435480eccf9SLee Schermerhorn * Returns a zonelist suitable for a huge page allocation. 143619770b32SMel Gorman * If the effective policy is 'BIND, returns pointer to local node's zonelist, 143719770b32SMel Gorman * and a pointer to the mempolicy's @nodemask for filtering the zonelist. 1438480eccf9SLee Schermerhorn * If it is also a policy for which get_vma_policy() returns an extra 143919770b32SMel Gorman * reference, we must hold that reference until after the allocation. 1440480eccf9SLee Schermerhorn * In that case, return policy via @mpol so hugetlb allocation can drop 1441480eccf9SLee Schermerhorn * the reference. For non-'BIND referenced policies, we can/do drop the 1442480eccf9SLee Schermerhorn * reference here, so the caller doesn't need to know about the special case 1443480eccf9SLee Schermerhorn * for default and current task policy. 1444480eccf9SLee Schermerhorn */ 1445396faf03SMel Gorman struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 144619770b32SMel Gorman gfp_t gfp_flags, struct mempolicy **mpol, 144719770b32SMel Gorman nodemask_t **nodemask) 14485da7ca86SChristoph Lameter { 14495da7ca86SChristoph Lameter struct mempolicy *pol = get_vma_policy(current, vma, addr); 1450480eccf9SLee Schermerhorn struct zonelist *zl; 14515da7ca86SChristoph Lameter 1452480eccf9SLee Schermerhorn *mpol = NULL; /* probably no unref needed */ 145319770b32SMel Gorman *nodemask = NULL; /* assume !MPOL_BIND */ 145419770b32SMel Gorman if (pol->policy == MPOL_BIND) { 145519770b32SMel Gorman *nodemask = &pol->v.nodes; 145619770b32SMel Gorman } else if (pol->policy == MPOL_INTERLEAVE) { 14575da7ca86SChristoph Lameter unsigned nid; 14585da7ca86SChristoph Lameter 14595da7ca86SChristoph Lameter nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 146069682d85SLee Schermerhorn if (unlikely(pol != &default_policy && 146169682d85SLee Schermerhorn pol != current->mempolicy)) 1462f0be3d32SLee Schermerhorn __mpol_put(pol); /* finished with pol */ 14630e88460dSMel Gorman return node_zonelist(nid, gfp_flags); 14645da7ca86SChristoph Lameter } 1465480eccf9SLee Schermerhorn 1466480eccf9SLee Schermerhorn zl = zonelist_policy(GFP_HIGHUSER, pol); 1467480eccf9SLee Schermerhorn if (unlikely(pol != &default_policy && pol != current->mempolicy)) { 1468480eccf9SLee Schermerhorn if (pol->policy != MPOL_BIND) 1469f0be3d32SLee Schermerhorn __mpol_put(pol); /* finished with pol */ 1470480eccf9SLee Schermerhorn else 1471480eccf9SLee Schermerhorn *mpol = pol; /* unref needed after allocation */ 1472480eccf9SLee Schermerhorn } 1473480eccf9SLee Schermerhorn return zl; 14745da7ca86SChristoph Lameter } 147500ac59adSChen, Kenneth W #endif 14765da7ca86SChristoph Lameter 14771da177e4SLinus Torvalds /* Allocate a page in interleaved policy. 14781da177e4SLinus Torvalds Own path because it needs to do special accounting. */ 1479662f3a0bSAndi Kleen static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1480662f3a0bSAndi Kleen unsigned nid) 14811da177e4SLinus Torvalds { 14821da177e4SLinus Torvalds struct zonelist *zl; 14831da177e4SLinus Torvalds struct page *page; 14841da177e4SLinus Torvalds 14850e88460dSMel Gorman zl = node_zonelist(nid, gfp); 14861da177e4SLinus Torvalds page = __alloc_pages(gfp, order, zl); 1487dd1a239fSMel Gorman if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) 1488ca889e6cSChristoph Lameter inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); 14891da177e4SLinus Torvalds return page; 14901da177e4SLinus Torvalds } 14911da177e4SLinus Torvalds 14921da177e4SLinus Torvalds /** 14931da177e4SLinus Torvalds * alloc_page_vma - Allocate a page for a VMA. 14941da177e4SLinus Torvalds * 14951da177e4SLinus Torvalds * @gfp: 14961da177e4SLinus Torvalds * %GFP_USER user allocation. 14971da177e4SLinus Torvalds * %GFP_KERNEL kernel allocations, 14981da177e4SLinus Torvalds * %GFP_HIGHMEM highmem/user allocations, 14991da177e4SLinus Torvalds * %GFP_FS allocation should not call back into a file system. 15001da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 15011da177e4SLinus Torvalds * 15021da177e4SLinus Torvalds * @vma: Pointer to VMA or NULL if not available. 15031da177e4SLinus Torvalds * @addr: Virtual Address of the allocation. Must be inside the VMA. 15041da177e4SLinus Torvalds * 15051da177e4SLinus Torvalds * This function allocates a page from the kernel page pool and applies 15061da177e4SLinus Torvalds * a NUMA policy associated with the VMA or the current process. 15071da177e4SLinus Torvalds * When VMA is not NULL caller must hold down_read on the mmap_sem of the 15081da177e4SLinus Torvalds * mm_struct of the VMA to prevent it from going away. Should be used for 15091da177e4SLinus Torvalds * all allocations for pages that will be mapped into 15101da177e4SLinus Torvalds * user space. Returns NULL when no page can be allocated. 15111da177e4SLinus Torvalds * 15121da177e4SLinus Torvalds * Should be called with the mm_sem of the vma hold. 15131da177e4SLinus Torvalds */ 15141da177e4SLinus Torvalds struct page * 1515dd0fc66fSAl Viro alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 15161da177e4SLinus Torvalds { 15176e21c8f1SChristoph Lameter struct mempolicy *pol = get_vma_policy(current, vma, addr); 1518480eccf9SLee Schermerhorn struct zonelist *zl; 15191da177e4SLinus Torvalds 1520cf2a473cSPaul Jackson cpuset_update_task_memory_state(); 15211da177e4SLinus Torvalds 15221da177e4SLinus Torvalds if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 15231da177e4SLinus Torvalds unsigned nid; 15245da7ca86SChristoph Lameter 15255da7ca86SChristoph Lameter nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 152669682d85SLee Schermerhorn if (unlikely(pol != &default_policy && 152769682d85SLee Schermerhorn pol != current->mempolicy)) 1528f0be3d32SLee Schermerhorn __mpol_put(pol); /* finished with pol */ 15291da177e4SLinus Torvalds return alloc_page_interleave(gfp, 0, nid); 15301da177e4SLinus Torvalds } 1531480eccf9SLee Schermerhorn zl = zonelist_policy(gfp, pol); 1532480eccf9SLee Schermerhorn if (pol != &default_policy && pol != current->mempolicy) { 1533480eccf9SLee Schermerhorn /* 1534480eccf9SLee Schermerhorn * slow path: ref counted policy -- shared or vma 1535480eccf9SLee Schermerhorn */ 153619770b32SMel Gorman struct page *page = __alloc_pages_nodemask(gfp, 0, 153719770b32SMel Gorman zl, nodemask_policy(gfp, pol)); 1538f0be3d32SLee Schermerhorn __mpol_put(pol); 1539480eccf9SLee Schermerhorn return page; 1540480eccf9SLee Schermerhorn } 1541480eccf9SLee Schermerhorn /* 1542480eccf9SLee Schermerhorn * fast path: default or task policy 1543480eccf9SLee Schermerhorn */ 154419770b32SMel Gorman return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); 15451da177e4SLinus Torvalds } 15461da177e4SLinus Torvalds 15471da177e4SLinus Torvalds /** 15481da177e4SLinus Torvalds * alloc_pages_current - Allocate pages. 15491da177e4SLinus Torvalds * 15501da177e4SLinus Torvalds * @gfp: 15511da177e4SLinus Torvalds * %GFP_USER user allocation, 15521da177e4SLinus Torvalds * %GFP_KERNEL kernel allocation, 15531da177e4SLinus Torvalds * %GFP_HIGHMEM highmem allocation, 15541da177e4SLinus Torvalds * %GFP_FS don't call back into a file system. 15551da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 15561da177e4SLinus Torvalds * @order: Power of two of allocation size in pages. 0 is a single page. 15571da177e4SLinus Torvalds * 15581da177e4SLinus Torvalds * Allocate a page from the kernel page pool. When not in 15591da177e4SLinus Torvalds * interrupt context and apply the current process NUMA policy. 15601da177e4SLinus Torvalds * Returns NULL when no page can be allocated. 15611da177e4SLinus Torvalds * 1562cf2a473cSPaul Jackson * Don't call cpuset_update_task_memory_state() unless 15631da177e4SLinus Torvalds * 1) it's ok to take cpuset_sem (can WAIT), and 15641da177e4SLinus Torvalds * 2) allocating for current task (not interrupt). 15651da177e4SLinus Torvalds */ 1566dd0fc66fSAl Viro struct page *alloc_pages_current(gfp_t gfp, unsigned order) 15671da177e4SLinus Torvalds { 15681da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 15691da177e4SLinus Torvalds 15701da177e4SLinus Torvalds if ((gfp & __GFP_WAIT) && !in_interrupt()) 1571cf2a473cSPaul Jackson cpuset_update_task_memory_state(); 15729b819d20SChristoph Lameter if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 15731da177e4SLinus Torvalds pol = &default_policy; 15741da177e4SLinus Torvalds if (pol->policy == MPOL_INTERLEAVE) 15751da177e4SLinus Torvalds return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 157619770b32SMel Gorman return __alloc_pages_nodemask(gfp, order, 157719770b32SMel Gorman zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); 15781da177e4SLinus Torvalds } 15791da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current); 15801da177e4SLinus Torvalds 15814225399aSPaul Jackson /* 1582846a16bfSLee Schermerhorn * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 15834225399aSPaul Jackson * rebinds the mempolicy its copying by calling mpol_rebind_policy() 15844225399aSPaul Jackson * with the mems_allowed returned by cpuset_mems_allowed(). This 15854225399aSPaul Jackson * keeps mempolicies cpuset relative after its cpuset moves. See 15864225399aSPaul Jackson * further kernel/cpuset.c update_nodemask(). 15874225399aSPaul Jackson */ 15884225399aSPaul Jackson 1589846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */ 1590846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old) 15911da177e4SLinus Torvalds { 15921da177e4SLinus Torvalds struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 15931da177e4SLinus Torvalds 15941da177e4SLinus Torvalds if (!new) 15951da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 15964225399aSPaul Jackson if (current_cpuset_is_being_rebound()) { 15974225399aSPaul Jackson nodemask_t mems = cpuset_mems_allowed(current); 15984225399aSPaul Jackson mpol_rebind_policy(old, &mems); 15994225399aSPaul Jackson } 16001da177e4SLinus Torvalds *new = *old; 16011da177e4SLinus Torvalds atomic_set(&new->refcnt, 1); 16021da177e4SLinus Torvalds return new; 16031da177e4SLinus Torvalds } 16041da177e4SLinus Torvalds 1605f5b087b5SDavid Rientjes static int mpol_match_intent(const struct mempolicy *a, 1606f5b087b5SDavid Rientjes const struct mempolicy *b) 1607f5b087b5SDavid Rientjes { 1608f5b087b5SDavid Rientjes if (a->flags != b->flags) 1609f5b087b5SDavid Rientjes return 0; 1610f5b087b5SDavid Rientjes if (!mpol_store_user_nodemask(a)) 1611f5b087b5SDavid Rientjes return 1; 1612f5b087b5SDavid Rientjes return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); 1613f5b087b5SDavid Rientjes } 1614f5b087b5SDavid Rientjes 16151da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */ 16161da177e4SLinus Torvalds int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 16171da177e4SLinus Torvalds { 16181da177e4SLinus Torvalds if (!a || !b) 16191da177e4SLinus Torvalds return 0; 16201da177e4SLinus Torvalds if (a->policy != b->policy) 16211da177e4SLinus Torvalds return 0; 1622f5b087b5SDavid Rientjes if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b)) 1623f5b087b5SDavid Rientjes return 0; 16241da177e4SLinus Torvalds switch (a->policy) { 16251da177e4SLinus Torvalds case MPOL_DEFAULT: 16261da177e4SLinus Torvalds return 1; 162719770b32SMel Gorman case MPOL_BIND: 162819770b32SMel Gorman /* Fall through */ 16291da177e4SLinus Torvalds case MPOL_INTERLEAVE: 1630dfcd3c0dSAndi Kleen return nodes_equal(a->v.nodes, b->v.nodes); 16311da177e4SLinus Torvalds case MPOL_PREFERRED: 16321da177e4SLinus Torvalds return a->v.preferred_node == b->v.preferred_node; 16331da177e4SLinus Torvalds default: 16341da177e4SLinus Torvalds BUG(); 16351da177e4SLinus Torvalds return 0; 16361da177e4SLinus Torvalds } 16371da177e4SLinus Torvalds } 16381da177e4SLinus Torvalds 16391da177e4SLinus Torvalds /* Slow path of a mpol destructor. */ 1640f0be3d32SLee Schermerhorn void __mpol_put(struct mempolicy *p) 16411da177e4SLinus Torvalds { 16421da177e4SLinus Torvalds if (!atomic_dec_and_test(&p->refcnt)) 16431da177e4SLinus Torvalds return; 16441da177e4SLinus Torvalds p->policy = MPOL_DEFAULT; 16451da177e4SLinus Torvalds kmem_cache_free(policy_cache, p); 16461da177e4SLinus Torvalds } 16471da177e4SLinus Torvalds 16481da177e4SLinus Torvalds /* 16491da177e4SLinus Torvalds * Shared memory backing store policy support. 16501da177e4SLinus Torvalds * 16511da177e4SLinus Torvalds * Remember policies even when nobody has shared memory mapped. 16521da177e4SLinus Torvalds * The policies are kept in Red-Black tree linked from the inode. 16531da177e4SLinus Torvalds * They are protected by the sp->lock spinlock, which should be held 16541da177e4SLinus Torvalds * for any accesses to the tree. 16551da177e4SLinus Torvalds */ 16561da177e4SLinus Torvalds 16571da177e4SLinus Torvalds /* lookup first element intersecting start-end */ 16581da177e4SLinus Torvalds /* Caller holds sp->lock */ 16591da177e4SLinus Torvalds static struct sp_node * 16601da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 16611da177e4SLinus Torvalds { 16621da177e4SLinus Torvalds struct rb_node *n = sp->root.rb_node; 16631da177e4SLinus Torvalds 16641da177e4SLinus Torvalds while (n) { 16651da177e4SLinus Torvalds struct sp_node *p = rb_entry(n, struct sp_node, nd); 16661da177e4SLinus Torvalds 16671da177e4SLinus Torvalds if (start >= p->end) 16681da177e4SLinus Torvalds n = n->rb_right; 16691da177e4SLinus Torvalds else if (end <= p->start) 16701da177e4SLinus Torvalds n = n->rb_left; 16711da177e4SLinus Torvalds else 16721da177e4SLinus Torvalds break; 16731da177e4SLinus Torvalds } 16741da177e4SLinus Torvalds if (!n) 16751da177e4SLinus Torvalds return NULL; 16761da177e4SLinus Torvalds for (;;) { 16771da177e4SLinus Torvalds struct sp_node *w = NULL; 16781da177e4SLinus Torvalds struct rb_node *prev = rb_prev(n); 16791da177e4SLinus Torvalds if (!prev) 16801da177e4SLinus Torvalds break; 16811da177e4SLinus Torvalds w = rb_entry(prev, struct sp_node, nd); 16821da177e4SLinus Torvalds if (w->end <= start) 16831da177e4SLinus Torvalds break; 16841da177e4SLinus Torvalds n = prev; 16851da177e4SLinus Torvalds } 16861da177e4SLinus Torvalds return rb_entry(n, struct sp_node, nd); 16871da177e4SLinus Torvalds } 16881da177e4SLinus Torvalds 16891da177e4SLinus Torvalds /* Insert a new shared policy into the list. */ 16901da177e4SLinus Torvalds /* Caller holds sp->lock */ 16911da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new) 16921da177e4SLinus Torvalds { 16931da177e4SLinus Torvalds struct rb_node **p = &sp->root.rb_node; 16941da177e4SLinus Torvalds struct rb_node *parent = NULL; 16951da177e4SLinus Torvalds struct sp_node *nd; 16961da177e4SLinus Torvalds 16971da177e4SLinus Torvalds while (*p) { 16981da177e4SLinus Torvalds parent = *p; 16991da177e4SLinus Torvalds nd = rb_entry(parent, struct sp_node, nd); 17001da177e4SLinus Torvalds if (new->start < nd->start) 17011da177e4SLinus Torvalds p = &(*p)->rb_left; 17021da177e4SLinus Torvalds else if (new->end > nd->end) 17031da177e4SLinus Torvalds p = &(*p)->rb_right; 17041da177e4SLinus Torvalds else 17051da177e4SLinus Torvalds BUG(); 17061da177e4SLinus Torvalds } 17071da177e4SLinus Torvalds rb_link_node(&new->nd, parent, p); 17081da177e4SLinus Torvalds rb_insert_color(&new->nd, &sp->root); 1709140d5a49SPaul Mundt pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, 17101da177e4SLinus Torvalds new->policy ? new->policy->policy : 0); 17111da177e4SLinus Torvalds } 17121da177e4SLinus Torvalds 17131da177e4SLinus Torvalds /* Find shared policy intersecting idx */ 17141da177e4SLinus Torvalds struct mempolicy * 17151da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 17161da177e4SLinus Torvalds { 17171da177e4SLinus Torvalds struct mempolicy *pol = NULL; 17181da177e4SLinus Torvalds struct sp_node *sn; 17191da177e4SLinus Torvalds 17201da177e4SLinus Torvalds if (!sp->root.rb_node) 17211da177e4SLinus Torvalds return NULL; 17221da177e4SLinus Torvalds spin_lock(&sp->lock); 17231da177e4SLinus Torvalds sn = sp_lookup(sp, idx, idx+1); 17241da177e4SLinus Torvalds if (sn) { 17251da177e4SLinus Torvalds mpol_get(sn->policy); 17261da177e4SLinus Torvalds pol = sn->policy; 17271da177e4SLinus Torvalds } 17281da177e4SLinus Torvalds spin_unlock(&sp->lock); 17291da177e4SLinus Torvalds return pol; 17301da177e4SLinus Torvalds } 17311da177e4SLinus Torvalds 17321da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n) 17331da177e4SLinus Torvalds { 1734140d5a49SPaul Mundt pr_debug("deleting %lx-l%lx\n", n->start, n->end); 17351da177e4SLinus Torvalds rb_erase(&n->nd, &sp->root); 1736f0be3d32SLee Schermerhorn mpol_put(n->policy); 17371da177e4SLinus Torvalds kmem_cache_free(sn_cache, n); 17381da177e4SLinus Torvalds } 17391da177e4SLinus Torvalds 1740dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 1741dbcb0f19SAdrian Bunk struct mempolicy *pol) 17421da177e4SLinus Torvalds { 17431da177e4SLinus Torvalds struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 17441da177e4SLinus Torvalds 17451da177e4SLinus Torvalds if (!n) 17461da177e4SLinus Torvalds return NULL; 17471da177e4SLinus Torvalds n->start = start; 17481da177e4SLinus Torvalds n->end = end; 17491da177e4SLinus Torvalds mpol_get(pol); 17501da177e4SLinus Torvalds n->policy = pol; 17511da177e4SLinus Torvalds return n; 17521da177e4SLinus Torvalds } 17531da177e4SLinus Torvalds 17541da177e4SLinus Torvalds /* Replace a policy range. */ 17551da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 17561da177e4SLinus Torvalds unsigned long end, struct sp_node *new) 17571da177e4SLinus Torvalds { 17581da177e4SLinus Torvalds struct sp_node *n, *new2 = NULL; 17591da177e4SLinus Torvalds 17601da177e4SLinus Torvalds restart: 17611da177e4SLinus Torvalds spin_lock(&sp->lock); 17621da177e4SLinus Torvalds n = sp_lookup(sp, start, end); 17631da177e4SLinus Torvalds /* Take care of old policies in the same range. */ 17641da177e4SLinus Torvalds while (n && n->start < end) { 17651da177e4SLinus Torvalds struct rb_node *next = rb_next(&n->nd); 17661da177e4SLinus Torvalds if (n->start >= start) { 17671da177e4SLinus Torvalds if (n->end <= end) 17681da177e4SLinus Torvalds sp_delete(sp, n); 17691da177e4SLinus Torvalds else 17701da177e4SLinus Torvalds n->start = end; 17711da177e4SLinus Torvalds } else { 17721da177e4SLinus Torvalds /* Old policy spanning whole new range. */ 17731da177e4SLinus Torvalds if (n->end > end) { 17741da177e4SLinus Torvalds if (!new2) { 17751da177e4SLinus Torvalds spin_unlock(&sp->lock); 17761da177e4SLinus Torvalds new2 = sp_alloc(end, n->end, n->policy); 17771da177e4SLinus Torvalds if (!new2) 17781da177e4SLinus Torvalds return -ENOMEM; 17791da177e4SLinus Torvalds goto restart; 17801da177e4SLinus Torvalds } 17811da177e4SLinus Torvalds n->end = start; 17821da177e4SLinus Torvalds sp_insert(sp, new2); 17831da177e4SLinus Torvalds new2 = NULL; 17841da177e4SLinus Torvalds break; 17851da177e4SLinus Torvalds } else 17861da177e4SLinus Torvalds n->end = start; 17871da177e4SLinus Torvalds } 17881da177e4SLinus Torvalds if (!next) 17891da177e4SLinus Torvalds break; 17901da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 17911da177e4SLinus Torvalds } 17921da177e4SLinus Torvalds if (new) 17931da177e4SLinus Torvalds sp_insert(sp, new); 17941da177e4SLinus Torvalds spin_unlock(&sp->lock); 17951da177e4SLinus Torvalds if (new2) { 1796f0be3d32SLee Schermerhorn mpol_put(new2->policy); 17971da177e4SLinus Torvalds kmem_cache_free(sn_cache, new2); 17981da177e4SLinus Torvalds } 17991da177e4SLinus Torvalds return 0; 18001da177e4SLinus Torvalds } 18011da177e4SLinus Torvalds 1802a3b51e01SDavid Rientjes void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy, 1803028fec41SDavid Rientjes unsigned short flags, nodemask_t *policy_nodes) 18047339ff83SRobin Holt { 18057339ff83SRobin Holt info->root = RB_ROOT; 18067339ff83SRobin Holt spin_lock_init(&info->lock); 18077339ff83SRobin Holt 18087339ff83SRobin Holt if (policy != MPOL_DEFAULT) { 18097339ff83SRobin Holt struct mempolicy *newpol; 18107339ff83SRobin Holt 18117339ff83SRobin Holt /* Falls back to MPOL_DEFAULT on any error */ 1812028fec41SDavid Rientjes newpol = mpol_new(policy, flags, policy_nodes); 18137339ff83SRobin Holt if (!IS_ERR(newpol)) { 18147339ff83SRobin Holt /* Create pseudo-vma that contains just the policy */ 18157339ff83SRobin Holt struct vm_area_struct pvma; 18167339ff83SRobin Holt 18177339ff83SRobin Holt memset(&pvma, 0, sizeof(struct vm_area_struct)); 18187339ff83SRobin Holt /* Policy covers entire file */ 18197339ff83SRobin Holt pvma.vm_end = TASK_SIZE; 18207339ff83SRobin Holt mpol_set_shared_policy(info, &pvma, newpol); 1821f0be3d32SLee Schermerhorn mpol_put(newpol); 18227339ff83SRobin Holt } 18237339ff83SRobin Holt } 18247339ff83SRobin Holt } 18257339ff83SRobin Holt 18261da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info, 18271da177e4SLinus Torvalds struct vm_area_struct *vma, struct mempolicy *npol) 18281da177e4SLinus Torvalds { 18291da177e4SLinus Torvalds int err; 18301da177e4SLinus Torvalds struct sp_node *new = NULL; 18311da177e4SLinus Torvalds unsigned long sz = vma_pages(vma); 18321da177e4SLinus Torvalds 1833028fec41SDavid Rientjes pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", 18341da177e4SLinus Torvalds vma->vm_pgoff, 18351da177e4SLinus Torvalds sz, npol ? npol->policy : -1, 1836028fec41SDavid Rientjes npol ? npol->flags : -1, 1837dfcd3c0dSAndi Kleen npol ? nodes_addr(npol->v.nodes)[0] : -1); 18381da177e4SLinus Torvalds 18391da177e4SLinus Torvalds if (npol) { 18401da177e4SLinus Torvalds new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 18411da177e4SLinus Torvalds if (!new) 18421da177e4SLinus Torvalds return -ENOMEM; 18431da177e4SLinus Torvalds } 18441da177e4SLinus Torvalds err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 18451da177e4SLinus Torvalds if (err && new) 18461da177e4SLinus Torvalds kmem_cache_free(sn_cache, new); 18471da177e4SLinus Torvalds return err; 18481da177e4SLinus Torvalds } 18491da177e4SLinus Torvalds 18501da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */ 18511da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p) 18521da177e4SLinus Torvalds { 18531da177e4SLinus Torvalds struct sp_node *n; 18541da177e4SLinus Torvalds struct rb_node *next; 18551da177e4SLinus Torvalds 18561da177e4SLinus Torvalds if (!p->root.rb_node) 18571da177e4SLinus Torvalds return; 18581da177e4SLinus Torvalds spin_lock(&p->lock); 18591da177e4SLinus Torvalds next = rb_first(&p->root); 18601da177e4SLinus Torvalds while (next) { 18611da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 18621da177e4SLinus Torvalds next = rb_next(&n->nd); 186390c5029eSAndi Kleen rb_erase(&n->nd, &p->root); 1864f0be3d32SLee Schermerhorn mpol_put(n->policy); 18651da177e4SLinus Torvalds kmem_cache_free(sn_cache, n); 18661da177e4SLinus Torvalds } 18671da177e4SLinus Torvalds spin_unlock(&p->lock); 18681da177e4SLinus Torvalds } 18691da177e4SLinus Torvalds 18701da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */ 18711da177e4SLinus Torvalds void __init numa_policy_init(void) 18721da177e4SLinus Torvalds { 1873b71636e2SPaul Mundt nodemask_t interleave_nodes; 1874b71636e2SPaul Mundt unsigned long largest = 0; 1875b71636e2SPaul Mundt int nid, prefer = 0; 1876b71636e2SPaul Mundt 18771da177e4SLinus Torvalds policy_cache = kmem_cache_create("numa_policy", 18781da177e4SLinus Torvalds sizeof(struct mempolicy), 187920c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 18801da177e4SLinus Torvalds 18811da177e4SLinus Torvalds sn_cache = kmem_cache_create("shared_policy_node", 18821da177e4SLinus Torvalds sizeof(struct sp_node), 188320c2df83SPaul Mundt 0, SLAB_PANIC, NULL); 18841da177e4SLinus Torvalds 1885b71636e2SPaul Mundt /* 1886b71636e2SPaul Mundt * Set interleaving policy for system init. Interleaving is only 1887b71636e2SPaul Mundt * enabled across suitably sized nodes (default is >= 16MB), or 1888b71636e2SPaul Mundt * fall back to the largest node if they're all smaller. 1889b71636e2SPaul Mundt */ 1890b71636e2SPaul Mundt nodes_clear(interleave_nodes); 189156bbd65dSChristoph Lameter for_each_node_state(nid, N_HIGH_MEMORY) { 1892b71636e2SPaul Mundt unsigned long total_pages = node_present_pages(nid); 18931da177e4SLinus Torvalds 1894b71636e2SPaul Mundt /* Preserve the largest node */ 1895b71636e2SPaul Mundt if (largest < total_pages) { 1896b71636e2SPaul Mundt largest = total_pages; 1897b71636e2SPaul Mundt prefer = nid; 1898b71636e2SPaul Mundt } 1899b71636e2SPaul Mundt 1900b71636e2SPaul Mundt /* Interleave this node? */ 1901b71636e2SPaul Mundt if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 1902b71636e2SPaul Mundt node_set(nid, interleave_nodes); 1903b71636e2SPaul Mundt } 1904b71636e2SPaul Mundt 1905b71636e2SPaul Mundt /* All too small, use the largest */ 1906b71636e2SPaul Mundt if (unlikely(nodes_empty(interleave_nodes))) 1907b71636e2SPaul Mundt node_set(prefer, interleave_nodes); 1908b71636e2SPaul Mundt 1909028fec41SDavid Rientjes if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 19101da177e4SLinus Torvalds printk("numa_policy_init: interleaving failed\n"); 19111da177e4SLinus Torvalds } 19121da177e4SLinus Torvalds 19138bccd85fSChristoph Lameter /* Reset policy of current process to default */ 19141da177e4SLinus Torvalds void numa_default_policy(void) 19151da177e4SLinus Torvalds { 1916028fec41SDavid Rientjes do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 19171da177e4SLinus Torvalds } 191868860ec1SPaul Jackson 19194225399aSPaul Jackson /* 19201a75a6c8SChristoph Lameter * Display pages allocated per node and memory policy via /proc. 19211a75a6c8SChristoph Lameter */ 192215ad7cdcSHelge Deller static const char * const policy_types[] = 192315ad7cdcSHelge Deller { "default", "prefer", "bind", "interleave" }; 19241a75a6c8SChristoph Lameter 19251a75a6c8SChristoph Lameter /* 19261a75a6c8SChristoph Lameter * Convert a mempolicy into a string. 19271a75a6c8SChristoph Lameter * Returns the number of characters in buffer (if positive) 19281a75a6c8SChristoph Lameter * or an error (negative) 19291a75a6c8SChristoph Lameter */ 19301a75a6c8SChristoph Lameter static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 19311a75a6c8SChristoph Lameter { 19321a75a6c8SChristoph Lameter char *p = buffer; 19331a75a6c8SChristoph Lameter int l; 19341a75a6c8SChristoph Lameter nodemask_t nodes; 1935a3b51e01SDavid Rientjes unsigned short mode = pol ? pol->policy : MPOL_DEFAULT; 1936f5b087b5SDavid Rientjes unsigned short flags = pol ? pol->flags : 0; 19371a75a6c8SChristoph Lameter 19381a75a6c8SChristoph Lameter switch (mode) { 19391a75a6c8SChristoph Lameter case MPOL_DEFAULT: 19401a75a6c8SChristoph Lameter nodes_clear(nodes); 19411a75a6c8SChristoph Lameter break; 19421a75a6c8SChristoph Lameter 19431a75a6c8SChristoph Lameter case MPOL_PREFERRED: 19441a75a6c8SChristoph Lameter nodes_clear(nodes); 19451a75a6c8SChristoph Lameter node_set(pol->v.preferred_node, nodes); 19461a75a6c8SChristoph Lameter break; 19471a75a6c8SChristoph Lameter 19481a75a6c8SChristoph Lameter case MPOL_BIND: 194919770b32SMel Gorman /* Fall through */ 19501a75a6c8SChristoph Lameter case MPOL_INTERLEAVE: 19511a75a6c8SChristoph Lameter nodes = pol->v.nodes; 19521a75a6c8SChristoph Lameter break; 19531a75a6c8SChristoph Lameter 19541a75a6c8SChristoph Lameter default: 19551a75a6c8SChristoph Lameter BUG(); 19561a75a6c8SChristoph Lameter return -EFAULT; 19571a75a6c8SChristoph Lameter } 19581a75a6c8SChristoph Lameter 19591a75a6c8SChristoph Lameter l = strlen(policy_types[mode]); 19601a75a6c8SChristoph Lameter if (buffer + maxlen < p + l + 1) 19611a75a6c8SChristoph Lameter return -ENOSPC; 19621a75a6c8SChristoph Lameter 19631a75a6c8SChristoph Lameter strcpy(p, policy_types[mode]); 19641a75a6c8SChristoph Lameter p += l; 19651a75a6c8SChristoph Lameter 1966f5b087b5SDavid Rientjes if (flags) { 1967f5b087b5SDavid Rientjes int need_bar = 0; 1968f5b087b5SDavid Rientjes 1969f5b087b5SDavid Rientjes if (buffer + maxlen < p + 2) 1970f5b087b5SDavid Rientjes return -ENOSPC; 1971f5b087b5SDavid Rientjes *p++ = '='; 1972f5b087b5SDavid Rientjes 1973f5b087b5SDavid Rientjes if (flags & MPOL_F_STATIC_NODES) 1974f5b087b5SDavid Rientjes p += sprintf(p, "%sstatic", need_bar++ ? "|" : ""); 19754c50bc01SDavid Rientjes if (flags & MPOL_F_RELATIVE_NODES) 19764c50bc01SDavid Rientjes p += sprintf(p, "%srelative", need_bar++ ? "|" : ""); 1977f5b087b5SDavid Rientjes } 1978f5b087b5SDavid Rientjes 19791a75a6c8SChristoph Lameter if (!nodes_empty(nodes)) { 19801a75a6c8SChristoph Lameter if (buffer + maxlen < p + 2) 19811a75a6c8SChristoph Lameter return -ENOSPC; 19821a75a6c8SChristoph Lameter *p++ = '='; 19831a75a6c8SChristoph Lameter p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 19841a75a6c8SChristoph Lameter } 19851a75a6c8SChristoph Lameter return p - buffer; 19861a75a6c8SChristoph Lameter } 19871a75a6c8SChristoph Lameter 19881a75a6c8SChristoph Lameter struct numa_maps { 19891a75a6c8SChristoph Lameter unsigned long pages; 19901a75a6c8SChristoph Lameter unsigned long anon; 1991397874dfSChristoph Lameter unsigned long active; 1992397874dfSChristoph Lameter unsigned long writeback; 19931a75a6c8SChristoph Lameter unsigned long mapcount_max; 1994397874dfSChristoph Lameter unsigned long dirty; 1995397874dfSChristoph Lameter unsigned long swapcache; 19961a75a6c8SChristoph Lameter unsigned long node[MAX_NUMNODES]; 19971a75a6c8SChristoph Lameter }; 19981a75a6c8SChristoph Lameter 1999397874dfSChristoph Lameter static void gather_stats(struct page *page, void *private, int pte_dirty) 20001a75a6c8SChristoph Lameter { 20011a75a6c8SChristoph Lameter struct numa_maps *md = private; 20021a75a6c8SChristoph Lameter int count = page_mapcount(page); 20031a75a6c8SChristoph Lameter 20041a75a6c8SChristoph Lameter md->pages++; 2005397874dfSChristoph Lameter if (pte_dirty || PageDirty(page)) 2006397874dfSChristoph Lameter md->dirty++; 2007397874dfSChristoph Lameter 2008397874dfSChristoph Lameter if (PageSwapCache(page)) 2009397874dfSChristoph Lameter md->swapcache++; 2010397874dfSChristoph Lameter 2011397874dfSChristoph Lameter if (PageActive(page)) 2012397874dfSChristoph Lameter md->active++; 2013397874dfSChristoph Lameter 2014397874dfSChristoph Lameter if (PageWriteback(page)) 2015397874dfSChristoph Lameter md->writeback++; 20161a75a6c8SChristoph Lameter 20171a75a6c8SChristoph Lameter if (PageAnon(page)) 20181a75a6c8SChristoph Lameter md->anon++; 20191a75a6c8SChristoph Lameter 2020397874dfSChristoph Lameter if (count > md->mapcount_max) 2021397874dfSChristoph Lameter md->mapcount_max = count; 2022397874dfSChristoph Lameter 20231a75a6c8SChristoph Lameter md->node[page_to_nid(page)]++; 20241a75a6c8SChristoph Lameter } 20251a75a6c8SChristoph Lameter 20267f709ed0SAndrew Morton #ifdef CONFIG_HUGETLB_PAGE 2027397874dfSChristoph Lameter static void check_huge_range(struct vm_area_struct *vma, 2028397874dfSChristoph Lameter unsigned long start, unsigned long end, 2029397874dfSChristoph Lameter struct numa_maps *md) 2030397874dfSChristoph Lameter { 2031397874dfSChristoph Lameter unsigned long addr; 2032397874dfSChristoph Lameter struct page *page; 2033397874dfSChristoph Lameter 2034397874dfSChristoph Lameter for (addr = start; addr < end; addr += HPAGE_SIZE) { 2035397874dfSChristoph Lameter pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2036397874dfSChristoph Lameter pte_t pte; 2037397874dfSChristoph Lameter 2038397874dfSChristoph Lameter if (!ptep) 2039397874dfSChristoph Lameter continue; 2040397874dfSChristoph Lameter 2041397874dfSChristoph Lameter pte = *ptep; 2042397874dfSChristoph Lameter if (pte_none(pte)) 2043397874dfSChristoph Lameter continue; 2044397874dfSChristoph Lameter 2045397874dfSChristoph Lameter page = pte_page(pte); 2046397874dfSChristoph Lameter if (!page) 2047397874dfSChristoph Lameter continue; 2048397874dfSChristoph Lameter 2049397874dfSChristoph Lameter gather_stats(page, md, pte_dirty(*ptep)); 2050397874dfSChristoph Lameter } 2051397874dfSChristoph Lameter } 20527f709ed0SAndrew Morton #else 20537f709ed0SAndrew Morton static inline void check_huge_range(struct vm_area_struct *vma, 20547f709ed0SAndrew Morton unsigned long start, unsigned long end, 20557f709ed0SAndrew Morton struct numa_maps *md) 20567f709ed0SAndrew Morton { 20577f709ed0SAndrew Morton } 20587f709ed0SAndrew Morton #endif 2059397874dfSChristoph Lameter 20601a75a6c8SChristoph Lameter int show_numa_map(struct seq_file *m, void *v) 20611a75a6c8SChristoph Lameter { 206299f89551SEric W. Biederman struct proc_maps_private *priv = m->private; 20631a75a6c8SChristoph Lameter struct vm_area_struct *vma = v; 20641a75a6c8SChristoph Lameter struct numa_maps *md; 2065397874dfSChristoph Lameter struct file *file = vma->vm_file; 2066397874dfSChristoph Lameter struct mm_struct *mm = vma->vm_mm; 2067480eccf9SLee Schermerhorn struct mempolicy *pol; 20681a75a6c8SChristoph Lameter int n; 20691a75a6c8SChristoph Lameter char buffer[50]; 20701a75a6c8SChristoph Lameter 2071397874dfSChristoph Lameter if (!mm) 20721a75a6c8SChristoph Lameter return 0; 20731a75a6c8SChristoph Lameter 20741a75a6c8SChristoph Lameter md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); 20751a75a6c8SChristoph Lameter if (!md) 20761a75a6c8SChristoph Lameter return 0; 20771a75a6c8SChristoph Lameter 2078480eccf9SLee Schermerhorn pol = get_vma_policy(priv->task, vma, vma->vm_start); 2079480eccf9SLee Schermerhorn mpol_to_str(buffer, sizeof(buffer), pol); 2080480eccf9SLee Schermerhorn /* 2081480eccf9SLee Schermerhorn * unref shared or other task's mempolicy 2082480eccf9SLee Schermerhorn */ 2083480eccf9SLee Schermerhorn if (pol != &default_policy && pol != current->mempolicy) 2084f0be3d32SLee Schermerhorn __mpol_put(pol); 20851a75a6c8SChristoph Lameter 2086397874dfSChristoph Lameter seq_printf(m, "%08lx %s", vma->vm_start, buffer); 2087397874dfSChristoph Lameter 2088397874dfSChristoph Lameter if (file) { 2089397874dfSChristoph Lameter seq_printf(m, " file="); 2090c32c2f63SJan Blunck seq_path(m, &file->f_path, "\n\t= "); 2091397874dfSChristoph Lameter } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 2092397874dfSChristoph Lameter seq_printf(m, " heap"); 2093397874dfSChristoph Lameter } else if (vma->vm_start <= mm->start_stack && 2094397874dfSChristoph Lameter vma->vm_end >= mm->start_stack) { 2095397874dfSChristoph Lameter seq_printf(m, " stack"); 2096397874dfSChristoph Lameter } 2097397874dfSChristoph Lameter 2098397874dfSChristoph Lameter if (is_vm_hugetlb_page(vma)) { 2099397874dfSChristoph Lameter check_huge_range(vma, vma->vm_start, vma->vm_end, md); 2100397874dfSChristoph Lameter seq_printf(m, " huge"); 2101397874dfSChristoph Lameter } else { 2102397874dfSChristoph Lameter check_pgd_range(vma, vma->vm_start, vma->vm_end, 210356bbd65dSChristoph Lameter &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); 2104397874dfSChristoph Lameter } 2105397874dfSChristoph Lameter 2106397874dfSChristoph Lameter if (!md->pages) 2107397874dfSChristoph Lameter goto out; 21081a75a6c8SChristoph Lameter 21091a75a6c8SChristoph Lameter if (md->anon) 21101a75a6c8SChristoph Lameter seq_printf(m," anon=%lu",md->anon); 21111a75a6c8SChristoph Lameter 2112397874dfSChristoph Lameter if (md->dirty) 2113397874dfSChristoph Lameter seq_printf(m," dirty=%lu",md->dirty); 2114397874dfSChristoph Lameter 2115397874dfSChristoph Lameter if (md->pages != md->anon && md->pages != md->dirty) 2116397874dfSChristoph Lameter seq_printf(m, " mapped=%lu", md->pages); 2117397874dfSChristoph Lameter 2118397874dfSChristoph Lameter if (md->mapcount_max > 1) 2119397874dfSChristoph Lameter seq_printf(m, " mapmax=%lu", md->mapcount_max); 2120397874dfSChristoph Lameter 2121397874dfSChristoph Lameter if (md->swapcache) 2122397874dfSChristoph Lameter seq_printf(m," swapcache=%lu", md->swapcache); 2123397874dfSChristoph Lameter 2124397874dfSChristoph Lameter if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 2125397874dfSChristoph Lameter seq_printf(m," active=%lu", md->active); 2126397874dfSChristoph Lameter 2127397874dfSChristoph Lameter if (md->writeback) 2128397874dfSChristoph Lameter seq_printf(m," writeback=%lu", md->writeback); 2129397874dfSChristoph Lameter 213056bbd65dSChristoph Lameter for_each_node_state(n, N_HIGH_MEMORY) 21311a75a6c8SChristoph Lameter if (md->node[n]) 21321a75a6c8SChristoph Lameter seq_printf(m, " N%d=%lu", n, md->node[n]); 2133397874dfSChristoph Lameter out: 21341a75a6c8SChristoph Lameter seq_putc(m, '\n'); 21351a75a6c8SChristoph Lameter kfree(md); 21361a75a6c8SChristoph Lameter 21371a75a6c8SChristoph Lameter if (m->count < m->size) 213899f89551SEric W. Biederman m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; 21391a75a6c8SChristoph Lameter return 0; 21401a75a6c8SChristoph Lameter } 2141