1*1da177e4SLinus Torvalds /* 2*1da177e4SLinus Torvalds * Simple NUMA memory policy for the Linux kernel. 3*1da177e4SLinus Torvalds * 4*1da177e4SLinus Torvalds * Copyright 2003,2004 Andi Kleen, SuSE Labs. 5*1da177e4SLinus Torvalds * Subject to the GNU Public License, version 2. 6*1da177e4SLinus Torvalds * 7*1da177e4SLinus Torvalds * NUMA policy allows the user to give hints in which node(s) memory should 8*1da177e4SLinus Torvalds * be allocated. 9*1da177e4SLinus Torvalds * 10*1da177e4SLinus Torvalds * Support four policies per VMA and per process: 11*1da177e4SLinus Torvalds * 12*1da177e4SLinus Torvalds * The VMA policy has priority over the process policy for a page fault. 13*1da177e4SLinus Torvalds * 14*1da177e4SLinus Torvalds * interleave Allocate memory interleaved over a set of nodes, 15*1da177e4SLinus Torvalds * with normal fallback if it fails. 16*1da177e4SLinus Torvalds * For VMA based allocations this interleaves based on the 17*1da177e4SLinus Torvalds * offset into the backing object or offset into the mapping 18*1da177e4SLinus Torvalds * for anonymous memory. For process policy an process counter 19*1da177e4SLinus Torvalds * is used. 20*1da177e4SLinus Torvalds * bind Only allocate memory on a specific set of nodes, 21*1da177e4SLinus Torvalds * no fallback. 22*1da177e4SLinus Torvalds * preferred Try a specific node first before normal fallback. 23*1da177e4SLinus Torvalds * As a special case node -1 here means do the allocation 24*1da177e4SLinus Torvalds * on the local CPU. This is normally identical to default, 25*1da177e4SLinus Torvalds * but useful to set in a VMA when you have a non default 26*1da177e4SLinus Torvalds * process policy. 27*1da177e4SLinus Torvalds * default Allocate on the local node first, or when on a VMA 28*1da177e4SLinus Torvalds * use the process policy. This is what Linux always did 29*1da177e4SLinus Torvalds * in a NUMA aware kernel and still does by, ahem, default. 30*1da177e4SLinus Torvalds * 31*1da177e4SLinus Torvalds * The process policy is applied for most non interrupt memory allocations 32*1da177e4SLinus Torvalds * in that process' context. Interrupts ignore the policies and always 33*1da177e4SLinus Torvalds * try to allocate on the local CPU. The VMA policy is only applied for memory 34*1da177e4SLinus Torvalds * allocations for a VMA in the VM. 35*1da177e4SLinus Torvalds * 36*1da177e4SLinus Torvalds * Currently there are a few corner cases in swapping where the policy 37*1da177e4SLinus Torvalds * is not applied, but the majority should be handled. When process policy 38*1da177e4SLinus Torvalds * is used it is not remembered over swap outs/swap ins. 39*1da177e4SLinus Torvalds * 40*1da177e4SLinus Torvalds * Only the highest zone in the zone hierarchy gets policied. Allocations 41*1da177e4SLinus Torvalds * requesting a lower zone just use default policy. This implies that 42*1da177e4SLinus Torvalds * on systems with highmem kernel lowmem allocation don't get policied. 43*1da177e4SLinus Torvalds * Same with GFP_DMA allocations. 44*1da177e4SLinus Torvalds * 45*1da177e4SLinus Torvalds * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 46*1da177e4SLinus Torvalds * all users and remembered even when nobody has memory mapped. 47*1da177e4SLinus Torvalds */ 48*1da177e4SLinus Torvalds 49*1da177e4SLinus Torvalds /* Notebook: 50*1da177e4SLinus Torvalds fix mmap readahead to honour policy and enable policy for any page cache 51*1da177e4SLinus Torvalds object 52*1da177e4SLinus Torvalds statistics for bigpages 53*1da177e4SLinus Torvalds global policy for page cache? currently it uses process policy. Requires 54*1da177e4SLinus Torvalds first item above. 55*1da177e4SLinus Torvalds handle mremap for shared memory (currently ignored for the policy) 56*1da177e4SLinus Torvalds grows down? 57*1da177e4SLinus Torvalds make bind policy root only? It can trigger oom much faster and the 58*1da177e4SLinus Torvalds kernel is not always grateful with that. 59*1da177e4SLinus Torvalds could replace all the switch()es with a mempolicy_ops structure. 60*1da177e4SLinus Torvalds */ 61*1da177e4SLinus Torvalds 62*1da177e4SLinus Torvalds #include <linux/mempolicy.h> 63*1da177e4SLinus Torvalds #include <linux/mm.h> 64*1da177e4SLinus Torvalds #include <linux/highmem.h> 65*1da177e4SLinus Torvalds #include <linux/hugetlb.h> 66*1da177e4SLinus Torvalds #include <linux/kernel.h> 67*1da177e4SLinus Torvalds #include <linux/sched.h> 68*1da177e4SLinus Torvalds #include <linux/mm.h> 69*1da177e4SLinus Torvalds #include <linux/nodemask.h> 70*1da177e4SLinus Torvalds #include <linux/cpuset.h> 71*1da177e4SLinus Torvalds #include <linux/gfp.h> 72*1da177e4SLinus Torvalds #include <linux/slab.h> 73*1da177e4SLinus Torvalds #include <linux/string.h> 74*1da177e4SLinus Torvalds #include <linux/module.h> 75*1da177e4SLinus Torvalds #include <linux/interrupt.h> 76*1da177e4SLinus Torvalds #include <linux/init.h> 77*1da177e4SLinus Torvalds #include <linux/compat.h> 78*1da177e4SLinus Torvalds #include <linux/mempolicy.h> 79*1da177e4SLinus Torvalds #include <asm/tlbflush.h> 80*1da177e4SLinus Torvalds #include <asm/uaccess.h> 81*1da177e4SLinus Torvalds 82*1da177e4SLinus Torvalds static kmem_cache_t *policy_cache; 83*1da177e4SLinus Torvalds static kmem_cache_t *sn_cache; 84*1da177e4SLinus Torvalds 85*1da177e4SLinus Torvalds #define PDprintk(fmt...) 86*1da177e4SLinus Torvalds 87*1da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not 88*1da177e4SLinus Torvalds policied. */ 89*1da177e4SLinus Torvalds static int policy_zone; 90*1da177e4SLinus Torvalds 91*1da177e4SLinus Torvalds static struct mempolicy default_policy = { 92*1da177e4SLinus Torvalds .refcnt = ATOMIC_INIT(1), /* never free it */ 93*1da177e4SLinus Torvalds .policy = MPOL_DEFAULT, 94*1da177e4SLinus Torvalds }; 95*1da177e4SLinus Torvalds 96*1da177e4SLinus Torvalds /* Check if all specified nodes are online */ 97*1da177e4SLinus Torvalds static int nodes_online(unsigned long *nodes) 98*1da177e4SLinus Torvalds { 99*1da177e4SLinus Torvalds DECLARE_BITMAP(online2, MAX_NUMNODES); 100*1da177e4SLinus Torvalds 101*1da177e4SLinus Torvalds bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); 102*1da177e4SLinus Torvalds if (bitmap_empty(online2, MAX_NUMNODES)) 103*1da177e4SLinus Torvalds set_bit(0, online2); 104*1da177e4SLinus Torvalds if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) 105*1da177e4SLinus Torvalds return -EINVAL; 106*1da177e4SLinus Torvalds return 0; 107*1da177e4SLinus Torvalds } 108*1da177e4SLinus Torvalds 109*1da177e4SLinus Torvalds /* Do sanity checking on a policy */ 110*1da177e4SLinus Torvalds static int mpol_check_policy(int mode, unsigned long *nodes) 111*1da177e4SLinus Torvalds { 112*1da177e4SLinus Torvalds int empty = bitmap_empty(nodes, MAX_NUMNODES); 113*1da177e4SLinus Torvalds 114*1da177e4SLinus Torvalds switch (mode) { 115*1da177e4SLinus Torvalds case MPOL_DEFAULT: 116*1da177e4SLinus Torvalds if (!empty) 117*1da177e4SLinus Torvalds return -EINVAL; 118*1da177e4SLinus Torvalds break; 119*1da177e4SLinus Torvalds case MPOL_BIND: 120*1da177e4SLinus Torvalds case MPOL_INTERLEAVE: 121*1da177e4SLinus Torvalds /* Preferred will only use the first bit, but allow 122*1da177e4SLinus Torvalds more for now. */ 123*1da177e4SLinus Torvalds if (empty) 124*1da177e4SLinus Torvalds return -EINVAL; 125*1da177e4SLinus Torvalds break; 126*1da177e4SLinus Torvalds } 127*1da177e4SLinus Torvalds return nodes_online(nodes); 128*1da177e4SLinus Torvalds } 129*1da177e4SLinus Torvalds 130*1da177e4SLinus Torvalds /* Copy a node mask from user space. */ 131*1da177e4SLinus Torvalds static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, 132*1da177e4SLinus Torvalds unsigned long maxnode, int mode) 133*1da177e4SLinus Torvalds { 134*1da177e4SLinus Torvalds unsigned long k; 135*1da177e4SLinus Torvalds unsigned long nlongs; 136*1da177e4SLinus Torvalds unsigned long endmask; 137*1da177e4SLinus Torvalds 138*1da177e4SLinus Torvalds --maxnode; 139*1da177e4SLinus Torvalds bitmap_zero(nodes, MAX_NUMNODES); 140*1da177e4SLinus Torvalds if (maxnode == 0 || !nmask) 141*1da177e4SLinus Torvalds return 0; 142*1da177e4SLinus Torvalds 143*1da177e4SLinus Torvalds nlongs = BITS_TO_LONGS(maxnode); 144*1da177e4SLinus Torvalds if ((maxnode % BITS_PER_LONG) == 0) 145*1da177e4SLinus Torvalds endmask = ~0UL; 146*1da177e4SLinus Torvalds else 147*1da177e4SLinus Torvalds endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 148*1da177e4SLinus Torvalds 149*1da177e4SLinus Torvalds /* When the user specified more nodes than supported just check 150*1da177e4SLinus Torvalds if the non supported part is all zero. */ 151*1da177e4SLinus Torvalds if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 152*1da177e4SLinus Torvalds if (nlongs > PAGE_SIZE/sizeof(long)) 153*1da177e4SLinus Torvalds return -EINVAL; 154*1da177e4SLinus Torvalds for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 155*1da177e4SLinus Torvalds unsigned long t; 156*1da177e4SLinus Torvalds if (get_user(t, nmask + k)) 157*1da177e4SLinus Torvalds return -EFAULT; 158*1da177e4SLinus Torvalds if (k == nlongs - 1) { 159*1da177e4SLinus Torvalds if (t & endmask) 160*1da177e4SLinus Torvalds return -EINVAL; 161*1da177e4SLinus Torvalds } else if (t) 162*1da177e4SLinus Torvalds return -EINVAL; 163*1da177e4SLinus Torvalds } 164*1da177e4SLinus Torvalds nlongs = BITS_TO_LONGS(MAX_NUMNODES); 165*1da177e4SLinus Torvalds endmask = ~0UL; 166*1da177e4SLinus Torvalds } 167*1da177e4SLinus Torvalds 168*1da177e4SLinus Torvalds if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) 169*1da177e4SLinus Torvalds return -EFAULT; 170*1da177e4SLinus Torvalds nodes[nlongs-1] &= endmask; 171*1da177e4SLinus Torvalds /* Update current mems_allowed */ 172*1da177e4SLinus Torvalds cpuset_update_current_mems_allowed(); 173*1da177e4SLinus Torvalds /* Ignore nodes not set in current->mems_allowed */ 174*1da177e4SLinus Torvalds cpuset_restrict_to_mems_allowed(nodes); 175*1da177e4SLinus Torvalds return mpol_check_policy(mode, nodes); 176*1da177e4SLinus Torvalds } 177*1da177e4SLinus Torvalds 178*1da177e4SLinus Torvalds /* Generate a custom zonelist for the BIND policy. */ 179*1da177e4SLinus Torvalds static struct zonelist *bind_zonelist(unsigned long *nodes) 180*1da177e4SLinus Torvalds { 181*1da177e4SLinus Torvalds struct zonelist *zl; 182*1da177e4SLinus Torvalds int num, max, nd; 183*1da177e4SLinus Torvalds 184*1da177e4SLinus Torvalds max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); 185*1da177e4SLinus Torvalds zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 186*1da177e4SLinus Torvalds if (!zl) 187*1da177e4SLinus Torvalds return NULL; 188*1da177e4SLinus Torvalds num = 0; 189*1da177e4SLinus Torvalds for (nd = find_first_bit(nodes, MAX_NUMNODES); 190*1da177e4SLinus Torvalds nd < MAX_NUMNODES; 191*1da177e4SLinus Torvalds nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { 192*1da177e4SLinus Torvalds int k; 193*1da177e4SLinus Torvalds for (k = MAX_NR_ZONES-1; k >= 0; k--) { 194*1da177e4SLinus Torvalds struct zone *z = &NODE_DATA(nd)->node_zones[k]; 195*1da177e4SLinus Torvalds if (!z->present_pages) 196*1da177e4SLinus Torvalds continue; 197*1da177e4SLinus Torvalds zl->zones[num++] = z; 198*1da177e4SLinus Torvalds if (k > policy_zone) 199*1da177e4SLinus Torvalds policy_zone = k; 200*1da177e4SLinus Torvalds } 201*1da177e4SLinus Torvalds } 202*1da177e4SLinus Torvalds BUG_ON(num >= max); 203*1da177e4SLinus Torvalds zl->zones[num] = NULL; 204*1da177e4SLinus Torvalds return zl; 205*1da177e4SLinus Torvalds } 206*1da177e4SLinus Torvalds 207*1da177e4SLinus Torvalds /* Create a new policy */ 208*1da177e4SLinus Torvalds static struct mempolicy *mpol_new(int mode, unsigned long *nodes) 209*1da177e4SLinus Torvalds { 210*1da177e4SLinus Torvalds struct mempolicy *policy; 211*1da177e4SLinus Torvalds 212*1da177e4SLinus Torvalds PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); 213*1da177e4SLinus Torvalds if (mode == MPOL_DEFAULT) 214*1da177e4SLinus Torvalds return NULL; 215*1da177e4SLinus Torvalds policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 216*1da177e4SLinus Torvalds if (!policy) 217*1da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 218*1da177e4SLinus Torvalds atomic_set(&policy->refcnt, 1); 219*1da177e4SLinus Torvalds switch (mode) { 220*1da177e4SLinus Torvalds case MPOL_INTERLEAVE: 221*1da177e4SLinus Torvalds bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); 222*1da177e4SLinus Torvalds break; 223*1da177e4SLinus Torvalds case MPOL_PREFERRED: 224*1da177e4SLinus Torvalds policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); 225*1da177e4SLinus Torvalds if (policy->v.preferred_node >= MAX_NUMNODES) 226*1da177e4SLinus Torvalds policy->v.preferred_node = -1; 227*1da177e4SLinus Torvalds break; 228*1da177e4SLinus Torvalds case MPOL_BIND: 229*1da177e4SLinus Torvalds policy->v.zonelist = bind_zonelist(nodes); 230*1da177e4SLinus Torvalds if (policy->v.zonelist == NULL) { 231*1da177e4SLinus Torvalds kmem_cache_free(policy_cache, policy); 232*1da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 233*1da177e4SLinus Torvalds } 234*1da177e4SLinus Torvalds break; 235*1da177e4SLinus Torvalds } 236*1da177e4SLinus Torvalds policy->policy = mode; 237*1da177e4SLinus Torvalds return policy; 238*1da177e4SLinus Torvalds } 239*1da177e4SLinus Torvalds 240*1da177e4SLinus Torvalds /* Ensure all existing pages follow the policy. */ 241*1da177e4SLinus Torvalds static int 242*1da177e4SLinus Torvalds verify_pages(struct mm_struct *mm, 243*1da177e4SLinus Torvalds unsigned long addr, unsigned long end, unsigned long *nodes) 244*1da177e4SLinus Torvalds { 245*1da177e4SLinus Torvalds while (addr < end) { 246*1da177e4SLinus Torvalds struct page *p; 247*1da177e4SLinus Torvalds pte_t *pte; 248*1da177e4SLinus Torvalds pmd_t *pmd; 249*1da177e4SLinus Torvalds pud_t *pud; 250*1da177e4SLinus Torvalds pgd_t *pgd; 251*1da177e4SLinus Torvalds pgd = pgd_offset(mm, addr); 252*1da177e4SLinus Torvalds if (pgd_none(*pgd)) { 253*1da177e4SLinus Torvalds unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK; 254*1da177e4SLinus Torvalds if (next > addr) 255*1da177e4SLinus Torvalds break; 256*1da177e4SLinus Torvalds addr = next; 257*1da177e4SLinus Torvalds continue; 258*1da177e4SLinus Torvalds } 259*1da177e4SLinus Torvalds pud = pud_offset(pgd, addr); 260*1da177e4SLinus Torvalds if (pud_none(*pud)) { 261*1da177e4SLinus Torvalds addr = (addr + PUD_SIZE) & PUD_MASK; 262*1da177e4SLinus Torvalds continue; 263*1da177e4SLinus Torvalds } 264*1da177e4SLinus Torvalds pmd = pmd_offset(pud, addr); 265*1da177e4SLinus Torvalds if (pmd_none(*pmd)) { 266*1da177e4SLinus Torvalds addr = (addr + PMD_SIZE) & PMD_MASK; 267*1da177e4SLinus Torvalds continue; 268*1da177e4SLinus Torvalds } 269*1da177e4SLinus Torvalds p = NULL; 270*1da177e4SLinus Torvalds pte = pte_offset_map(pmd, addr); 271*1da177e4SLinus Torvalds if (pte_present(*pte)) 272*1da177e4SLinus Torvalds p = pte_page(*pte); 273*1da177e4SLinus Torvalds pte_unmap(pte); 274*1da177e4SLinus Torvalds if (p) { 275*1da177e4SLinus Torvalds unsigned nid = page_to_nid(p); 276*1da177e4SLinus Torvalds if (!test_bit(nid, nodes)) 277*1da177e4SLinus Torvalds return -EIO; 278*1da177e4SLinus Torvalds } 279*1da177e4SLinus Torvalds addr += PAGE_SIZE; 280*1da177e4SLinus Torvalds } 281*1da177e4SLinus Torvalds return 0; 282*1da177e4SLinus Torvalds } 283*1da177e4SLinus Torvalds 284*1da177e4SLinus Torvalds /* Step 1: check the range */ 285*1da177e4SLinus Torvalds static struct vm_area_struct * 286*1da177e4SLinus Torvalds check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 287*1da177e4SLinus Torvalds unsigned long *nodes, unsigned long flags) 288*1da177e4SLinus Torvalds { 289*1da177e4SLinus Torvalds int err; 290*1da177e4SLinus Torvalds struct vm_area_struct *first, *vma, *prev; 291*1da177e4SLinus Torvalds 292*1da177e4SLinus Torvalds first = find_vma(mm, start); 293*1da177e4SLinus Torvalds if (!first) 294*1da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 295*1da177e4SLinus Torvalds prev = NULL; 296*1da177e4SLinus Torvalds for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 297*1da177e4SLinus Torvalds if (!vma->vm_next && vma->vm_end < end) 298*1da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 299*1da177e4SLinus Torvalds if (prev && prev->vm_end < vma->vm_start) 300*1da177e4SLinus Torvalds return ERR_PTR(-EFAULT); 301*1da177e4SLinus Torvalds if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 302*1da177e4SLinus Torvalds err = verify_pages(vma->vm_mm, 303*1da177e4SLinus Torvalds vma->vm_start, vma->vm_end, nodes); 304*1da177e4SLinus Torvalds if (err) { 305*1da177e4SLinus Torvalds first = ERR_PTR(err); 306*1da177e4SLinus Torvalds break; 307*1da177e4SLinus Torvalds } 308*1da177e4SLinus Torvalds } 309*1da177e4SLinus Torvalds prev = vma; 310*1da177e4SLinus Torvalds } 311*1da177e4SLinus Torvalds return first; 312*1da177e4SLinus Torvalds } 313*1da177e4SLinus Torvalds 314*1da177e4SLinus Torvalds /* Apply policy to a single VMA */ 315*1da177e4SLinus Torvalds static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 316*1da177e4SLinus Torvalds { 317*1da177e4SLinus Torvalds int err = 0; 318*1da177e4SLinus Torvalds struct mempolicy *old = vma->vm_policy; 319*1da177e4SLinus Torvalds 320*1da177e4SLinus Torvalds PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 321*1da177e4SLinus Torvalds vma->vm_start, vma->vm_end, vma->vm_pgoff, 322*1da177e4SLinus Torvalds vma->vm_ops, vma->vm_file, 323*1da177e4SLinus Torvalds vma->vm_ops ? vma->vm_ops->set_policy : NULL); 324*1da177e4SLinus Torvalds 325*1da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->set_policy) 326*1da177e4SLinus Torvalds err = vma->vm_ops->set_policy(vma, new); 327*1da177e4SLinus Torvalds if (!err) { 328*1da177e4SLinus Torvalds mpol_get(new); 329*1da177e4SLinus Torvalds vma->vm_policy = new; 330*1da177e4SLinus Torvalds mpol_free(old); 331*1da177e4SLinus Torvalds } 332*1da177e4SLinus Torvalds return err; 333*1da177e4SLinus Torvalds } 334*1da177e4SLinus Torvalds 335*1da177e4SLinus Torvalds /* Step 2: apply policy to a range and do splits. */ 336*1da177e4SLinus Torvalds static int mbind_range(struct vm_area_struct *vma, unsigned long start, 337*1da177e4SLinus Torvalds unsigned long end, struct mempolicy *new) 338*1da177e4SLinus Torvalds { 339*1da177e4SLinus Torvalds struct vm_area_struct *next; 340*1da177e4SLinus Torvalds int err; 341*1da177e4SLinus Torvalds 342*1da177e4SLinus Torvalds err = 0; 343*1da177e4SLinus Torvalds for (; vma && vma->vm_start < end; vma = next) { 344*1da177e4SLinus Torvalds next = vma->vm_next; 345*1da177e4SLinus Torvalds if (vma->vm_start < start) 346*1da177e4SLinus Torvalds err = split_vma(vma->vm_mm, vma, start, 1); 347*1da177e4SLinus Torvalds if (!err && vma->vm_end > end) 348*1da177e4SLinus Torvalds err = split_vma(vma->vm_mm, vma, end, 0); 349*1da177e4SLinus Torvalds if (!err) 350*1da177e4SLinus Torvalds err = policy_vma(vma, new); 351*1da177e4SLinus Torvalds if (err) 352*1da177e4SLinus Torvalds break; 353*1da177e4SLinus Torvalds } 354*1da177e4SLinus Torvalds return err; 355*1da177e4SLinus Torvalds } 356*1da177e4SLinus Torvalds 357*1da177e4SLinus Torvalds /* Change policy for a memory range */ 358*1da177e4SLinus Torvalds asmlinkage long sys_mbind(unsigned long start, unsigned long len, 359*1da177e4SLinus Torvalds unsigned long mode, 360*1da177e4SLinus Torvalds unsigned long __user *nmask, unsigned long maxnode, 361*1da177e4SLinus Torvalds unsigned flags) 362*1da177e4SLinus Torvalds { 363*1da177e4SLinus Torvalds struct vm_area_struct *vma; 364*1da177e4SLinus Torvalds struct mm_struct *mm = current->mm; 365*1da177e4SLinus Torvalds struct mempolicy *new; 366*1da177e4SLinus Torvalds unsigned long end; 367*1da177e4SLinus Torvalds DECLARE_BITMAP(nodes, MAX_NUMNODES); 368*1da177e4SLinus Torvalds int err; 369*1da177e4SLinus Torvalds 370*1da177e4SLinus Torvalds if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 371*1da177e4SLinus Torvalds return -EINVAL; 372*1da177e4SLinus Torvalds if (start & ~PAGE_MASK) 373*1da177e4SLinus Torvalds return -EINVAL; 374*1da177e4SLinus Torvalds if (mode == MPOL_DEFAULT) 375*1da177e4SLinus Torvalds flags &= ~MPOL_MF_STRICT; 376*1da177e4SLinus Torvalds len = (len + PAGE_SIZE - 1) & PAGE_MASK; 377*1da177e4SLinus Torvalds end = start + len; 378*1da177e4SLinus Torvalds if (end < start) 379*1da177e4SLinus Torvalds return -EINVAL; 380*1da177e4SLinus Torvalds if (end == start) 381*1da177e4SLinus Torvalds return 0; 382*1da177e4SLinus Torvalds 383*1da177e4SLinus Torvalds err = get_nodes(nodes, nmask, maxnode, mode); 384*1da177e4SLinus Torvalds if (err) 385*1da177e4SLinus Torvalds return err; 386*1da177e4SLinus Torvalds 387*1da177e4SLinus Torvalds new = mpol_new(mode, nodes); 388*1da177e4SLinus Torvalds if (IS_ERR(new)) 389*1da177e4SLinus Torvalds return PTR_ERR(new); 390*1da177e4SLinus Torvalds 391*1da177e4SLinus Torvalds PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 392*1da177e4SLinus Torvalds mode,nodes[0]); 393*1da177e4SLinus Torvalds 394*1da177e4SLinus Torvalds down_write(&mm->mmap_sem); 395*1da177e4SLinus Torvalds vma = check_range(mm, start, end, nodes, flags); 396*1da177e4SLinus Torvalds err = PTR_ERR(vma); 397*1da177e4SLinus Torvalds if (!IS_ERR(vma)) 398*1da177e4SLinus Torvalds err = mbind_range(vma, start, end, new); 399*1da177e4SLinus Torvalds up_write(&mm->mmap_sem); 400*1da177e4SLinus Torvalds mpol_free(new); 401*1da177e4SLinus Torvalds return err; 402*1da177e4SLinus Torvalds } 403*1da177e4SLinus Torvalds 404*1da177e4SLinus Torvalds /* Set the process memory policy */ 405*1da177e4SLinus Torvalds asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 406*1da177e4SLinus Torvalds unsigned long maxnode) 407*1da177e4SLinus Torvalds { 408*1da177e4SLinus Torvalds int err; 409*1da177e4SLinus Torvalds struct mempolicy *new; 410*1da177e4SLinus Torvalds DECLARE_BITMAP(nodes, MAX_NUMNODES); 411*1da177e4SLinus Torvalds 412*1da177e4SLinus Torvalds if (mode > MPOL_MAX) 413*1da177e4SLinus Torvalds return -EINVAL; 414*1da177e4SLinus Torvalds err = get_nodes(nodes, nmask, maxnode, mode); 415*1da177e4SLinus Torvalds if (err) 416*1da177e4SLinus Torvalds return err; 417*1da177e4SLinus Torvalds new = mpol_new(mode, nodes); 418*1da177e4SLinus Torvalds if (IS_ERR(new)) 419*1da177e4SLinus Torvalds return PTR_ERR(new); 420*1da177e4SLinus Torvalds mpol_free(current->mempolicy); 421*1da177e4SLinus Torvalds current->mempolicy = new; 422*1da177e4SLinus Torvalds if (new && new->policy == MPOL_INTERLEAVE) 423*1da177e4SLinus Torvalds current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); 424*1da177e4SLinus Torvalds return 0; 425*1da177e4SLinus Torvalds } 426*1da177e4SLinus Torvalds 427*1da177e4SLinus Torvalds /* Fill a zone bitmap for a policy */ 428*1da177e4SLinus Torvalds static void get_zonemask(struct mempolicy *p, unsigned long *nodes) 429*1da177e4SLinus Torvalds { 430*1da177e4SLinus Torvalds int i; 431*1da177e4SLinus Torvalds 432*1da177e4SLinus Torvalds bitmap_zero(nodes, MAX_NUMNODES); 433*1da177e4SLinus Torvalds switch (p->policy) { 434*1da177e4SLinus Torvalds case MPOL_BIND: 435*1da177e4SLinus Torvalds for (i = 0; p->v.zonelist->zones[i]; i++) 436*1da177e4SLinus Torvalds __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); 437*1da177e4SLinus Torvalds break; 438*1da177e4SLinus Torvalds case MPOL_DEFAULT: 439*1da177e4SLinus Torvalds break; 440*1da177e4SLinus Torvalds case MPOL_INTERLEAVE: 441*1da177e4SLinus Torvalds bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); 442*1da177e4SLinus Torvalds break; 443*1da177e4SLinus Torvalds case MPOL_PREFERRED: 444*1da177e4SLinus Torvalds /* or use current node instead of online map? */ 445*1da177e4SLinus Torvalds if (p->v.preferred_node < 0) 446*1da177e4SLinus Torvalds bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); 447*1da177e4SLinus Torvalds else 448*1da177e4SLinus Torvalds __set_bit(p->v.preferred_node, nodes); 449*1da177e4SLinus Torvalds break; 450*1da177e4SLinus Torvalds default: 451*1da177e4SLinus Torvalds BUG(); 452*1da177e4SLinus Torvalds } 453*1da177e4SLinus Torvalds } 454*1da177e4SLinus Torvalds 455*1da177e4SLinus Torvalds static int lookup_node(struct mm_struct *mm, unsigned long addr) 456*1da177e4SLinus Torvalds { 457*1da177e4SLinus Torvalds struct page *p; 458*1da177e4SLinus Torvalds int err; 459*1da177e4SLinus Torvalds 460*1da177e4SLinus Torvalds err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 461*1da177e4SLinus Torvalds if (err >= 0) { 462*1da177e4SLinus Torvalds err = page_to_nid(p); 463*1da177e4SLinus Torvalds put_page(p); 464*1da177e4SLinus Torvalds } 465*1da177e4SLinus Torvalds return err; 466*1da177e4SLinus Torvalds } 467*1da177e4SLinus Torvalds 468*1da177e4SLinus Torvalds /* Copy a kernel node mask to user space */ 469*1da177e4SLinus Torvalds static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 470*1da177e4SLinus Torvalds void *nodes, unsigned nbytes) 471*1da177e4SLinus Torvalds { 472*1da177e4SLinus Torvalds unsigned long copy = ALIGN(maxnode-1, 64) / 8; 473*1da177e4SLinus Torvalds 474*1da177e4SLinus Torvalds if (copy > nbytes) { 475*1da177e4SLinus Torvalds if (copy > PAGE_SIZE) 476*1da177e4SLinus Torvalds return -EINVAL; 477*1da177e4SLinus Torvalds if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 478*1da177e4SLinus Torvalds return -EFAULT; 479*1da177e4SLinus Torvalds copy = nbytes; 480*1da177e4SLinus Torvalds } 481*1da177e4SLinus Torvalds return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; 482*1da177e4SLinus Torvalds } 483*1da177e4SLinus Torvalds 484*1da177e4SLinus Torvalds /* Retrieve NUMA policy */ 485*1da177e4SLinus Torvalds asmlinkage long sys_get_mempolicy(int __user *policy, 486*1da177e4SLinus Torvalds unsigned long __user *nmask, 487*1da177e4SLinus Torvalds unsigned long maxnode, 488*1da177e4SLinus Torvalds unsigned long addr, unsigned long flags) 489*1da177e4SLinus Torvalds { 490*1da177e4SLinus Torvalds int err, pval; 491*1da177e4SLinus Torvalds struct mm_struct *mm = current->mm; 492*1da177e4SLinus Torvalds struct vm_area_struct *vma = NULL; 493*1da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 494*1da177e4SLinus Torvalds 495*1da177e4SLinus Torvalds if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 496*1da177e4SLinus Torvalds return -EINVAL; 497*1da177e4SLinus Torvalds if (nmask != NULL && maxnode < MAX_NUMNODES) 498*1da177e4SLinus Torvalds return -EINVAL; 499*1da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 500*1da177e4SLinus Torvalds down_read(&mm->mmap_sem); 501*1da177e4SLinus Torvalds vma = find_vma_intersection(mm, addr, addr+1); 502*1da177e4SLinus Torvalds if (!vma) { 503*1da177e4SLinus Torvalds up_read(&mm->mmap_sem); 504*1da177e4SLinus Torvalds return -EFAULT; 505*1da177e4SLinus Torvalds } 506*1da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->get_policy) 507*1da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr); 508*1da177e4SLinus Torvalds else 509*1da177e4SLinus Torvalds pol = vma->vm_policy; 510*1da177e4SLinus Torvalds } else if (addr) 511*1da177e4SLinus Torvalds return -EINVAL; 512*1da177e4SLinus Torvalds 513*1da177e4SLinus Torvalds if (!pol) 514*1da177e4SLinus Torvalds pol = &default_policy; 515*1da177e4SLinus Torvalds 516*1da177e4SLinus Torvalds if (flags & MPOL_F_NODE) { 517*1da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) { 518*1da177e4SLinus Torvalds err = lookup_node(mm, addr); 519*1da177e4SLinus Torvalds if (err < 0) 520*1da177e4SLinus Torvalds goto out; 521*1da177e4SLinus Torvalds pval = err; 522*1da177e4SLinus Torvalds } else if (pol == current->mempolicy && 523*1da177e4SLinus Torvalds pol->policy == MPOL_INTERLEAVE) { 524*1da177e4SLinus Torvalds pval = current->il_next; 525*1da177e4SLinus Torvalds } else { 526*1da177e4SLinus Torvalds err = -EINVAL; 527*1da177e4SLinus Torvalds goto out; 528*1da177e4SLinus Torvalds } 529*1da177e4SLinus Torvalds } else 530*1da177e4SLinus Torvalds pval = pol->policy; 531*1da177e4SLinus Torvalds 532*1da177e4SLinus Torvalds if (vma) { 533*1da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 534*1da177e4SLinus Torvalds vma = NULL; 535*1da177e4SLinus Torvalds } 536*1da177e4SLinus Torvalds 537*1da177e4SLinus Torvalds if (policy && put_user(pval, policy)) 538*1da177e4SLinus Torvalds return -EFAULT; 539*1da177e4SLinus Torvalds 540*1da177e4SLinus Torvalds err = 0; 541*1da177e4SLinus Torvalds if (nmask) { 542*1da177e4SLinus Torvalds DECLARE_BITMAP(nodes, MAX_NUMNODES); 543*1da177e4SLinus Torvalds get_zonemask(pol, nodes); 544*1da177e4SLinus Torvalds err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); 545*1da177e4SLinus Torvalds } 546*1da177e4SLinus Torvalds 547*1da177e4SLinus Torvalds out: 548*1da177e4SLinus Torvalds if (vma) 549*1da177e4SLinus Torvalds up_read(¤t->mm->mmap_sem); 550*1da177e4SLinus Torvalds return err; 551*1da177e4SLinus Torvalds } 552*1da177e4SLinus Torvalds 553*1da177e4SLinus Torvalds #ifdef CONFIG_COMPAT 554*1da177e4SLinus Torvalds 555*1da177e4SLinus Torvalds asmlinkage long compat_sys_get_mempolicy(int __user *policy, 556*1da177e4SLinus Torvalds compat_ulong_t __user *nmask, 557*1da177e4SLinus Torvalds compat_ulong_t maxnode, 558*1da177e4SLinus Torvalds compat_ulong_t addr, compat_ulong_t flags) 559*1da177e4SLinus Torvalds { 560*1da177e4SLinus Torvalds long err; 561*1da177e4SLinus Torvalds unsigned long __user *nm = NULL; 562*1da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 563*1da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 564*1da177e4SLinus Torvalds 565*1da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 566*1da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 567*1da177e4SLinus Torvalds 568*1da177e4SLinus Torvalds if (nmask) 569*1da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 570*1da177e4SLinus Torvalds 571*1da177e4SLinus Torvalds err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 572*1da177e4SLinus Torvalds 573*1da177e4SLinus Torvalds if (!err && nmask) { 574*1da177e4SLinus Torvalds err = copy_from_user(bm, nm, alloc_size); 575*1da177e4SLinus Torvalds /* ensure entire bitmap is zeroed */ 576*1da177e4SLinus Torvalds err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 577*1da177e4SLinus Torvalds err |= compat_put_bitmap(nmask, bm, nr_bits); 578*1da177e4SLinus Torvalds } 579*1da177e4SLinus Torvalds 580*1da177e4SLinus Torvalds return err; 581*1da177e4SLinus Torvalds } 582*1da177e4SLinus Torvalds 583*1da177e4SLinus Torvalds asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, 584*1da177e4SLinus Torvalds compat_ulong_t maxnode) 585*1da177e4SLinus Torvalds { 586*1da177e4SLinus Torvalds long err = 0; 587*1da177e4SLinus Torvalds unsigned long __user *nm = NULL; 588*1da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 589*1da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 590*1da177e4SLinus Torvalds 591*1da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 592*1da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 593*1da177e4SLinus Torvalds 594*1da177e4SLinus Torvalds if (nmask) { 595*1da177e4SLinus Torvalds err = compat_get_bitmap(bm, nmask, nr_bits); 596*1da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 597*1da177e4SLinus Torvalds err |= copy_to_user(nm, bm, alloc_size); 598*1da177e4SLinus Torvalds } 599*1da177e4SLinus Torvalds 600*1da177e4SLinus Torvalds if (err) 601*1da177e4SLinus Torvalds return -EFAULT; 602*1da177e4SLinus Torvalds 603*1da177e4SLinus Torvalds return sys_set_mempolicy(mode, nm, nr_bits+1); 604*1da177e4SLinus Torvalds } 605*1da177e4SLinus Torvalds 606*1da177e4SLinus Torvalds asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, 607*1da177e4SLinus Torvalds compat_ulong_t mode, compat_ulong_t __user *nmask, 608*1da177e4SLinus Torvalds compat_ulong_t maxnode, compat_ulong_t flags) 609*1da177e4SLinus Torvalds { 610*1da177e4SLinus Torvalds long err = 0; 611*1da177e4SLinus Torvalds unsigned long __user *nm = NULL; 612*1da177e4SLinus Torvalds unsigned long nr_bits, alloc_size; 613*1da177e4SLinus Torvalds DECLARE_BITMAP(bm, MAX_NUMNODES); 614*1da177e4SLinus Torvalds 615*1da177e4SLinus Torvalds nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 616*1da177e4SLinus Torvalds alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 617*1da177e4SLinus Torvalds 618*1da177e4SLinus Torvalds if (nmask) { 619*1da177e4SLinus Torvalds err = compat_get_bitmap(bm, nmask, nr_bits); 620*1da177e4SLinus Torvalds nm = compat_alloc_user_space(alloc_size); 621*1da177e4SLinus Torvalds err |= copy_to_user(nm, bm, alloc_size); 622*1da177e4SLinus Torvalds } 623*1da177e4SLinus Torvalds 624*1da177e4SLinus Torvalds if (err) 625*1da177e4SLinus Torvalds return -EFAULT; 626*1da177e4SLinus Torvalds 627*1da177e4SLinus Torvalds return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 628*1da177e4SLinus Torvalds } 629*1da177e4SLinus Torvalds 630*1da177e4SLinus Torvalds #endif 631*1da177e4SLinus Torvalds 632*1da177e4SLinus Torvalds /* Return effective policy for a VMA */ 633*1da177e4SLinus Torvalds static struct mempolicy * 634*1da177e4SLinus Torvalds get_vma_policy(struct vm_area_struct *vma, unsigned long addr) 635*1da177e4SLinus Torvalds { 636*1da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 637*1da177e4SLinus Torvalds 638*1da177e4SLinus Torvalds if (vma) { 639*1da177e4SLinus Torvalds if (vma->vm_ops && vma->vm_ops->get_policy) 640*1da177e4SLinus Torvalds pol = vma->vm_ops->get_policy(vma, addr); 641*1da177e4SLinus Torvalds else if (vma->vm_policy && 642*1da177e4SLinus Torvalds vma->vm_policy->policy != MPOL_DEFAULT) 643*1da177e4SLinus Torvalds pol = vma->vm_policy; 644*1da177e4SLinus Torvalds } 645*1da177e4SLinus Torvalds if (!pol) 646*1da177e4SLinus Torvalds pol = &default_policy; 647*1da177e4SLinus Torvalds return pol; 648*1da177e4SLinus Torvalds } 649*1da177e4SLinus Torvalds 650*1da177e4SLinus Torvalds /* Return a zonelist representing a mempolicy */ 651*1da177e4SLinus Torvalds static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy) 652*1da177e4SLinus Torvalds { 653*1da177e4SLinus Torvalds int nd; 654*1da177e4SLinus Torvalds 655*1da177e4SLinus Torvalds switch (policy->policy) { 656*1da177e4SLinus Torvalds case MPOL_PREFERRED: 657*1da177e4SLinus Torvalds nd = policy->v.preferred_node; 658*1da177e4SLinus Torvalds if (nd < 0) 659*1da177e4SLinus Torvalds nd = numa_node_id(); 660*1da177e4SLinus Torvalds break; 661*1da177e4SLinus Torvalds case MPOL_BIND: 662*1da177e4SLinus Torvalds /* Lower zones don't get a policy applied */ 663*1da177e4SLinus Torvalds /* Careful: current->mems_allowed might have moved */ 664*1da177e4SLinus Torvalds if (gfp >= policy_zone) 665*1da177e4SLinus Torvalds if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 666*1da177e4SLinus Torvalds return policy->v.zonelist; 667*1da177e4SLinus Torvalds /*FALL THROUGH*/ 668*1da177e4SLinus Torvalds case MPOL_INTERLEAVE: /* should not happen */ 669*1da177e4SLinus Torvalds case MPOL_DEFAULT: 670*1da177e4SLinus Torvalds nd = numa_node_id(); 671*1da177e4SLinus Torvalds break; 672*1da177e4SLinus Torvalds default: 673*1da177e4SLinus Torvalds nd = 0; 674*1da177e4SLinus Torvalds BUG(); 675*1da177e4SLinus Torvalds } 676*1da177e4SLinus Torvalds return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); 677*1da177e4SLinus Torvalds } 678*1da177e4SLinus Torvalds 679*1da177e4SLinus Torvalds /* Do dynamic interleaving for a process */ 680*1da177e4SLinus Torvalds static unsigned interleave_nodes(struct mempolicy *policy) 681*1da177e4SLinus Torvalds { 682*1da177e4SLinus Torvalds unsigned nid, next; 683*1da177e4SLinus Torvalds struct task_struct *me = current; 684*1da177e4SLinus Torvalds 685*1da177e4SLinus Torvalds nid = me->il_next; 686*1da177e4SLinus Torvalds BUG_ON(nid >= MAX_NUMNODES); 687*1da177e4SLinus Torvalds next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); 688*1da177e4SLinus Torvalds if (next >= MAX_NUMNODES) 689*1da177e4SLinus Torvalds next = find_first_bit(policy->v.nodes, MAX_NUMNODES); 690*1da177e4SLinus Torvalds me->il_next = next; 691*1da177e4SLinus Torvalds return nid; 692*1da177e4SLinus Torvalds } 693*1da177e4SLinus Torvalds 694*1da177e4SLinus Torvalds /* Do static interleaving for a VMA with known offset. */ 695*1da177e4SLinus Torvalds static unsigned offset_il_node(struct mempolicy *pol, 696*1da177e4SLinus Torvalds struct vm_area_struct *vma, unsigned long off) 697*1da177e4SLinus Torvalds { 698*1da177e4SLinus Torvalds unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); 699*1da177e4SLinus Torvalds unsigned target = (unsigned)off % nnodes; 700*1da177e4SLinus Torvalds int c; 701*1da177e4SLinus Torvalds int nid = -1; 702*1da177e4SLinus Torvalds 703*1da177e4SLinus Torvalds c = 0; 704*1da177e4SLinus Torvalds do { 705*1da177e4SLinus Torvalds nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); 706*1da177e4SLinus Torvalds c++; 707*1da177e4SLinus Torvalds } while (c <= target); 708*1da177e4SLinus Torvalds BUG_ON(nid >= MAX_NUMNODES); 709*1da177e4SLinus Torvalds BUG_ON(!test_bit(nid, pol->v.nodes)); 710*1da177e4SLinus Torvalds return nid; 711*1da177e4SLinus Torvalds } 712*1da177e4SLinus Torvalds 713*1da177e4SLinus Torvalds /* Allocate a page in interleaved policy. 714*1da177e4SLinus Torvalds Own path because it needs to do special accounting. */ 715*1da177e4SLinus Torvalds static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid) 716*1da177e4SLinus Torvalds { 717*1da177e4SLinus Torvalds struct zonelist *zl; 718*1da177e4SLinus Torvalds struct page *page; 719*1da177e4SLinus Torvalds 720*1da177e4SLinus Torvalds BUG_ON(!node_online(nid)); 721*1da177e4SLinus Torvalds zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); 722*1da177e4SLinus Torvalds page = __alloc_pages(gfp, order, zl); 723*1da177e4SLinus Torvalds if (page && page_zone(page) == zl->zones[0]) { 724*1da177e4SLinus Torvalds zl->zones[0]->pageset[get_cpu()].interleave_hit++; 725*1da177e4SLinus Torvalds put_cpu(); 726*1da177e4SLinus Torvalds } 727*1da177e4SLinus Torvalds return page; 728*1da177e4SLinus Torvalds } 729*1da177e4SLinus Torvalds 730*1da177e4SLinus Torvalds /** 731*1da177e4SLinus Torvalds * alloc_page_vma - Allocate a page for a VMA. 732*1da177e4SLinus Torvalds * 733*1da177e4SLinus Torvalds * @gfp: 734*1da177e4SLinus Torvalds * %GFP_USER user allocation. 735*1da177e4SLinus Torvalds * %GFP_KERNEL kernel allocations, 736*1da177e4SLinus Torvalds * %GFP_HIGHMEM highmem/user allocations, 737*1da177e4SLinus Torvalds * %GFP_FS allocation should not call back into a file system. 738*1da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 739*1da177e4SLinus Torvalds * 740*1da177e4SLinus Torvalds * @vma: Pointer to VMA or NULL if not available. 741*1da177e4SLinus Torvalds * @addr: Virtual Address of the allocation. Must be inside the VMA. 742*1da177e4SLinus Torvalds * 743*1da177e4SLinus Torvalds * This function allocates a page from the kernel page pool and applies 744*1da177e4SLinus Torvalds * a NUMA policy associated with the VMA or the current process. 745*1da177e4SLinus Torvalds * When VMA is not NULL caller must hold down_read on the mmap_sem of the 746*1da177e4SLinus Torvalds * mm_struct of the VMA to prevent it from going away. Should be used for 747*1da177e4SLinus Torvalds * all allocations for pages that will be mapped into 748*1da177e4SLinus Torvalds * user space. Returns NULL when no page can be allocated. 749*1da177e4SLinus Torvalds * 750*1da177e4SLinus Torvalds * Should be called with the mm_sem of the vma hold. 751*1da177e4SLinus Torvalds */ 752*1da177e4SLinus Torvalds struct page * 753*1da177e4SLinus Torvalds alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr) 754*1da177e4SLinus Torvalds { 755*1da177e4SLinus Torvalds struct mempolicy *pol = get_vma_policy(vma, addr); 756*1da177e4SLinus Torvalds 757*1da177e4SLinus Torvalds cpuset_update_current_mems_allowed(); 758*1da177e4SLinus Torvalds 759*1da177e4SLinus Torvalds if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 760*1da177e4SLinus Torvalds unsigned nid; 761*1da177e4SLinus Torvalds if (vma) { 762*1da177e4SLinus Torvalds unsigned long off; 763*1da177e4SLinus Torvalds BUG_ON(addr >= vma->vm_end); 764*1da177e4SLinus Torvalds BUG_ON(addr < vma->vm_start); 765*1da177e4SLinus Torvalds off = vma->vm_pgoff; 766*1da177e4SLinus Torvalds off += (addr - vma->vm_start) >> PAGE_SHIFT; 767*1da177e4SLinus Torvalds nid = offset_il_node(pol, vma, off); 768*1da177e4SLinus Torvalds } else { 769*1da177e4SLinus Torvalds /* fall back to process interleaving */ 770*1da177e4SLinus Torvalds nid = interleave_nodes(pol); 771*1da177e4SLinus Torvalds } 772*1da177e4SLinus Torvalds return alloc_page_interleave(gfp, 0, nid); 773*1da177e4SLinus Torvalds } 774*1da177e4SLinus Torvalds return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 775*1da177e4SLinus Torvalds } 776*1da177e4SLinus Torvalds 777*1da177e4SLinus Torvalds /** 778*1da177e4SLinus Torvalds * alloc_pages_current - Allocate pages. 779*1da177e4SLinus Torvalds * 780*1da177e4SLinus Torvalds * @gfp: 781*1da177e4SLinus Torvalds * %GFP_USER user allocation, 782*1da177e4SLinus Torvalds * %GFP_KERNEL kernel allocation, 783*1da177e4SLinus Torvalds * %GFP_HIGHMEM highmem allocation, 784*1da177e4SLinus Torvalds * %GFP_FS don't call back into a file system. 785*1da177e4SLinus Torvalds * %GFP_ATOMIC don't sleep. 786*1da177e4SLinus Torvalds * @order: Power of two of allocation size in pages. 0 is a single page. 787*1da177e4SLinus Torvalds * 788*1da177e4SLinus Torvalds * Allocate a page from the kernel page pool. When not in 789*1da177e4SLinus Torvalds * interrupt context and apply the current process NUMA policy. 790*1da177e4SLinus Torvalds * Returns NULL when no page can be allocated. 791*1da177e4SLinus Torvalds * 792*1da177e4SLinus Torvalds * Don't call cpuset_update_current_mems_allowed() unless 793*1da177e4SLinus Torvalds * 1) it's ok to take cpuset_sem (can WAIT), and 794*1da177e4SLinus Torvalds * 2) allocating for current task (not interrupt). 795*1da177e4SLinus Torvalds */ 796*1da177e4SLinus Torvalds struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order) 797*1da177e4SLinus Torvalds { 798*1da177e4SLinus Torvalds struct mempolicy *pol = current->mempolicy; 799*1da177e4SLinus Torvalds 800*1da177e4SLinus Torvalds if ((gfp & __GFP_WAIT) && !in_interrupt()) 801*1da177e4SLinus Torvalds cpuset_update_current_mems_allowed(); 802*1da177e4SLinus Torvalds if (!pol || in_interrupt()) 803*1da177e4SLinus Torvalds pol = &default_policy; 804*1da177e4SLinus Torvalds if (pol->policy == MPOL_INTERLEAVE) 805*1da177e4SLinus Torvalds return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 806*1da177e4SLinus Torvalds return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 807*1da177e4SLinus Torvalds } 808*1da177e4SLinus Torvalds EXPORT_SYMBOL(alloc_pages_current); 809*1da177e4SLinus Torvalds 810*1da177e4SLinus Torvalds /* Slow path of a mempolicy copy */ 811*1da177e4SLinus Torvalds struct mempolicy *__mpol_copy(struct mempolicy *old) 812*1da177e4SLinus Torvalds { 813*1da177e4SLinus Torvalds struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 814*1da177e4SLinus Torvalds 815*1da177e4SLinus Torvalds if (!new) 816*1da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 817*1da177e4SLinus Torvalds *new = *old; 818*1da177e4SLinus Torvalds atomic_set(&new->refcnt, 1); 819*1da177e4SLinus Torvalds if (new->policy == MPOL_BIND) { 820*1da177e4SLinus Torvalds int sz = ksize(old->v.zonelist); 821*1da177e4SLinus Torvalds new->v.zonelist = kmalloc(sz, SLAB_KERNEL); 822*1da177e4SLinus Torvalds if (!new->v.zonelist) { 823*1da177e4SLinus Torvalds kmem_cache_free(policy_cache, new); 824*1da177e4SLinus Torvalds return ERR_PTR(-ENOMEM); 825*1da177e4SLinus Torvalds } 826*1da177e4SLinus Torvalds memcpy(new->v.zonelist, old->v.zonelist, sz); 827*1da177e4SLinus Torvalds } 828*1da177e4SLinus Torvalds return new; 829*1da177e4SLinus Torvalds } 830*1da177e4SLinus Torvalds 831*1da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */ 832*1da177e4SLinus Torvalds int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 833*1da177e4SLinus Torvalds { 834*1da177e4SLinus Torvalds if (!a || !b) 835*1da177e4SLinus Torvalds return 0; 836*1da177e4SLinus Torvalds if (a->policy != b->policy) 837*1da177e4SLinus Torvalds return 0; 838*1da177e4SLinus Torvalds switch (a->policy) { 839*1da177e4SLinus Torvalds case MPOL_DEFAULT: 840*1da177e4SLinus Torvalds return 1; 841*1da177e4SLinus Torvalds case MPOL_INTERLEAVE: 842*1da177e4SLinus Torvalds return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); 843*1da177e4SLinus Torvalds case MPOL_PREFERRED: 844*1da177e4SLinus Torvalds return a->v.preferred_node == b->v.preferred_node; 845*1da177e4SLinus Torvalds case MPOL_BIND: { 846*1da177e4SLinus Torvalds int i; 847*1da177e4SLinus Torvalds for (i = 0; a->v.zonelist->zones[i]; i++) 848*1da177e4SLinus Torvalds if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) 849*1da177e4SLinus Torvalds return 0; 850*1da177e4SLinus Torvalds return b->v.zonelist->zones[i] == NULL; 851*1da177e4SLinus Torvalds } 852*1da177e4SLinus Torvalds default: 853*1da177e4SLinus Torvalds BUG(); 854*1da177e4SLinus Torvalds return 0; 855*1da177e4SLinus Torvalds } 856*1da177e4SLinus Torvalds } 857*1da177e4SLinus Torvalds 858*1da177e4SLinus Torvalds /* Slow path of a mpol destructor. */ 859*1da177e4SLinus Torvalds void __mpol_free(struct mempolicy *p) 860*1da177e4SLinus Torvalds { 861*1da177e4SLinus Torvalds if (!atomic_dec_and_test(&p->refcnt)) 862*1da177e4SLinus Torvalds return; 863*1da177e4SLinus Torvalds if (p->policy == MPOL_BIND) 864*1da177e4SLinus Torvalds kfree(p->v.zonelist); 865*1da177e4SLinus Torvalds p->policy = MPOL_DEFAULT; 866*1da177e4SLinus Torvalds kmem_cache_free(policy_cache, p); 867*1da177e4SLinus Torvalds } 868*1da177e4SLinus Torvalds 869*1da177e4SLinus Torvalds /* 870*1da177e4SLinus Torvalds * Hugetlb policy. Same as above, just works with node numbers instead of 871*1da177e4SLinus Torvalds * zonelists. 872*1da177e4SLinus Torvalds */ 873*1da177e4SLinus Torvalds 874*1da177e4SLinus Torvalds /* Find first node suitable for an allocation */ 875*1da177e4SLinus Torvalds int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) 876*1da177e4SLinus Torvalds { 877*1da177e4SLinus Torvalds struct mempolicy *pol = get_vma_policy(vma, addr); 878*1da177e4SLinus Torvalds 879*1da177e4SLinus Torvalds switch (pol->policy) { 880*1da177e4SLinus Torvalds case MPOL_DEFAULT: 881*1da177e4SLinus Torvalds return numa_node_id(); 882*1da177e4SLinus Torvalds case MPOL_BIND: 883*1da177e4SLinus Torvalds return pol->v.zonelist->zones[0]->zone_pgdat->node_id; 884*1da177e4SLinus Torvalds case MPOL_INTERLEAVE: 885*1da177e4SLinus Torvalds return interleave_nodes(pol); 886*1da177e4SLinus Torvalds case MPOL_PREFERRED: 887*1da177e4SLinus Torvalds return pol->v.preferred_node >= 0 ? 888*1da177e4SLinus Torvalds pol->v.preferred_node : numa_node_id(); 889*1da177e4SLinus Torvalds } 890*1da177e4SLinus Torvalds BUG(); 891*1da177e4SLinus Torvalds return 0; 892*1da177e4SLinus Torvalds } 893*1da177e4SLinus Torvalds 894*1da177e4SLinus Torvalds /* Find secondary valid nodes for an allocation */ 895*1da177e4SLinus Torvalds int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) 896*1da177e4SLinus Torvalds { 897*1da177e4SLinus Torvalds struct mempolicy *pol = get_vma_policy(vma, addr); 898*1da177e4SLinus Torvalds 899*1da177e4SLinus Torvalds switch (pol->policy) { 900*1da177e4SLinus Torvalds case MPOL_PREFERRED: 901*1da177e4SLinus Torvalds case MPOL_DEFAULT: 902*1da177e4SLinus Torvalds case MPOL_INTERLEAVE: 903*1da177e4SLinus Torvalds return 1; 904*1da177e4SLinus Torvalds case MPOL_BIND: { 905*1da177e4SLinus Torvalds struct zone **z; 906*1da177e4SLinus Torvalds for (z = pol->v.zonelist->zones; *z; z++) 907*1da177e4SLinus Torvalds if ((*z)->zone_pgdat->node_id == nid) 908*1da177e4SLinus Torvalds return 1; 909*1da177e4SLinus Torvalds return 0; 910*1da177e4SLinus Torvalds } 911*1da177e4SLinus Torvalds default: 912*1da177e4SLinus Torvalds BUG(); 913*1da177e4SLinus Torvalds return 0; 914*1da177e4SLinus Torvalds } 915*1da177e4SLinus Torvalds } 916*1da177e4SLinus Torvalds 917*1da177e4SLinus Torvalds /* 918*1da177e4SLinus Torvalds * Shared memory backing store policy support. 919*1da177e4SLinus Torvalds * 920*1da177e4SLinus Torvalds * Remember policies even when nobody has shared memory mapped. 921*1da177e4SLinus Torvalds * The policies are kept in Red-Black tree linked from the inode. 922*1da177e4SLinus Torvalds * They are protected by the sp->lock spinlock, which should be held 923*1da177e4SLinus Torvalds * for any accesses to the tree. 924*1da177e4SLinus Torvalds */ 925*1da177e4SLinus Torvalds 926*1da177e4SLinus Torvalds /* lookup first element intersecting start-end */ 927*1da177e4SLinus Torvalds /* Caller holds sp->lock */ 928*1da177e4SLinus Torvalds static struct sp_node * 929*1da177e4SLinus Torvalds sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 930*1da177e4SLinus Torvalds { 931*1da177e4SLinus Torvalds struct rb_node *n = sp->root.rb_node; 932*1da177e4SLinus Torvalds 933*1da177e4SLinus Torvalds while (n) { 934*1da177e4SLinus Torvalds struct sp_node *p = rb_entry(n, struct sp_node, nd); 935*1da177e4SLinus Torvalds 936*1da177e4SLinus Torvalds if (start >= p->end) 937*1da177e4SLinus Torvalds n = n->rb_right; 938*1da177e4SLinus Torvalds else if (end <= p->start) 939*1da177e4SLinus Torvalds n = n->rb_left; 940*1da177e4SLinus Torvalds else 941*1da177e4SLinus Torvalds break; 942*1da177e4SLinus Torvalds } 943*1da177e4SLinus Torvalds if (!n) 944*1da177e4SLinus Torvalds return NULL; 945*1da177e4SLinus Torvalds for (;;) { 946*1da177e4SLinus Torvalds struct sp_node *w = NULL; 947*1da177e4SLinus Torvalds struct rb_node *prev = rb_prev(n); 948*1da177e4SLinus Torvalds if (!prev) 949*1da177e4SLinus Torvalds break; 950*1da177e4SLinus Torvalds w = rb_entry(prev, struct sp_node, nd); 951*1da177e4SLinus Torvalds if (w->end <= start) 952*1da177e4SLinus Torvalds break; 953*1da177e4SLinus Torvalds n = prev; 954*1da177e4SLinus Torvalds } 955*1da177e4SLinus Torvalds return rb_entry(n, struct sp_node, nd); 956*1da177e4SLinus Torvalds } 957*1da177e4SLinus Torvalds 958*1da177e4SLinus Torvalds /* Insert a new shared policy into the list. */ 959*1da177e4SLinus Torvalds /* Caller holds sp->lock */ 960*1da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new) 961*1da177e4SLinus Torvalds { 962*1da177e4SLinus Torvalds struct rb_node **p = &sp->root.rb_node; 963*1da177e4SLinus Torvalds struct rb_node *parent = NULL; 964*1da177e4SLinus Torvalds struct sp_node *nd; 965*1da177e4SLinus Torvalds 966*1da177e4SLinus Torvalds while (*p) { 967*1da177e4SLinus Torvalds parent = *p; 968*1da177e4SLinus Torvalds nd = rb_entry(parent, struct sp_node, nd); 969*1da177e4SLinus Torvalds if (new->start < nd->start) 970*1da177e4SLinus Torvalds p = &(*p)->rb_left; 971*1da177e4SLinus Torvalds else if (new->end > nd->end) 972*1da177e4SLinus Torvalds p = &(*p)->rb_right; 973*1da177e4SLinus Torvalds else 974*1da177e4SLinus Torvalds BUG(); 975*1da177e4SLinus Torvalds } 976*1da177e4SLinus Torvalds rb_link_node(&new->nd, parent, p); 977*1da177e4SLinus Torvalds rb_insert_color(&new->nd, &sp->root); 978*1da177e4SLinus Torvalds PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, 979*1da177e4SLinus Torvalds new->policy ? new->policy->policy : 0); 980*1da177e4SLinus Torvalds } 981*1da177e4SLinus Torvalds 982*1da177e4SLinus Torvalds /* Find shared policy intersecting idx */ 983*1da177e4SLinus Torvalds struct mempolicy * 984*1da177e4SLinus Torvalds mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 985*1da177e4SLinus Torvalds { 986*1da177e4SLinus Torvalds struct mempolicy *pol = NULL; 987*1da177e4SLinus Torvalds struct sp_node *sn; 988*1da177e4SLinus Torvalds 989*1da177e4SLinus Torvalds if (!sp->root.rb_node) 990*1da177e4SLinus Torvalds return NULL; 991*1da177e4SLinus Torvalds spin_lock(&sp->lock); 992*1da177e4SLinus Torvalds sn = sp_lookup(sp, idx, idx+1); 993*1da177e4SLinus Torvalds if (sn) { 994*1da177e4SLinus Torvalds mpol_get(sn->policy); 995*1da177e4SLinus Torvalds pol = sn->policy; 996*1da177e4SLinus Torvalds } 997*1da177e4SLinus Torvalds spin_unlock(&sp->lock); 998*1da177e4SLinus Torvalds return pol; 999*1da177e4SLinus Torvalds } 1000*1da177e4SLinus Torvalds 1001*1da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n) 1002*1da177e4SLinus Torvalds { 1003*1da177e4SLinus Torvalds PDprintk("deleting %lx-l%x\n", n->start, n->end); 1004*1da177e4SLinus Torvalds rb_erase(&n->nd, &sp->root); 1005*1da177e4SLinus Torvalds mpol_free(n->policy); 1006*1da177e4SLinus Torvalds kmem_cache_free(sn_cache, n); 1007*1da177e4SLinus Torvalds } 1008*1da177e4SLinus Torvalds 1009*1da177e4SLinus Torvalds struct sp_node * 1010*1da177e4SLinus Torvalds sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) 1011*1da177e4SLinus Torvalds { 1012*1da177e4SLinus Torvalds struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 1013*1da177e4SLinus Torvalds 1014*1da177e4SLinus Torvalds if (!n) 1015*1da177e4SLinus Torvalds return NULL; 1016*1da177e4SLinus Torvalds n->start = start; 1017*1da177e4SLinus Torvalds n->end = end; 1018*1da177e4SLinus Torvalds mpol_get(pol); 1019*1da177e4SLinus Torvalds n->policy = pol; 1020*1da177e4SLinus Torvalds return n; 1021*1da177e4SLinus Torvalds } 1022*1da177e4SLinus Torvalds 1023*1da177e4SLinus Torvalds /* Replace a policy range. */ 1024*1da177e4SLinus Torvalds static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 1025*1da177e4SLinus Torvalds unsigned long end, struct sp_node *new) 1026*1da177e4SLinus Torvalds { 1027*1da177e4SLinus Torvalds struct sp_node *n, *new2 = NULL; 1028*1da177e4SLinus Torvalds 1029*1da177e4SLinus Torvalds restart: 1030*1da177e4SLinus Torvalds spin_lock(&sp->lock); 1031*1da177e4SLinus Torvalds n = sp_lookup(sp, start, end); 1032*1da177e4SLinus Torvalds /* Take care of old policies in the same range. */ 1033*1da177e4SLinus Torvalds while (n && n->start < end) { 1034*1da177e4SLinus Torvalds struct rb_node *next = rb_next(&n->nd); 1035*1da177e4SLinus Torvalds if (n->start >= start) { 1036*1da177e4SLinus Torvalds if (n->end <= end) 1037*1da177e4SLinus Torvalds sp_delete(sp, n); 1038*1da177e4SLinus Torvalds else 1039*1da177e4SLinus Torvalds n->start = end; 1040*1da177e4SLinus Torvalds } else { 1041*1da177e4SLinus Torvalds /* Old policy spanning whole new range. */ 1042*1da177e4SLinus Torvalds if (n->end > end) { 1043*1da177e4SLinus Torvalds if (!new2) { 1044*1da177e4SLinus Torvalds spin_unlock(&sp->lock); 1045*1da177e4SLinus Torvalds new2 = sp_alloc(end, n->end, n->policy); 1046*1da177e4SLinus Torvalds if (!new2) 1047*1da177e4SLinus Torvalds return -ENOMEM; 1048*1da177e4SLinus Torvalds goto restart; 1049*1da177e4SLinus Torvalds } 1050*1da177e4SLinus Torvalds n->end = start; 1051*1da177e4SLinus Torvalds sp_insert(sp, new2); 1052*1da177e4SLinus Torvalds new2 = NULL; 1053*1da177e4SLinus Torvalds break; 1054*1da177e4SLinus Torvalds } else 1055*1da177e4SLinus Torvalds n->end = start; 1056*1da177e4SLinus Torvalds } 1057*1da177e4SLinus Torvalds if (!next) 1058*1da177e4SLinus Torvalds break; 1059*1da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 1060*1da177e4SLinus Torvalds } 1061*1da177e4SLinus Torvalds if (new) 1062*1da177e4SLinus Torvalds sp_insert(sp, new); 1063*1da177e4SLinus Torvalds spin_unlock(&sp->lock); 1064*1da177e4SLinus Torvalds if (new2) { 1065*1da177e4SLinus Torvalds mpol_free(new2->policy); 1066*1da177e4SLinus Torvalds kmem_cache_free(sn_cache, new2); 1067*1da177e4SLinus Torvalds } 1068*1da177e4SLinus Torvalds return 0; 1069*1da177e4SLinus Torvalds } 1070*1da177e4SLinus Torvalds 1071*1da177e4SLinus Torvalds int mpol_set_shared_policy(struct shared_policy *info, 1072*1da177e4SLinus Torvalds struct vm_area_struct *vma, struct mempolicy *npol) 1073*1da177e4SLinus Torvalds { 1074*1da177e4SLinus Torvalds int err; 1075*1da177e4SLinus Torvalds struct sp_node *new = NULL; 1076*1da177e4SLinus Torvalds unsigned long sz = vma_pages(vma); 1077*1da177e4SLinus Torvalds 1078*1da177e4SLinus Torvalds PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1079*1da177e4SLinus Torvalds vma->vm_pgoff, 1080*1da177e4SLinus Torvalds sz, npol? npol->policy : -1, 1081*1da177e4SLinus Torvalds npol ? npol->v.nodes[0] : -1); 1082*1da177e4SLinus Torvalds 1083*1da177e4SLinus Torvalds if (npol) { 1084*1da177e4SLinus Torvalds new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1085*1da177e4SLinus Torvalds if (!new) 1086*1da177e4SLinus Torvalds return -ENOMEM; 1087*1da177e4SLinus Torvalds } 1088*1da177e4SLinus Torvalds err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 1089*1da177e4SLinus Torvalds if (err && new) 1090*1da177e4SLinus Torvalds kmem_cache_free(sn_cache, new); 1091*1da177e4SLinus Torvalds return err; 1092*1da177e4SLinus Torvalds } 1093*1da177e4SLinus Torvalds 1094*1da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */ 1095*1da177e4SLinus Torvalds void mpol_free_shared_policy(struct shared_policy *p) 1096*1da177e4SLinus Torvalds { 1097*1da177e4SLinus Torvalds struct sp_node *n; 1098*1da177e4SLinus Torvalds struct rb_node *next; 1099*1da177e4SLinus Torvalds 1100*1da177e4SLinus Torvalds if (!p->root.rb_node) 1101*1da177e4SLinus Torvalds return; 1102*1da177e4SLinus Torvalds spin_lock(&p->lock); 1103*1da177e4SLinus Torvalds next = rb_first(&p->root); 1104*1da177e4SLinus Torvalds while (next) { 1105*1da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd); 1106*1da177e4SLinus Torvalds next = rb_next(&n->nd); 1107*1da177e4SLinus Torvalds mpol_free(n->policy); 1108*1da177e4SLinus Torvalds kmem_cache_free(sn_cache, n); 1109*1da177e4SLinus Torvalds } 1110*1da177e4SLinus Torvalds spin_unlock(&p->lock); 1111*1da177e4SLinus Torvalds p->root = RB_ROOT; 1112*1da177e4SLinus Torvalds } 1113*1da177e4SLinus Torvalds 1114*1da177e4SLinus Torvalds /* assumes fs == KERNEL_DS */ 1115*1da177e4SLinus Torvalds void __init numa_policy_init(void) 1116*1da177e4SLinus Torvalds { 1117*1da177e4SLinus Torvalds policy_cache = kmem_cache_create("numa_policy", 1118*1da177e4SLinus Torvalds sizeof(struct mempolicy), 1119*1da177e4SLinus Torvalds 0, SLAB_PANIC, NULL, NULL); 1120*1da177e4SLinus Torvalds 1121*1da177e4SLinus Torvalds sn_cache = kmem_cache_create("shared_policy_node", 1122*1da177e4SLinus Torvalds sizeof(struct sp_node), 1123*1da177e4SLinus Torvalds 0, SLAB_PANIC, NULL, NULL); 1124*1da177e4SLinus Torvalds 1125*1da177e4SLinus Torvalds /* Set interleaving policy for system init. This way not all 1126*1da177e4SLinus Torvalds the data structures allocated at system boot end up in node zero. */ 1127*1da177e4SLinus Torvalds 1128*1da177e4SLinus Torvalds if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), 1129*1da177e4SLinus Torvalds MAX_NUMNODES) < 0) 1130*1da177e4SLinus Torvalds printk("numa_policy_init: interleaving failed\n"); 1131*1da177e4SLinus Torvalds } 1132*1da177e4SLinus Torvalds 1133*1da177e4SLinus Torvalds /* Reset policy of current process to default. 1134*1da177e4SLinus Torvalds * Assumes fs == KERNEL_DS */ 1135*1da177e4SLinus Torvalds void numa_default_policy(void) 1136*1da177e4SLinus Torvalds { 1137*1da177e4SLinus Torvalds sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); 1138*1da177e4SLinus Torvalds } 1139