1 /* 2 * Simple NUMA memory policy for the Linux kernel. 3 * 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 6 * Subject to the GNU Public License, version 2. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support four policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * bind Only allocate memory on a specific set of nodes, 23 * no fallback. 24 * FIXME: memory is allocated starting with the first node 25 * to the last. It would be better if bind would truly restrict 26 * the allocation to memory nodes instead 27 * 28 * preferred Try a specific node first before normal fallback. 29 * As a special case node -1 here means do the allocation 30 * on the local CPU. This is normally identical to default, 31 * but useful to set in a VMA when you have a non default 32 * process policy. 33 * 34 * default Allocate on the local node first, or when on a VMA 35 * use the process policy. This is what Linux always did 36 * in a NUMA aware kernel and still does by, ahem, default. 37 * 38 * The process policy is applied for most non interrupt memory allocations 39 * in that process' context. Interrupts ignore the policies and always 40 * try to allocate on the local CPU. The VMA policy is only applied for memory 41 * allocations for a VMA in the VM. 42 * 43 * Currently there are a few corner cases in swapping where the policy 44 * is not applied, but the majority should be handled. When process policy 45 * is used it is not remembered over swap outs/swap ins. 46 * 47 * Only the highest zone in the zone hierarchy gets policied. Allocations 48 * requesting a lower zone just use default policy. This implies that 49 * on systems with highmem kernel lowmem allocation don't get policied. 50 * Same with GFP_DMA allocations. 51 * 52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 53 * all users and remembered even when nobody has memory mapped. 54 */ 55 56 /* Notebook: 57 fix mmap readahead to honour policy and enable policy for any page cache 58 object 59 statistics for bigpages 60 global policy for page cache? currently it uses process policy. Requires 61 first item above. 62 handle mremap for shared memory (currently ignored for the policy) 63 grows down? 64 make bind policy root only? It can trigger oom much faster and the 65 kernel is not always grateful with that. 66 could replace all the switch()es with a mempolicy_ops structure. 67 */ 68 69 #include <linux/mempolicy.h> 70 #include <linux/mm.h> 71 #include <linux/highmem.h> 72 #include <linux/hugetlb.h> 73 #include <linux/kernel.h> 74 #include <linux/sched.h> 75 #include <linux/mm.h> 76 #include <linux/nodemask.h> 77 #include <linux/cpuset.h> 78 #include <linux/gfp.h> 79 #include <linux/slab.h> 80 #include <linux/string.h> 81 #include <linux/module.h> 82 #include <linux/interrupt.h> 83 #include <linux/init.h> 84 #include <linux/compat.h> 85 #include <linux/mempolicy.h> 86 #include <asm/tlbflush.h> 87 #include <asm/uaccess.h> 88 89 static kmem_cache_t *policy_cache; 90 static kmem_cache_t *sn_cache; 91 92 #define PDprintk(fmt...) 93 94 /* Highest zone. An specific allocation for a zone below that is not 95 policied. */ 96 static int policy_zone; 97 98 struct mempolicy default_policy = { 99 .refcnt = ATOMIC_INIT(1), /* never free it */ 100 .policy = MPOL_DEFAULT, 101 }; 102 103 /* Do sanity checking on a policy */ 104 static int mpol_check_policy(int mode, nodemask_t *nodes) 105 { 106 int empty = nodes_empty(*nodes); 107 108 switch (mode) { 109 case MPOL_DEFAULT: 110 if (!empty) 111 return -EINVAL; 112 break; 113 case MPOL_BIND: 114 case MPOL_INTERLEAVE: 115 /* Preferred will only use the first bit, but allow 116 more for now. */ 117 if (empty) 118 return -EINVAL; 119 break; 120 } 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 122 } 123 /* Generate a custom zonelist for the BIND policy. */ 124 static struct zonelist *bind_zonelist(nodemask_t *nodes) 125 { 126 struct zonelist *zl; 127 int num, max, nd; 128 129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 131 if (!zl) 132 return NULL; 133 num = 0; 134 for_each_node_mask(nd, *nodes) { 135 int k; 136 for (k = MAX_NR_ZONES-1; k >= 0; k--) { 137 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 138 if (!z->present_pages) 139 continue; 140 zl->zones[num++] = z; 141 if (k > policy_zone) 142 policy_zone = k; 143 } 144 } 145 zl->zones[num] = NULL; 146 return zl; 147 } 148 149 /* Create a new policy */ 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) 151 { 152 struct mempolicy *policy; 153 154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); 155 if (mode == MPOL_DEFAULT) 156 return NULL; 157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 158 if (!policy) 159 return ERR_PTR(-ENOMEM); 160 atomic_set(&policy->refcnt, 1); 161 switch (mode) { 162 case MPOL_INTERLEAVE: 163 policy->v.nodes = *nodes; 164 break; 165 case MPOL_PREFERRED: 166 policy->v.preferred_node = first_node(*nodes); 167 if (policy->v.preferred_node >= MAX_NUMNODES) 168 policy->v.preferred_node = -1; 169 break; 170 case MPOL_BIND: 171 policy->v.zonelist = bind_zonelist(nodes); 172 if (policy->v.zonelist == NULL) { 173 kmem_cache_free(policy_cache, policy); 174 return ERR_PTR(-ENOMEM); 175 } 176 break; 177 } 178 policy->policy = mode; 179 return policy; 180 } 181 182 /* Ensure all existing pages follow the policy. */ 183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 184 unsigned long addr, unsigned long end, nodemask_t *nodes) 185 { 186 pte_t *orig_pte; 187 pte_t *pte; 188 spinlock_t *ptl; 189 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 191 do { 192 struct page *page; 193 unsigned int nid; 194 195 if (!pte_present(*pte)) 196 continue; 197 page = vm_normal_page(vma, addr, *pte); 198 if (!page) 199 continue; 200 nid = page_to_nid(page); 201 if (!node_isset(nid, *nodes)) 202 break; 203 } while (pte++, addr += PAGE_SIZE, addr != end); 204 pte_unmap_unlock(orig_pte, ptl); 205 return addr != end; 206 } 207 208 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 209 unsigned long addr, unsigned long end, nodemask_t *nodes) 210 { 211 pmd_t *pmd; 212 unsigned long next; 213 214 pmd = pmd_offset(pud, addr); 215 do { 216 next = pmd_addr_end(addr, end); 217 if (pmd_none_or_clear_bad(pmd)) 218 continue; 219 if (check_pte_range(vma, pmd, addr, next, nodes)) 220 return -EIO; 221 } while (pmd++, addr = next, addr != end); 222 return 0; 223 } 224 225 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 226 unsigned long addr, unsigned long end, nodemask_t *nodes) 227 { 228 pud_t *pud; 229 unsigned long next; 230 231 pud = pud_offset(pgd, addr); 232 do { 233 next = pud_addr_end(addr, end); 234 if (pud_none_or_clear_bad(pud)) 235 continue; 236 if (check_pmd_range(vma, pud, addr, next, nodes)) 237 return -EIO; 238 } while (pud++, addr = next, addr != end); 239 return 0; 240 } 241 242 static inline int check_pgd_range(struct vm_area_struct *vma, 243 unsigned long addr, unsigned long end, nodemask_t *nodes) 244 { 245 pgd_t *pgd; 246 unsigned long next; 247 248 pgd = pgd_offset(vma->vm_mm, addr); 249 do { 250 next = pgd_addr_end(addr, end); 251 if (pgd_none_or_clear_bad(pgd)) 252 continue; 253 if (check_pud_range(vma, pgd, addr, next, nodes)) 254 return -EIO; 255 } while (pgd++, addr = next, addr != end); 256 return 0; 257 } 258 259 /* Step 1: check the range */ 260 static struct vm_area_struct * 261 check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 262 nodemask_t *nodes, unsigned long flags) 263 { 264 int err; 265 struct vm_area_struct *first, *vma, *prev; 266 267 first = find_vma(mm, start); 268 if (!first) 269 return ERR_PTR(-EFAULT); 270 prev = NULL; 271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 272 if (!vma->vm_next && vma->vm_end < end) 273 return ERR_PTR(-EFAULT); 274 if (prev && prev->vm_end < vma->vm_start) 275 return ERR_PTR(-EFAULT); 276 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 277 unsigned long endvma = vma->vm_end; 278 if (endvma > end) 279 endvma = end; 280 if (vma->vm_start > start) 281 start = vma->vm_start; 282 err = check_pgd_range(vma, start, endvma, nodes); 283 if (err) { 284 first = ERR_PTR(err); 285 break; 286 } 287 } 288 prev = vma; 289 } 290 return first; 291 } 292 293 /* Apply policy to a single VMA */ 294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 295 { 296 int err = 0; 297 struct mempolicy *old = vma->vm_policy; 298 299 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 300 vma->vm_start, vma->vm_end, vma->vm_pgoff, 301 vma->vm_ops, vma->vm_file, 302 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 303 304 if (vma->vm_ops && vma->vm_ops->set_policy) 305 err = vma->vm_ops->set_policy(vma, new); 306 if (!err) { 307 mpol_get(new); 308 vma->vm_policy = new; 309 mpol_free(old); 310 } 311 return err; 312 } 313 314 /* Step 2: apply policy to a range and do splits. */ 315 static int mbind_range(struct vm_area_struct *vma, unsigned long start, 316 unsigned long end, struct mempolicy *new) 317 { 318 struct vm_area_struct *next; 319 int err; 320 321 err = 0; 322 for (; vma && vma->vm_start < end; vma = next) { 323 next = vma->vm_next; 324 if (vma->vm_start < start) 325 err = split_vma(vma->vm_mm, vma, start, 1); 326 if (!err && vma->vm_end > end) 327 err = split_vma(vma->vm_mm, vma, end, 0); 328 if (!err) 329 err = policy_vma(vma, new); 330 if (err) 331 break; 332 } 333 return err; 334 } 335 336 static int contextualize_policy(int mode, nodemask_t *nodes) 337 { 338 if (!nodes) 339 return 0; 340 341 /* Update current mems_allowed */ 342 cpuset_update_current_mems_allowed(); 343 /* Ignore nodes not set in current->mems_allowed */ 344 cpuset_restrict_to_mems_allowed(nodes->bits); 345 return mpol_check_policy(mode, nodes); 346 } 347 348 long do_mbind(unsigned long start, unsigned long len, 349 unsigned long mode, nodemask_t *nmask, unsigned long flags) 350 { 351 struct vm_area_struct *vma; 352 struct mm_struct *mm = current->mm; 353 struct mempolicy *new; 354 unsigned long end; 355 int err; 356 357 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 358 return -EINVAL; 359 if (start & ~PAGE_MASK) 360 return -EINVAL; 361 if (mode == MPOL_DEFAULT) 362 flags &= ~MPOL_MF_STRICT; 363 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 364 end = start + len; 365 if (end < start) 366 return -EINVAL; 367 if (end == start) 368 return 0; 369 if (mpol_check_policy(mode, nmask)) 370 return -EINVAL; 371 new = mpol_new(mode, nmask); 372 if (IS_ERR(new)) 373 return PTR_ERR(new); 374 375 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 376 mode,nodes_addr(nodes)[0]); 377 378 down_write(&mm->mmap_sem); 379 vma = check_range(mm, start, end, nmask, flags); 380 err = PTR_ERR(vma); 381 if (!IS_ERR(vma)) 382 err = mbind_range(vma, start, end, new); 383 up_write(&mm->mmap_sem); 384 mpol_free(new); 385 return err; 386 } 387 388 /* Set the process memory policy */ 389 long do_set_mempolicy(int mode, nodemask_t *nodes) 390 { 391 struct mempolicy *new; 392 393 if (contextualize_policy(mode, nodes)) 394 return -EINVAL; 395 new = mpol_new(mode, nodes); 396 if (IS_ERR(new)) 397 return PTR_ERR(new); 398 mpol_free(current->mempolicy); 399 current->mempolicy = new; 400 if (new && new->policy == MPOL_INTERLEAVE) 401 current->il_next = first_node(new->v.nodes); 402 return 0; 403 } 404 405 /* Fill a zone bitmap for a policy */ 406 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 407 { 408 int i; 409 410 nodes_clear(*nodes); 411 switch (p->policy) { 412 case MPOL_BIND: 413 for (i = 0; p->v.zonelist->zones[i]; i++) 414 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, 415 *nodes); 416 break; 417 case MPOL_DEFAULT: 418 break; 419 case MPOL_INTERLEAVE: 420 *nodes = p->v.nodes; 421 break; 422 case MPOL_PREFERRED: 423 /* or use current node instead of online map? */ 424 if (p->v.preferred_node < 0) 425 *nodes = node_online_map; 426 else 427 node_set(p->v.preferred_node, *nodes); 428 break; 429 default: 430 BUG(); 431 } 432 } 433 434 static int lookup_node(struct mm_struct *mm, unsigned long addr) 435 { 436 struct page *p; 437 int err; 438 439 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 440 if (err >= 0) { 441 err = page_to_nid(p); 442 put_page(p); 443 } 444 return err; 445 } 446 447 /* Retrieve NUMA policy */ 448 long do_get_mempolicy(int *policy, nodemask_t *nmask, 449 unsigned long addr, unsigned long flags) 450 { 451 int err; 452 struct mm_struct *mm = current->mm; 453 struct vm_area_struct *vma = NULL; 454 struct mempolicy *pol = current->mempolicy; 455 456 cpuset_update_current_mems_allowed(); 457 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 458 return -EINVAL; 459 if (flags & MPOL_F_ADDR) { 460 down_read(&mm->mmap_sem); 461 vma = find_vma_intersection(mm, addr, addr+1); 462 if (!vma) { 463 up_read(&mm->mmap_sem); 464 return -EFAULT; 465 } 466 if (vma->vm_ops && vma->vm_ops->get_policy) 467 pol = vma->vm_ops->get_policy(vma, addr); 468 else 469 pol = vma->vm_policy; 470 } else if (addr) 471 return -EINVAL; 472 473 if (!pol) 474 pol = &default_policy; 475 476 if (flags & MPOL_F_NODE) { 477 if (flags & MPOL_F_ADDR) { 478 err = lookup_node(mm, addr); 479 if (err < 0) 480 goto out; 481 *policy = err; 482 } else if (pol == current->mempolicy && 483 pol->policy == MPOL_INTERLEAVE) { 484 *policy = current->il_next; 485 } else { 486 err = -EINVAL; 487 goto out; 488 } 489 } else 490 *policy = pol->policy; 491 492 if (vma) { 493 up_read(¤t->mm->mmap_sem); 494 vma = NULL; 495 } 496 497 err = 0; 498 if (nmask) 499 get_zonemask(pol, nmask); 500 501 out: 502 if (vma) 503 up_read(¤t->mm->mmap_sem); 504 return err; 505 } 506 507 /* 508 * User space interface with variable sized bitmaps for nodelists. 509 */ 510 511 /* Copy a node mask from user space. */ 512 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, 513 unsigned long maxnode) 514 { 515 unsigned long k; 516 unsigned long nlongs; 517 unsigned long endmask; 518 519 --maxnode; 520 nodes_clear(*nodes); 521 if (maxnode == 0 || !nmask) 522 return 0; 523 524 nlongs = BITS_TO_LONGS(maxnode); 525 if ((maxnode % BITS_PER_LONG) == 0) 526 endmask = ~0UL; 527 else 528 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 529 530 /* When the user specified more nodes than supported just check 531 if the non supported part is all zero. */ 532 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 533 if (nlongs > PAGE_SIZE/sizeof(long)) 534 return -EINVAL; 535 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 536 unsigned long t; 537 if (get_user(t, nmask + k)) 538 return -EFAULT; 539 if (k == nlongs - 1) { 540 if (t & endmask) 541 return -EINVAL; 542 } else if (t) 543 return -EINVAL; 544 } 545 nlongs = BITS_TO_LONGS(MAX_NUMNODES); 546 endmask = ~0UL; 547 } 548 549 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 550 return -EFAULT; 551 nodes_addr(*nodes)[nlongs-1] &= endmask; 552 return 0; 553 } 554 555 /* Copy a kernel node mask to user space */ 556 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 557 nodemask_t *nodes) 558 { 559 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 560 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 561 562 if (copy > nbytes) { 563 if (copy > PAGE_SIZE) 564 return -EINVAL; 565 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 566 return -EFAULT; 567 copy = nbytes; 568 } 569 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 570 } 571 572 asmlinkage long sys_mbind(unsigned long start, unsigned long len, 573 unsigned long mode, 574 unsigned long __user *nmask, unsigned long maxnode, 575 unsigned flags) 576 { 577 nodemask_t nodes; 578 int err; 579 580 err = get_nodes(&nodes, nmask, maxnode); 581 if (err) 582 return err; 583 return do_mbind(start, len, mode, &nodes, flags); 584 } 585 586 /* Set the process memory policy */ 587 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 588 unsigned long maxnode) 589 { 590 int err; 591 nodemask_t nodes; 592 593 if (mode < 0 || mode > MPOL_MAX) 594 return -EINVAL; 595 err = get_nodes(&nodes, nmask, maxnode); 596 if (err) 597 return err; 598 return do_set_mempolicy(mode, &nodes); 599 } 600 601 /* Retrieve NUMA policy */ 602 asmlinkage long sys_get_mempolicy(int __user *policy, 603 unsigned long __user *nmask, 604 unsigned long maxnode, 605 unsigned long addr, unsigned long flags) 606 { 607 int err, pval; 608 nodemask_t nodes; 609 610 if (nmask != NULL && maxnode < MAX_NUMNODES) 611 return -EINVAL; 612 613 err = do_get_mempolicy(&pval, &nodes, addr, flags); 614 615 if (err) 616 return err; 617 618 if (policy && put_user(pval, policy)) 619 return -EFAULT; 620 621 if (nmask) 622 err = copy_nodes_to_user(nmask, maxnode, &nodes); 623 624 return err; 625 } 626 627 #ifdef CONFIG_COMPAT 628 629 asmlinkage long compat_sys_get_mempolicy(int __user *policy, 630 compat_ulong_t __user *nmask, 631 compat_ulong_t maxnode, 632 compat_ulong_t addr, compat_ulong_t flags) 633 { 634 long err; 635 unsigned long __user *nm = NULL; 636 unsigned long nr_bits, alloc_size; 637 DECLARE_BITMAP(bm, MAX_NUMNODES); 638 639 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 640 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 641 642 if (nmask) 643 nm = compat_alloc_user_space(alloc_size); 644 645 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 646 647 if (!err && nmask) { 648 err = copy_from_user(bm, nm, alloc_size); 649 /* ensure entire bitmap is zeroed */ 650 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 651 err |= compat_put_bitmap(nmask, bm, nr_bits); 652 } 653 654 return err; 655 } 656 657 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, 658 compat_ulong_t maxnode) 659 { 660 long err = 0; 661 unsigned long __user *nm = NULL; 662 unsigned long nr_bits, alloc_size; 663 DECLARE_BITMAP(bm, MAX_NUMNODES); 664 665 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 666 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 667 668 if (nmask) { 669 err = compat_get_bitmap(bm, nmask, nr_bits); 670 nm = compat_alloc_user_space(alloc_size); 671 err |= copy_to_user(nm, bm, alloc_size); 672 } 673 674 if (err) 675 return -EFAULT; 676 677 return sys_set_mempolicy(mode, nm, nr_bits+1); 678 } 679 680 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, 681 compat_ulong_t mode, compat_ulong_t __user *nmask, 682 compat_ulong_t maxnode, compat_ulong_t flags) 683 { 684 long err = 0; 685 unsigned long __user *nm = NULL; 686 unsigned long nr_bits, alloc_size; 687 nodemask_t bm; 688 689 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 690 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 691 692 if (nmask) { 693 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 694 nm = compat_alloc_user_space(alloc_size); 695 err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 696 } 697 698 if (err) 699 return -EFAULT; 700 701 return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 702 } 703 704 #endif 705 706 /* Return effective policy for a VMA */ 707 struct mempolicy * 708 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) 709 { 710 struct mempolicy *pol = task->mempolicy; 711 712 if (vma) { 713 if (vma->vm_ops && vma->vm_ops->get_policy) 714 pol = vma->vm_ops->get_policy(vma, addr); 715 else if (vma->vm_policy && 716 vma->vm_policy->policy != MPOL_DEFAULT) 717 pol = vma->vm_policy; 718 } 719 if (!pol) 720 pol = &default_policy; 721 return pol; 722 } 723 724 /* Return a zonelist representing a mempolicy */ 725 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 726 { 727 int nd; 728 729 switch (policy->policy) { 730 case MPOL_PREFERRED: 731 nd = policy->v.preferred_node; 732 if (nd < 0) 733 nd = numa_node_id(); 734 break; 735 case MPOL_BIND: 736 /* Lower zones don't get a policy applied */ 737 /* Careful: current->mems_allowed might have moved */ 738 if (gfp_zone(gfp) >= policy_zone) 739 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 740 return policy->v.zonelist; 741 /*FALL THROUGH*/ 742 case MPOL_INTERLEAVE: /* should not happen */ 743 case MPOL_DEFAULT: 744 nd = numa_node_id(); 745 break; 746 default: 747 nd = 0; 748 BUG(); 749 } 750 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); 751 } 752 753 /* Do dynamic interleaving for a process */ 754 static unsigned interleave_nodes(struct mempolicy *policy) 755 { 756 unsigned nid, next; 757 struct task_struct *me = current; 758 759 nid = me->il_next; 760 next = next_node(nid, policy->v.nodes); 761 if (next >= MAX_NUMNODES) 762 next = first_node(policy->v.nodes); 763 me->il_next = next; 764 return nid; 765 } 766 767 /* Do static interleaving for a VMA with known offset. */ 768 static unsigned offset_il_node(struct mempolicy *pol, 769 struct vm_area_struct *vma, unsigned long off) 770 { 771 unsigned nnodes = nodes_weight(pol->v.nodes); 772 unsigned target = (unsigned)off % nnodes; 773 int c; 774 int nid = -1; 775 776 c = 0; 777 do { 778 nid = next_node(nid, pol->v.nodes); 779 c++; 780 } while (c <= target); 781 return nid; 782 } 783 784 /* Allocate a page in interleaved policy. 785 Own path because it needs to do special accounting. */ 786 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 787 unsigned nid) 788 { 789 struct zonelist *zl; 790 struct page *page; 791 792 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 793 page = __alloc_pages(gfp, order, zl); 794 if (page && page_zone(page) == zl->zones[0]) { 795 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; 796 put_cpu(); 797 } 798 return page; 799 } 800 801 /** 802 * alloc_page_vma - Allocate a page for a VMA. 803 * 804 * @gfp: 805 * %GFP_USER user allocation. 806 * %GFP_KERNEL kernel allocations, 807 * %GFP_HIGHMEM highmem/user allocations, 808 * %GFP_FS allocation should not call back into a file system. 809 * %GFP_ATOMIC don't sleep. 810 * 811 * @vma: Pointer to VMA or NULL if not available. 812 * @addr: Virtual Address of the allocation. Must be inside the VMA. 813 * 814 * This function allocates a page from the kernel page pool and applies 815 * a NUMA policy associated with the VMA or the current process. 816 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 817 * mm_struct of the VMA to prevent it from going away. Should be used for 818 * all allocations for pages that will be mapped into 819 * user space. Returns NULL when no page can be allocated. 820 * 821 * Should be called with the mm_sem of the vma hold. 822 */ 823 struct page * 824 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 825 { 826 struct mempolicy *pol = get_vma_policy(current, vma, addr); 827 828 cpuset_update_current_mems_allowed(); 829 830 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 831 unsigned nid; 832 if (vma) { 833 unsigned long off; 834 off = vma->vm_pgoff; 835 off += (addr - vma->vm_start) >> PAGE_SHIFT; 836 nid = offset_il_node(pol, vma, off); 837 } else { 838 /* fall back to process interleaving */ 839 nid = interleave_nodes(pol); 840 } 841 return alloc_page_interleave(gfp, 0, nid); 842 } 843 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 844 } 845 846 /** 847 * alloc_pages_current - Allocate pages. 848 * 849 * @gfp: 850 * %GFP_USER user allocation, 851 * %GFP_KERNEL kernel allocation, 852 * %GFP_HIGHMEM highmem allocation, 853 * %GFP_FS don't call back into a file system. 854 * %GFP_ATOMIC don't sleep. 855 * @order: Power of two of allocation size in pages. 0 is a single page. 856 * 857 * Allocate a page from the kernel page pool. When not in 858 * interrupt context and apply the current process NUMA policy. 859 * Returns NULL when no page can be allocated. 860 * 861 * Don't call cpuset_update_current_mems_allowed() unless 862 * 1) it's ok to take cpuset_sem (can WAIT), and 863 * 2) allocating for current task (not interrupt). 864 */ 865 struct page *alloc_pages_current(gfp_t gfp, unsigned order) 866 { 867 struct mempolicy *pol = current->mempolicy; 868 869 if ((gfp & __GFP_WAIT) && !in_interrupt()) 870 cpuset_update_current_mems_allowed(); 871 if (!pol || in_interrupt()) 872 pol = &default_policy; 873 if (pol->policy == MPOL_INTERLEAVE) 874 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 875 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 876 } 877 EXPORT_SYMBOL(alloc_pages_current); 878 879 /* Slow path of a mempolicy copy */ 880 struct mempolicy *__mpol_copy(struct mempolicy *old) 881 { 882 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 883 884 if (!new) 885 return ERR_PTR(-ENOMEM); 886 *new = *old; 887 atomic_set(&new->refcnt, 1); 888 if (new->policy == MPOL_BIND) { 889 int sz = ksize(old->v.zonelist); 890 new->v.zonelist = kmalloc(sz, SLAB_KERNEL); 891 if (!new->v.zonelist) { 892 kmem_cache_free(policy_cache, new); 893 return ERR_PTR(-ENOMEM); 894 } 895 memcpy(new->v.zonelist, old->v.zonelist, sz); 896 } 897 return new; 898 } 899 900 /* Slow path of a mempolicy comparison */ 901 int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 902 { 903 if (!a || !b) 904 return 0; 905 if (a->policy != b->policy) 906 return 0; 907 switch (a->policy) { 908 case MPOL_DEFAULT: 909 return 1; 910 case MPOL_INTERLEAVE: 911 return nodes_equal(a->v.nodes, b->v.nodes); 912 case MPOL_PREFERRED: 913 return a->v.preferred_node == b->v.preferred_node; 914 case MPOL_BIND: { 915 int i; 916 for (i = 0; a->v.zonelist->zones[i]; i++) 917 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) 918 return 0; 919 return b->v.zonelist->zones[i] == NULL; 920 } 921 default: 922 BUG(); 923 return 0; 924 } 925 } 926 927 /* Slow path of a mpol destructor. */ 928 void __mpol_free(struct mempolicy *p) 929 { 930 if (!atomic_dec_and_test(&p->refcnt)) 931 return; 932 if (p->policy == MPOL_BIND) 933 kfree(p->v.zonelist); 934 p->policy = MPOL_DEFAULT; 935 kmem_cache_free(policy_cache, p); 936 } 937 938 /* 939 * Hugetlb policy. Same as above, just works with node numbers instead of 940 * zonelists. 941 */ 942 943 /* Find first node suitable for an allocation */ 944 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) 945 { 946 struct mempolicy *pol = get_vma_policy(current, vma, addr); 947 948 switch (pol->policy) { 949 case MPOL_DEFAULT: 950 return numa_node_id(); 951 case MPOL_BIND: 952 return pol->v.zonelist->zones[0]->zone_pgdat->node_id; 953 case MPOL_INTERLEAVE: 954 return interleave_nodes(pol); 955 case MPOL_PREFERRED: 956 return pol->v.preferred_node >= 0 ? 957 pol->v.preferred_node : numa_node_id(); 958 } 959 BUG(); 960 return 0; 961 } 962 963 /* Find secondary valid nodes for an allocation */ 964 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) 965 { 966 struct mempolicy *pol = get_vma_policy(current, vma, addr); 967 968 switch (pol->policy) { 969 case MPOL_PREFERRED: 970 case MPOL_DEFAULT: 971 case MPOL_INTERLEAVE: 972 return 1; 973 case MPOL_BIND: { 974 struct zone **z; 975 for (z = pol->v.zonelist->zones; *z; z++) 976 if ((*z)->zone_pgdat->node_id == nid) 977 return 1; 978 return 0; 979 } 980 default: 981 BUG(); 982 return 0; 983 } 984 } 985 986 /* 987 * Shared memory backing store policy support. 988 * 989 * Remember policies even when nobody has shared memory mapped. 990 * The policies are kept in Red-Black tree linked from the inode. 991 * They are protected by the sp->lock spinlock, which should be held 992 * for any accesses to the tree. 993 */ 994 995 /* lookup first element intersecting start-end */ 996 /* Caller holds sp->lock */ 997 static struct sp_node * 998 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 999 { 1000 struct rb_node *n = sp->root.rb_node; 1001 1002 while (n) { 1003 struct sp_node *p = rb_entry(n, struct sp_node, nd); 1004 1005 if (start >= p->end) 1006 n = n->rb_right; 1007 else if (end <= p->start) 1008 n = n->rb_left; 1009 else 1010 break; 1011 } 1012 if (!n) 1013 return NULL; 1014 for (;;) { 1015 struct sp_node *w = NULL; 1016 struct rb_node *prev = rb_prev(n); 1017 if (!prev) 1018 break; 1019 w = rb_entry(prev, struct sp_node, nd); 1020 if (w->end <= start) 1021 break; 1022 n = prev; 1023 } 1024 return rb_entry(n, struct sp_node, nd); 1025 } 1026 1027 /* Insert a new shared policy into the list. */ 1028 /* Caller holds sp->lock */ 1029 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 1030 { 1031 struct rb_node **p = &sp->root.rb_node; 1032 struct rb_node *parent = NULL; 1033 struct sp_node *nd; 1034 1035 while (*p) { 1036 parent = *p; 1037 nd = rb_entry(parent, struct sp_node, nd); 1038 if (new->start < nd->start) 1039 p = &(*p)->rb_left; 1040 else if (new->end > nd->end) 1041 p = &(*p)->rb_right; 1042 else 1043 BUG(); 1044 } 1045 rb_link_node(&new->nd, parent, p); 1046 rb_insert_color(&new->nd, &sp->root); 1047 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, 1048 new->policy ? new->policy->policy : 0); 1049 } 1050 1051 /* Find shared policy intersecting idx */ 1052 struct mempolicy * 1053 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 1054 { 1055 struct mempolicy *pol = NULL; 1056 struct sp_node *sn; 1057 1058 if (!sp->root.rb_node) 1059 return NULL; 1060 spin_lock(&sp->lock); 1061 sn = sp_lookup(sp, idx, idx+1); 1062 if (sn) { 1063 mpol_get(sn->policy); 1064 pol = sn->policy; 1065 } 1066 spin_unlock(&sp->lock); 1067 return pol; 1068 } 1069 1070 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 1071 { 1072 PDprintk("deleting %lx-l%x\n", n->start, n->end); 1073 rb_erase(&n->nd, &sp->root); 1074 mpol_free(n->policy); 1075 kmem_cache_free(sn_cache, n); 1076 } 1077 1078 struct sp_node * 1079 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) 1080 { 1081 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 1082 1083 if (!n) 1084 return NULL; 1085 n->start = start; 1086 n->end = end; 1087 mpol_get(pol); 1088 n->policy = pol; 1089 return n; 1090 } 1091 1092 /* Replace a policy range. */ 1093 static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 1094 unsigned long end, struct sp_node *new) 1095 { 1096 struct sp_node *n, *new2 = NULL; 1097 1098 restart: 1099 spin_lock(&sp->lock); 1100 n = sp_lookup(sp, start, end); 1101 /* Take care of old policies in the same range. */ 1102 while (n && n->start < end) { 1103 struct rb_node *next = rb_next(&n->nd); 1104 if (n->start >= start) { 1105 if (n->end <= end) 1106 sp_delete(sp, n); 1107 else 1108 n->start = end; 1109 } else { 1110 /* Old policy spanning whole new range. */ 1111 if (n->end > end) { 1112 if (!new2) { 1113 spin_unlock(&sp->lock); 1114 new2 = sp_alloc(end, n->end, n->policy); 1115 if (!new2) 1116 return -ENOMEM; 1117 goto restart; 1118 } 1119 n->end = start; 1120 sp_insert(sp, new2); 1121 new2 = NULL; 1122 break; 1123 } else 1124 n->end = start; 1125 } 1126 if (!next) 1127 break; 1128 n = rb_entry(next, struct sp_node, nd); 1129 } 1130 if (new) 1131 sp_insert(sp, new); 1132 spin_unlock(&sp->lock); 1133 if (new2) { 1134 mpol_free(new2->policy); 1135 kmem_cache_free(sn_cache, new2); 1136 } 1137 return 0; 1138 } 1139 1140 int mpol_set_shared_policy(struct shared_policy *info, 1141 struct vm_area_struct *vma, struct mempolicy *npol) 1142 { 1143 int err; 1144 struct sp_node *new = NULL; 1145 unsigned long sz = vma_pages(vma); 1146 1147 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1148 vma->vm_pgoff, 1149 sz, npol? npol->policy : -1, 1150 npol ? nodes_addr(npol->v.nodes)[0] : -1); 1151 1152 if (npol) { 1153 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1154 if (!new) 1155 return -ENOMEM; 1156 } 1157 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 1158 if (err && new) 1159 kmem_cache_free(sn_cache, new); 1160 return err; 1161 } 1162 1163 /* Free a backing policy store on inode delete. */ 1164 void mpol_free_shared_policy(struct shared_policy *p) 1165 { 1166 struct sp_node *n; 1167 struct rb_node *next; 1168 1169 if (!p->root.rb_node) 1170 return; 1171 spin_lock(&p->lock); 1172 next = rb_first(&p->root); 1173 while (next) { 1174 n = rb_entry(next, struct sp_node, nd); 1175 next = rb_next(&n->nd); 1176 rb_erase(&n->nd, &p->root); 1177 mpol_free(n->policy); 1178 kmem_cache_free(sn_cache, n); 1179 } 1180 spin_unlock(&p->lock); 1181 } 1182 1183 /* assumes fs == KERNEL_DS */ 1184 void __init numa_policy_init(void) 1185 { 1186 policy_cache = kmem_cache_create("numa_policy", 1187 sizeof(struct mempolicy), 1188 0, SLAB_PANIC, NULL, NULL); 1189 1190 sn_cache = kmem_cache_create("shared_policy_node", 1191 sizeof(struct sp_node), 1192 0, SLAB_PANIC, NULL, NULL); 1193 1194 /* Set interleaving policy for system init. This way not all 1195 the data structures allocated at system boot end up in node zero. */ 1196 1197 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) 1198 printk("numa_policy_init: interleaving failed\n"); 1199 } 1200 1201 /* Reset policy of current process to default */ 1202 void numa_default_policy(void) 1203 { 1204 do_set_mempolicy(MPOL_DEFAULT, NULL); 1205 } 1206 1207 /* Migrate a policy to a different set of nodes */ 1208 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, 1209 const nodemask_t *new) 1210 { 1211 nodemask_t tmp; 1212 1213 if (!pol) 1214 return; 1215 1216 switch (pol->policy) { 1217 case MPOL_DEFAULT: 1218 break; 1219 case MPOL_INTERLEAVE: 1220 nodes_remap(tmp, pol->v.nodes, *old, *new); 1221 pol->v.nodes = tmp; 1222 current->il_next = node_remap(current->il_next, *old, *new); 1223 break; 1224 case MPOL_PREFERRED: 1225 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1226 *old, *new); 1227 break; 1228 case MPOL_BIND: { 1229 nodemask_t nodes; 1230 struct zone **z; 1231 struct zonelist *zonelist; 1232 1233 nodes_clear(nodes); 1234 for (z = pol->v.zonelist->zones; *z; z++) 1235 node_set((*z)->zone_pgdat->node_id, nodes); 1236 nodes_remap(tmp, nodes, *old, *new); 1237 nodes = tmp; 1238 1239 zonelist = bind_zonelist(&nodes); 1240 1241 /* If no mem, then zonelist is NULL and we keep old zonelist. 1242 * If that old zonelist has no remaining mems_allowed nodes, 1243 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. 1244 */ 1245 1246 if (zonelist) { 1247 /* Good - got mem - substitute new zonelist */ 1248 kfree(pol->v.zonelist); 1249 pol->v.zonelist = zonelist; 1250 } 1251 break; 1252 } 1253 default: 1254 BUG(); 1255 break; 1256 } 1257 } 1258 1259 /* 1260 * Someone moved this task to different nodes. Fixup mempolicies. 1261 * 1262 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, 1263 * once we have a cpuset mechanism to mark which cpuset subtree is migrating. 1264 */ 1265 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) 1266 { 1267 rebind_policy(current->mempolicy, old, new); 1268 } 1269