1 /* 2 * Memory Migration functionality - linux/mm/migration.c 3 * 4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5 * 6 * Page migration was first developed in the context of the memory hotplug 7 * project. The main authors of the migration code are: 8 * 9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10 * Hirokazu Takahashi <taka@valinux.co.jp> 11 * Dave Hansen <haveblue@us.ibm.com> 12 * Christoph Lameter 13 */ 14 15 #include <linux/migrate.h> 16 #include <linux/module.h> 17 #include <linux/swap.h> 18 #include <linux/swapops.h> 19 #include <linux/pagemap.h> 20 #include <linux/buffer_head.h> 21 #include <linux/mm_inline.h> 22 #include <linux/nsproxy.h> 23 #include <linux/pagevec.h> 24 #include <linux/ksm.h> 25 #include <linux/rmap.h> 26 #include <linux/topology.h> 27 #include <linux/cpu.h> 28 #include <linux/cpuset.h> 29 #include <linux/writeback.h> 30 #include <linux/mempolicy.h> 31 #include <linux/vmalloc.h> 32 #include <linux/security.h> 33 #include <linux/memcontrol.h> 34 #include <linux/syscalls.h> 35 36 #include "internal.h" 37 38 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 39 40 /* 41 * migrate_prep() needs to be called before we start compiling a list of pages 42 * to be migrated using isolate_lru_page(). 43 */ 44 int migrate_prep(void) 45 { 46 /* 47 * Clear the LRU lists so pages can be isolated. 48 * Note that pages may be moved off the LRU after we have 49 * drained them. Those pages will fail to migrate like other 50 * pages that may be busy. 51 */ 52 lru_add_drain_all(); 53 54 return 0; 55 } 56 57 /* 58 * Add isolated pages on the list back to the LRU under page lock 59 * to avoid leaking evictable pages back onto unevictable list. 60 * 61 * returns the number of pages put back. 62 */ 63 int putback_lru_pages(struct list_head *l) 64 { 65 struct page *page; 66 struct page *page2; 67 int count = 0; 68 69 list_for_each_entry_safe(page, page2, l, lru) { 70 list_del(&page->lru); 71 dec_zone_page_state(page, NR_ISOLATED_ANON + 72 page_is_file_cache(page)); 73 putback_lru_page(page); 74 count++; 75 } 76 return count; 77 } 78 79 /* 80 * Restore a potential migration pte to a working pte entry 81 */ 82 static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, 83 unsigned long addr, void *old) 84 { 85 struct mm_struct *mm = vma->vm_mm; 86 swp_entry_t entry; 87 pgd_t *pgd; 88 pud_t *pud; 89 pmd_t *pmd; 90 pte_t *ptep, pte; 91 spinlock_t *ptl; 92 93 pgd = pgd_offset(mm, addr); 94 if (!pgd_present(*pgd)) 95 goto out; 96 97 pud = pud_offset(pgd, addr); 98 if (!pud_present(*pud)) 99 goto out; 100 101 pmd = pmd_offset(pud, addr); 102 if (!pmd_present(*pmd)) 103 goto out; 104 105 ptep = pte_offset_map(pmd, addr); 106 107 if (!is_swap_pte(*ptep)) { 108 pte_unmap(ptep); 109 goto out; 110 } 111 112 ptl = pte_lockptr(mm, pmd); 113 spin_lock(ptl); 114 pte = *ptep; 115 if (!is_swap_pte(pte)) 116 goto unlock; 117 118 entry = pte_to_swp_entry(pte); 119 120 if (!is_migration_entry(entry) || 121 migration_entry_to_page(entry) != old) 122 goto unlock; 123 124 get_page(new); 125 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 126 if (is_write_migration_entry(entry)) 127 pte = pte_mkwrite(pte); 128 flush_cache_page(vma, addr, pte_pfn(pte)); 129 set_pte_at(mm, addr, ptep, pte); 130 131 if (PageAnon(new)) 132 page_add_anon_rmap(new, vma, addr); 133 else 134 page_add_file_rmap(new); 135 136 /* No need to invalidate - it was non-present before */ 137 update_mmu_cache(vma, addr, ptep); 138 unlock: 139 pte_unmap_unlock(ptep, ptl); 140 out: 141 return SWAP_AGAIN; 142 } 143 144 /* 145 * Get rid of all migration entries and replace them by 146 * references to the indicated page. 147 */ 148 static void remove_migration_ptes(struct page *old, struct page *new) 149 { 150 rmap_walk(new, remove_migration_pte, old); 151 } 152 153 /* 154 * Something used the pte of a page under migration. We need to 155 * get to the page and wait until migration is finished. 156 * When we return from this function the fault will be retried. 157 * 158 * This function is called from do_swap_page(). 159 */ 160 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 161 unsigned long address) 162 { 163 pte_t *ptep, pte; 164 spinlock_t *ptl; 165 swp_entry_t entry; 166 struct page *page; 167 168 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 169 pte = *ptep; 170 if (!is_swap_pte(pte)) 171 goto out; 172 173 entry = pte_to_swp_entry(pte); 174 if (!is_migration_entry(entry)) 175 goto out; 176 177 page = migration_entry_to_page(entry); 178 179 /* 180 * Once radix-tree replacement of page migration started, page_count 181 * *must* be zero. And, we don't want to call wait_on_page_locked() 182 * against a page without get_page(). 183 * So, we use get_page_unless_zero(), here. Even failed, page fault 184 * will occur again. 185 */ 186 if (!get_page_unless_zero(page)) 187 goto out; 188 pte_unmap_unlock(ptep, ptl); 189 wait_on_page_locked(page); 190 put_page(page); 191 return; 192 out: 193 pte_unmap_unlock(ptep, ptl); 194 } 195 196 /* 197 * Replace the page in the mapping. 198 * 199 * The number of remaining references must be: 200 * 1 for anonymous pages without a mapping 201 * 2 for pages with a mapping 202 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 203 */ 204 static int migrate_page_move_mapping(struct address_space *mapping, 205 struct page *newpage, struct page *page) 206 { 207 int expected_count; 208 void **pslot; 209 210 if (!mapping) { 211 /* Anonymous page without mapping */ 212 if (page_count(page) != 1) 213 return -EAGAIN; 214 return 0; 215 } 216 217 spin_lock_irq(&mapping->tree_lock); 218 219 pslot = radix_tree_lookup_slot(&mapping->page_tree, 220 page_index(page)); 221 222 expected_count = 2 + page_has_private(page); 223 if (page_count(page) != expected_count || 224 (struct page *)radix_tree_deref_slot(pslot) != page) { 225 spin_unlock_irq(&mapping->tree_lock); 226 return -EAGAIN; 227 } 228 229 if (!page_freeze_refs(page, expected_count)) { 230 spin_unlock_irq(&mapping->tree_lock); 231 return -EAGAIN; 232 } 233 234 /* 235 * Now we know that no one else is looking at the page. 236 */ 237 get_page(newpage); /* add cache reference */ 238 if (PageSwapCache(page)) { 239 SetPageSwapCache(newpage); 240 set_page_private(newpage, page_private(page)); 241 } 242 243 radix_tree_replace_slot(pslot, newpage); 244 245 page_unfreeze_refs(page, expected_count); 246 /* 247 * Drop cache reference from old page. 248 * We know this isn't the last reference. 249 */ 250 __put_page(page); 251 252 /* 253 * If moved to a different zone then also account 254 * the page for that zone. Other VM counters will be 255 * taken care of when we establish references to the 256 * new page and drop references to the old page. 257 * 258 * Note that anonymous pages are accounted for 259 * via NR_FILE_PAGES and NR_ANON_PAGES if they 260 * are mapped to swap space. 261 */ 262 __dec_zone_page_state(page, NR_FILE_PAGES); 263 __inc_zone_page_state(newpage, NR_FILE_PAGES); 264 if (PageSwapBacked(page)) { 265 __dec_zone_page_state(page, NR_SHMEM); 266 __inc_zone_page_state(newpage, NR_SHMEM); 267 } 268 spin_unlock_irq(&mapping->tree_lock); 269 270 return 0; 271 } 272 273 /* 274 * Copy the page to its new location 275 */ 276 static void migrate_page_copy(struct page *newpage, struct page *page) 277 { 278 copy_highpage(newpage, page); 279 280 if (PageError(page)) 281 SetPageError(newpage); 282 if (PageReferenced(page)) 283 SetPageReferenced(newpage); 284 if (PageUptodate(page)) 285 SetPageUptodate(newpage); 286 if (TestClearPageActive(page)) { 287 VM_BUG_ON(PageUnevictable(page)); 288 SetPageActive(newpage); 289 } else if (TestClearPageUnevictable(page)) 290 SetPageUnevictable(newpage); 291 if (PageChecked(page)) 292 SetPageChecked(newpage); 293 if (PageMappedToDisk(page)) 294 SetPageMappedToDisk(newpage); 295 296 if (PageDirty(page)) { 297 clear_page_dirty_for_io(page); 298 /* 299 * Want to mark the page and the radix tree as dirty, and 300 * redo the accounting that clear_page_dirty_for_io undid, 301 * but we can't use set_page_dirty because that function 302 * is actually a signal that all of the page has become dirty. 303 * Wheras only part of our page may be dirty. 304 */ 305 __set_page_dirty_nobuffers(newpage); 306 } 307 308 mlock_migrate_page(newpage, page); 309 ksm_migrate_page(newpage, page); 310 311 ClearPageSwapCache(page); 312 ClearPagePrivate(page); 313 set_page_private(page, 0); 314 page->mapping = NULL; 315 316 /* 317 * If any waiters have accumulated on the new page then 318 * wake them up. 319 */ 320 if (PageWriteback(newpage)) 321 end_page_writeback(newpage); 322 } 323 324 /************************************************************ 325 * Migration functions 326 ***********************************************************/ 327 328 /* Always fail migration. Used for mappings that are not movable */ 329 int fail_migrate_page(struct address_space *mapping, 330 struct page *newpage, struct page *page) 331 { 332 return -EIO; 333 } 334 EXPORT_SYMBOL(fail_migrate_page); 335 336 /* 337 * Common logic to directly migrate a single page suitable for 338 * pages that do not use PagePrivate/PagePrivate2. 339 * 340 * Pages are locked upon entry and exit. 341 */ 342 int migrate_page(struct address_space *mapping, 343 struct page *newpage, struct page *page) 344 { 345 int rc; 346 347 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 348 349 rc = migrate_page_move_mapping(mapping, newpage, page); 350 351 if (rc) 352 return rc; 353 354 migrate_page_copy(newpage, page); 355 return 0; 356 } 357 EXPORT_SYMBOL(migrate_page); 358 359 #ifdef CONFIG_BLOCK 360 /* 361 * Migration function for pages with buffers. This function can only be used 362 * if the underlying filesystem guarantees that no other references to "page" 363 * exist. 364 */ 365 int buffer_migrate_page(struct address_space *mapping, 366 struct page *newpage, struct page *page) 367 { 368 struct buffer_head *bh, *head; 369 int rc; 370 371 if (!page_has_buffers(page)) 372 return migrate_page(mapping, newpage, page); 373 374 head = page_buffers(page); 375 376 rc = migrate_page_move_mapping(mapping, newpage, page); 377 378 if (rc) 379 return rc; 380 381 bh = head; 382 do { 383 get_bh(bh); 384 lock_buffer(bh); 385 bh = bh->b_this_page; 386 387 } while (bh != head); 388 389 ClearPagePrivate(page); 390 set_page_private(newpage, page_private(page)); 391 set_page_private(page, 0); 392 put_page(page); 393 get_page(newpage); 394 395 bh = head; 396 do { 397 set_bh_page(bh, newpage, bh_offset(bh)); 398 bh = bh->b_this_page; 399 400 } while (bh != head); 401 402 SetPagePrivate(newpage); 403 404 migrate_page_copy(newpage, page); 405 406 bh = head; 407 do { 408 unlock_buffer(bh); 409 put_bh(bh); 410 bh = bh->b_this_page; 411 412 } while (bh != head); 413 414 return 0; 415 } 416 EXPORT_SYMBOL(buffer_migrate_page); 417 #endif 418 419 /* 420 * Writeback a page to clean the dirty state 421 */ 422 static int writeout(struct address_space *mapping, struct page *page) 423 { 424 struct writeback_control wbc = { 425 .sync_mode = WB_SYNC_NONE, 426 .nr_to_write = 1, 427 .range_start = 0, 428 .range_end = LLONG_MAX, 429 .nonblocking = 1, 430 .for_reclaim = 1 431 }; 432 int rc; 433 434 if (!mapping->a_ops->writepage) 435 /* No write method for the address space */ 436 return -EINVAL; 437 438 if (!clear_page_dirty_for_io(page)) 439 /* Someone else already triggered a write */ 440 return -EAGAIN; 441 442 /* 443 * A dirty page may imply that the underlying filesystem has 444 * the page on some queue. So the page must be clean for 445 * migration. Writeout may mean we loose the lock and the 446 * page state is no longer what we checked for earlier. 447 * At this point we know that the migration attempt cannot 448 * be successful. 449 */ 450 remove_migration_ptes(page, page); 451 452 rc = mapping->a_ops->writepage(page, &wbc); 453 454 if (rc != AOP_WRITEPAGE_ACTIVATE) 455 /* unlocked. Relock */ 456 lock_page(page); 457 458 return (rc < 0) ? -EIO : -EAGAIN; 459 } 460 461 /* 462 * Default handling if a filesystem does not provide a migration function. 463 */ 464 static int fallback_migrate_page(struct address_space *mapping, 465 struct page *newpage, struct page *page) 466 { 467 if (PageDirty(page)) 468 return writeout(mapping, page); 469 470 /* 471 * Buffers may be managed in a filesystem specific way. 472 * We must have no buffers or drop them. 473 */ 474 if (page_has_private(page) && 475 !try_to_release_page(page, GFP_KERNEL)) 476 return -EAGAIN; 477 478 return migrate_page(mapping, newpage, page); 479 } 480 481 /* 482 * Move a page to a newly allocated page 483 * The page is locked and all ptes have been successfully removed. 484 * 485 * The new page will have replaced the old page if this function 486 * is successful. 487 * 488 * Return value: 489 * < 0 - error code 490 * == 0 - success 491 */ 492 static int move_to_new_page(struct page *newpage, struct page *page) 493 { 494 struct address_space *mapping; 495 int rc; 496 497 /* 498 * Block others from accessing the page when we get around to 499 * establishing additional references. We are the only one 500 * holding a reference to the new page at this point. 501 */ 502 if (!trylock_page(newpage)) 503 BUG(); 504 505 /* Prepare mapping for the new page.*/ 506 newpage->index = page->index; 507 newpage->mapping = page->mapping; 508 if (PageSwapBacked(page)) 509 SetPageSwapBacked(newpage); 510 511 mapping = page_mapping(page); 512 if (!mapping) 513 rc = migrate_page(mapping, newpage, page); 514 else if (mapping->a_ops->migratepage) 515 /* 516 * Most pages have a mapping and most filesystems 517 * should provide a migration function. Anonymous 518 * pages are part of swap space which also has its 519 * own migration function. This is the most common 520 * path for page migration. 521 */ 522 rc = mapping->a_ops->migratepage(mapping, 523 newpage, page); 524 else 525 rc = fallback_migrate_page(mapping, newpage, page); 526 527 if (!rc) 528 remove_migration_ptes(page, newpage); 529 else 530 newpage->mapping = NULL; 531 532 unlock_page(newpage); 533 534 return rc; 535 } 536 537 /* 538 * Obtain the lock on page, remove all ptes and migrate the page 539 * to the newly allocated page in newpage. 540 */ 541 static int unmap_and_move(new_page_t get_new_page, unsigned long private, 542 struct page *page, int force, int offlining) 543 { 544 int rc = 0; 545 int *result = NULL; 546 struct page *newpage = get_new_page(page, private, &result); 547 int rcu_locked = 0; 548 int charge = 0; 549 struct mem_cgroup *mem = NULL; 550 551 if (!newpage) 552 return -ENOMEM; 553 554 if (page_count(page) == 1) { 555 /* page was freed from under us. So we are done. */ 556 goto move_newpage; 557 } 558 559 /* prepare cgroup just returns 0 or -ENOMEM */ 560 rc = -EAGAIN; 561 562 if (!trylock_page(page)) { 563 if (!force) 564 goto move_newpage; 565 lock_page(page); 566 } 567 568 /* 569 * Only memory hotplug's offline_pages() caller has locked out KSM, 570 * and can safely migrate a KSM page. The other cases have skipped 571 * PageKsm along with PageReserved - but it is only now when we have 572 * the page lock that we can be certain it will not go KSM beneath us 573 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees 574 * its pagecount raised, but only here do we take the page lock which 575 * serializes that). 576 */ 577 if (PageKsm(page) && !offlining) { 578 rc = -EBUSY; 579 goto unlock; 580 } 581 582 /* charge against new page */ 583 charge = mem_cgroup_prepare_migration(page, &mem); 584 if (charge == -ENOMEM) { 585 rc = -ENOMEM; 586 goto unlock; 587 } 588 BUG_ON(charge); 589 590 if (PageWriteback(page)) { 591 if (!force) 592 goto uncharge; 593 wait_on_page_writeback(page); 594 } 595 /* 596 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 597 * we cannot notice that anon_vma is freed while we migrates a page. 598 * This rcu_read_lock() delays freeing anon_vma pointer until the end 599 * of migration. File cache pages are no problem because of page_lock() 600 * File Caches may use write_page() or lock_page() in migration, then, 601 * just care Anon page here. 602 */ 603 if (PageAnon(page)) { 604 rcu_read_lock(); 605 rcu_locked = 1; 606 } 607 608 /* 609 * Corner case handling: 610 * 1. When a new swap-cache page is read into, it is added to the LRU 611 * and treated as swapcache but it has no rmap yet. 612 * Calling try_to_unmap() against a page->mapping==NULL page will 613 * trigger a BUG. So handle it here. 614 * 2. An orphaned page (see truncate_complete_page) might have 615 * fs-private metadata. The page can be picked up due to memory 616 * offlining. Everywhere else except page reclaim, the page is 617 * invisible to the vm, so the page can not be migrated. So try to 618 * free the metadata, so the page can be freed. 619 */ 620 if (!page->mapping) { 621 if (!PageAnon(page) && page_has_private(page)) { 622 /* 623 * Go direct to try_to_free_buffers() here because 624 * a) that's what try_to_release_page() would do anyway 625 * b) we may be under rcu_read_lock() here, so we can't 626 * use GFP_KERNEL which is what try_to_release_page() 627 * needs to be effective. 628 */ 629 try_to_free_buffers(page); 630 goto rcu_unlock; 631 } 632 goto skip_unmap; 633 } 634 635 /* Establish migration ptes or remove ptes */ 636 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 637 638 skip_unmap: 639 if (!page_mapped(page)) 640 rc = move_to_new_page(newpage, page); 641 642 if (rc) 643 remove_migration_ptes(page, page); 644 rcu_unlock: 645 if (rcu_locked) 646 rcu_read_unlock(); 647 uncharge: 648 if (!charge) 649 mem_cgroup_end_migration(mem, page, newpage); 650 unlock: 651 unlock_page(page); 652 653 if (rc != -EAGAIN) { 654 /* 655 * A page that has been migrated has all references 656 * removed and will be freed. A page that has not been 657 * migrated will have kepts its references and be 658 * restored. 659 */ 660 list_del(&page->lru); 661 dec_zone_page_state(page, NR_ISOLATED_ANON + 662 page_is_file_cache(page)); 663 putback_lru_page(page); 664 } 665 666 move_newpage: 667 668 /* 669 * Move the new page to the LRU. If migration was not successful 670 * then this will free the page. 671 */ 672 putback_lru_page(newpage); 673 674 if (result) { 675 if (rc) 676 *result = rc; 677 else 678 *result = page_to_nid(newpage); 679 } 680 return rc; 681 } 682 683 /* 684 * migrate_pages 685 * 686 * The function takes one list of pages to migrate and a function 687 * that determines from the page to be migrated and the private data 688 * the target of the move and allocates the page. 689 * 690 * The function returns after 10 attempts or if no pages 691 * are movable anymore because to has become empty 692 * or no retryable pages exist anymore. All pages will be 693 * returned to the LRU or freed. 694 * 695 * Return: Number of pages not migrated or error code. 696 */ 697 int migrate_pages(struct list_head *from, 698 new_page_t get_new_page, unsigned long private, int offlining) 699 { 700 int retry = 1; 701 int nr_failed = 0; 702 int pass = 0; 703 struct page *page; 704 struct page *page2; 705 int swapwrite = current->flags & PF_SWAPWRITE; 706 int rc; 707 708 if (!swapwrite) 709 current->flags |= PF_SWAPWRITE; 710 711 for(pass = 0; pass < 10 && retry; pass++) { 712 retry = 0; 713 714 list_for_each_entry_safe(page, page2, from, lru) { 715 cond_resched(); 716 717 rc = unmap_and_move(get_new_page, private, 718 page, pass > 2, offlining); 719 720 switch(rc) { 721 case -ENOMEM: 722 goto out; 723 case -EAGAIN: 724 retry++; 725 break; 726 case 0: 727 break; 728 default: 729 /* Permanent failure */ 730 nr_failed++; 731 break; 732 } 733 } 734 } 735 rc = 0; 736 out: 737 if (!swapwrite) 738 current->flags &= ~PF_SWAPWRITE; 739 740 putback_lru_pages(from); 741 742 if (rc) 743 return rc; 744 745 return nr_failed + retry; 746 } 747 748 #ifdef CONFIG_NUMA 749 /* 750 * Move a list of individual pages 751 */ 752 struct page_to_node { 753 unsigned long addr; 754 struct page *page; 755 int node; 756 int status; 757 }; 758 759 static struct page *new_page_node(struct page *p, unsigned long private, 760 int **result) 761 { 762 struct page_to_node *pm = (struct page_to_node *)private; 763 764 while (pm->node != MAX_NUMNODES && pm->page != p) 765 pm++; 766 767 if (pm->node == MAX_NUMNODES) 768 return NULL; 769 770 *result = &pm->status; 771 772 return alloc_pages_exact_node(pm->node, 773 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 774 } 775 776 /* 777 * Move a set of pages as indicated in the pm array. The addr 778 * field must be set to the virtual address of the page to be moved 779 * and the node number must contain a valid target node. 780 * The pm array ends with node = MAX_NUMNODES. 781 */ 782 static int do_move_page_to_node_array(struct mm_struct *mm, 783 struct page_to_node *pm, 784 int migrate_all) 785 { 786 int err; 787 struct page_to_node *pp; 788 LIST_HEAD(pagelist); 789 790 down_read(&mm->mmap_sem); 791 792 /* 793 * Build a list of pages to migrate 794 */ 795 for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 796 struct vm_area_struct *vma; 797 struct page *page; 798 799 err = -EFAULT; 800 vma = find_vma(mm, pp->addr); 801 if (!vma || !vma_migratable(vma)) 802 goto set_status; 803 804 page = follow_page(vma, pp->addr, FOLL_GET); 805 806 err = PTR_ERR(page); 807 if (IS_ERR(page)) 808 goto set_status; 809 810 err = -ENOENT; 811 if (!page) 812 goto set_status; 813 814 /* Use PageReserved to check for zero page */ 815 if (PageReserved(page) || PageKsm(page)) 816 goto put_and_set; 817 818 pp->page = page; 819 err = page_to_nid(page); 820 821 if (err == pp->node) 822 /* 823 * Node already in the right place 824 */ 825 goto put_and_set; 826 827 err = -EACCES; 828 if (page_mapcount(page) > 1 && 829 !migrate_all) 830 goto put_and_set; 831 832 err = isolate_lru_page(page); 833 if (!err) { 834 list_add_tail(&page->lru, &pagelist); 835 inc_zone_page_state(page, NR_ISOLATED_ANON + 836 page_is_file_cache(page)); 837 } 838 put_and_set: 839 /* 840 * Either remove the duplicate refcount from 841 * isolate_lru_page() or drop the page ref if it was 842 * not isolated. 843 */ 844 put_page(page); 845 set_status: 846 pp->status = err; 847 } 848 849 err = 0; 850 if (!list_empty(&pagelist)) 851 err = migrate_pages(&pagelist, new_page_node, 852 (unsigned long)pm, 0); 853 854 up_read(&mm->mmap_sem); 855 return err; 856 } 857 858 /* 859 * Migrate an array of page address onto an array of nodes and fill 860 * the corresponding array of status. 861 */ 862 static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 863 unsigned long nr_pages, 864 const void __user * __user *pages, 865 const int __user *nodes, 866 int __user *status, int flags) 867 { 868 struct page_to_node *pm; 869 nodemask_t task_nodes; 870 unsigned long chunk_nr_pages; 871 unsigned long chunk_start; 872 int err; 873 874 task_nodes = cpuset_mems_allowed(task); 875 876 err = -ENOMEM; 877 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 878 if (!pm) 879 goto out; 880 881 migrate_prep(); 882 883 /* 884 * Store a chunk of page_to_node array in a page, 885 * but keep the last one as a marker 886 */ 887 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; 888 889 for (chunk_start = 0; 890 chunk_start < nr_pages; 891 chunk_start += chunk_nr_pages) { 892 int j; 893 894 if (chunk_start + chunk_nr_pages > nr_pages) 895 chunk_nr_pages = nr_pages - chunk_start; 896 897 /* fill the chunk pm with addrs and nodes from user-space */ 898 for (j = 0; j < chunk_nr_pages; j++) { 899 const void __user *p; 900 int node; 901 902 err = -EFAULT; 903 if (get_user(p, pages + j + chunk_start)) 904 goto out_pm; 905 pm[j].addr = (unsigned long) p; 906 907 if (get_user(node, nodes + j + chunk_start)) 908 goto out_pm; 909 910 err = -ENODEV; 911 if (node < 0 || node >= MAX_NUMNODES) 912 goto out_pm; 913 914 if (!node_state(node, N_HIGH_MEMORY)) 915 goto out_pm; 916 917 err = -EACCES; 918 if (!node_isset(node, task_nodes)) 919 goto out_pm; 920 921 pm[j].node = node; 922 } 923 924 /* End marker for this chunk */ 925 pm[chunk_nr_pages].node = MAX_NUMNODES; 926 927 /* Migrate this chunk */ 928 err = do_move_page_to_node_array(mm, pm, 929 flags & MPOL_MF_MOVE_ALL); 930 if (err < 0) 931 goto out_pm; 932 933 /* Return status information */ 934 for (j = 0; j < chunk_nr_pages; j++) 935 if (put_user(pm[j].status, status + j + chunk_start)) { 936 err = -EFAULT; 937 goto out_pm; 938 } 939 } 940 err = 0; 941 942 out_pm: 943 free_page((unsigned long)pm); 944 out: 945 return err; 946 } 947 948 /* 949 * Determine the nodes of an array of pages and store it in an array of status. 950 */ 951 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 952 const void __user **pages, int *status) 953 { 954 unsigned long i; 955 956 down_read(&mm->mmap_sem); 957 958 for (i = 0; i < nr_pages; i++) { 959 unsigned long addr = (unsigned long)(*pages); 960 struct vm_area_struct *vma; 961 struct page *page; 962 int err = -EFAULT; 963 964 vma = find_vma(mm, addr); 965 if (!vma) 966 goto set_status; 967 968 page = follow_page(vma, addr, 0); 969 970 err = PTR_ERR(page); 971 if (IS_ERR(page)) 972 goto set_status; 973 974 err = -ENOENT; 975 /* Use PageReserved to check for zero page */ 976 if (!page || PageReserved(page) || PageKsm(page)) 977 goto set_status; 978 979 err = page_to_nid(page); 980 set_status: 981 *status = err; 982 983 pages++; 984 status++; 985 } 986 987 up_read(&mm->mmap_sem); 988 } 989 990 /* 991 * Determine the nodes of a user array of pages and store it in 992 * a user array of status. 993 */ 994 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 995 const void __user * __user *pages, 996 int __user *status) 997 { 998 #define DO_PAGES_STAT_CHUNK_NR 16 999 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1000 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1001 1002 while (nr_pages) { 1003 unsigned long chunk_nr; 1004 1005 chunk_nr = nr_pages; 1006 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 1007 chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1008 1009 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 1010 break; 1011 1012 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1013 1014 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 1015 break; 1016 1017 pages += chunk_nr; 1018 status += chunk_nr; 1019 nr_pages -= chunk_nr; 1020 } 1021 return nr_pages ? -EFAULT : 0; 1022 } 1023 1024 /* 1025 * Move a list of pages in the address space of the currently executing 1026 * process. 1027 */ 1028 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1029 const void __user * __user *, pages, 1030 const int __user *, nodes, 1031 int __user *, status, int, flags) 1032 { 1033 const struct cred *cred = current_cred(), *tcred; 1034 struct task_struct *task; 1035 struct mm_struct *mm; 1036 int err; 1037 1038 /* Check flags */ 1039 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1040 return -EINVAL; 1041 1042 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1043 return -EPERM; 1044 1045 /* Find the mm_struct */ 1046 read_lock(&tasklist_lock); 1047 task = pid ? find_task_by_vpid(pid) : current; 1048 if (!task) { 1049 read_unlock(&tasklist_lock); 1050 return -ESRCH; 1051 } 1052 mm = get_task_mm(task); 1053 read_unlock(&tasklist_lock); 1054 1055 if (!mm) 1056 return -EINVAL; 1057 1058 /* 1059 * Check if this process has the right to modify the specified 1060 * process. The right exists if the process has administrative 1061 * capabilities, superuser privileges or the same 1062 * userid as the target process. 1063 */ 1064 rcu_read_lock(); 1065 tcred = __task_cred(task); 1066 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1067 cred->uid != tcred->suid && cred->uid != tcred->uid && 1068 !capable(CAP_SYS_NICE)) { 1069 rcu_read_unlock(); 1070 err = -EPERM; 1071 goto out; 1072 } 1073 rcu_read_unlock(); 1074 1075 err = security_task_movememory(task); 1076 if (err) 1077 goto out; 1078 1079 if (nodes) { 1080 err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 1081 flags); 1082 } else { 1083 err = do_pages_stat(mm, nr_pages, pages, status); 1084 } 1085 1086 out: 1087 mmput(mm); 1088 return err; 1089 } 1090 1091 /* 1092 * Call migration functions in the vma_ops that may prepare 1093 * memory in a vm for migration. migration functions may perform 1094 * the migration for vmas that do not have an underlying page struct. 1095 */ 1096 int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 1097 const nodemask_t *from, unsigned long flags) 1098 { 1099 struct vm_area_struct *vma; 1100 int err = 0; 1101 1102 for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { 1103 if (vma->vm_ops && vma->vm_ops->migrate) { 1104 err = vma->vm_ops->migrate(vma, to, from, flags); 1105 if (err) 1106 break; 1107 } 1108 } 1109 return err; 1110 } 1111 #endif 1112