1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory Migration functionality - linux/mm/migrate.c 4 * 5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 6 * 7 * Page migration was first developed in the context of the memory hotplug 8 * project. The main authors of the migration code are: 9 * 10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 11 * Hirokazu Takahashi <taka@valinux.co.jp> 12 * Dave Hansen <haveblue@us.ibm.com> 13 * Christoph Lameter 14 */ 15 16 #include <linux/migrate.h> 17 #include <linux/export.h> 18 #include <linux/swap.h> 19 #include <linux/swapops.h> 20 #include <linux/pagemap.h> 21 #include <linux/buffer_head.h> 22 #include <linux/mm_inline.h> 23 #include <linux/nsproxy.h> 24 #include <linux/pagevec.h> 25 #include <linux/ksm.h> 26 #include <linux/rmap.h> 27 #include <linux/topology.h> 28 #include <linux/cpu.h> 29 #include <linux/cpuset.h> 30 #include <linux/writeback.h> 31 #include <linux/mempolicy.h> 32 #include <linux/vmalloc.h> 33 #include <linux/security.h> 34 #include <linux/backing-dev.h> 35 #include <linux/compaction.h> 36 #include <linux/syscalls.h> 37 #include <linux/compat.h> 38 #include <linux/hugetlb.h> 39 #include <linux/hugetlb_cgroup.h> 40 #include <linux/gfp.h> 41 #include <linux/pagewalk.h> 42 #include <linux/pfn_t.h> 43 #include <linux/memremap.h> 44 #include <linux/userfaultfd_k.h> 45 #include <linux/balloon_compaction.h> 46 #include <linux/mmu_notifier.h> 47 #include <linux/page_idle.h> 48 #include <linux/page_owner.h> 49 #include <linux/sched/mm.h> 50 #include <linux/ptrace.h> 51 #include <linux/oom.h> 52 53 #include <asm/tlbflush.h> 54 55 #define CREATE_TRACE_POINTS 56 #include <trace/events/migrate.h> 57 58 #include "internal.h" 59 60 int isolate_movable_page(struct page *page, isolate_mode_t mode) 61 { 62 struct address_space *mapping; 63 64 /* 65 * Avoid burning cycles with pages that are yet under __free_pages(), 66 * or just got freed under us. 67 * 68 * In case we 'win' a race for a movable page being freed under us and 69 * raise its refcount preventing __free_pages() from doing its job 70 * the put_page() at the end of this block will take care of 71 * release this page, thus avoiding a nasty leakage. 72 */ 73 if (unlikely(!get_page_unless_zero(page))) 74 goto out; 75 76 /* 77 * Check PageMovable before holding a PG_lock because page's owner 78 * assumes anybody doesn't touch PG_lock of newly allocated page 79 * so unconditionally grabbing the lock ruins page's owner side. 80 */ 81 if (unlikely(!__PageMovable(page))) 82 goto out_putpage; 83 /* 84 * As movable pages are not isolated from LRU lists, concurrent 85 * compaction threads can race against page migration functions 86 * as well as race against the releasing a page. 87 * 88 * In order to avoid having an already isolated movable page 89 * being (wrongly) re-isolated while it is under migration, 90 * or to avoid attempting to isolate pages being released, 91 * lets be sure we have the page lock 92 * before proceeding with the movable page isolation steps. 93 */ 94 if (unlikely(!trylock_page(page))) 95 goto out_putpage; 96 97 if (!PageMovable(page) || PageIsolated(page)) 98 goto out_no_isolated; 99 100 mapping = page_mapping(page); 101 VM_BUG_ON_PAGE(!mapping, page); 102 103 if (!mapping->a_ops->isolate_page(page, mode)) 104 goto out_no_isolated; 105 106 /* Driver shouldn't use PG_isolated bit of page->flags */ 107 WARN_ON_ONCE(PageIsolated(page)); 108 __SetPageIsolated(page); 109 unlock_page(page); 110 111 return 0; 112 113 out_no_isolated: 114 unlock_page(page); 115 out_putpage: 116 put_page(page); 117 out: 118 return -EBUSY; 119 } 120 121 static void putback_movable_page(struct page *page) 122 { 123 struct address_space *mapping; 124 125 mapping = page_mapping(page); 126 mapping->a_ops->putback_page(page); 127 __ClearPageIsolated(page); 128 } 129 130 /* 131 * Put previously isolated pages back onto the appropriate lists 132 * from where they were once taken off for compaction/migration. 133 * 134 * This function shall be used whenever the isolated pageset has been 135 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() 136 * and isolate_huge_page(). 137 */ 138 void putback_movable_pages(struct list_head *l) 139 { 140 struct page *page; 141 struct page *page2; 142 143 list_for_each_entry_safe(page, page2, l, lru) { 144 if (unlikely(PageHuge(page))) { 145 putback_active_hugepage(page); 146 continue; 147 } 148 list_del(&page->lru); 149 /* 150 * We isolated non-lru movable page so here we can use 151 * __PageMovable because LRU page's mapping cannot have 152 * PAGE_MAPPING_MOVABLE. 153 */ 154 if (unlikely(__PageMovable(page))) { 155 VM_BUG_ON_PAGE(!PageIsolated(page), page); 156 lock_page(page); 157 if (PageMovable(page)) 158 putback_movable_page(page); 159 else 160 __ClearPageIsolated(page); 161 unlock_page(page); 162 put_page(page); 163 } else { 164 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 165 page_is_file_lru(page), -thp_nr_pages(page)); 166 putback_lru_page(page); 167 } 168 } 169 } 170 171 /* 172 * Restore a potential migration pte to a working pte entry 173 */ 174 static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, 175 unsigned long addr, void *old) 176 { 177 struct page_vma_mapped_walk pvmw = { 178 .page = old, 179 .vma = vma, 180 .address = addr, 181 .flags = PVMW_SYNC | PVMW_MIGRATION, 182 }; 183 struct page *new; 184 pte_t pte; 185 swp_entry_t entry; 186 187 VM_BUG_ON_PAGE(PageTail(page), page); 188 while (page_vma_mapped_walk(&pvmw)) { 189 if (PageKsm(page)) 190 new = page; 191 else 192 new = page - pvmw.page->index + 193 linear_page_index(vma, pvmw.address); 194 195 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 196 /* PMD-mapped THP migration entry */ 197 if (!pvmw.pte) { 198 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 199 remove_migration_pmd(&pvmw, new); 200 continue; 201 } 202 #endif 203 204 get_page(new); 205 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); 206 if (pte_swp_soft_dirty(*pvmw.pte)) 207 pte = pte_mksoft_dirty(pte); 208 209 /* 210 * Recheck VMA as permissions can change since migration started 211 */ 212 entry = pte_to_swp_entry(*pvmw.pte); 213 if (is_write_migration_entry(entry)) 214 pte = maybe_mkwrite(pte, vma); 215 else if (pte_swp_uffd_wp(*pvmw.pte)) 216 pte = pte_mkuffd_wp(pte); 217 218 if (unlikely(is_device_private_page(new))) { 219 entry = make_device_private_entry(new, pte_write(pte)); 220 pte = swp_entry_to_pte(entry); 221 if (pte_swp_soft_dirty(*pvmw.pte)) 222 pte = pte_swp_mksoft_dirty(pte); 223 if (pte_swp_uffd_wp(*pvmw.pte)) 224 pte = pte_swp_mkuffd_wp(pte); 225 } 226 227 #ifdef CONFIG_HUGETLB_PAGE 228 if (PageHuge(new)) { 229 pte = pte_mkhuge(pte); 230 pte = arch_make_huge_pte(pte, vma, new, 0); 231 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 232 if (PageAnon(new)) 233 hugepage_add_anon_rmap(new, vma, pvmw.address); 234 else 235 page_dup_rmap(new, true); 236 } else 237 #endif 238 { 239 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 240 241 if (PageAnon(new)) 242 page_add_anon_rmap(new, vma, pvmw.address, false); 243 else 244 page_add_file_rmap(new, false); 245 } 246 if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) 247 mlock_vma_page(new); 248 249 if (PageTransHuge(page) && PageMlocked(page)) 250 clear_page_mlock(page); 251 252 /* No need to invalidate - it was non-present before */ 253 update_mmu_cache(vma, pvmw.address, pvmw.pte); 254 } 255 256 return true; 257 } 258 259 /* 260 * Get rid of all migration entries and replace them by 261 * references to the indicated page. 262 */ 263 void remove_migration_ptes(struct page *old, struct page *new, bool locked) 264 { 265 struct rmap_walk_control rwc = { 266 .rmap_one = remove_migration_pte, 267 .arg = old, 268 }; 269 270 if (locked) 271 rmap_walk_locked(new, &rwc); 272 else 273 rmap_walk(new, &rwc); 274 } 275 276 /* 277 * Something used the pte of a page under migration. We need to 278 * get to the page and wait until migration is finished. 279 * When we return from this function the fault will be retried. 280 */ 281 void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 282 spinlock_t *ptl) 283 { 284 pte_t pte; 285 swp_entry_t entry; 286 struct page *page; 287 288 spin_lock(ptl); 289 pte = *ptep; 290 if (!is_swap_pte(pte)) 291 goto out; 292 293 entry = pte_to_swp_entry(pte); 294 if (!is_migration_entry(entry)) 295 goto out; 296 297 page = migration_entry_to_page(entry); 298 page = compound_head(page); 299 300 /* 301 * Once page cache replacement of page migration started, page_count 302 * is zero; but we must not call put_and_wait_on_page_locked() without 303 * a ref. Use get_page_unless_zero(), and just fault again if it fails. 304 */ 305 if (!get_page_unless_zero(page)) 306 goto out; 307 pte_unmap_unlock(ptep, ptl); 308 put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); 309 return; 310 out: 311 pte_unmap_unlock(ptep, ptl); 312 } 313 314 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 315 unsigned long address) 316 { 317 spinlock_t *ptl = pte_lockptr(mm, pmd); 318 pte_t *ptep = pte_offset_map(pmd, address); 319 __migration_entry_wait(mm, ptep, ptl); 320 } 321 322 void migration_entry_wait_huge(struct vm_area_struct *vma, 323 struct mm_struct *mm, pte_t *pte) 324 { 325 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); 326 __migration_entry_wait(mm, pte, ptl); 327 } 328 329 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 330 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) 331 { 332 spinlock_t *ptl; 333 struct page *page; 334 335 ptl = pmd_lock(mm, pmd); 336 if (!is_pmd_migration_entry(*pmd)) 337 goto unlock; 338 page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); 339 if (!get_page_unless_zero(page)) 340 goto unlock; 341 spin_unlock(ptl); 342 put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); 343 return; 344 unlock: 345 spin_unlock(ptl); 346 } 347 #endif 348 349 static int expected_page_refs(struct address_space *mapping, struct page *page) 350 { 351 int expected_count = 1; 352 353 /* 354 * Device private pages have an extra refcount as they are 355 * ZONE_DEVICE pages. 356 */ 357 expected_count += is_device_private_page(page); 358 if (mapping) 359 expected_count += thp_nr_pages(page) + page_has_private(page); 360 361 return expected_count; 362 } 363 364 /* 365 * Replace the page in the mapping. 366 * 367 * The number of remaining references must be: 368 * 1 for anonymous pages without a mapping 369 * 2 for pages with a mapping 370 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 371 */ 372 int migrate_page_move_mapping(struct address_space *mapping, 373 struct page *newpage, struct page *page, int extra_count) 374 { 375 XA_STATE(xas, &mapping->i_pages, page_index(page)); 376 struct zone *oldzone, *newzone; 377 int dirty; 378 int expected_count = expected_page_refs(mapping, page) + extra_count; 379 int nr = thp_nr_pages(page); 380 381 if (!mapping) { 382 /* Anonymous page without mapping */ 383 if (page_count(page) != expected_count) 384 return -EAGAIN; 385 386 /* No turning back from here */ 387 newpage->index = page->index; 388 newpage->mapping = page->mapping; 389 if (PageSwapBacked(page)) 390 __SetPageSwapBacked(newpage); 391 392 return MIGRATEPAGE_SUCCESS; 393 } 394 395 oldzone = page_zone(page); 396 newzone = page_zone(newpage); 397 398 xas_lock_irq(&xas); 399 if (page_count(page) != expected_count || xas_load(&xas) != page) { 400 xas_unlock_irq(&xas); 401 return -EAGAIN; 402 } 403 404 if (!page_ref_freeze(page, expected_count)) { 405 xas_unlock_irq(&xas); 406 return -EAGAIN; 407 } 408 409 /* 410 * Now we know that no one else is looking at the page: 411 * no turning back from here. 412 */ 413 newpage->index = page->index; 414 newpage->mapping = page->mapping; 415 page_ref_add(newpage, nr); /* add cache reference */ 416 if (PageSwapBacked(page)) { 417 __SetPageSwapBacked(newpage); 418 if (PageSwapCache(page)) { 419 SetPageSwapCache(newpage); 420 set_page_private(newpage, page_private(page)); 421 } 422 } else { 423 VM_BUG_ON_PAGE(PageSwapCache(page), page); 424 } 425 426 /* Move dirty while page refs frozen and newpage not yet exposed */ 427 dirty = PageDirty(page); 428 if (dirty) { 429 ClearPageDirty(page); 430 SetPageDirty(newpage); 431 } 432 433 xas_store(&xas, newpage); 434 if (PageTransHuge(page)) { 435 int i; 436 437 for (i = 1; i < nr; i++) { 438 xas_next(&xas); 439 xas_store(&xas, newpage); 440 } 441 } 442 443 /* 444 * Drop cache reference from old page by unfreezing 445 * to one less reference. 446 * We know this isn't the last reference. 447 */ 448 page_ref_unfreeze(page, expected_count - nr); 449 450 xas_unlock(&xas); 451 /* Leave irq disabled to prevent preemption while updating stats */ 452 453 /* 454 * If moved to a different zone then also account 455 * the page for that zone. Other VM counters will be 456 * taken care of when we establish references to the 457 * new page and drop references to the old page. 458 * 459 * Note that anonymous pages are accounted for 460 * via NR_FILE_PAGES and NR_ANON_MAPPED if they 461 * are mapped to swap space. 462 */ 463 if (newzone != oldzone) { 464 struct lruvec *old_lruvec, *new_lruvec; 465 struct mem_cgroup *memcg; 466 467 memcg = page_memcg(page); 468 old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); 469 new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); 470 471 __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); 472 __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); 473 if (PageSwapBacked(page) && !PageSwapCache(page)) { 474 __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); 475 __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); 476 } 477 #ifdef CONFIG_SWAP 478 if (PageSwapCache(page)) { 479 __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); 480 __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); 481 } 482 #endif 483 if (dirty && mapping_can_writeback(mapping)) { 484 __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); 485 __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); 486 __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); 487 __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); 488 } 489 } 490 local_irq_enable(); 491 492 return MIGRATEPAGE_SUCCESS; 493 } 494 EXPORT_SYMBOL(migrate_page_move_mapping); 495 496 /* 497 * The expected number of remaining references is the same as that 498 * of migrate_page_move_mapping(). 499 */ 500 int migrate_huge_page_move_mapping(struct address_space *mapping, 501 struct page *newpage, struct page *page) 502 { 503 XA_STATE(xas, &mapping->i_pages, page_index(page)); 504 int expected_count; 505 506 xas_lock_irq(&xas); 507 expected_count = 2 + page_has_private(page); 508 if (page_count(page) != expected_count || xas_load(&xas) != page) { 509 xas_unlock_irq(&xas); 510 return -EAGAIN; 511 } 512 513 if (!page_ref_freeze(page, expected_count)) { 514 xas_unlock_irq(&xas); 515 return -EAGAIN; 516 } 517 518 newpage->index = page->index; 519 newpage->mapping = page->mapping; 520 521 get_page(newpage); 522 523 xas_store(&xas, newpage); 524 525 page_ref_unfreeze(page, expected_count - 1); 526 527 xas_unlock_irq(&xas); 528 529 return MIGRATEPAGE_SUCCESS; 530 } 531 532 /* 533 * Gigantic pages are so large that we do not guarantee that page++ pointer 534 * arithmetic will work across the entire page. We need something more 535 * specialized. 536 */ 537 static void __copy_gigantic_page(struct page *dst, struct page *src, 538 int nr_pages) 539 { 540 int i; 541 struct page *dst_base = dst; 542 struct page *src_base = src; 543 544 for (i = 0; i < nr_pages; ) { 545 cond_resched(); 546 copy_highpage(dst, src); 547 548 i++; 549 dst = mem_map_next(dst, dst_base, i); 550 src = mem_map_next(src, src_base, i); 551 } 552 } 553 554 static void copy_huge_page(struct page *dst, struct page *src) 555 { 556 int i; 557 int nr_pages; 558 559 if (PageHuge(src)) { 560 /* hugetlbfs page */ 561 struct hstate *h = page_hstate(src); 562 nr_pages = pages_per_huge_page(h); 563 564 if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { 565 __copy_gigantic_page(dst, src, nr_pages); 566 return; 567 } 568 } else { 569 /* thp page */ 570 BUG_ON(!PageTransHuge(src)); 571 nr_pages = thp_nr_pages(src); 572 } 573 574 for (i = 0; i < nr_pages; i++) { 575 cond_resched(); 576 copy_highpage(dst + i, src + i); 577 } 578 } 579 580 /* 581 * Copy the page to its new location 582 */ 583 void migrate_page_states(struct page *newpage, struct page *page) 584 { 585 int cpupid; 586 587 if (PageError(page)) 588 SetPageError(newpage); 589 if (PageReferenced(page)) 590 SetPageReferenced(newpage); 591 if (PageUptodate(page)) 592 SetPageUptodate(newpage); 593 if (TestClearPageActive(page)) { 594 VM_BUG_ON_PAGE(PageUnevictable(page), page); 595 SetPageActive(newpage); 596 } else if (TestClearPageUnevictable(page)) 597 SetPageUnevictable(newpage); 598 if (PageWorkingset(page)) 599 SetPageWorkingset(newpage); 600 if (PageChecked(page)) 601 SetPageChecked(newpage); 602 if (PageMappedToDisk(page)) 603 SetPageMappedToDisk(newpage); 604 605 /* Move dirty on pages not done by migrate_page_move_mapping() */ 606 if (PageDirty(page)) 607 SetPageDirty(newpage); 608 609 if (page_is_young(page)) 610 set_page_young(newpage); 611 if (page_is_idle(page)) 612 set_page_idle(newpage); 613 614 /* 615 * Copy NUMA information to the new page, to prevent over-eager 616 * future migrations of this same page. 617 */ 618 cpupid = page_cpupid_xchg_last(page, -1); 619 page_cpupid_xchg_last(newpage, cpupid); 620 621 ksm_migrate_page(newpage, page); 622 /* 623 * Please do not reorder this without considering how mm/ksm.c's 624 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 625 */ 626 if (PageSwapCache(page)) 627 ClearPageSwapCache(page); 628 ClearPagePrivate(page); 629 set_page_private(page, 0); 630 631 /* 632 * If any waiters have accumulated on the new page then 633 * wake them up. 634 */ 635 if (PageWriteback(newpage)) 636 end_page_writeback(newpage); 637 638 /* 639 * PG_readahead shares the same bit with PG_reclaim. The above 640 * end_page_writeback() may clear PG_readahead mistakenly, so set the 641 * bit after that. 642 */ 643 if (PageReadahead(page)) 644 SetPageReadahead(newpage); 645 646 copy_page_owner(page, newpage); 647 648 if (!PageHuge(page)) 649 mem_cgroup_migrate(page, newpage); 650 } 651 EXPORT_SYMBOL(migrate_page_states); 652 653 void migrate_page_copy(struct page *newpage, struct page *page) 654 { 655 if (PageHuge(page) || PageTransHuge(page)) 656 copy_huge_page(newpage, page); 657 else 658 copy_highpage(newpage, page); 659 660 migrate_page_states(newpage, page); 661 } 662 EXPORT_SYMBOL(migrate_page_copy); 663 664 /************************************************************ 665 * Migration functions 666 ***********************************************************/ 667 668 /* 669 * Common logic to directly migrate a single LRU page suitable for 670 * pages that do not use PagePrivate/PagePrivate2. 671 * 672 * Pages are locked upon entry and exit. 673 */ 674 int migrate_page(struct address_space *mapping, 675 struct page *newpage, struct page *page, 676 enum migrate_mode mode) 677 { 678 int rc; 679 680 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 681 682 rc = migrate_page_move_mapping(mapping, newpage, page, 0); 683 684 if (rc != MIGRATEPAGE_SUCCESS) 685 return rc; 686 687 if (mode != MIGRATE_SYNC_NO_COPY) 688 migrate_page_copy(newpage, page); 689 else 690 migrate_page_states(newpage, page); 691 return MIGRATEPAGE_SUCCESS; 692 } 693 EXPORT_SYMBOL(migrate_page); 694 695 #ifdef CONFIG_BLOCK 696 /* Returns true if all buffers are successfully locked */ 697 static bool buffer_migrate_lock_buffers(struct buffer_head *head, 698 enum migrate_mode mode) 699 { 700 struct buffer_head *bh = head; 701 702 /* Simple case, sync compaction */ 703 if (mode != MIGRATE_ASYNC) { 704 do { 705 lock_buffer(bh); 706 bh = bh->b_this_page; 707 708 } while (bh != head); 709 710 return true; 711 } 712 713 /* async case, we cannot block on lock_buffer so use trylock_buffer */ 714 do { 715 if (!trylock_buffer(bh)) { 716 /* 717 * We failed to lock the buffer and cannot stall in 718 * async migration. Release the taken locks 719 */ 720 struct buffer_head *failed_bh = bh; 721 bh = head; 722 while (bh != failed_bh) { 723 unlock_buffer(bh); 724 bh = bh->b_this_page; 725 } 726 return false; 727 } 728 729 bh = bh->b_this_page; 730 } while (bh != head); 731 return true; 732 } 733 734 static int __buffer_migrate_page(struct address_space *mapping, 735 struct page *newpage, struct page *page, enum migrate_mode mode, 736 bool check_refs) 737 { 738 struct buffer_head *bh, *head; 739 int rc; 740 int expected_count; 741 742 if (!page_has_buffers(page)) 743 return migrate_page(mapping, newpage, page, mode); 744 745 /* Check whether page does not have extra refs before we do more work */ 746 expected_count = expected_page_refs(mapping, page); 747 if (page_count(page) != expected_count) 748 return -EAGAIN; 749 750 head = page_buffers(page); 751 if (!buffer_migrate_lock_buffers(head, mode)) 752 return -EAGAIN; 753 754 if (check_refs) { 755 bool busy; 756 bool invalidated = false; 757 758 recheck_buffers: 759 busy = false; 760 spin_lock(&mapping->private_lock); 761 bh = head; 762 do { 763 if (atomic_read(&bh->b_count)) { 764 busy = true; 765 break; 766 } 767 bh = bh->b_this_page; 768 } while (bh != head); 769 if (busy) { 770 if (invalidated) { 771 rc = -EAGAIN; 772 goto unlock_buffers; 773 } 774 spin_unlock(&mapping->private_lock); 775 invalidate_bh_lrus(); 776 invalidated = true; 777 goto recheck_buffers; 778 } 779 } 780 781 rc = migrate_page_move_mapping(mapping, newpage, page, 0); 782 if (rc != MIGRATEPAGE_SUCCESS) 783 goto unlock_buffers; 784 785 attach_page_private(newpage, detach_page_private(page)); 786 787 bh = head; 788 do { 789 set_bh_page(bh, newpage, bh_offset(bh)); 790 bh = bh->b_this_page; 791 792 } while (bh != head); 793 794 if (mode != MIGRATE_SYNC_NO_COPY) 795 migrate_page_copy(newpage, page); 796 else 797 migrate_page_states(newpage, page); 798 799 rc = MIGRATEPAGE_SUCCESS; 800 unlock_buffers: 801 if (check_refs) 802 spin_unlock(&mapping->private_lock); 803 bh = head; 804 do { 805 unlock_buffer(bh); 806 bh = bh->b_this_page; 807 808 } while (bh != head); 809 810 return rc; 811 } 812 813 /* 814 * Migration function for pages with buffers. This function can only be used 815 * if the underlying filesystem guarantees that no other references to "page" 816 * exist. For example attached buffer heads are accessed only under page lock. 817 */ 818 int buffer_migrate_page(struct address_space *mapping, 819 struct page *newpage, struct page *page, enum migrate_mode mode) 820 { 821 return __buffer_migrate_page(mapping, newpage, page, mode, false); 822 } 823 EXPORT_SYMBOL(buffer_migrate_page); 824 825 /* 826 * Same as above except that this variant is more careful and checks that there 827 * are also no buffer head references. This function is the right one for 828 * mappings where buffer heads are directly looked up and referenced (such as 829 * block device mappings). 830 */ 831 int buffer_migrate_page_norefs(struct address_space *mapping, 832 struct page *newpage, struct page *page, enum migrate_mode mode) 833 { 834 return __buffer_migrate_page(mapping, newpage, page, mode, true); 835 } 836 #endif 837 838 /* 839 * Writeback a page to clean the dirty state 840 */ 841 static int writeout(struct address_space *mapping, struct page *page) 842 { 843 struct writeback_control wbc = { 844 .sync_mode = WB_SYNC_NONE, 845 .nr_to_write = 1, 846 .range_start = 0, 847 .range_end = LLONG_MAX, 848 .for_reclaim = 1 849 }; 850 int rc; 851 852 if (!mapping->a_ops->writepage) 853 /* No write method for the address space */ 854 return -EINVAL; 855 856 if (!clear_page_dirty_for_io(page)) 857 /* Someone else already triggered a write */ 858 return -EAGAIN; 859 860 /* 861 * A dirty page may imply that the underlying filesystem has 862 * the page on some queue. So the page must be clean for 863 * migration. Writeout may mean we loose the lock and the 864 * page state is no longer what we checked for earlier. 865 * At this point we know that the migration attempt cannot 866 * be successful. 867 */ 868 remove_migration_ptes(page, page, false); 869 870 rc = mapping->a_ops->writepage(page, &wbc); 871 872 if (rc != AOP_WRITEPAGE_ACTIVATE) 873 /* unlocked. Relock */ 874 lock_page(page); 875 876 return (rc < 0) ? -EIO : -EAGAIN; 877 } 878 879 /* 880 * Default handling if a filesystem does not provide a migration function. 881 */ 882 static int fallback_migrate_page(struct address_space *mapping, 883 struct page *newpage, struct page *page, enum migrate_mode mode) 884 { 885 if (PageDirty(page)) { 886 /* Only writeback pages in full synchronous migration */ 887 switch (mode) { 888 case MIGRATE_SYNC: 889 case MIGRATE_SYNC_NO_COPY: 890 break; 891 default: 892 return -EBUSY; 893 } 894 return writeout(mapping, page); 895 } 896 897 /* 898 * Buffers may be managed in a filesystem specific way. 899 * We must have no buffers or drop them. 900 */ 901 if (page_has_private(page) && 902 !try_to_release_page(page, GFP_KERNEL)) 903 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; 904 905 return migrate_page(mapping, newpage, page, mode); 906 } 907 908 /* 909 * Move a page to a newly allocated page 910 * The page is locked and all ptes have been successfully removed. 911 * 912 * The new page will have replaced the old page if this function 913 * is successful. 914 * 915 * Return value: 916 * < 0 - error code 917 * MIGRATEPAGE_SUCCESS - success 918 */ 919 static int move_to_new_page(struct page *newpage, struct page *page, 920 enum migrate_mode mode) 921 { 922 struct address_space *mapping; 923 int rc = -EAGAIN; 924 bool is_lru = !__PageMovable(page); 925 926 VM_BUG_ON_PAGE(!PageLocked(page), page); 927 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 928 929 mapping = page_mapping(page); 930 931 if (likely(is_lru)) { 932 if (!mapping) 933 rc = migrate_page(mapping, newpage, page, mode); 934 else if (mapping->a_ops->migratepage) 935 /* 936 * Most pages have a mapping and most filesystems 937 * provide a migratepage callback. Anonymous pages 938 * are part of swap space which also has its own 939 * migratepage callback. This is the most common path 940 * for page migration. 941 */ 942 rc = mapping->a_ops->migratepage(mapping, newpage, 943 page, mode); 944 else 945 rc = fallback_migrate_page(mapping, newpage, 946 page, mode); 947 } else { 948 /* 949 * In case of non-lru page, it could be released after 950 * isolation step. In that case, we shouldn't try migration. 951 */ 952 VM_BUG_ON_PAGE(!PageIsolated(page), page); 953 if (!PageMovable(page)) { 954 rc = MIGRATEPAGE_SUCCESS; 955 __ClearPageIsolated(page); 956 goto out; 957 } 958 959 rc = mapping->a_ops->migratepage(mapping, newpage, 960 page, mode); 961 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && 962 !PageIsolated(page)); 963 } 964 965 /* 966 * When successful, old pagecache page->mapping must be cleared before 967 * page is freed; but stats require that PageAnon be left as PageAnon. 968 */ 969 if (rc == MIGRATEPAGE_SUCCESS) { 970 if (__PageMovable(page)) { 971 VM_BUG_ON_PAGE(!PageIsolated(page), page); 972 973 /* 974 * We clear PG_movable under page_lock so any compactor 975 * cannot try to migrate this page. 976 */ 977 __ClearPageIsolated(page); 978 } 979 980 /* 981 * Anonymous and movable page->mapping will be cleared by 982 * free_pages_prepare so don't reset it here for keeping 983 * the type to work PageAnon, for example. 984 */ 985 if (!PageMappingFlags(page)) 986 page->mapping = NULL; 987 988 if (likely(!is_zone_device_page(newpage))) 989 flush_dcache_page(newpage); 990 991 } 992 out: 993 return rc; 994 } 995 996 static int __unmap_and_move(struct page *page, struct page *newpage, 997 int force, enum migrate_mode mode) 998 { 999 int rc = -EAGAIN; 1000 int page_was_mapped = 0; 1001 struct anon_vma *anon_vma = NULL; 1002 bool is_lru = !__PageMovable(page); 1003 1004 if (!trylock_page(page)) { 1005 if (!force || mode == MIGRATE_ASYNC) 1006 goto out; 1007 1008 /* 1009 * It's not safe for direct compaction to call lock_page. 1010 * For example, during page readahead pages are added locked 1011 * to the LRU. Later, when the IO completes the pages are 1012 * marked uptodate and unlocked. However, the queueing 1013 * could be merging multiple pages for one bio (e.g. 1014 * mpage_readahead). If an allocation happens for the 1015 * second or third page, the process can end up locking 1016 * the same page twice and deadlocking. Rather than 1017 * trying to be clever about what pages can be locked, 1018 * avoid the use of lock_page for direct compaction 1019 * altogether. 1020 */ 1021 if (current->flags & PF_MEMALLOC) 1022 goto out; 1023 1024 lock_page(page); 1025 } 1026 1027 if (PageWriteback(page)) { 1028 /* 1029 * Only in the case of a full synchronous migration is it 1030 * necessary to wait for PageWriteback. In the async case, 1031 * the retry loop is too short and in the sync-light case, 1032 * the overhead of stalling is too much 1033 */ 1034 switch (mode) { 1035 case MIGRATE_SYNC: 1036 case MIGRATE_SYNC_NO_COPY: 1037 break; 1038 default: 1039 rc = -EBUSY; 1040 goto out_unlock; 1041 } 1042 if (!force) 1043 goto out_unlock; 1044 wait_on_page_writeback(page); 1045 } 1046 1047 /* 1048 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 1049 * we cannot notice that anon_vma is freed while we migrates a page. 1050 * This get_anon_vma() delays freeing anon_vma pointer until the end 1051 * of migration. File cache pages are no problem because of page_lock() 1052 * File Caches may use write_page() or lock_page() in migration, then, 1053 * just care Anon page here. 1054 * 1055 * Only page_get_anon_vma() understands the subtleties of 1056 * getting a hold on an anon_vma from outside one of its mms. 1057 * But if we cannot get anon_vma, then we won't need it anyway, 1058 * because that implies that the anon page is no longer mapped 1059 * (and cannot be remapped so long as we hold the page lock). 1060 */ 1061 if (PageAnon(page) && !PageKsm(page)) 1062 anon_vma = page_get_anon_vma(page); 1063 1064 /* 1065 * Block others from accessing the new page when we get around to 1066 * establishing additional references. We are usually the only one 1067 * holding a reference to newpage at this point. We used to have a BUG 1068 * here if trylock_page(newpage) fails, but would like to allow for 1069 * cases where there might be a race with the previous use of newpage. 1070 * This is much like races on refcount of oldpage: just don't BUG(). 1071 */ 1072 if (unlikely(!trylock_page(newpage))) 1073 goto out_unlock; 1074 1075 if (unlikely(!is_lru)) { 1076 rc = move_to_new_page(newpage, page, mode); 1077 goto out_unlock_both; 1078 } 1079 1080 /* 1081 * Corner case handling: 1082 * 1. When a new swap-cache page is read into, it is added to the LRU 1083 * and treated as swapcache but it has no rmap yet. 1084 * Calling try_to_unmap() against a page->mapping==NULL page will 1085 * trigger a BUG. So handle it here. 1086 * 2. An orphaned page (see truncate_cleanup_page) might have 1087 * fs-private metadata. The page can be picked up due to memory 1088 * offlining. Everywhere else except page reclaim, the page is 1089 * invisible to the vm, so the page can not be migrated. So try to 1090 * free the metadata, so the page can be freed. 1091 */ 1092 if (!page->mapping) { 1093 VM_BUG_ON_PAGE(PageAnon(page), page); 1094 if (page_has_private(page)) { 1095 try_to_free_buffers(page); 1096 goto out_unlock_both; 1097 } 1098 } else if (page_mapped(page)) { 1099 /* Establish migration ptes */ 1100 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, 1101 page); 1102 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK); 1103 page_was_mapped = 1; 1104 } 1105 1106 if (!page_mapped(page)) 1107 rc = move_to_new_page(newpage, page, mode); 1108 1109 if (page_was_mapped) 1110 remove_migration_ptes(page, 1111 rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); 1112 1113 out_unlock_both: 1114 unlock_page(newpage); 1115 out_unlock: 1116 /* Drop an anon_vma reference if we took one */ 1117 if (anon_vma) 1118 put_anon_vma(anon_vma); 1119 unlock_page(page); 1120 out: 1121 /* 1122 * If migration is successful, decrease refcount of the newpage 1123 * which will not free the page because new page owner increased 1124 * refcounter. As well, if it is LRU page, add the page to LRU 1125 * list in here. Use the old state of the isolated source page to 1126 * determine if we migrated a LRU page. newpage was already unlocked 1127 * and possibly modified by its owner - don't rely on the page 1128 * state. 1129 */ 1130 if (rc == MIGRATEPAGE_SUCCESS) { 1131 if (unlikely(!is_lru)) 1132 put_page(newpage); 1133 else 1134 putback_lru_page(newpage); 1135 } 1136 1137 return rc; 1138 } 1139 1140 /* 1141 * Obtain the lock on page, remove all ptes and migrate the page 1142 * to the newly allocated page in newpage. 1143 */ 1144 static int unmap_and_move(new_page_t get_new_page, 1145 free_page_t put_new_page, 1146 unsigned long private, struct page *page, 1147 int force, enum migrate_mode mode, 1148 enum migrate_reason reason, 1149 struct list_head *ret) 1150 { 1151 int rc = MIGRATEPAGE_SUCCESS; 1152 struct page *newpage = NULL; 1153 1154 if (!thp_migration_supported() && PageTransHuge(page)) 1155 return -ENOSYS; 1156 1157 if (page_count(page) == 1) { 1158 /* page was freed from under us. So we are done. */ 1159 ClearPageActive(page); 1160 ClearPageUnevictable(page); 1161 if (unlikely(__PageMovable(page))) { 1162 lock_page(page); 1163 if (!PageMovable(page)) 1164 __ClearPageIsolated(page); 1165 unlock_page(page); 1166 } 1167 goto out; 1168 } 1169 1170 newpage = get_new_page(page, private); 1171 if (!newpage) 1172 return -ENOMEM; 1173 1174 rc = __unmap_and_move(page, newpage, force, mode); 1175 if (rc == MIGRATEPAGE_SUCCESS) 1176 set_page_owner_migrate_reason(newpage, reason); 1177 1178 out: 1179 if (rc != -EAGAIN) { 1180 /* 1181 * A page that has been migrated has all references 1182 * removed and will be freed. A page that has not been 1183 * migrated will have kept its references and be restored. 1184 */ 1185 list_del(&page->lru); 1186 } 1187 1188 /* 1189 * If migration is successful, releases reference grabbed during 1190 * isolation. Otherwise, restore the page to right list unless 1191 * we want to retry. 1192 */ 1193 if (rc == MIGRATEPAGE_SUCCESS) { 1194 /* 1195 * Compaction can migrate also non-LRU pages which are 1196 * not accounted to NR_ISOLATED_*. They can be recognized 1197 * as __PageMovable 1198 */ 1199 if (likely(!__PageMovable(page))) 1200 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 1201 page_is_file_lru(page), -thp_nr_pages(page)); 1202 1203 if (reason != MR_MEMORY_FAILURE) 1204 /* 1205 * We release the page in page_handle_poison. 1206 */ 1207 put_page(page); 1208 } else { 1209 if (rc != -EAGAIN) 1210 list_add_tail(&page->lru, ret); 1211 1212 if (put_new_page) 1213 put_new_page(newpage, private); 1214 else 1215 put_page(newpage); 1216 } 1217 1218 return rc; 1219 } 1220 1221 /* 1222 * Counterpart of unmap_and_move_page() for hugepage migration. 1223 * 1224 * This function doesn't wait the completion of hugepage I/O 1225 * because there is no race between I/O and migration for hugepage. 1226 * Note that currently hugepage I/O occurs only in direct I/O 1227 * where no lock is held and PG_writeback is irrelevant, 1228 * and writeback status of all subpages are counted in the reference 1229 * count of the head page (i.e. if all subpages of a 2MB hugepage are 1230 * under direct I/O, the reference of the head page is 512 and a bit more.) 1231 * This means that when we try to migrate hugepage whose subpages are 1232 * doing direct I/O, some references remain after try_to_unmap() and 1233 * hugepage migration fails without data corruption. 1234 * 1235 * There is also no race when direct I/O is issued on the page under migration, 1236 * because then pte is replaced with migration swap entry and direct I/O code 1237 * will wait in the page fault for migration to complete. 1238 */ 1239 static int unmap_and_move_huge_page(new_page_t get_new_page, 1240 free_page_t put_new_page, unsigned long private, 1241 struct page *hpage, int force, 1242 enum migrate_mode mode, int reason, 1243 struct list_head *ret) 1244 { 1245 int rc = -EAGAIN; 1246 int page_was_mapped = 0; 1247 struct page *new_hpage; 1248 struct anon_vma *anon_vma = NULL; 1249 struct address_space *mapping = NULL; 1250 1251 /* 1252 * Migratability of hugepages depends on architectures and their size. 1253 * This check is necessary because some callers of hugepage migration 1254 * like soft offline and memory hotremove don't walk through page 1255 * tables or check whether the hugepage is pmd-based or not before 1256 * kicking migration. 1257 */ 1258 if (!hugepage_migration_supported(page_hstate(hpage))) { 1259 list_move_tail(&hpage->lru, ret); 1260 return -ENOSYS; 1261 } 1262 1263 if (page_count(hpage) == 1) { 1264 /* page was freed from under us. So we are done. */ 1265 putback_active_hugepage(hpage); 1266 return MIGRATEPAGE_SUCCESS; 1267 } 1268 1269 new_hpage = get_new_page(hpage, private); 1270 if (!new_hpage) 1271 return -ENOMEM; 1272 1273 if (!trylock_page(hpage)) { 1274 if (!force) 1275 goto out; 1276 switch (mode) { 1277 case MIGRATE_SYNC: 1278 case MIGRATE_SYNC_NO_COPY: 1279 break; 1280 default: 1281 goto out; 1282 } 1283 lock_page(hpage); 1284 } 1285 1286 /* 1287 * Check for pages which are in the process of being freed. Without 1288 * page_mapping() set, hugetlbfs specific move page routine will not 1289 * be called and we could leak usage counts for subpools. 1290 */ 1291 if (page_private(hpage) && !page_mapping(hpage)) { 1292 rc = -EBUSY; 1293 goto out_unlock; 1294 } 1295 1296 if (PageAnon(hpage)) 1297 anon_vma = page_get_anon_vma(hpage); 1298 1299 if (unlikely(!trylock_page(new_hpage))) 1300 goto put_anon; 1301 1302 if (page_mapped(hpage)) { 1303 bool mapping_locked = false; 1304 enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK; 1305 1306 if (!PageAnon(hpage)) { 1307 /* 1308 * In shared mappings, try_to_unmap could potentially 1309 * call huge_pmd_unshare. Because of this, take 1310 * semaphore in write mode here and set TTU_RMAP_LOCKED 1311 * to let lower levels know we have taken the lock. 1312 */ 1313 mapping = hugetlb_page_mapping_lock_write(hpage); 1314 if (unlikely(!mapping)) 1315 goto unlock_put_anon; 1316 1317 mapping_locked = true; 1318 ttu |= TTU_RMAP_LOCKED; 1319 } 1320 1321 try_to_unmap(hpage, ttu); 1322 page_was_mapped = 1; 1323 1324 if (mapping_locked) 1325 i_mmap_unlock_write(mapping); 1326 } 1327 1328 if (!page_mapped(hpage)) 1329 rc = move_to_new_page(new_hpage, hpage, mode); 1330 1331 if (page_was_mapped) 1332 remove_migration_ptes(hpage, 1333 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); 1334 1335 unlock_put_anon: 1336 unlock_page(new_hpage); 1337 1338 put_anon: 1339 if (anon_vma) 1340 put_anon_vma(anon_vma); 1341 1342 if (rc == MIGRATEPAGE_SUCCESS) { 1343 move_hugetlb_state(hpage, new_hpage, reason); 1344 put_new_page = NULL; 1345 } 1346 1347 out_unlock: 1348 unlock_page(hpage); 1349 out: 1350 if (rc == MIGRATEPAGE_SUCCESS) 1351 putback_active_hugepage(hpage); 1352 else if (rc != -EAGAIN) 1353 list_move_tail(&hpage->lru, ret); 1354 1355 /* 1356 * If migration was not successful and there's a freeing callback, use 1357 * it. Otherwise, put_page() will drop the reference grabbed during 1358 * isolation. 1359 */ 1360 if (put_new_page) 1361 put_new_page(new_hpage, private); 1362 else 1363 putback_active_hugepage(new_hpage); 1364 1365 return rc; 1366 } 1367 1368 static inline int try_split_thp(struct page *page, struct page **page2, 1369 struct list_head *from) 1370 { 1371 int rc = 0; 1372 1373 lock_page(page); 1374 rc = split_huge_page_to_list(page, from); 1375 unlock_page(page); 1376 if (!rc) 1377 list_safe_reset_next(page, *page2, lru); 1378 1379 return rc; 1380 } 1381 1382 /* 1383 * migrate_pages - migrate the pages specified in a list, to the free pages 1384 * supplied as the target for the page migration 1385 * 1386 * @from: The list of pages to be migrated. 1387 * @get_new_page: The function used to allocate free pages to be used 1388 * as the target of the page migration. 1389 * @put_new_page: The function used to free target pages if migration 1390 * fails, or NULL if no special handling is necessary. 1391 * @private: Private data to be passed on to get_new_page() 1392 * @mode: The migration mode that specifies the constraints for 1393 * page migration, if any. 1394 * @reason: The reason for page migration. 1395 * 1396 * The function returns after 10 attempts or if no pages are movable any more 1397 * because the list has become empty or no retryable pages exist any more. 1398 * It is caller's responsibility to call putback_movable_pages() to return pages 1399 * to the LRU or free list only if ret != 0. 1400 * 1401 * Returns the number of pages that were not migrated, or an error code. 1402 */ 1403 int migrate_pages(struct list_head *from, new_page_t get_new_page, 1404 free_page_t put_new_page, unsigned long private, 1405 enum migrate_mode mode, int reason) 1406 { 1407 int retry = 1; 1408 int thp_retry = 1; 1409 int nr_failed = 0; 1410 int nr_succeeded = 0; 1411 int nr_thp_succeeded = 0; 1412 int nr_thp_failed = 0; 1413 int nr_thp_split = 0; 1414 int pass = 0; 1415 bool is_thp = false; 1416 struct page *page; 1417 struct page *page2; 1418 int swapwrite = current->flags & PF_SWAPWRITE; 1419 int rc, nr_subpages; 1420 LIST_HEAD(ret_pages); 1421 1422 trace_mm_migrate_pages_start(mode, reason); 1423 1424 if (!swapwrite) 1425 current->flags |= PF_SWAPWRITE; 1426 1427 for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { 1428 retry = 0; 1429 thp_retry = 0; 1430 1431 list_for_each_entry_safe(page, page2, from, lru) { 1432 retry: 1433 /* 1434 * THP statistics is based on the source huge page. 1435 * Capture required information that might get lost 1436 * during migration. 1437 */ 1438 is_thp = PageTransHuge(page) && !PageHuge(page); 1439 nr_subpages = thp_nr_pages(page); 1440 cond_resched(); 1441 1442 if (PageHuge(page)) 1443 rc = unmap_and_move_huge_page(get_new_page, 1444 put_new_page, private, page, 1445 pass > 2, mode, reason, 1446 &ret_pages); 1447 else 1448 rc = unmap_and_move(get_new_page, put_new_page, 1449 private, page, pass > 2, mode, 1450 reason, &ret_pages); 1451 /* 1452 * The rules are: 1453 * Success: non hugetlb page will be freed, hugetlb 1454 * page will be put back 1455 * -EAGAIN: stay on the from list 1456 * -ENOMEM: stay on the from list 1457 * Other errno: put on ret_pages list then splice to 1458 * from list 1459 */ 1460 switch(rc) { 1461 /* 1462 * THP migration might be unsupported or the 1463 * allocation could've failed so we should 1464 * retry on the same page with the THP split 1465 * to base pages. 1466 * 1467 * Head page is retried immediately and tail 1468 * pages are added to the tail of the list so 1469 * we encounter them after the rest of the list 1470 * is processed. 1471 */ 1472 case -ENOSYS: 1473 /* THP migration is unsupported */ 1474 if (is_thp) { 1475 if (!try_split_thp(page, &page2, from)) { 1476 nr_thp_split++; 1477 goto retry; 1478 } 1479 1480 nr_thp_failed++; 1481 nr_failed += nr_subpages; 1482 break; 1483 } 1484 1485 /* Hugetlb migration is unsupported */ 1486 nr_failed++; 1487 break; 1488 case -ENOMEM: 1489 /* 1490 * When memory is low, don't bother to try to migrate 1491 * other pages, just exit. 1492 */ 1493 if (is_thp) { 1494 if (!try_split_thp(page, &page2, from)) { 1495 nr_thp_split++; 1496 goto retry; 1497 } 1498 1499 nr_thp_failed++; 1500 nr_failed += nr_subpages; 1501 goto out; 1502 } 1503 nr_failed++; 1504 goto out; 1505 case -EAGAIN: 1506 if (is_thp) { 1507 thp_retry++; 1508 break; 1509 } 1510 retry++; 1511 break; 1512 case MIGRATEPAGE_SUCCESS: 1513 if (is_thp) { 1514 nr_thp_succeeded++; 1515 nr_succeeded += nr_subpages; 1516 break; 1517 } 1518 nr_succeeded++; 1519 break; 1520 default: 1521 /* 1522 * Permanent failure (-EBUSY, etc.): 1523 * unlike -EAGAIN case, the failed page is 1524 * removed from migration page list and not 1525 * retried in the next outer loop. 1526 */ 1527 if (is_thp) { 1528 nr_thp_failed++; 1529 nr_failed += nr_subpages; 1530 break; 1531 } 1532 nr_failed++; 1533 break; 1534 } 1535 } 1536 } 1537 nr_failed += retry + thp_retry; 1538 nr_thp_failed += thp_retry; 1539 rc = nr_failed; 1540 out: 1541 /* 1542 * Put the permanent failure page back to migration list, they 1543 * will be put back to the right list by the caller. 1544 */ 1545 list_splice(&ret_pages, from); 1546 1547 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); 1548 count_vm_events(PGMIGRATE_FAIL, nr_failed); 1549 count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); 1550 count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); 1551 count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); 1552 trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, 1553 nr_thp_failed, nr_thp_split, mode, reason); 1554 1555 if (!swapwrite) 1556 current->flags &= ~PF_SWAPWRITE; 1557 1558 return rc; 1559 } 1560 1561 struct page *alloc_migration_target(struct page *page, unsigned long private) 1562 { 1563 struct migration_target_control *mtc; 1564 gfp_t gfp_mask; 1565 unsigned int order = 0; 1566 struct page *new_page = NULL; 1567 int nid; 1568 int zidx; 1569 1570 mtc = (struct migration_target_control *)private; 1571 gfp_mask = mtc->gfp_mask; 1572 nid = mtc->nid; 1573 if (nid == NUMA_NO_NODE) 1574 nid = page_to_nid(page); 1575 1576 if (PageHuge(page)) { 1577 struct hstate *h = page_hstate(compound_head(page)); 1578 1579 gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); 1580 return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); 1581 } 1582 1583 if (PageTransHuge(page)) { 1584 /* 1585 * clear __GFP_RECLAIM to make the migration callback 1586 * consistent with regular THP allocations. 1587 */ 1588 gfp_mask &= ~__GFP_RECLAIM; 1589 gfp_mask |= GFP_TRANSHUGE; 1590 order = HPAGE_PMD_ORDER; 1591 } 1592 zidx = zone_idx(page_zone(page)); 1593 if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) 1594 gfp_mask |= __GFP_HIGHMEM; 1595 1596 new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask); 1597 1598 if (new_page && PageTransHuge(new_page)) 1599 prep_transhuge_page(new_page); 1600 1601 return new_page; 1602 } 1603 1604 #ifdef CONFIG_NUMA 1605 1606 static int store_status(int __user *status, int start, int value, int nr) 1607 { 1608 while (nr-- > 0) { 1609 if (put_user(value, status + start)) 1610 return -EFAULT; 1611 start++; 1612 } 1613 1614 return 0; 1615 } 1616 1617 static int do_move_pages_to_node(struct mm_struct *mm, 1618 struct list_head *pagelist, int node) 1619 { 1620 int err; 1621 struct migration_target_control mtc = { 1622 .nid = node, 1623 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1624 }; 1625 1626 err = migrate_pages(pagelist, alloc_migration_target, NULL, 1627 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); 1628 if (err) 1629 putback_movable_pages(pagelist); 1630 return err; 1631 } 1632 1633 /* 1634 * Resolves the given address to a struct page, isolates it from the LRU and 1635 * puts it to the given pagelist. 1636 * Returns: 1637 * errno - if the page cannot be found/isolated 1638 * 0 - when it doesn't have to be migrated because it is already on the 1639 * target node 1640 * 1 - when it has been queued 1641 */ 1642 static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, 1643 int node, struct list_head *pagelist, bool migrate_all) 1644 { 1645 struct vm_area_struct *vma; 1646 struct page *page; 1647 unsigned int follflags; 1648 int err; 1649 1650 mmap_read_lock(mm); 1651 err = -EFAULT; 1652 vma = find_vma(mm, addr); 1653 if (!vma || addr < vma->vm_start || !vma_migratable(vma)) 1654 goto out; 1655 1656 /* FOLL_DUMP to ignore special (like zero) pages */ 1657 follflags = FOLL_GET | FOLL_DUMP; 1658 page = follow_page(vma, addr, follflags); 1659 1660 err = PTR_ERR(page); 1661 if (IS_ERR(page)) 1662 goto out; 1663 1664 err = -ENOENT; 1665 if (!page) 1666 goto out; 1667 1668 err = 0; 1669 if (page_to_nid(page) == node) 1670 goto out_putpage; 1671 1672 err = -EACCES; 1673 if (page_mapcount(page) > 1 && !migrate_all) 1674 goto out_putpage; 1675 1676 if (PageHuge(page)) { 1677 if (PageHead(page)) { 1678 isolate_huge_page(page, pagelist); 1679 err = 1; 1680 } 1681 } else { 1682 struct page *head; 1683 1684 head = compound_head(page); 1685 err = isolate_lru_page(head); 1686 if (err) 1687 goto out_putpage; 1688 1689 err = 1; 1690 list_add_tail(&head->lru, pagelist); 1691 mod_node_page_state(page_pgdat(head), 1692 NR_ISOLATED_ANON + page_is_file_lru(head), 1693 thp_nr_pages(head)); 1694 } 1695 out_putpage: 1696 /* 1697 * Either remove the duplicate refcount from 1698 * isolate_lru_page() or drop the page ref if it was 1699 * not isolated. 1700 */ 1701 put_page(page); 1702 out: 1703 mmap_read_unlock(mm); 1704 return err; 1705 } 1706 1707 static int move_pages_and_store_status(struct mm_struct *mm, int node, 1708 struct list_head *pagelist, int __user *status, 1709 int start, int i, unsigned long nr_pages) 1710 { 1711 int err; 1712 1713 if (list_empty(pagelist)) 1714 return 0; 1715 1716 err = do_move_pages_to_node(mm, pagelist, node); 1717 if (err) { 1718 /* 1719 * Positive err means the number of failed 1720 * pages to migrate. Since we are going to 1721 * abort and return the number of non-migrated 1722 * pages, so need to include the rest of the 1723 * nr_pages that have not been attempted as 1724 * well. 1725 */ 1726 if (err > 0) 1727 err += nr_pages - i - 1; 1728 return err; 1729 } 1730 return store_status(status, start, node, i - start); 1731 } 1732 1733 /* 1734 * Migrate an array of page address onto an array of nodes and fill 1735 * the corresponding array of status. 1736 */ 1737 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 1738 unsigned long nr_pages, 1739 const void __user * __user *pages, 1740 const int __user *nodes, 1741 int __user *status, int flags) 1742 { 1743 int current_node = NUMA_NO_NODE; 1744 LIST_HEAD(pagelist); 1745 int start, i; 1746 int err = 0, err1; 1747 1748 lru_cache_disable(); 1749 1750 for (i = start = 0; i < nr_pages; i++) { 1751 const void __user *p; 1752 unsigned long addr; 1753 int node; 1754 1755 err = -EFAULT; 1756 if (get_user(p, pages + i)) 1757 goto out_flush; 1758 if (get_user(node, nodes + i)) 1759 goto out_flush; 1760 addr = (unsigned long)untagged_addr(p); 1761 1762 err = -ENODEV; 1763 if (node < 0 || node >= MAX_NUMNODES) 1764 goto out_flush; 1765 if (!node_state(node, N_MEMORY)) 1766 goto out_flush; 1767 1768 err = -EACCES; 1769 if (!node_isset(node, task_nodes)) 1770 goto out_flush; 1771 1772 if (current_node == NUMA_NO_NODE) { 1773 current_node = node; 1774 start = i; 1775 } else if (node != current_node) { 1776 err = move_pages_and_store_status(mm, current_node, 1777 &pagelist, status, start, i, nr_pages); 1778 if (err) 1779 goto out; 1780 start = i; 1781 current_node = node; 1782 } 1783 1784 /* 1785 * Errors in the page lookup or isolation are not fatal and we simply 1786 * report them via status 1787 */ 1788 err = add_page_for_migration(mm, addr, current_node, 1789 &pagelist, flags & MPOL_MF_MOVE_ALL); 1790 1791 if (err > 0) { 1792 /* The page is successfully queued for migration */ 1793 continue; 1794 } 1795 1796 /* 1797 * If the page is already on the target node (!err), store the 1798 * node, otherwise, store the err. 1799 */ 1800 err = store_status(status, i, err ? : current_node, 1); 1801 if (err) 1802 goto out_flush; 1803 1804 err = move_pages_and_store_status(mm, current_node, &pagelist, 1805 status, start, i, nr_pages); 1806 if (err) 1807 goto out; 1808 current_node = NUMA_NO_NODE; 1809 } 1810 out_flush: 1811 /* Make sure we do not overwrite the existing error */ 1812 err1 = move_pages_and_store_status(mm, current_node, &pagelist, 1813 status, start, i, nr_pages); 1814 if (err >= 0) 1815 err = err1; 1816 out: 1817 lru_cache_enable(); 1818 return err; 1819 } 1820 1821 /* 1822 * Determine the nodes of an array of pages and store it in an array of status. 1823 */ 1824 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 1825 const void __user **pages, int *status) 1826 { 1827 unsigned long i; 1828 1829 mmap_read_lock(mm); 1830 1831 for (i = 0; i < nr_pages; i++) { 1832 unsigned long addr = (unsigned long)(*pages); 1833 struct vm_area_struct *vma; 1834 struct page *page; 1835 int err = -EFAULT; 1836 1837 vma = vma_lookup(mm, addr); 1838 if (!vma) 1839 goto set_status; 1840 1841 /* FOLL_DUMP to ignore special (like zero) pages */ 1842 page = follow_page(vma, addr, FOLL_DUMP); 1843 1844 err = PTR_ERR(page); 1845 if (IS_ERR(page)) 1846 goto set_status; 1847 1848 err = page ? page_to_nid(page) : -ENOENT; 1849 set_status: 1850 *status = err; 1851 1852 pages++; 1853 status++; 1854 } 1855 1856 mmap_read_unlock(mm); 1857 } 1858 1859 /* 1860 * Determine the nodes of a user array of pages and store it in 1861 * a user array of status. 1862 */ 1863 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 1864 const void __user * __user *pages, 1865 int __user *status) 1866 { 1867 #define DO_PAGES_STAT_CHUNK_NR 16 1868 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1869 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1870 1871 while (nr_pages) { 1872 unsigned long chunk_nr; 1873 1874 chunk_nr = nr_pages; 1875 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 1876 chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1877 1878 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 1879 break; 1880 1881 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1882 1883 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 1884 break; 1885 1886 pages += chunk_nr; 1887 status += chunk_nr; 1888 nr_pages -= chunk_nr; 1889 } 1890 return nr_pages ? -EFAULT : 0; 1891 } 1892 1893 static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) 1894 { 1895 struct task_struct *task; 1896 struct mm_struct *mm; 1897 1898 /* 1899 * There is no need to check if current process has the right to modify 1900 * the specified process when they are same. 1901 */ 1902 if (!pid) { 1903 mmget(current->mm); 1904 *mem_nodes = cpuset_mems_allowed(current); 1905 return current->mm; 1906 } 1907 1908 /* Find the mm_struct */ 1909 rcu_read_lock(); 1910 task = find_task_by_vpid(pid); 1911 if (!task) { 1912 rcu_read_unlock(); 1913 return ERR_PTR(-ESRCH); 1914 } 1915 get_task_struct(task); 1916 1917 /* 1918 * Check if this process has the right to modify the specified 1919 * process. Use the regular "ptrace_may_access()" checks. 1920 */ 1921 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1922 rcu_read_unlock(); 1923 mm = ERR_PTR(-EPERM); 1924 goto out; 1925 } 1926 rcu_read_unlock(); 1927 1928 mm = ERR_PTR(security_task_movememory(task)); 1929 if (IS_ERR(mm)) 1930 goto out; 1931 *mem_nodes = cpuset_mems_allowed(task); 1932 mm = get_task_mm(task); 1933 out: 1934 put_task_struct(task); 1935 if (!mm) 1936 mm = ERR_PTR(-EINVAL); 1937 return mm; 1938 } 1939 1940 /* 1941 * Move a list of pages in the address space of the currently executing 1942 * process. 1943 */ 1944 static int kernel_move_pages(pid_t pid, unsigned long nr_pages, 1945 const void __user * __user *pages, 1946 const int __user *nodes, 1947 int __user *status, int flags) 1948 { 1949 struct mm_struct *mm; 1950 int err; 1951 nodemask_t task_nodes; 1952 1953 /* Check flags */ 1954 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1955 return -EINVAL; 1956 1957 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1958 return -EPERM; 1959 1960 mm = find_mm_struct(pid, &task_nodes); 1961 if (IS_ERR(mm)) 1962 return PTR_ERR(mm); 1963 1964 if (nodes) 1965 err = do_pages_move(mm, task_nodes, nr_pages, pages, 1966 nodes, status, flags); 1967 else 1968 err = do_pages_stat(mm, nr_pages, pages, status); 1969 1970 mmput(mm); 1971 return err; 1972 } 1973 1974 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1975 const void __user * __user *, pages, 1976 const int __user *, nodes, 1977 int __user *, status, int, flags) 1978 { 1979 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 1980 } 1981 1982 #ifdef CONFIG_COMPAT 1983 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, 1984 compat_uptr_t __user *, pages32, 1985 const int __user *, nodes, 1986 int __user *, status, 1987 int, flags) 1988 { 1989 const void __user * __user *pages; 1990 int i; 1991 1992 pages = compat_alloc_user_space(nr_pages * sizeof(void *)); 1993 for (i = 0; i < nr_pages; i++) { 1994 compat_uptr_t p; 1995 1996 if (get_user(p, pages32 + i) || 1997 put_user(compat_ptr(p), pages + i)) 1998 return -EFAULT; 1999 } 2000 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 2001 } 2002 #endif /* CONFIG_COMPAT */ 2003 2004 #ifdef CONFIG_NUMA_BALANCING 2005 /* 2006 * Returns true if this is a safe migration target node for misplaced NUMA 2007 * pages. Currently it only checks the watermarks which crude 2008 */ 2009 static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 2010 unsigned long nr_migrate_pages) 2011 { 2012 int z; 2013 2014 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 2015 struct zone *zone = pgdat->node_zones + z; 2016 2017 if (!populated_zone(zone)) 2018 continue; 2019 2020 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 2021 if (!zone_watermark_ok(zone, 0, 2022 high_wmark_pages(zone) + 2023 nr_migrate_pages, 2024 ZONE_MOVABLE, 0)) 2025 continue; 2026 return true; 2027 } 2028 return false; 2029 } 2030 2031 static struct page *alloc_misplaced_dst_page(struct page *page, 2032 unsigned long data) 2033 { 2034 int nid = (int) data; 2035 struct page *newpage; 2036 2037 newpage = __alloc_pages_node(nid, 2038 (GFP_HIGHUSER_MOVABLE | 2039 __GFP_THISNODE | __GFP_NOMEMALLOC | 2040 __GFP_NORETRY | __GFP_NOWARN) & 2041 ~__GFP_RECLAIM, 0); 2042 2043 return newpage; 2044 } 2045 2046 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 2047 { 2048 int page_lru; 2049 2050 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); 2051 2052 /* Avoid migrating to a node that is nearly full */ 2053 if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) 2054 return 0; 2055 2056 if (isolate_lru_page(page)) 2057 return 0; 2058 2059 /* 2060 * migrate_misplaced_transhuge_page() skips page migration's usual 2061 * check on page_count(), so we must do it here, now that the page 2062 * has been isolated: a GUP pin, or any other pin, prevents migration. 2063 * The expected page count is 3: 1 for page's mapcount and 1 for the 2064 * caller's pin and 1 for the reference taken by isolate_lru_page(). 2065 */ 2066 if (PageTransHuge(page) && page_count(page) != 3) { 2067 putback_lru_page(page); 2068 return 0; 2069 } 2070 2071 page_lru = page_is_file_lru(page); 2072 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, 2073 thp_nr_pages(page)); 2074 2075 /* 2076 * Isolating the page has taken another reference, so the 2077 * caller's reference can be safely dropped without the page 2078 * disappearing underneath us during migration. 2079 */ 2080 put_page(page); 2081 return 1; 2082 } 2083 2084 bool pmd_trans_migrating(pmd_t pmd) 2085 { 2086 struct page *page = pmd_page(pmd); 2087 return PageLocked(page); 2088 } 2089 2090 /* 2091 * Attempt to migrate a misplaced page to the specified destination 2092 * node. Caller is expected to have an elevated reference count on 2093 * the page that will be dropped by this function before returning. 2094 */ 2095 int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, 2096 int node) 2097 { 2098 pg_data_t *pgdat = NODE_DATA(node); 2099 int isolated; 2100 int nr_remaining; 2101 LIST_HEAD(migratepages); 2102 2103 /* 2104 * Don't migrate file pages that are mapped in multiple processes 2105 * with execute permissions as they are probably shared libraries. 2106 */ 2107 if (page_mapcount(page) != 1 && page_is_file_lru(page) && 2108 (vma->vm_flags & VM_EXEC)) 2109 goto out; 2110 2111 /* 2112 * Also do not migrate dirty pages as not all filesystems can move 2113 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. 2114 */ 2115 if (page_is_file_lru(page) && PageDirty(page)) 2116 goto out; 2117 2118 isolated = numamigrate_isolate_page(pgdat, page); 2119 if (!isolated) 2120 goto out; 2121 2122 list_add(&page->lru, &migratepages); 2123 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 2124 NULL, node, MIGRATE_ASYNC, 2125 MR_NUMA_MISPLACED); 2126 if (nr_remaining) { 2127 if (!list_empty(&migratepages)) { 2128 list_del(&page->lru); 2129 dec_node_page_state(page, NR_ISOLATED_ANON + 2130 page_is_file_lru(page)); 2131 putback_lru_page(page); 2132 } 2133 isolated = 0; 2134 } else 2135 count_vm_numa_event(NUMA_PAGE_MIGRATE); 2136 BUG_ON(!list_empty(&migratepages)); 2137 return isolated; 2138 2139 out: 2140 put_page(page); 2141 return 0; 2142 } 2143 #endif /* CONFIG_NUMA_BALANCING */ 2144 2145 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 2146 /* 2147 * Migrates a THP to a given target node. page must be locked and is unlocked 2148 * before returning. 2149 */ 2150 int migrate_misplaced_transhuge_page(struct mm_struct *mm, 2151 struct vm_area_struct *vma, 2152 pmd_t *pmd, pmd_t entry, 2153 unsigned long address, 2154 struct page *page, int node) 2155 { 2156 spinlock_t *ptl; 2157 pg_data_t *pgdat = NODE_DATA(node); 2158 int isolated = 0; 2159 struct page *new_page = NULL; 2160 int page_lru = page_is_file_lru(page); 2161 unsigned long start = address & HPAGE_PMD_MASK; 2162 2163 new_page = alloc_pages_node(node, 2164 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), 2165 HPAGE_PMD_ORDER); 2166 if (!new_page) 2167 goto out_fail; 2168 prep_transhuge_page(new_page); 2169 2170 isolated = numamigrate_isolate_page(pgdat, page); 2171 if (!isolated) { 2172 put_page(new_page); 2173 goto out_fail; 2174 } 2175 2176 /* Prepare a page as a migration target */ 2177 __SetPageLocked(new_page); 2178 if (PageSwapBacked(page)) 2179 __SetPageSwapBacked(new_page); 2180 2181 /* anon mapping, we can simply copy page->mapping to the new page: */ 2182 new_page->mapping = page->mapping; 2183 new_page->index = page->index; 2184 /* flush the cache before copying using the kernel virtual address */ 2185 flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); 2186 migrate_page_copy(new_page, page); 2187 WARN_ON(PageLRU(new_page)); 2188 2189 /* Recheck the target PMD */ 2190 ptl = pmd_lock(mm, pmd); 2191 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { 2192 spin_unlock(ptl); 2193 2194 /* Reverse changes made by migrate_page_copy() */ 2195 if (TestClearPageActive(new_page)) 2196 SetPageActive(page); 2197 if (TestClearPageUnevictable(new_page)) 2198 SetPageUnevictable(page); 2199 2200 unlock_page(new_page); 2201 put_page(new_page); /* Free it */ 2202 2203 /* Retake the callers reference and putback on LRU */ 2204 get_page(page); 2205 putback_lru_page(page); 2206 mod_node_page_state(page_pgdat(page), 2207 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 2208 2209 goto out_unlock; 2210 } 2211 2212 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 2213 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2214 2215 /* 2216 * Overwrite the old entry under pagetable lock and establish 2217 * the new PTE. Any parallel GUP will either observe the old 2218 * page blocking on the page lock, block on the page table 2219 * lock or observe the new page. The SetPageUptodate on the 2220 * new page and page_add_new_anon_rmap guarantee the copy is 2221 * visible before the pagetable update. 2222 */ 2223 page_add_anon_rmap(new_page, vma, start, true); 2224 /* 2225 * At this point the pmd is numa/protnone (i.e. non present) and the TLB 2226 * has already been flushed globally. So no TLB can be currently 2227 * caching this non present pmd mapping. There's no need to clear the 2228 * pmd before doing set_pmd_at(), nor to flush the TLB after 2229 * set_pmd_at(). Clearing the pmd here would introduce a race 2230 * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the 2231 * mmap_lock for reading. If the pmd is set to NULL at any given time, 2232 * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this 2233 * pmd. 2234 */ 2235 set_pmd_at(mm, start, pmd, entry); 2236 update_mmu_cache_pmd(vma, address, &entry); 2237 2238 page_ref_unfreeze(page, 2); 2239 mlock_migrate_page(new_page, page); 2240 page_remove_rmap(page, true); 2241 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 2242 2243 spin_unlock(ptl); 2244 2245 /* Take an "isolate" reference and put new page on the LRU. */ 2246 get_page(new_page); 2247 putback_lru_page(new_page); 2248 2249 unlock_page(new_page); 2250 unlock_page(page); 2251 put_page(page); /* Drop the rmap reference */ 2252 put_page(page); /* Drop the LRU isolation reference */ 2253 2254 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 2255 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 2256 2257 mod_node_page_state(page_pgdat(page), 2258 NR_ISOLATED_ANON + page_lru, 2259 -HPAGE_PMD_NR); 2260 return isolated; 2261 2262 out_fail: 2263 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 2264 ptl = pmd_lock(mm, pmd); 2265 if (pmd_same(*pmd, entry)) { 2266 entry = pmd_modify(entry, vma->vm_page_prot); 2267 set_pmd_at(mm, start, pmd, entry); 2268 update_mmu_cache_pmd(vma, address, &entry); 2269 } 2270 spin_unlock(ptl); 2271 2272 out_unlock: 2273 unlock_page(page); 2274 put_page(page); 2275 return 0; 2276 } 2277 #endif /* CONFIG_NUMA_BALANCING */ 2278 2279 #endif /* CONFIG_NUMA */ 2280 2281 #ifdef CONFIG_DEVICE_PRIVATE 2282 static int migrate_vma_collect_skip(unsigned long start, 2283 unsigned long end, 2284 struct mm_walk *walk) 2285 { 2286 struct migrate_vma *migrate = walk->private; 2287 unsigned long addr; 2288 2289 for (addr = start; addr < end; addr += PAGE_SIZE) { 2290 migrate->dst[migrate->npages] = 0; 2291 migrate->src[migrate->npages++] = 0; 2292 } 2293 2294 return 0; 2295 } 2296 2297 static int migrate_vma_collect_hole(unsigned long start, 2298 unsigned long end, 2299 __always_unused int depth, 2300 struct mm_walk *walk) 2301 { 2302 struct migrate_vma *migrate = walk->private; 2303 unsigned long addr; 2304 2305 /* Only allow populating anonymous memory. */ 2306 if (!vma_is_anonymous(walk->vma)) 2307 return migrate_vma_collect_skip(start, end, walk); 2308 2309 for (addr = start; addr < end; addr += PAGE_SIZE) { 2310 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 2311 migrate->dst[migrate->npages] = 0; 2312 migrate->npages++; 2313 migrate->cpages++; 2314 } 2315 2316 return 0; 2317 } 2318 2319 static int migrate_vma_collect_pmd(pmd_t *pmdp, 2320 unsigned long start, 2321 unsigned long end, 2322 struct mm_walk *walk) 2323 { 2324 struct migrate_vma *migrate = walk->private; 2325 struct vm_area_struct *vma = walk->vma; 2326 struct mm_struct *mm = vma->vm_mm; 2327 unsigned long addr = start, unmapped = 0; 2328 spinlock_t *ptl; 2329 pte_t *ptep; 2330 2331 again: 2332 if (pmd_none(*pmdp)) 2333 return migrate_vma_collect_hole(start, end, -1, walk); 2334 2335 if (pmd_trans_huge(*pmdp)) { 2336 struct page *page; 2337 2338 ptl = pmd_lock(mm, pmdp); 2339 if (unlikely(!pmd_trans_huge(*pmdp))) { 2340 spin_unlock(ptl); 2341 goto again; 2342 } 2343 2344 page = pmd_page(*pmdp); 2345 if (is_huge_zero_page(page)) { 2346 spin_unlock(ptl); 2347 split_huge_pmd(vma, pmdp, addr); 2348 if (pmd_trans_unstable(pmdp)) 2349 return migrate_vma_collect_skip(start, end, 2350 walk); 2351 } else { 2352 int ret; 2353 2354 get_page(page); 2355 spin_unlock(ptl); 2356 if (unlikely(!trylock_page(page))) 2357 return migrate_vma_collect_skip(start, end, 2358 walk); 2359 ret = split_huge_page(page); 2360 unlock_page(page); 2361 put_page(page); 2362 if (ret) 2363 return migrate_vma_collect_skip(start, end, 2364 walk); 2365 if (pmd_none(*pmdp)) 2366 return migrate_vma_collect_hole(start, end, -1, 2367 walk); 2368 } 2369 } 2370 2371 if (unlikely(pmd_bad(*pmdp))) 2372 return migrate_vma_collect_skip(start, end, walk); 2373 2374 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2375 arch_enter_lazy_mmu_mode(); 2376 2377 for (; addr < end; addr += PAGE_SIZE, ptep++) { 2378 unsigned long mpfn = 0, pfn; 2379 struct page *page; 2380 swp_entry_t entry; 2381 pte_t pte; 2382 2383 pte = *ptep; 2384 2385 if (pte_none(pte)) { 2386 if (vma_is_anonymous(vma)) { 2387 mpfn = MIGRATE_PFN_MIGRATE; 2388 migrate->cpages++; 2389 } 2390 goto next; 2391 } 2392 2393 if (!pte_present(pte)) { 2394 /* 2395 * Only care about unaddressable device page special 2396 * page table entry. Other special swap entries are not 2397 * migratable, and we ignore regular swapped page. 2398 */ 2399 entry = pte_to_swp_entry(pte); 2400 if (!is_device_private_entry(entry)) 2401 goto next; 2402 2403 page = device_private_entry_to_page(entry); 2404 if (!(migrate->flags & 2405 MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || 2406 page->pgmap->owner != migrate->pgmap_owner) 2407 goto next; 2408 2409 mpfn = migrate_pfn(page_to_pfn(page)) | 2410 MIGRATE_PFN_MIGRATE; 2411 if (is_write_device_private_entry(entry)) 2412 mpfn |= MIGRATE_PFN_WRITE; 2413 } else { 2414 if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) 2415 goto next; 2416 pfn = pte_pfn(pte); 2417 if (is_zero_pfn(pfn)) { 2418 mpfn = MIGRATE_PFN_MIGRATE; 2419 migrate->cpages++; 2420 goto next; 2421 } 2422 page = vm_normal_page(migrate->vma, addr, pte); 2423 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 2424 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 2425 } 2426 2427 /* FIXME support THP */ 2428 if (!page || !page->mapping || PageTransCompound(page)) { 2429 mpfn = 0; 2430 goto next; 2431 } 2432 2433 /* 2434 * By getting a reference on the page we pin it and that blocks 2435 * any kind of migration. Side effect is that it "freezes" the 2436 * pte. 2437 * 2438 * We drop this reference after isolating the page from the lru 2439 * for non device page (device page are not on the lru and thus 2440 * can't be dropped from it). 2441 */ 2442 get_page(page); 2443 migrate->cpages++; 2444 2445 /* 2446 * Optimize for the common case where page is only mapped once 2447 * in one process. If we can lock the page, then we can safely 2448 * set up a special migration page table entry now. 2449 */ 2450 if (trylock_page(page)) { 2451 pte_t swp_pte; 2452 2453 mpfn |= MIGRATE_PFN_LOCKED; 2454 ptep_get_and_clear(mm, addr, ptep); 2455 2456 /* Setup special migration page table entry */ 2457 entry = make_migration_entry(page, mpfn & 2458 MIGRATE_PFN_WRITE); 2459 swp_pte = swp_entry_to_pte(entry); 2460 if (pte_present(pte)) { 2461 if (pte_soft_dirty(pte)) 2462 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2463 if (pte_uffd_wp(pte)) 2464 swp_pte = pte_swp_mkuffd_wp(swp_pte); 2465 } else { 2466 if (pte_swp_soft_dirty(pte)) 2467 swp_pte = pte_swp_mksoft_dirty(swp_pte); 2468 if (pte_swp_uffd_wp(pte)) 2469 swp_pte = pte_swp_mkuffd_wp(swp_pte); 2470 } 2471 set_pte_at(mm, addr, ptep, swp_pte); 2472 2473 /* 2474 * This is like regular unmap: we remove the rmap and 2475 * drop page refcount. Page won't be freed, as we took 2476 * a reference just above. 2477 */ 2478 page_remove_rmap(page, false); 2479 put_page(page); 2480 2481 if (pte_present(pte)) 2482 unmapped++; 2483 } 2484 2485 next: 2486 migrate->dst[migrate->npages] = 0; 2487 migrate->src[migrate->npages++] = mpfn; 2488 } 2489 arch_leave_lazy_mmu_mode(); 2490 pte_unmap_unlock(ptep - 1, ptl); 2491 2492 /* Only flush the TLB if we actually modified any entries */ 2493 if (unmapped) 2494 flush_tlb_range(walk->vma, start, end); 2495 2496 return 0; 2497 } 2498 2499 static const struct mm_walk_ops migrate_vma_walk_ops = { 2500 .pmd_entry = migrate_vma_collect_pmd, 2501 .pte_hole = migrate_vma_collect_hole, 2502 }; 2503 2504 /* 2505 * migrate_vma_collect() - collect pages over a range of virtual addresses 2506 * @migrate: migrate struct containing all migration information 2507 * 2508 * This will walk the CPU page table. For each virtual address backed by a 2509 * valid page, it updates the src array and takes a reference on the page, in 2510 * order to pin the page until we lock it and unmap it. 2511 */ 2512 static void migrate_vma_collect(struct migrate_vma *migrate) 2513 { 2514 struct mmu_notifier_range range; 2515 2516 /* 2517 * Note that the pgmap_owner is passed to the mmu notifier callback so 2518 * that the registered device driver can skip invalidating device 2519 * private page mappings that won't be migrated. 2520 */ 2521 mmu_notifier_range_init_migrate(&range, 0, migrate->vma, 2522 migrate->vma->vm_mm, migrate->start, migrate->end, 2523 migrate->pgmap_owner); 2524 mmu_notifier_invalidate_range_start(&range); 2525 2526 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, 2527 &migrate_vma_walk_ops, migrate); 2528 2529 mmu_notifier_invalidate_range_end(&range); 2530 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2531 } 2532 2533 /* 2534 * migrate_vma_check_page() - check if page is pinned or not 2535 * @page: struct page to check 2536 * 2537 * Pinned pages cannot be migrated. This is the same test as in 2538 * migrate_page_move_mapping(), except that here we allow migration of a 2539 * ZONE_DEVICE page. 2540 */ 2541 static bool migrate_vma_check_page(struct page *page) 2542 { 2543 /* 2544 * One extra ref because caller holds an extra reference, either from 2545 * isolate_lru_page() for a regular page, or migrate_vma_collect() for 2546 * a device page. 2547 */ 2548 int extra = 1; 2549 2550 /* 2551 * FIXME support THP (transparent huge page), it is bit more complex to 2552 * check them than regular pages, because they can be mapped with a pmd 2553 * or with a pte (split pte mapping). 2554 */ 2555 if (PageCompound(page)) 2556 return false; 2557 2558 /* Page from ZONE_DEVICE have one extra reference */ 2559 if (is_zone_device_page(page)) { 2560 /* 2561 * Private page can never be pin as they have no valid pte and 2562 * GUP will fail for those. Yet if there is a pending migration 2563 * a thread might try to wait on the pte migration entry and 2564 * will bump the page reference count. Sadly there is no way to 2565 * differentiate a regular pin from migration wait. Hence to 2566 * avoid 2 racing thread trying to migrate back to CPU to enter 2567 * infinite loop (one stopping migration because the other is 2568 * waiting on pte migration entry). We always return true here. 2569 * 2570 * FIXME proper solution is to rework migration_entry_wait() so 2571 * it does not need to take a reference on page. 2572 */ 2573 return is_device_private_page(page); 2574 } 2575 2576 /* For file back page */ 2577 if (page_mapping(page)) 2578 extra += 1 + page_has_private(page); 2579 2580 if ((page_count(page) - extra) > page_mapcount(page)) 2581 return false; 2582 2583 return true; 2584 } 2585 2586 /* 2587 * migrate_vma_prepare() - lock pages and isolate them from the lru 2588 * @migrate: migrate struct containing all migration information 2589 * 2590 * This locks pages that have been collected by migrate_vma_collect(). Once each 2591 * page is locked it is isolated from the lru (for non-device pages). Finally, 2592 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be 2593 * migrated by concurrent kernel threads. 2594 */ 2595 static void migrate_vma_prepare(struct migrate_vma *migrate) 2596 { 2597 const unsigned long npages = migrate->npages; 2598 const unsigned long start = migrate->start; 2599 unsigned long addr, i, restore = 0; 2600 bool allow_drain = true; 2601 2602 lru_add_drain(); 2603 2604 for (i = 0; (i < npages) && migrate->cpages; i++) { 2605 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2606 bool remap = true; 2607 2608 if (!page) 2609 continue; 2610 2611 if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { 2612 /* 2613 * Because we are migrating several pages there can be 2614 * a deadlock between 2 concurrent migration where each 2615 * are waiting on each other page lock. 2616 * 2617 * Make migrate_vma() a best effort thing and backoff 2618 * for any page we can not lock right away. 2619 */ 2620 if (!trylock_page(page)) { 2621 migrate->src[i] = 0; 2622 migrate->cpages--; 2623 put_page(page); 2624 continue; 2625 } 2626 remap = false; 2627 migrate->src[i] |= MIGRATE_PFN_LOCKED; 2628 } 2629 2630 /* ZONE_DEVICE pages are not on LRU */ 2631 if (!is_zone_device_page(page)) { 2632 if (!PageLRU(page) && allow_drain) { 2633 /* Drain CPU's pagevec */ 2634 lru_add_drain_all(); 2635 allow_drain = false; 2636 } 2637 2638 if (isolate_lru_page(page)) { 2639 if (remap) { 2640 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2641 migrate->cpages--; 2642 restore++; 2643 } else { 2644 migrate->src[i] = 0; 2645 unlock_page(page); 2646 migrate->cpages--; 2647 put_page(page); 2648 } 2649 continue; 2650 } 2651 2652 /* Drop the reference we took in collect */ 2653 put_page(page); 2654 } 2655 2656 if (!migrate_vma_check_page(page)) { 2657 if (remap) { 2658 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2659 migrate->cpages--; 2660 restore++; 2661 2662 if (!is_zone_device_page(page)) { 2663 get_page(page); 2664 putback_lru_page(page); 2665 } 2666 } else { 2667 migrate->src[i] = 0; 2668 unlock_page(page); 2669 migrate->cpages--; 2670 2671 if (!is_zone_device_page(page)) 2672 putback_lru_page(page); 2673 else 2674 put_page(page); 2675 } 2676 } 2677 } 2678 2679 for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { 2680 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2681 2682 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2683 continue; 2684 2685 remove_migration_pte(page, migrate->vma, addr, page); 2686 2687 migrate->src[i] = 0; 2688 unlock_page(page); 2689 put_page(page); 2690 restore--; 2691 } 2692 } 2693 2694 /* 2695 * migrate_vma_unmap() - replace page mapping with special migration pte entry 2696 * @migrate: migrate struct containing all migration information 2697 * 2698 * Replace page mapping (CPU page table pte) with a special migration pte entry 2699 * and check again if it has been pinned. Pinned pages are restored because we 2700 * cannot migrate them. 2701 * 2702 * This is the last step before we call the device driver callback to allocate 2703 * destination memory and copy contents of original page over to new page. 2704 */ 2705 static void migrate_vma_unmap(struct migrate_vma *migrate) 2706 { 2707 int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK; 2708 const unsigned long npages = migrate->npages; 2709 const unsigned long start = migrate->start; 2710 unsigned long addr, i, restore = 0; 2711 2712 for (i = 0; i < npages; i++) { 2713 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2714 2715 if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2716 continue; 2717 2718 if (page_mapped(page)) { 2719 try_to_unmap(page, flags); 2720 if (page_mapped(page)) 2721 goto restore; 2722 } 2723 2724 if (migrate_vma_check_page(page)) 2725 continue; 2726 2727 restore: 2728 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2729 migrate->cpages--; 2730 restore++; 2731 } 2732 2733 for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { 2734 struct page *page = migrate_pfn_to_page(migrate->src[i]); 2735 2736 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2737 continue; 2738 2739 remove_migration_ptes(page, page, false); 2740 2741 migrate->src[i] = 0; 2742 unlock_page(page); 2743 restore--; 2744 2745 if (is_zone_device_page(page)) 2746 put_page(page); 2747 else 2748 putback_lru_page(page); 2749 } 2750 } 2751 2752 /** 2753 * migrate_vma_setup() - prepare to migrate a range of memory 2754 * @args: contains the vma, start, and pfns arrays for the migration 2755 * 2756 * Returns: negative errno on failures, 0 when 0 or more pages were migrated 2757 * without an error. 2758 * 2759 * Prepare to migrate a range of memory virtual address range by collecting all 2760 * the pages backing each virtual address in the range, saving them inside the 2761 * src array. Then lock those pages and unmap them. Once the pages are locked 2762 * and unmapped, check whether each page is pinned or not. Pages that aren't 2763 * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the 2764 * corresponding src array entry. Then restores any pages that are pinned, by 2765 * remapping and unlocking those pages. 2766 * 2767 * The caller should then allocate destination memory and copy source memory to 2768 * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE 2769 * flag set). Once these are allocated and copied, the caller must update each 2770 * corresponding entry in the dst array with the pfn value of the destination 2771 * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set 2772 * (destination pages must have their struct pages locked, via lock_page()). 2773 * 2774 * Note that the caller does not have to migrate all the pages that are marked 2775 * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from 2776 * device memory to system memory. If the caller cannot migrate a device page 2777 * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe 2778 * consequences for the userspace process, so it must be avoided if at all 2779 * possible. 2780 * 2781 * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we 2782 * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 2783 * allowing the caller to allocate device memory for those unbacked virtual 2784 * addresses. For this the caller simply has to allocate device memory and 2785 * properly set the destination entry like for regular migration. Note that 2786 * this can still fail, and thus inside the device driver you must check if the 2787 * migration was successful for those entries after calling migrate_vma_pages(), 2788 * just like for regular migration. 2789 * 2790 * After that, the callers must call migrate_vma_pages() to go over each entry 2791 * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2792 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2793 * then migrate_vma_pages() to migrate struct page information from the source 2794 * struct page to the destination struct page. If it fails to migrate the 2795 * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the 2796 * src array. 2797 * 2798 * At this point all successfully migrated pages have an entry in the src 2799 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2800 * array entry with MIGRATE_PFN_VALID flag set. 2801 * 2802 * Once migrate_vma_pages() returns the caller may inspect which pages were 2803 * successfully migrated, and which were not. Successfully migrated pages will 2804 * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 2805 * 2806 * It is safe to update device page table after migrate_vma_pages() because 2807 * both destination and source page are still locked, and the mmap_lock is held 2808 * in read mode (hence no one can unmap the range being migrated). 2809 * 2810 * Once the caller is done cleaning up things and updating its page table (if it 2811 * chose to do so, this is not an obligation) it finally calls 2812 * migrate_vma_finalize() to update the CPU page table to point to new pages 2813 * for successfully migrated pages or otherwise restore the CPU page table to 2814 * point to the original source pages. 2815 */ 2816 int migrate_vma_setup(struct migrate_vma *args) 2817 { 2818 long nr_pages = (args->end - args->start) >> PAGE_SHIFT; 2819 2820 args->start &= PAGE_MASK; 2821 args->end &= PAGE_MASK; 2822 if (!args->vma || is_vm_hugetlb_page(args->vma) || 2823 (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) 2824 return -EINVAL; 2825 if (nr_pages <= 0) 2826 return -EINVAL; 2827 if (args->start < args->vma->vm_start || 2828 args->start >= args->vma->vm_end) 2829 return -EINVAL; 2830 if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) 2831 return -EINVAL; 2832 if (!args->src || !args->dst) 2833 return -EINVAL; 2834 2835 memset(args->src, 0, sizeof(*args->src) * nr_pages); 2836 args->cpages = 0; 2837 args->npages = 0; 2838 2839 migrate_vma_collect(args); 2840 2841 if (args->cpages) 2842 migrate_vma_prepare(args); 2843 if (args->cpages) 2844 migrate_vma_unmap(args); 2845 2846 /* 2847 * At this point pages are locked and unmapped, and thus they have 2848 * stable content and can safely be copied to destination memory that 2849 * is allocated by the drivers. 2850 */ 2851 return 0; 2852 2853 } 2854 EXPORT_SYMBOL(migrate_vma_setup); 2855 2856 /* 2857 * This code closely matches the code in: 2858 * __handle_mm_fault() 2859 * handle_pte_fault() 2860 * do_anonymous_page() 2861 * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE 2862 * private page. 2863 */ 2864 static void migrate_vma_insert_page(struct migrate_vma *migrate, 2865 unsigned long addr, 2866 struct page *page, 2867 unsigned long *src) 2868 { 2869 struct vm_area_struct *vma = migrate->vma; 2870 struct mm_struct *mm = vma->vm_mm; 2871 bool flush = false; 2872 spinlock_t *ptl; 2873 pte_t entry; 2874 pgd_t *pgdp; 2875 p4d_t *p4dp; 2876 pud_t *pudp; 2877 pmd_t *pmdp; 2878 pte_t *ptep; 2879 2880 /* Only allow populating anonymous memory */ 2881 if (!vma_is_anonymous(vma)) 2882 goto abort; 2883 2884 pgdp = pgd_offset(mm, addr); 2885 p4dp = p4d_alloc(mm, pgdp, addr); 2886 if (!p4dp) 2887 goto abort; 2888 pudp = pud_alloc(mm, p4dp, addr); 2889 if (!pudp) 2890 goto abort; 2891 pmdp = pmd_alloc(mm, pudp, addr); 2892 if (!pmdp) 2893 goto abort; 2894 2895 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 2896 goto abort; 2897 2898 /* 2899 * Use pte_alloc() instead of pte_alloc_map(). We can't run 2900 * pte_offset_map() on pmds where a huge pmd might be created 2901 * from a different thread. 2902 * 2903 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when 2904 * parallel threads are excluded by other means. 2905 * 2906 * Here we only have mmap_read_lock(mm). 2907 */ 2908 if (pte_alloc(mm, pmdp)) 2909 goto abort; 2910 2911 /* See the comment in pte_alloc_one_map() */ 2912 if (unlikely(pmd_trans_unstable(pmdp))) 2913 goto abort; 2914 2915 if (unlikely(anon_vma_prepare(vma))) 2916 goto abort; 2917 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) 2918 goto abort; 2919 2920 /* 2921 * The memory barrier inside __SetPageUptodate makes sure that 2922 * preceding stores to the page contents become visible before 2923 * the set_pte_at() write. 2924 */ 2925 __SetPageUptodate(page); 2926 2927 if (is_zone_device_page(page)) { 2928 if (is_device_private_page(page)) { 2929 swp_entry_t swp_entry; 2930 2931 swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); 2932 entry = swp_entry_to_pte(swp_entry); 2933 } else { 2934 /* 2935 * For now we only support migrating to un-addressable 2936 * device memory. 2937 */ 2938 pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); 2939 goto abort; 2940 } 2941 } else { 2942 entry = mk_pte(page, vma->vm_page_prot); 2943 if (vma->vm_flags & VM_WRITE) 2944 entry = pte_mkwrite(pte_mkdirty(entry)); 2945 } 2946 2947 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2948 2949 if (check_stable_address_space(mm)) 2950 goto unlock_abort; 2951 2952 if (pte_present(*ptep)) { 2953 unsigned long pfn = pte_pfn(*ptep); 2954 2955 if (!is_zero_pfn(pfn)) 2956 goto unlock_abort; 2957 flush = true; 2958 } else if (!pte_none(*ptep)) 2959 goto unlock_abort; 2960 2961 /* 2962 * Check for userfaultfd but do not deliver the fault. Instead, 2963 * just back off. 2964 */ 2965 if (userfaultfd_missing(vma)) 2966 goto unlock_abort; 2967 2968 inc_mm_counter(mm, MM_ANONPAGES); 2969 page_add_new_anon_rmap(page, vma, addr, false); 2970 if (!is_zone_device_page(page)) 2971 lru_cache_add_inactive_or_unevictable(page, vma); 2972 get_page(page); 2973 2974 if (flush) { 2975 flush_cache_page(vma, addr, pte_pfn(*ptep)); 2976 ptep_clear_flush_notify(vma, addr, ptep); 2977 set_pte_at_notify(mm, addr, ptep, entry); 2978 update_mmu_cache(vma, addr, ptep); 2979 } else { 2980 /* No need to invalidate - it was non-present before */ 2981 set_pte_at(mm, addr, ptep, entry); 2982 update_mmu_cache(vma, addr, ptep); 2983 } 2984 2985 pte_unmap_unlock(ptep, ptl); 2986 *src = MIGRATE_PFN_MIGRATE; 2987 return; 2988 2989 unlock_abort: 2990 pte_unmap_unlock(ptep, ptl); 2991 abort: 2992 *src &= ~MIGRATE_PFN_MIGRATE; 2993 } 2994 2995 /** 2996 * migrate_vma_pages() - migrate meta-data from src page to dst page 2997 * @migrate: migrate struct containing all migration information 2998 * 2999 * This migrates struct page meta-data from source struct page to destination 3000 * struct page. This effectively finishes the migration from source page to the 3001 * destination page. 3002 */ 3003 void migrate_vma_pages(struct migrate_vma *migrate) 3004 { 3005 const unsigned long npages = migrate->npages; 3006 const unsigned long start = migrate->start; 3007 struct mmu_notifier_range range; 3008 unsigned long addr, i; 3009 bool notified = false; 3010 3011 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 3012 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 3013 struct page *page = migrate_pfn_to_page(migrate->src[i]); 3014 struct address_space *mapping; 3015 int r; 3016 3017 if (!newpage) { 3018 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 3019 continue; 3020 } 3021 3022 if (!page) { 3023 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 3024 continue; 3025 if (!notified) { 3026 notified = true; 3027 3028 mmu_notifier_range_init_migrate(&range, 0, 3029 migrate->vma, migrate->vma->vm_mm, 3030 addr, migrate->end, 3031 migrate->pgmap_owner); 3032 mmu_notifier_invalidate_range_start(&range); 3033 } 3034 migrate_vma_insert_page(migrate, addr, newpage, 3035 &migrate->src[i]); 3036 continue; 3037 } 3038 3039 mapping = page_mapping(page); 3040 3041 if (is_zone_device_page(newpage)) { 3042 if (is_device_private_page(newpage)) { 3043 /* 3044 * For now only support private anonymous when 3045 * migrating to un-addressable device memory. 3046 */ 3047 if (mapping) { 3048 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 3049 continue; 3050 } 3051 } else { 3052 /* 3053 * Other types of ZONE_DEVICE page are not 3054 * supported. 3055 */ 3056 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 3057 continue; 3058 } 3059 } 3060 3061 r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); 3062 if (r != MIGRATEPAGE_SUCCESS) 3063 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 3064 } 3065 3066 /* 3067 * No need to double call mmu_notifier->invalidate_range() callback as 3068 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() 3069 * did already call it. 3070 */ 3071 if (notified) 3072 mmu_notifier_invalidate_range_only_end(&range); 3073 } 3074 EXPORT_SYMBOL(migrate_vma_pages); 3075 3076 /** 3077 * migrate_vma_finalize() - restore CPU page table entry 3078 * @migrate: migrate struct containing all migration information 3079 * 3080 * This replaces the special migration pte entry with either a mapping to the 3081 * new page if migration was successful for that page, or to the original page 3082 * otherwise. 3083 * 3084 * This also unlocks the pages and puts them back on the lru, or drops the extra 3085 * refcount, for device pages. 3086 */ 3087 void migrate_vma_finalize(struct migrate_vma *migrate) 3088 { 3089 const unsigned long npages = migrate->npages; 3090 unsigned long i; 3091 3092 for (i = 0; i < npages; i++) { 3093 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 3094 struct page *page = migrate_pfn_to_page(migrate->src[i]); 3095 3096 if (!page) { 3097 if (newpage) { 3098 unlock_page(newpage); 3099 put_page(newpage); 3100 } 3101 continue; 3102 } 3103 3104 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { 3105 if (newpage) { 3106 unlock_page(newpage); 3107 put_page(newpage); 3108 } 3109 newpage = page; 3110 } 3111 3112 remove_migration_ptes(page, newpage, false); 3113 unlock_page(page); 3114 3115 if (is_zone_device_page(page)) 3116 put_page(page); 3117 else 3118 putback_lru_page(page); 3119 3120 if (newpage != page) { 3121 unlock_page(newpage); 3122 if (is_zone_device_page(newpage)) 3123 put_page(newpage); 3124 else 3125 putback_lru_page(newpage); 3126 } 3127 } 3128 } 3129 EXPORT_SYMBOL(migrate_vma_finalize); 3130 #endif /* CONFIG_DEVICE_PRIVATE */ 3131