1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Device Memory Migration functionality. 4 * 5 * Originally written by Jérôme Glisse. 6 */ 7 #include <linux/export.h> 8 #include <linux/memremap.h> 9 #include <linux/migrate.h> 10 #include <linux/mm.h> 11 #include <linux/mm_inline.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/oom.h> 14 #include <linux/pagewalk.h> 15 #include <linux/rmap.h> 16 #include <linux/swapops.h> 17 #include <asm/tlbflush.h> 18 #include "internal.h" 19 20 static int migrate_vma_collect_skip(unsigned long start, 21 unsigned long end, 22 struct mm_walk *walk) 23 { 24 struct migrate_vma *migrate = walk->private; 25 unsigned long addr; 26 27 for (addr = start; addr < end; addr += PAGE_SIZE) { 28 migrate->dst[migrate->npages] = 0; 29 migrate->src[migrate->npages++] = 0; 30 } 31 32 return 0; 33 } 34 35 static int migrate_vma_collect_hole(unsigned long start, 36 unsigned long end, 37 __always_unused int depth, 38 struct mm_walk *walk) 39 { 40 struct migrate_vma *migrate = walk->private; 41 unsigned long addr; 42 43 /* Only allow populating anonymous memory. */ 44 if (!vma_is_anonymous(walk->vma)) 45 return migrate_vma_collect_skip(start, end, walk); 46 47 for (addr = start; addr < end; addr += PAGE_SIZE) { 48 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 49 migrate->dst[migrate->npages] = 0; 50 migrate->npages++; 51 migrate->cpages++; 52 } 53 54 return 0; 55 } 56 57 static int migrate_vma_collect_pmd(pmd_t *pmdp, 58 unsigned long start, 59 unsigned long end, 60 struct mm_walk *walk) 61 { 62 struct migrate_vma *migrate = walk->private; 63 struct folio *fault_folio = migrate->fault_page ? 64 page_folio(migrate->fault_page) : NULL; 65 struct vm_area_struct *vma = walk->vma; 66 struct mm_struct *mm = vma->vm_mm; 67 unsigned long addr = start, unmapped = 0; 68 spinlock_t *ptl; 69 pte_t *ptep; 70 71 again: 72 if (pmd_none(*pmdp)) 73 return migrate_vma_collect_hole(start, end, -1, walk); 74 75 if (pmd_trans_huge(*pmdp)) { 76 struct folio *folio; 77 78 ptl = pmd_lock(mm, pmdp); 79 if (unlikely(!pmd_trans_huge(*pmdp))) { 80 spin_unlock(ptl); 81 goto again; 82 } 83 84 folio = pmd_folio(*pmdp); 85 if (is_huge_zero_folio(folio)) { 86 spin_unlock(ptl); 87 split_huge_pmd(vma, pmdp, addr); 88 } else { 89 int ret; 90 91 folio_get(folio); 92 spin_unlock(ptl); 93 /* FIXME: we don't expect THP for fault_folio */ 94 if (WARN_ON_ONCE(fault_folio == folio)) 95 return migrate_vma_collect_skip(start, end, 96 walk); 97 if (unlikely(!folio_trylock(folio))) 98 return migrate_vma_collect_skip(start, end, 99 walk); 100 ret = split_folio(folio); 101 if (fault_folio != folio) 102 folio_unlock(folio); 103 folio_put(folio); 104 if (ret) 105 return migrate_vma_collect_skip(start, end, 106 walk); 107 } 108 } 109 110 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 111 if (!ptep) 112 goto again; 113 arch_enter_lazy_mmu_mode(); 114 115 for (; addr < end; addr += PAGE_SIZE, ptep++) { 116 unsigned long mpfn = 0, pfn; 117 struct folio *folio; 118 struct page *page; 119 swp_entry_t entry; 120 pte_t pte; 121 122 pte = ptep_get(ptep); 123 124 if (pte_none(pte)) { 125 if (vma_is_anonymous(vma)) { 126 mpfn = MIGRATE_PFN_MIGRATE; 127 migrate->cpages++; 128 } 129 goto next; 130 } 131 132 if (!pte_present(pte)) { 133 /* 134 * Only care about unaddressable device page special 135 * page table entry. Other special swap entries are not 136 * migratable, and we ignore regular swapped page. 137 */ 138 entry = pte_to_swp_entry(pte); 139 if (!is_device_private_entry(entry)) 140 goto next; 141 142 page = pfn_swap_entry_to_page(entry); 143 if (!(migrate->flags & 144 MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || 145 page->pgmap->owner != migrate->pgmap_owner) 146 goto next; 147 148 mpfn = migrate_pfn(page_to_pfn(page)) | 149 MIGRATE_PFN_MIGRATE; 150 if (is_writable_device_private_entry(entry)) 151 mpfn |= MIGRATE_PFN_WRITE; 152 } else { 153 pfn = pte_pfn(pte); 154 if (is_zero_pfn(pfn) && 155 (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { 156 mpfn = MIGRATE_PFN_MIGRATE; 157 migrate->cpages++; 158 goto next; 159 } 160 page = vm_normal_page(migrate->vma, addr, pte); 161 if (page && !is_zone_device_page(page) && 162 !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) 163 goto next; 164 else if (page && is_device_coherent_page(page) && 165 (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || 166 page->pgmap->owner != migrate->pgmap_owner)) 167 goto next; 168 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 169 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 170 } 171 172 /* FIXME support THP */ 173 if (!page || !page->mapping || PageTransCompound(page)) { 174 mpfn = 0; 175 goto next; 176 } 177 178 /* 179 * By getting a reference on the folio we pin it and that blocks 180 * any kind of migration. Side effect is that it "freezes" the 181 * pte. 182 * 183 * We drop this reference after isolating the folio from the lru 184 * for non device folio (device folio are not on the lru and thus 185 * can't be dropped from it). 186 */ 187 folio = page_folio(page); 188 folio_get(folio); 189 190 /* 191 * We rely on folio_trylock() to avoid deadlock between 192 * concurrent migrations where each is waiting on the others 193 * folio lock. If we can't immediately lock the folio we fail this 194 * migration as it is only best effort anyway. 195 * 196 * If we can lock the folio it's safe to set up a migration entry 197 * now. In the common case where the folio is mapped once in a 198 * single process setting up the migration entry now is an 199 * optimisation to avoid walking the rmap later with 200 * try_to_migrate(). 201 */ 202 if (fault_folio == folio || folio_trylock(folio)) { 203 bool anon_exclusive; 204 pte_t swp_pte; 205 206 flush_cache_page(vma, addr, pte_pfn(pte)); 207 anon_exclusive = folio_test_anon(folio) && 208 PageAnonExclusive(page); 209 if (anon_exclusive) { 210 pte = ptep_clear_flush(vma, addr, ptep); 211 212 if (folio_try_share_anon_rmap_pte(folio, page)) { 213 set_pte_at(mm, addr, ptep, pte); 214 if (fault_folio != folio) 215 folio_unlock(folio); 216 folio_put(folio); 217 mpfn = 0; 218 goto next; 219 } 220 } else { 221 pte = ptep_get_and_clear(mm, addr, ptep); 222 } 223 224 migrate->cpages++; 225 226 /* Set the dirty flag on the folio now the pte is gone. */ 227 if (pte_dirty(pte)) 228 folio_mark_dirty(folio); 229 230 /* Setup special migration page table entry */ 231 if (mpfn & MIGRATE_PFN_WRITE) 232 entry = make_writable_migration_entry( 233 page_to_pfn(page)); 234 else if (anon_exclusive) 235 entry = make_readable_exclusive_migration_entry( 236 page_to_pfn(page)); 237 else 238 entry = make_readable_migration_entry( 239 page_to_pfn(page)); 240 if (pte_present(pte)) { 241 if (pte_young(pte)) 242 entry = make_migration_entry_young(entry); 243 if (pte_dirty(pte)) 244 entry = make_migration_entry_dirty(entry); 245 } 246 swp_pte = swp_entry_to_pte(entry); 247 if (pte_present(pte)) { 248 if (pte_soft_dirty(pte)) 249 swp_pte = pte_swp_mksoft_dirty(swp_pte); 250 if (pte_uffd_wp(pte)) 251 swp_pte = pte_swp_mkuffd_wp(swp_pte); 252 } else { 253 if (pte_swp_soft_dirty(pte)) 254 swp_pte = pte_swp_mksoft_dirty(swp_pte); 255 if (pte_swp_uffd_wp(pte)) 256 swp_pte = pte_swp_mkuffd_wp(swp_pte); 257 } 258 set_pte_at(mm, addr, ptep, swp_pte); 259 260 /* 261 * This is like regular unmap: we remove the rmap and 262 * drop the folio refcount. The folio won't be freed, as 263 * we took a reference just above. 264 */ 265 folio_remove_rmap_pte(folio, page, vma); 266 folio_put(folio); 267 268 if (pte_present(pte)) 269 unmapped++; 270 } else { 271 folio_put(folio); 272 mpfn = 0; 273 } 274 275 next: 276 migrate->dst[migrate->npages] = 0; 277 migrate->src[migrate->npages++] = mpfn; 278 } 279 280 /* Only flush the TLB if we actually modified any entries */ 281 if (unmapped) 282 flush_tlb_range(walk->vma, start, end); 283 284 arch_leave_lazy_mmu_mode(); 285 pte_unmap_unlock(ptep - 1, ptl); 286 287 return 0; 288 } 289 290 static const struct mm_walk_ops migrate_vma_walk_ops = { 291 .pmd_entry = migrate_vma_collect_pmd, 292 .pte_hole = migrate_vma_collect_hole, 293 .walk_lock = PGWALK_RDLOCK, 294 }; 295 296 /* 297 * migrate_vma_collect() - collect pages over a range of virtual addresses 298 * @migrate: migrate struct containing all migration information 299 * 300 * This will walk the CPU page table. For each virtual address backed by a 301 * valid page, it updates the src array and takes a reference on the page, in 302 * order to pin the page until we lock it and unmap it. 303 */ 304 static void migrate_vma_collect(struct migrate_vma *migrate) 305 { 306 struct mmu_notifier_range range; 307 308 /* 309 * Note that the pgmap_owner is passed to the mmu notifier callback so 310 * that the registered device driver can skip invalidating device 311 * private page mappings that won't be migrated. 312 */ 313 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, 314 migrate->vma->vm_mm, migrate->start, migrate->end, 315 migrate->pgmap_owner); 316 mmu_notifier_invalidate_range_start(&range); 317 318 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, 319 &migrate_vma_walk_ops, migrate); 320 321 mmu_notifier_invalidate_range_end(&range); 322 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 323 } 324 325 /* 326 * migrate_vma_check_page() - check if page is pinned or not 327 * @page: struct page to check 328 * 329 * Pinned pages cannot be migrated. This is the same test as in 330 * folio_migrate_mapping(), except that here we allow migration of a 331 * ZONE_DEVICE page. 332 */ 333 static bool migrate_vma_check_page(struct page *page, struct page *fault_page) 334 { 335 struct folio *folio = page_folio(page); 336 337 /* 338 * One extra ref because caller holds an extra reference, either from 339 * folio_isolate_lru() for a regular folio, or migrate_vma_collect() for 340 * a device folio. 341 */ 342 int extra = 1 + (page == fault_page); 343 344 /* 345 * FIXME support THP (transparent huge page), it is bit more complex to 346 * check them than regular pages, because they can be mapped with a pmd 347 * or with a pte (split pte mapping). 348 */ 349 if (folio_test_large(folio)) 350 return false; 351 352 /* Page from ZONE_DEVICE have one extra reference */ 353 if (folio_is_zone_device(folio)) 354 extra++; 355 356 /* For file back page */ 357 if (folio_mapping(folio)) 358 extra += 1 + folio_has_private(folio); 359 360 if ((folio_ref_count(folio) - extra) > folio_mapcount(folio)) 361 return false; 362 363 return true; 364 } 365 366 /* 367 * Unmaps pages for migration. Returns number of source pfns marked as 368 * migrating. 369 */ 370 static unsigned long migrate_device_unmap(unsigned long *src_pfns, 371 unsigned long npages, 372 struct page *fault_page) 373 { 374 struct folio *fault_folio = fault_page ? 375 page_folio(fault_page) : NULL; 376 unsigned long i, restore = 0; 377 bool allow_drain = true; 378 unsigned long unmapped = 0; 379 380 lru_add_drain(); 381 382 for (i = 0; i < npages; i++) { 383 struct page *page = migrate_pfn_to_page(src_pfns[i]); 384 struct folio *folio; 385 386 if (!page) { 387 if (src_pfns[i] & MIGRATE_PFN_MIGRATE) 388 unmapped++; 389 continue; 390 } 391 392 folio = page_folio(page); 393 /* ZONE_DEVICE folios are not on LRU */ 394 if (!folio_is_zone_device(folio)) { 395 if (!folio_test_lru(folio) && allow_drain) { 396 /* Drain CPU's lru cache */ 397 lru_add_drain_all(); 398 allow_drain = false; 399 } 400 401 if (!folio_isolate_lru(folio)) { 402 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 403 restore++; 404 continue; 405 } 406 407 /* Drop the reference we took in collect */ 408 folio_put(folio); 409 } 410 411 if (folio_mapped(folio)) 412 try_to_migrate(folio, 0); 413 414 if (folio_mapped(folio) || 415 !migrate_vma_check_page(page, fault_page)) { 416 if (!folio_is_zone_device(folio)) { 417 folio_get(folio); 418 folio_putback_lru(folio); 419 } 420 421 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 422 restore++; 423 continue; 424 } 425 426 unmapped++; 427 } 428 429 for (i = 0; i < npages && restore; i++) { 430 struct page *page = migrate_pfn_to_page(src_pfns[i]); 431 struct folio *folio; 432 433 if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE)) 434 continue; 435 436 folio = page_folio(page); 437 remove_migration_ptes(folio, folio, 0); 438 439 src_pfns[i] = 0; 440 if (fault_folio != folio) 441 folio_unlock(folio); 442 folio_put(folio); 443 restore--; 444 } 445 446 return unmapped; 447 } 448 449 /* 450 * migrate_vma_unmap() - replace page mapping with special migration pte entry 451 * @migrate: migrate struct containing all migration information 452 * 453 * Isolate pages from the LRU and replace mappings (CPU page table pte) with a 454 * special migration pte entry and check if it has been pinned. Pinned pages are 455 * restored because we cannot migrate them. 456 * 457 * This is the last step before we call the device driver callback to allocate 458 * destination memory and copy contents of original page over to new page. 459 */ 460 static void migrate_vma_unmap(struct migrate_vma *migrate) 461 { 462 migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages, 463 migrate->fault_page); 464 } 465 466 /** 467 * migrate_vma_setup() - prepare to migrate a range of memory 468 * @args: contains the vma, start, and pfns arrays for the migration 469 * 470 * Returns: negative errno on failures, 0 when 0 or more pages were migrated 471 * without an error. 472 * 473 * Prepare to migrate a range of memory virtual address range by collecting all 474 * the pages backing each virtual address in the range, saving them inside the 475 * src array. Then lock those pages and unmap them. Once the pages are locked 476 * and unmapped, check whether each page is pinned or not. Pages that aren't 477 * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the 478 * corresponding src array entry. Then restores any pages that are pinned, by 479 * remapping and unlocking those pages. 480 * 481 * The caller should then allocate destination memory and copy source memory to 482 * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE 483 * flag set). Once these are allocated and copied, the caller must update each 484 * corresponding entry in the dst array with the pfn value of the destination 485 * page and with MIGRATE_PFN_VALID. Destination pages must be locked via 486 * lock_page(). 487 * 488 * Note that the caller does not have to migrate all the pages that are marked 489 * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from 490 * device memory to system memory. If the caller cannot migrate a device page 491 * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe 492 * consequences for the userspace process, so it must be avoided if at all 493 * possible. 494 * 495 * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we 496 * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 497 * allowing the caller to allocate device memory for those unbacked virtual 498 * addresses. For this the caller simply has to allocate device memory and 499 * properly set the destination entry like for regular migration. Note that 500 * this can still fail, and thus inside the device driver you must check if the 501 * migration was successful for those entries after calling migrate_vma_pages(), 502 * just like for regular migration. 503 * 504 * After that, the callers must call migrate_vma_pages() to go over each entry 505 * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 506 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 507 * then migrate_vma_pages() to migrate struct page information from the source 508 * struct page to the destination struct page. If it fails to migrate the 509 * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the 510 * src array. 511 * 512 * At this point all successfully migrated pages have an entry in the src 513 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 514 * array entry with MIGRATE_PFN_VALID flag set. 515 * 516 * Once migrate_vma_pages() returns the caller may inspect which pages were 517 * successfully migrated, and which were not. Successfully migrated pages will 518 * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 519 * 520 * It is safe to update device page table after migrate_vma_pages() because 521 * both destination and source page are still locked, and the mmap_lock is held 522 * in read mode (hence no one can unmap the range being migrated). 523 * 524 * Once the caller is done cleaning up things and updating its page table (if it 525 * chose to do so, this is not an obligation) it finally calls 526 * migrate_vma_finalize() to update the CPU page table to point to new pages 527 * for successfully migrated pages or otherwise restore the CPU page table to 528 * point to the original source pages. 529 */ 530 int migrate_vma_setup(struct migrate_vma *args) 531 { 532 long nr_pages = (args->end - args->start) >> PAGE_SHIFT; 533 534 args->start &= PAGE_MASK; 535 args->end &= PAGE_MASK; 536 if (!args->vma || is_vm_hugetlb_page(args->vma) || 537 (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) 538 return -EINVAL; 539 if (nr_pages <= 0) 540 return -EINVAL; 541 if (args->start < args->vma->vm_start || 542 args->start >= args->vma->vm_end) 543 return -EINVAL; 544 if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) 545 return -EINVAL; 546 if (!args->src || !args->dst) 547 return -EINVAL; 548 if (args->fault_page && !is_device_private_page(args->fault_page)) 549 return -EINVAL; 550 if (args->fault_page && !PageLocked(args->fault_page)) 551 return -EINVAL; 552 553 memset(args->src, 0, sizeof(*args->src) * nr_pages); 554 args->cpages = 0; 555 args->npages = 0; 556 557 migrate_vma_collect(args); 558 559 if (args->cpages) 560 migrate_vma_unmap(args); 561 562 /* 563 * At this point pages are locked and unmapped, and thus they have 564 * stable content and can safely be copied to destination memory that 565 * is allocated by the drivers. 566 */ 567 return 0; 568 569 } 570 EXPORT_SYMBOL(migrate_vma_setup); 571 572 /* 573 * This code closely matches the code in: 574 * __handle_mm_fault() 575 * handle_pte_fault() 576 * do_anonymous_page() 577 * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE 578 * private or coherent page. 579 */ 580 static void migrate_vma_insert_page(struct migrate_vma *migrate, 581 unsigned long addr, 582 struct page *page, 583 unsigned long *src) 584 { 585 struct folio *folio = page_folio(page); 586 struct vm_area_struct *vma = migrate->vma; 587 struct mm_struct *mm = vma->vm_mm; 588 bool flush = false; 589 spinlock_t *ptl; 590 pte_t entry; 591 pgd_t *pgdp; 592 p4d_t *p4dp; 593 pud_t *pudp; 594 pmd_t *pmdp; 595 pte_t *ptep; 596 pte_t orig_pte; 597 598 /* Only allow populating anonymous memory */ 599 if (!vma_is_anonymous(vma)) 600 goto abort; 601 602 pgdp = pgd_offset(mm, addr); 603 p4dp = p4d_alloc(mm, pgdp, addr); 604 if (!p4dp) 605 goto abort; 606 pudp = pud_alloc(mm, p4dp, addr); 607 if (!pudp) 608 goto abort; 609 pmdp = pmd_alloc(mm, pudp, addr); 610 if (!pmdp) 611 goto abort; 612 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 613 goto abort; 614 if (pte_alloc(mm, pmdp)) 615 goto abort; 616 if (unlikely(anon_vma_prepare(vma))) 617 goto abort; 618 if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) 619 goto abort; 620 621 /* 622 * The memory barrier inside __folio_mark_uptodate makes sure that 623 * preceding stores to the folio contents become visible before 624 * the set_pte_at() write. 625 */ 626 __folio_mark_uptodate(folio); 627 628 if (folio_is_device_private(folio)) { 629 swp_entry_t swp_entry; 630 631 if (vma->vm_flags & VM_WRITE) 632 swp_entry = make_writable_device_private_entry( 633 page_to_pfn(page)); 634 else 635 swp_entry = make_readable_device_private_entry( 636 page_to_pfn(page)); 637 entry = swp_entry_to_pte(swp_entry); 638 } else { 639 if (folio_is_zone_device(folio) && 640 !folio_is_device_coherent(folio)) { 641 pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); 642 goto abort; 643 } 644 entry = mk_pte(page, vma->vm_page_prot); 645 if (vma->vm_flags & VM_WRITE) 646 entry = pte_mkwrite(pte_mkdirty(entry), vma); 647 } 648 649 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 650 if (!ptep) 651 goto abort; 652 orig_pte = ptep_get(ptep); 653 654 if (check_stable_address_space(mm)) 655 goto unlock_abort; 656 657 if (pte_present(orig_pte)) { 658 unsigned long pfn = pte_pfn(orig_pte); 659 660 if (!is_zero_pfn(pfn)) 661 goto unlock_abort; 662 flush = true; 663 } else if (!pte_none(orig_pte)) 664 goto unlock_abort; 665 666 /* 667 * Check for userfaultfd but do not deliver the fault. Instead, 668 * just back off. 669 */ 670 if (userfaultfd_missing(vma)) 671 goto unlock_abort; 672 673 inc_mm_counter(mm, MM_ANONPAGES); 674 folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); 675 if (!folio_is_zone_device(folio)) 676 folio_add_lru_vma(folio, vma); 677 folio_get(folio); 678 679 if (flush) { 680 flush_cache_page(vma, addr, pte_pfn(orig_pte)); 681 ptep_clear_flush(vma, addr, ptep); 682 } 683 set_pte_at(mm, addr, ptep, entry); 684 update_mmu_cache(vma, addr, ptep); 685 686 pte_unmap_unlock(ptep, ptl); 687 *src = MIGRATE_PFN_MIGRATE; 688 return; 689 690 unlock_abort: 691 pte_unmap_unlock(ptep, ptl); 692 abort: 693 *src &= ~MIGRATE_PFN_MIGRATE; 694 } 695 696 static void __migrate_device_pages(unsigned long *src_pfns, 697 unsigned long *dst_pfns, unsigned long npages, 698 struct migrate_vma *migrate) 699 { 700 struct mmu_notifier_range range; 701 unsigned long i; 702 bool notified = false; 703 704 for (i = 0; i < npages; i++) { 705 struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); 706 struct page *page = migrate_pfn_to_page(src_pfns[i]); 707 struct address_space *mapping; 708 struct folio *newfolio, *folio; 709 int r, extra_cnt = 0; 710 711 if (!newpage) { 712 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 713 continue; 714 } 715 716 if (!page) { 717 unsigned long addr; 718 719 if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) 720 continue; 721 722 /* 723 * The only time there is no vma is when called from 724 * migrate_device_coherent_folio(). However this isn't 725 * called if the page could not be unmapped. 726 */ 727 VM_BUG_ON(!migrate); 728 addr = migrate->start + i*PAGE_SIZE; 729 if (!notified) { 730 notified = true; 731 732 mmu_notifier_range_init_owner(&range, 733 MMU_NOTIFY_MIGRATE, 0, 734 migrate->vma->vm_mm, addr, migrate->end, 735 migrate->pgmap_owner); 736 mmu_notifier_invalidate_range_start(&range); 737 } 738 migrate_vma_insert_page(migrate, addr, newpage, 739 &src_pfns[i]); 740 continue; 741 } 742 743 newfolio = page_folio(newpage); 744 folio = page_folio(page); 745 mapping = folio_mapping(folio); 746 747 if (folio_is_device_private(newfolio) || 748 folio_is_device_coherent(newfolio)) { 749 if (mapping) { 750 /* 751 * For now only support anonymous memory migrating to 752 * device private or coherent memory. 753 * 754 * Try to get rid of swap cache if possible. 755 */ 756 if (!folio_test_anon(folio) || 757 !folio_free_swap(folio)) { 758 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 759 continue; 760 } 761 } 762 } else if (folio_is_zone_device(newfolio)) { 763 /* 764 * Other types of ZONE_DEVICE page are not supported. 765 */ 766 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 767 continue; 768 } 769 770 BUG_ON(folio_test_writeback(folio)); 771 772 if (migrate && migrate->fault_page == page) 773 extra_cnt = 1; 774 r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); 775 if (r != MIGRATEPAGE_SUCCESS) 776 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 777 else 778 folio_migrate_flags(newfolio, folio); 779 } 780 781 if (notified) 782 mmu_notifier_invalidate_range_end(&range); 783 } 784 785 /** 786 * migrate_device_pages() - migrate meta-data from src page to dst page 787 * @src_pfns: src_pfns returned from migrate_device_range() 788 * @dst_pfns: array of pfns allocated by the driver to migrate memory to 789 * @npages: number of pages in the range 790 * 791 * Equivalent to migrate_vma_pages(). This is called to migrate struct page 792 * meta-data from source struct page to destination. 793 */ 794 void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, 795 unsigned long npages) 796 { 797 __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); 798 } 799 EXPORT_SYMBOL(migrate_device_pages); 800 801 /** 802 * migrate_vma_pages() - migrate meta-data from src page to dst page 803 * @migrate: migrate struct containing all migration information 804 * 805 * This migrates struct page meta-data from source struct page to destination 806 * struct page. This effectively finishes the migration from source page to the 807 * destination page. 808 */ 809 void migrate_vma_pages(struct migrate_vma *migrate) 810 { 811 __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); 812 } 813 EXPORT_SYMBOL(migrate_vma_pages); 814 815 static void __migrate_device_finalize(unsigned long *src_pfns, 816 unsigned long *dst_pfns, 817 unsigned long npages, 818 struct page *fault_page) 819 { 820 struct folio *fault_folio = fault_page ? 821 page_folio(fault_page) : NULL; 822 unsigned long i; 823 824 for (i = 0; i < npages; i++) { 825 struct folio *dst = NULL, *src = NULL; 826 struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); 827 struct page *page = migrate_pfn_to_page(src_pfns[i]); 828 829 if (newpage) 830 dst = page_folio(newpage); 831 832 if (!page) { 833 if (dst) { 834 WARN_ON_ONCE(fault_folio == dst); 835 folio_unlock(dst); 836 folio_put(dst); 837 } 838 continue; 839 } 840 841 src = page_folio(page); 842 843 if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) { 844 if (dst) { 845 WARN_ON_ONCE(fault_folio == dst); 846 folio_unlock(dst); 847 folio_put(dst); 848 } 849 dst = src; 850 } 851 852 if (!folio_is_zone_device(dst)) 853 folio_add_lru(dst); 854 remove_migration_ptes(src, dst, 0); 855 if (fault_folio != src) 856 folio_unlock(src); 857 folio_put(src); 858 859 if (dst != src) { 860 WARN_ON_ONCE(fault_folio == dst); 861 folio_unlock(dst); 862 folio_put(dst); 863 } 864 } 865 } 866 867 /* 868 * migrate_device_finalize() - complete page migration 869 * @src_pfns: src_pfns returned from migrate_device_range() 870 * @dst_pfns: array of pfns allocated by the driver to migrate memory to 871 * @npages: number of pages in the range 872 * 873 * Completes migration of the page by removing special migration entries. 874 * Drivers must ensure copying of page data is complete and visible to the CPU 875 * before calling this. 876 */ 877 void migrate_device_finalize(unsigned long *src_pfns, 878 unsigned long *dst_pfns, unsigned long npages) 879 { 880 return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL); 881 } 882 EXPORT_SYMBOL(migrate_device_finalize); 883 884 /** 885 * migrate_vma_finalize() - restore CPU page table entry 886 * @migrate: migrate struct containing all migration information 887 * 888 * This replaces the special migration pte entry with either a mapping to the 889 * new page if migration was successful for that page, or to the original page 890 * otherwise. 891 * 892 * This also unlocks the pages and puts them back on the lru, or drops the extra 893 * refcount, for device pages. 894 */ 895 void migrate_vma_finalize(struct migrate_vma *migrate) 896 { 897 __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages, 898 migrate->fault_page); 899 } 900 EXPORT_SYMBOL(migrate_vma_finalize); 901 902 static unsigned long migrate_device_pfn_lock(unsigned long pfn) 903 { 904 struct folio *folio; 905 906 folio = folio_get_nontail_page(pfn_to_page(pfn)); 907 if (!folio) 908 return 0; 909 910 if (!folio_trylock(folio)) { 911 folio_put(folio); 912 return 0; 913 } 914 915 return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 916 } 917 918 /** 919 * migrate_device_range() - migrate device private pfns to normal memory. 920 * @src_pfns: array large enough to hold migrating source device private pfns. 921 * @start: starting pfn in the range to migrate. 922 * @npages: number of pages to migrate. 923 * 924 * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that 925 * instead of looking up pages based on virtual address mappings a range of 926 * device pfns that should be migrated to system memory is used instead. 927 * 928 * This is useful when a driver needs to free device memory but doesn't know the 929 * virtual mappings of every page that may be in device memory. For example this 930 * is often the case when a driver is being unloaded or unbound from a device. 931 * 932 * Like migrate_vma_setup() this function will take a reference and lock any 933 * migrating pages that aren't free before unmapping them. Drivers may then 934 * allocate destination pages and start copying data from the device to CPU 935 * memory before calling migrate_device_pages(). 936 */ 937 int migrate_device_range(unsigned long *src_pfns, unsigned long start, 938 unsigned long npages) 939 { 940 unsigned long i, pfn; 941 942 for (pfn = start, i = 0; i < npages; pfn++, i++) 943 src_pfns[i] = migrate_device_pfn_lock(pfn); 944 945 migrate_device_unmap(src_pfns, npages, NULL); 946 947 return 0; 948 } 949 EXPORT_SYMBOL(migrate_device_range); 950 951 /** 952 * migrate_device_pfns() - migrate device private pfns to normal memory. 953 * @src_pfns: pre-popluated array of source device private pfns to migrate. 954 * @npages: number of pages to migrate. 955 * 956 * Similar to migrate_device_range() but supports non-contiguous pre-popluated 957 * array of device pages to migrate. 958 */ 959 int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) 960 { 961 unsigned long i; 962 963 for (i = 0; i < npages; i++) 964 src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); 965 966 migrate_device_unmap(src_pfns, npages, NULL); 967 968 return 0; 969 } 970 EXPORT_SYMBOL(migrate_device_pfns); 971 972 /* 973 * Migrate a device coherent folio back to normal memory. The caller should have 974 * a reference on folio which will be copied to the new folio if migration is 975 * successful or dropped on failure. 976 */ 977 int migrate_device_coherent_folio(struct folio *folio) 978 { 979 unsigned long src_pfn, dst_pfn = 0; 980 struct folio *dfolio; 981 982 WARN_ON_ONCE(folio_test_large(folio)); 983 984 folio_lock(folio); 985 src_pfn = migrate_pfn(folio_pfn(folio)) | MIGRATE_PFN_MIGRATE; 986 987 /* 988 * We don't have a VMA and don't need to walk the page tables to find 989 * the source folio. So call migrate_vma_unmap() directly to unmap the 990 * folio as migrate_vma_setup() will fail if args.vma == NULL. 991 */ 992 migrate_device_unmap(&src_pfn, 1, NULL); 993 if (!(src_pfn & MIGRATE_PFN_MIGRATE)) 994 return -EBUSY; 995 996 dfolio = folio_alloc(GFP_USER | __GFP_NOWARN, 0); 997 if (dfolio) { 998 folio_lock(dfolio); 999 dst_pfn = migrate_pfn(folio_pfn(dfolio)); 1000 } 1001 1002 migrate_device_pages(&src_pfn, &dst_pfn, 1); 1003 if (src_pfn & MIGRATE_PFN_MIGRATE) 1004 folio_copy(dfolio, folio); 1005 migrate_device_finalize(&src_pfn, &dst_pfn, 1); 1006 1007 if (src_pfn & MIGRATE_PFN_MIGRATE) 1008 return 0; 1009 return -EBUSY; 1010 } 1011