1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/mm.h> 5 #include <linux/sched.h> 6 #include <linux/sched/mm.h> 7 #include <linux/mmu_notifier.h> 8 #include <linux/rmap.h> 9 #include <linux/swap.h> 10 #include <linux/mm_inline.h> 11 #include <linux/kthread.h> 12 #include <linux/khugepaged.h> 13 #include <linux/freezer.h> 14 #include <linux/mman.h> 15 #include <linux/hashtable.h> 16 #include <linux/userfaultfd_k.h> 17 #include <linux/page_idle.h> 18 #include <linux/page_table_check.h> 19 #include <linux/rcupdate_wait.h> 20 #include <linux/swapops.h> 21 #include <linux/shmem_fs.h> 22 #include <linux/dax.h> 23 #include <linux/ksm.h> 24 25 #include <asm/tlb.h> 26 #include <asm/pgalloc.h> 27 #include "internal.h" 28 #include "mm_slot.h" 29 30 enum scan_result { 31 SCAN_FAIL, 32 SCAN_SUCCEED, 33 SCAN_PMD_NULL, 34 SCAN_PMD_NONE, 35 SCAN_PMD_MAPPED, 36 SCAN_EXCEED_NONE_PTE, 37 SCAN_EXCEED_SWAP_PTE, 38 SCAN_EXCEED_SHARED_PTE, 39 SCAN_PTE_NON_PRESENT, 40 SCAN_PTE_UFFD_WP, 41 SCAN_PTE_MAPPED_HUGEPAGE, 42 SCAN_LACK_REFERENCED_PAGE, 43 SCAN_PAGE_NULL, 44 SCAN_SCAN_ABORT, 45 SCAN_PAGE_COUNT, 46 SCAN_PAGE_LRU, 47 SCAN_PAGE_LOCK, 48 SCAN_PAGE_ANON, 49 SCAN_PAGE_COMPOUND, 50 SCAN_ANY_PROCESS, 51 SCAN_VMA_NULL, 52 SCAN_VMA_CHECK, 53 SCAN_ADDRESS_RANGE, 54 SCAN_DEL_PAGE_LRU, 55 SCAN_ALLOC_HUGE_PAGE_FAIL, 56 SCAN_CGROUP_CHARGE_FAIL, 57 SCAN_TRUNCATED, 58 SCAN_PAGE_HAS_PRIVATE, 59 SCAN_STORE_FAILED, 60 SCAN_COPY_MC, 61 SCAN_PAGE_FILLED, 62 }; 63 64 #define CREATE_TRACE_POINTS 65 #include <trace/events/huge_memory.h> 66 67 static struct task_struct *khugepaged_thread __read_mostly; 68 static DEFINE_MUTEX(khugepaged_mutex); 69 70 /* default scan 8*512 pte (or vmas) every 30 second */ 71 static unsigned int khugepaged_pages_to_scan __read_mostly; 72 static unsigned int khugepaged_pages_collapsed; 73 static unsigned int khugepaged_full_scans; 74 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 75 /* during fragmentation poll the hugepage allocator once every minute */ 76 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 77 static unsigned long khugepaged_sleep_expire; 78 static DEFINE_SPINLOCK(khugepaged_mm_lock); 79 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 80 /* 81 * default collapse hugepages if there is at least one pte mapped like 82 * it would have happened if the vma was large enough during page 83 * fault. 84 * 85 * Note that these are only respected if collapse was initiated by khugepaged. 86 */ 87 unsigned int khugepaged_max_ptes_none __read_mostly; 88 static unsigned int khugepaged_max_ptes_swap __read_mostly; 89 static unsigned int khugepaged_max_ptes_shared __read_mostly; 90 91 #define MM_SLOTS_HASH_BITS 10 92 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 93 94 static struct kmem_cache *mm_slot_cache __ro_after_init; 95 96 struct collapse_control { 97 bool is_khugepaged; 98 99 /* Num pages scanned per node */ 100 u32 node_load[MAX_NUMNODES]; 101 102 /* nodemask for allocation fallback */ 103 nodemask_t alloc_nmask; 104 }; 105 106 /** 107 * struct khugepaged_scan - cursor for scanning 108 * @mm_head: the head of the mm list to scan 109 * @mm_slot: the current mm_slot we are scanning 110 * @address: the next address inside that to be scanned 111 * 112 * There is only the one khugepaged_scan instance of this cursor structure. 113 */ 114 struct khugepaged_scan { 115 struct list_head mm_head; 116 struct mm_slot *mm_slot; 117 unsigned long address; 118 }; 119 120 static struct khugepaged_scan khugepaged_scan = { 121 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 122 }; 123 124 #ifdef CONFIG_SYSFS 125 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 126 struct kobj_attribute *attr, 127 char *buf) 128 { 129 return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); 130 } 131 132 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 133 struct kobj_attribute *attr, 134 const char *buf, size_t count) 135 { 136 unsigned int msecs; 137 int err; 138 139 err = kstrtouint(buf, 10, &msecs); 140 if (err) 141 return -EINVAL; 142 143 khugepaged_scan_sleep_millisecs = msecs; 144 khugepaged_sleep_expire = 0; 145 wake_up_interruptible(&khugepaged_wait); 146 147 return count; 148 } 149 static struct kobj_attribute scan_sleep_millisecs_attr = 150 __ATTR_RW(scan_sleep_millisecs); 151 152 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 153 struct kobj_attribute *attr, 154 char *buf) 155 { 156 return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 157 } 158 159 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 160 struct kobj_attribute *attr, 161 const char *buf, size_t count) 162 { 163 unsigned int msecs; 164 int err; 165 166 err = kstrtouint(buf, 10, &msecs); 167 if (err) 168 return -EINVAL; 169 170 khugepaged_alloc_sleep_millisecs = msecs; 171 khugepaged_sleep_expire = 0; 172 wake_up_interruptible(&khugepaged_wait); 173 174 return count; 175 } 176 static struct kobj_attribute alloc_sleep_millisecs_attr = 177 __ATTR_RW(alloc_sleep_millisecs); 178 179 static ssize_t pages_to_scan_show(struct kobject *kobj, 180 struct kobj_attribute *attr, 181 char *buf) 182 { 183 return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); 184 } 185 static ssize_t pages_to_scan_store(struct kobject *kobj, 186 struct kobj_attribute *attr, 187 const char *buf, size_t count) 188 { 189 unsigned int pages; 190 int err; 191 192 err = kstrtouint(buf, 10, &pages); 193 if (err || !pages) 194 return -EINVAL; 195 196 khugepaged_pages_to_scan = pages; 197 198 return count; 199 } 200 static struct kobj_attribute pages_to_scan_attr = 201 __ATTR_RW(pages_to_scan); 202 203 static ssize_t pages_collapsed_show(struct kobject *kobj, 204 struct kobj_attribute *attr, 205 char *buf) 206 { 207 return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); 208 } 209 static struct kobj_attribute pages_collapsed_attr = 210 __ATTR_RO(pages_collapsed); 211 212 static ssize_t full_scans_show(struct kobject *kobj, 213 struct kobj_attribute *attr, 214 char *buf) 215 { 216 return sysfs_emit(buf, "%u\n", khugepaged_full_scans); 217 } 218 static struct kobj_attribute full_scans_attr = 219 __ATTR_RO(full_scans); 220 221 static ssize_t defrag_show(struct kobject *kobj, 222 struct kobj_attribute *attr, char *buf) 223 { 224 return single_hugepage_flag_show(kobj, attr, buf, 225 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 226 } 227 static ssize_t defrag_store(struct kobject *kobj, 228 struct kobj_attribute *attr, 229 const char *buf, size_t count) 230 { 231 return single_hugepage_flag_store(kobj, attr, buf, count, 232 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 233 } 234 static struct kobj_attribute khugepaged_defrag_attr = 235 __ATTR_RW(defrag); 236 237 /* 238 * max_ptes_none controls if khugepaged should collapse hugepages over 239 * any unmapped ptes in turn potentially increasing the memory 240 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 241 * reduce the available free memory in the system as it 242 * runs. Increasing max_ptes_none will instead potentially reduce the 243 * free memory in the system during the khugepaged scan. 244 */ 245 static ssize_t max_ptes_none_show(struct kobject *kobj, 246 struct kobj_attribute *attr, 247 char *buf) 248 { 249 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); 250 } 251 static ssize_t max_ptes_none_store(struct kobject *kobj, 252 struct kobj_attribute *attr, 253 const char *buf, size_t count) 254 { 255 int err; 256 unsigned long max_ptes_none; 257 258 err = kstrtoul(buf, 10, &max_ptes_none); 259 if (err || max_ptes_none > HPAGE_PMD_NR - 1) 260 return -EINVAL; 261 262 khugepaged_max_ptes_none = max_ptes_none; 263 264 return count; 265 } 266 static struct kobj_attribute khugepaged_max_ptes_none_attr = 267 __ATTR_RW(max_ptes_none); 268 269 static ssize_t max_ptes_swap_show(struct kobject *kobj, 270 struct kobj_attribute *attr, 271 char *buf) 272 { 273 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); 274 } 275 276 static ssize_t max_ptes_swap_store(struct kobject *kobj, 277 struct kobj_attribute *attr, 278 const char *buf, size_t count) 279 { 280 int err; 281 unsigned long max_ptes_swap; 282 283 err = kstrtoul(buf, 10, &max_ptes_swap); 284 if (err || max_ptes_swap > HPAGE_PMD_NR - 1) 285 return -EINVAL; 286 287 khugepaged_max_ptes_swap = max_ptes_swap; 288 289 return count; 290 } 291 292 static struct kobj_attribute khugepaged_max_ptes_swap_attr = 293 __ATTR_RW(max_ptes_swap); 294 295 static ssize_t max_ptes_shared_show(struct kobject *kobj, 296 struct kobj_attribute *attr, 297 char *buf) 298 { 299 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); 300 } 301 302 static ssize_t max_ptes_shared_store(struct kobject *kobj, 303 struct kobj_attribute *attr, 304 const char *buf, size_t count) 305 { 306 int err; 307 unsigned long max_ptes_shared; 308 309 err = kstrtoul(buf, 10, &max_ptes_shared); 310 if (err || max_ptes_shared > HPAGE_PMD_NR - 1) 311 return -EINVAL; 312 313 khugepaged_max_ptes_shared = max_ptes_shared; 314 315 return count; 316 } 317 318 static struct kobj_attribute khugepaged_max_ptes_shared_attr = 319 __ATTR_RW(max_ptes_shared); 320 321 static struct attribute *khugepaged_attr[] = { 322 &khugepaged_defrag_attr.attr, 323 &khugepaged_max_ptes_none_attr.attr, 324 &khugepaged_max_ptes_swap_attr.attr, 325 &khugepaged_max_ptes_shared_attr.attr, 326 &pages_to_scan_attr.attr, 327 &pages_collapsed_attr.attr, 328 &full_scans_attr.attr, 329 &scan_sleep_millisecs_attr.attr, 330 &alloc_sleep_millisecs_attr.attr, 331 NULL, 332 }; 333 334 struct attribute_group khugepaged_attr_group = { 335 .attrs = khugepaged_attr, 336 .name = "khugepaged", 337 }; 338 #endif /* CONFIG_SYSFS */ 339 340 int hugepage_madvise(struct vm_area_struct *vma, 341 vm_flags_t *vm_flags, int advice) 342 { 343 switch (advice) { 344 case MADV_HUGEPAGE: 345 #ifdef CONFIG_S390 346 /* 347 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 348 * can't handle this properly after s390_enable_sie, so we simply 349 * ignore the madvise to prevent qemu from causing a SIGSEGV. 350 */ 351 if (mm_has_pgste(vma->vm_mm)) 352 return 0; 353 #endif 354 *vm_flags &= ~VM_NOHUGEPAGE; 355 *vm_flags |= VM_HUGEPAGE; 356 /* 357 * If the vma become good for khugepaged to scan, 358 * register it here without waiting a page fault that 359 * may not happen any time soon. 360 */ 361 khugepaged_enter_vma(vma, *vm_flags); 362 break; 363 case MADV_NOHUGEPAGE: 364 *vm_flags &= ~VM_HUGEPAGE; 365 *vm_flags |= VM_NOHUGEPAGE; 366 /* 367 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 368 * this vma even if we leave the mm registered in khugepaged if 369 * it got registered before VM_NOHUGEPAGE was set. 370 */ 371 break; 372 } 373 374 return 0; 375 } 376 377 int __init khugepaged_init(void) 378 { 379 mm_slot_cache = KMEM_CACHE(mm_slot, 0); 380 if (!mm_slot_cache) 381 return -ENOMEM; 382 383 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 384 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; 385 khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; 386 khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; 387 388 return 0; 389 } 390 391 void __init khugepaged_destroy(void) 392 { 393 kmem_cache_destroy(mm_slot_cache); 394 } 395 396 static inline int hpage_collapse_test_exit(struct mm_struct *mm) 397 { 398 return atomic_read(&mm->mm_users) == 0; 399 } 400 401 static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) 402 { 403 return hpage_collapse_test_exit(mm) || 404 mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); 405 } 406 407 static bool hugepage_pmd_enabled(void) 408 { 409 /* 410 * We cover the anon, shmem and the file-backed case here; file-backed 411 * hugepages, when configured in, are determined by the global control. 412 * Anon pmd-sized hugepages are determined by the pmd-size control. 413 * Shmem pmd-sized hugepages are also determined by its pmd-size control, 414 * except when the global shmem_huge is set to SHMEM_HUGE_DENY. 415 */ 416 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 417 hugepage_global_enabled()) 418 return true; 419 if (test_bit(PMD_ORDER, &huge_anon_orders_always)) 420 return true; 421 if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) 422 return true; 423 if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && 424 hugepage_global_enabled()) 425 return true; 426 if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) 427 return true; 428 return false; 429 } 430 431 void __khugepaged_enter(struct mm_struct *mm) 432 { 433 struct mm_slot *slot; 434 int wakeup; 435 436 /* __khugepaged_exit() must not run from under us */ 437 VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); 438 if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) 439 return; 440 441 slot = mm_slot_alloc(mm_slot_cache); 442 if (!slot) 443 return; 444 445 spin_lock(&khugepaged_mm_lock); 446 mm_slot_insert(mm_slots_hash, mm, slot); 447 /* 448 * Insert just behind the scanning cursor, to let the area settle 449 * down a little. 450 */ 451 wakeup = list_empty(&khugepaged_scan.mm_head); 452 list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); 453 spin_unlock(&khugepaged_mm_lock); 454 455 mmgrab(mm); 456 if (wakeup) 457 wake_up_interruptible(&khugepaged_wait); 458 } 459 460 void khugepaged_enter_vma(struct vm_area_struct *vma, 461 vm_flags_t vm_flags) 462 { 463 if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && 464 hugepage_pmd_enabled()) { 465 if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) 466 __khugepaged_enter(vma->vm_mm); 467 } 468 } 469 470 void __khugepaged_exit(struct mm_struct *mm) 471 { 472 struct mm_slot *slot; 473 int free = 0; 474 475 spin_lock(&khugepaged_mm_lock); 476 slot = mm_slot_lookup(mm_slots_hash, mm); 477 if (slot && khugepaged_scan.mm_slot != slot) { 478 hash_del(&slot->hash); 479 list_del(&slot->mm_node); 480 free = 1; 481 } 482 spin_unlock(&khugepaged_mm_lock); 483 484 if (free) { 485 mm_flags_clear(MMF_VM_HUGEPAGE, mm); 486 mm_slot_free(mm_slot_cache, slot); 487 mmdrop(mm); 488 } else if (slot) { 489 /* 490 * This is required to serialize against 491 * hpage_collapse_test_exit() (which is guaranteed to run 492 * under mmap sem read mode). Stop here (after we return all 493 * pagetables will be destroyed) until khugepaged has finished 494 * working on the pagetables under the mmap_lock. 495 */ 496 mmap_write_lock(mm); 497 mmap_write_unlock(mm); 498 } 499 } 500 501 static void release_pte_folio(struct folio *folio) 502 { 503 node_stat_mod_folio(folio, 504 NR_ISOLATED_ANON + folio_is_file_lru(folio), 505 -folio_nr_pages(folio)); 506 folio_unlock(folio); 507 folio_putback_lru(folio); 508 } 509 510 static void release_pte_pages(pte_t *pte, pte_t *_pte, 511 struct list_head *compound_pagelist) 512 { 513 struct folio *folio, *tmp; 514 515 while (--_pte >= pte) { 516 pte_t pteval = ptep_get(_pte); 517 unsigned long pfn; 518 519 if (pte_none(pteval)) 520 continue; 521 pfn = pte_pfn(pteval); 522 if (is_zero_pfn(pfn)) 523 continue; 524 folio = pfn_folio(pfn); 525 if (folio_test_large(folio)) 526 continue; 527 release_pte_folio(folio); 528 } 529 530 list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { 531 list_del(&folio->lru); 532 release_pte_folio(folio); 533 } 534 } 535 536 static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 537 unsigned long start_addr, 538 pte_t *pte, 539 struct collapse_control *cc, 540 struct list_head *compound_pagelist) 541 { 542 struct page *page = NULL; 543 struct folio *folio = NULL; 544 unsigned long addr = start_addr; 545 pte_t *_pte; 546 int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; 547 548 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; 549 _pte++, addr += PAGE_SIZE) { 550 pte_t pteval = ptep_get(_pte); 551 if (pte_none(pteval) || (pte_present(pteval) && 552 is_zero_pfn(pte_pfn(pteval)))) { 553 ++none_or_zero; 554 if (!userfaultfd_armed(vma) && 555 (!cc->is_khugepaged || 556 none_or_zero <= khugepaged_max_ptes_none)) { 557 continue; 558 } else { 559 result = SCAN_EXCEED_NONE_PTE; 560 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 561 goto out; 562 } 563 } 564 if (!pte_present(pteval)) { 565 result = SCAN_PTE_NON_PRESENT; 566 goto out; 567 } 568 if (pte_uffd_wp(pteval)) { 569 result = SCAN_PTE_UFFD_WP; 570 goto out; 571 } 572 page = vm_normal_page(vma, addr, pteval); 573 if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 574 result = SCAN_PAGE_NULL; 575 goto out; 576 } 577 578 folio = page_folio(page); 579 VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); 580 581 /* See hpage_collapse_scan_pmd(). */ 582 if (folio_maybe_mapped_shared(folio)) { 583 ++shared; 584 if (cc->is_khugepaged && 585 shared > khugepaged_max_ptes_shared) { 586 result = SCAN_EXCEED_SHARED_PTE; 587 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 588 goto out; 589 } 590 } 591 592 if (folio_test_large(folio)) { 593 struct folio *f; 594 595 /* 596 * Check if we have dealt with the compound page 597 * already 598 */ 599 list_for_each_entry(f, compound_pagelist, lru) { 600 if (folio == f) 601 goto next; 602 } 603 } 604 605 /* 606 * We can do it before folio_isolate_lru because the 607 * folio can't be freed from under us. NOTE: PG_lock 608 * is needed to serialize against split_huge_page 609 * when invoked from the VM. 610 */ 611 if (!folio_trylock(folio)) { 612 result = SCAN_PAGE_LOCK; 613 goto out; 614 } 615 616 /* 617 * Check if the page has any GUP (or other external) pins. 618 * 619 * The page table that maps the page has been already unlinked 620 * from the page table tree and this process cannot get 621 * an additional pin on the page. 622 * 623 * New pins can come later if the page is shared across fork, 624 * but not from this process. The other process cannot write to 625 * the page, only trigger CoW. 626 */ 627 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { 628 folio_unlock(folio); 629 result = SCAN_PAGE_COUNT; 630 goto out; 631 } 632 633 /* 634 * Isolate the page to avoid collapsing an hugepage 635 * currently in use by the VM. 636 */ 637 if (!folio_isolate_lru(folio)) { 638 folio_unlock(folio); 639 result = SCAN_DEL_PAGE_LRU; 640 goto out; 641 } 642 node_stat_mod_folio(folio, 643 NR_ISOLATED_ANON + folio_is_file_lru(folio), 644 folio_nr_pages(folio)); 645 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 646 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 647 648 if (folio_test_large(folio)) 649 list_add_tail(&folio->lru, compound_pagelist); 650 next: 651 /* 652 * If collapse was initiated by khugepaged, check that there is 653 * enough young pte to justify collapsing the page 654 */ 655 if (cc->is_khugepaged && 656 (pte_young(pteval) || folio_test_young(folio) || 657 folio_test_referenced(folio) || 658 mmu_notifier_test_young(vma->vm_mm, addr))) 659 referenced++; 660 } 661 662 if (unlikely(cc->is_khugepaged && !referenced)) { 663 result = SCAN_LACK_REFERENCED_PAGE; 664 } else { 665 result = SCAN_SUCCEED; 666 trace_mm_collapse_huge_page_isolate(folio, none_or_zero, 667 referenced, result); 668 return result; 669 } 670 out: 671 release_pte_pages(pte, _pte, compound_pagelist); 672 trace_mm_collapse_huge_page_isolate(folio, none_or_zero, 673 referenced, result); 674 return result; 675 } 676 677 static void __collapse_huge_page_copy_succeeded(pte_t *pte, 678 struct vm_area_struct *vma, 679 unsigned long address, 680 spinlock_t *ptl, 681 struct list_head *compound_pagelist) 682 { 683 unsigned long end = address + HPAGE_PMD_SIZE; 684 struct folio *src, *tmp; 685 pte_t pteval; 686 pte_t *_pte; 687 unsigned int nr_ptes; 688 689 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, 690 address += nr_ptes * PAGE_SIZE) { 691 nr_ptes = 1; 692 pteval = ptep_get(_pte); 693 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 694 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 695 if (is_zero_pfn(pte_pfn(pteval))) { 696 /* 697 * ptl mostly unnecessary. 698 */ 699 spin_lock(ptl); 700 ptep_clear(vma->vm_mm, address, _pte); 701 spin_unlock(ptl); 702 ksm_might_unmap_zero_page(vma->vm_mm, pteval); 703 } 704 } else { 705 struct page *src_page = pte_page(pteval); 706 707 src = page_folio(src_page); 708 709 if (folio_test_large(src)) { 710 unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; 711 712 nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); 713 } else { 714 release_pte_folio(src); 715 } 716 717 /* 718 * ptl mostly unnecessary, but preempt has to 719 * be disabled to update the per-cpu stats 720 * inside folio_remove_rmap_pte(). 721 */ 722 spin_lock(ptl); 723 clear_ptes(vma->vm_mm, address, _pte, nr_ptes); 724 folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); 725 spin_unlock(ptl); 726 free_swap_cache(src); 727 folio_put_refs(src, nr_ptes); 728 } 729 } 730 731 list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { 732 list_del(&src->lru); 733 node_stat_sub_folio(src, NR_ISOLATED_ANON + 734 folio_is_file_lru(src)); 735 folio_unlock(src); 736 free_swap_cache(src); 737 folio_putback_lru(src); 738 } 739 } 740 741 static void __collapse_huge_page_copy_failed(pte_t *pte, 742 pmd_t *pmd, 743 pmd_t orig_pmd, 744 struct vm_area_struct *vma, 745 struct list_head *compound_pagelist) 746 { 747 spinlock_t *pmd_ptl; 748 749 /* 750 * Re-establish the PMD to point to the original page table 751 * entry. Restoring PMD needs to be done prior to releasing 752 * pages. Since pages are still isolated and locked here, 753 * acquiring anon_vma_lock_write is unnecessary. 754 */ 755 pmd_ptl = pmd_lock(vma->vm_mm, pmd); 756 pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); 757 spin_unlock(pmd_ptl); 758 /* 759 * Release both raw and compound pages isolated 760 * in __collapse_huge_page_isolate. 761 */ 762 release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); 763 } 764 765 /* 766 * __collapse_huge_page_copy - attempts to copy memory contents from raw 767 * pages to a hugepage. Cleans up the raw pages if copying succeeds; 768 * otherwise restores the original page table and releases isolated raw pages. 769 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. 770 * 771 * @pte: starting of the PTEs to copy from 772 * @folio: the new hugepage to copy contents to 773 * @pmd: pointer to the new hugepage's PMD 774 * @orig_pmd: the original raw pages' PMD 775 * @vma: the original raw pages' virtual memory area 776 * @address: starting address to copy 777 * @ptl: lock on raw pages' PTEs 778 * @compound_pagelist: list that stores compound pages 779 */ 780 static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, 781 pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, 782 unsigned long address, spinlock_t *ptl, 783 struct list_head *compound_pagelist) 784 { 785 unsigned int i; 786 int result = SCAN_SUCCEED; 787 788 /* 789 * Copying pages' contents is subject to memory poison at any iteration. 790 */ 791 for (i = 0; i < HPAGE_PMD_NR; i++) { 792 pte_t pteval = ptep_get(pte + i); 793 struct page *page = folio_page(folio, i); 794 unsigned long src_addr = address + i * PAGE_SIZE; 795 struct page *src_page; 796 797 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 798 clear_user_highpage(page, src_addr); 799 continue; 800 } 801 src_page = pte_page(pteval); 802 if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { 803 result = SCAN_COPY_MC; 804 break; 805 } 806 } 807 808 if (likely(result == SCAN_SUCCEED)) 809 __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, 810 compound_pagelist); 811 else 812 __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, 813 compound_pagelist); 814 815 return result; 816 } 817 818 static void khugepaged_alloc_sleep(void) 819 { 820 DEFINE_WAIT(wait); 821 822 add_wait_queue(&khugepaged_wait, &wait); 823 __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 824 schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 825 remove_wait_queue(&khugepaged_wait, &wait); 826 } 827 828 struct collapse_control khugepaged_collapse_control = { 829 .is_khugepaged = true, 830 }; 831 832 static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) 833 { 834 int i; 835 836 /* 837 * If node_reclaim_mode is disabled, then no extra effort is made to 838 * allocate memory locally. 839 */ 840 if (!node_reclaim_enabled()) 841 return false; 842 843 /* If there is a count for this node already, it must be acceptable */ 844 if (cc->node_load[nid]) 845 return false; 846 847 for (i = 0; i < MAX_NUMNODES; i++) { 848 if (!cc->node_load[i]) 849 continue; 850 if (node_distance(nid, i) > node_reclaim_distance) 851 return true; 852 } 853 return false; 854 } 855 856 #define khugepaged_defrag() \ 857 (transparent_hugepage_flags & \ 858 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) 859 860 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 861 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 862 { 863 return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; 864 } 865 866 #ifdef CONFIG_NUMA 867 static int hpage_collapse_find_target_node(struct collapse_control *cc) 868 { 869 int nid, target_node = 0, max_value = 0; 870 871 /* find first node with max normal pages hit */ 872 for (nid = 0; nid < MAX_NUMNODES; nid++) 873 if (cc->node_load[nid] > max_value) { 874 max_value = cc->node_load[nid]; 875 target_node = nid; 876 } 877 878 for_each_online_node(nid) { 879 if (max_value == cc->node_load[nid]) 880 node_set(nid, cc->alloc_nmask); 881 } 882 883 return target_node; 884 } 885 #else 886 static int hpage_collapse_find_target_node(struct collapse_control *cc) 887 { 888 return 0; 889 } 890 #endif 891 892 /* 893 * If mmap_lock temporarily dropped, revalidate vma 894 * before taking mmap_lock. 895 * Returns enum scan_result value. 896 */ 897 898 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, 899 bool expect_anon, 900 struct vm_area_struct **vmap, 901 struct collapse_control *cc) 902 { 903 struct vm_area_struct *vma; 904 enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : 905 TVA_FORCED_COLLAPSE; 906 907 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) 908 return SCAN_ANY_PROCESS; 909 910 *vmap = vma = find_vma(mm, address); 911 if (!vma) 912 return SCAN_VMA_NULL; 913 914 if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) 915 return SCAN_ADDRESS_RANGE; 916 if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) 917 return SCAN_VMA_CHECK; 918 /* 919 * Anon VMA expected, the address may be unmapped then 920 * remapped to file after khugepaged reaquired the mmap_lock. 921 * 922 * thp_vma_allowable_order may return true for qualified file 923 * vmas. 924 */ 925 if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) 926 return SCAN_PAGE_ANON; 927 return SCAN_SUCCEED; 928 } 929 930 static inline int check_pmd_state(pmd_t *pmd) 931 { 932 pmd_t pmde = pmdp_get_lockless(pmd); 933 934 if (pmd_none(pmde)) 935 return SCAN_PMD_NONE; 936 937 /* 938 * The folio may be under migration when khugepaged is trying to 939 * collapse it. Migration success or failure will eventually end 940 * up with a present PMD mapping a folio again. 941 */ 942 if (is_pmd_migration_entry(pmde)) 943 return SCAN_PMD_MAPPED; 944 if (!pmd_present(pmde)) 945 return SCAN_PMD_NULL; 946 if (pmd_trans_huge(pmde)) 947 return SCAN_PMD_MAPPED; 948 if (pmd_bad(pmde)) 949 return SCAN_PMD_NULL; 950 return SCAN_SUCCEED; 951 } 952 953 static int find_pmd_or_thp_or_none(struct mm_struct *mm, 954 unsigned long address, 955 pmd_t **pmd) 956 { 957 *pmd = mm_find_pmd(mm, address); 958 if (!*pmd) 959 return SCAN_PMD_NULL; 960 961 return check_pmd_state(*pmd); 962 } 963 964 static int check_pmd_still_valid(struct mm_struct *mm, 965 unsigned long address, 966 pmd_t *pmd) 967 { 968 pmd_t *new_pmd; 969 int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); 970 971 if (result != SCAN_SUCCEED) 972 return result; 973 if (new_pmd != pmd) 974 return SCAN_FAIL; 975 return SCAN_SUCCEED; 976 } 977 978 /* 979 * Bring missing pages in from swap, to complete THP collapse. 980 * Only done if hpage_collapse_scan_pmd believes it is worthwhile. 981 * 982 * Called and returns without pte mapped or spinlocks held. 983 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. 984 */ 985 static int __collapse_huge_page_swapin(struct mm_struct *mm, 986 struct vm_area_struct *vma, 987 unsigned long start_addr, pmd_t *pmd, 988 int referenced) 989 { 990 int swapped_in = 0; 991 vm_fault_t ret = 0; 992 unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); 993 int result; 994 pte_t *pte = NULL; 995 spinlock_t *ptl; 996 997 for (addr = start_addr; addr < end; addr += PAGE_SIZE) { 998 struct vm_fault vmf = { 999 .vma = vma, 1000 .address = addr, 1001 .pgoff = linear_page_index(vma, addr), 1002 .flags = FAULT_FLAG_ALLOW_RETRY, 1003 .pmd = pmd, 1004 }; 1005 1006 if (!pte++) { 1007 /* 1008 * Here the ptl is only used to check pte_same() in 1009 * do_swap_page(), so readonly version is enough. 1010 */ 1011 pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); 1012 if (!pte) { 1013 mmap_read_unlock(mm); 1014 result = SCAN_PMD_NULL; 1015 goto out; 1016 } 1017 } 1018 1019 vmf.orig_pte = ptep_get_lockless(pte); 1020 if (!is_swap_pte(vmf.orig_pte)) 1021 continue; 1022 1023 vmf.pte = pte; 1024 vmf.ptl = ptl; 1025 ret = do_swap_page(&vmf); 1026 /* Which unmaps pte (after perhaps re-checking the entry) */ 1027 pte = NULL; 1028 1029 /* 1030 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. 1031 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because 1032 * we do not retry here and swap entry will remain in pagetable 1033 * resulting in later failure. 1034 */ 1035 if (ret & VM_FAULT_RETRY) { 1036 /* Likely, but not guaranteed, that page lock failed */ 1037 result = SCAN_PAGE_LOCK; 1038 goto out; 1039 } 1040 if (ret & VM_FAULT_ERROR) { 1041 mmap_read_unlock(mm); 1042 result = SCAN_FAIL; 1043 goto out; 1044 } 1045 swapped_in++; 1046 } 1047 1048 if (pte) 1049 pte_unmap(pte); 1050 1051 /* Drain LRU cache to remove extra pin on the swapped in pages */ 1052 if (swapped_in) 1053 lru_add_drain(); 1054 1055 result = SCAN_SUCCEED; 1056 out: 1057 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); 1058 return result; 1059 } 1060 1061 static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, 1062 struct collapse_control *cc) 1063 { 1064 gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : 1065 GFP_TRANSHUGE); 1066 int node = hpage_collapse_find_target_node(cc); 1067 struct folio *folio; 1068 1069 folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); 1070 if (!folio) { 1071 *foliop = NULL; 1072 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1073 return SCAN_ALLOC_HUGE_PAGE_FAIL; 1074 } 1075 1076 count_vm_event(THP_COLLAPSE_ALLOC); 1077 if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { 1078 folio_put(folio); 1079 *foliop = NULL; 1080 return SCAN_CGROUP_CHARGE_FAIL; 1081 } 1082 1083 count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); 1084 1085 *foliop = folio; 1086 return SCAN_SUCCEED; 1087 } 1088 1089 static int collapse_huge_page(struct mm_struct *mm, unsigned long address, 1090 int referenced, int unmapped, 1091 struct collapse_control *cc) 1092 { 1093 LIST_HEAD(compound_pagelist); 1094 pmd_t *pmd, _pmd; 1095 pte_t *pte; 1096 pgtable_t pgtable; 1097 struct folio *folio; 1098 spinlock_t *pmd_ptl, *pte_ptl; 1099 int result = SCAN_FAIL; 1100 struct vm_area_struct *vma; 1101 struct mmu_notifier_range range; 1102 1103 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1104 1105 /* 1106 * Before allocating the hugepage, release the mmap_lock read lock. 1107 * The allocation can take potentially a long time if it involves 1108 * sync compaction, and we do not need to hold the mmap_lock during 1109 * that. We will recheck the vma after taking it again in write mode. 1110 */ 1111 mmap_read_unlock(mm); 1112 1113 result = alloc_charge_folio(&folio, mm, cc); 1114 if (result != SCAN_SUCCEED) 1115 goto out_nolock; 1116 1117 mmap_read_lock(mm); 1118 result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 1119 if (result != SCAN_SUCCEED) { 1120 mmap_read_unlock(mm); 1121 goto out_nolock; 1122 } 1123 1124 result = find_pmd_or_thp_or_none(mm, address, &pmd); 1125 if (result != SCAN_SUCCEED) { 1126 mmap_read_unlock(mm); 1127 goto out_nolock; 1128 } 1129 1130 if (unmapped) { 1131 /* 1132 * __collapse_huge_page_swapin will return with mmap_lock 1133 * released when it fails. So we jump out_nolock directly in 1134 * that case. Continuing to collapse causes inconsistency. 1135 */ 1136 result = __collapse_huge_page_swapin(mm, vma, address, pmd, 1137 referenced); 1138 if (result != SCAN_SUCCEED) 1139 goto out_nolock; 1140 } 1141 1142 mmap_read_unlock(mm); 1143 /* 1144 * Prevent all access to pagetables with the exception of 1145 * gup_fast later handled by the ptep_clear_flush and the VM 1146 * handled by the anon_vma lock + PG_lock. 1147 * 1148 * UFFDIO_MOVE is prevented to race as well thanks to the 1149 * mmap_lock. 1150 */ 1151 mmap_write_lock(mm); 1152 result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 1153 if (result != SCAN_SUCCEED) 1154 goto out_up_write; 1155 /* check if the pmd is still valid */ 1156 vma_start_write(vma); 1157 result = check_pmd_still_valid(mm, address, pmd); 1158 if (result != SCAN_SUCCEED) 1159 goto out_up_write; 1160 1161 anon_vma_lock_write(vma->anon_vma); 1162 1163 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, 1164 address + HPAGE_PMD_SIZE); 1165 mmu_notifier_invalidate_range_start(&range); 1166 1167 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1168 /* 1169 * This removes any huge TLB entry from the CPU so we won't allow 1170 * huge and small TLB entries for the same virtual address to 1171 * avoid the risk of CPU bugs in that area. 1172 * 1173 * Parallel GUP-fast is fine since GUP-fast will back off when 1174 * it detects PMD is changed. 1175 */ 1176 _pmd = pmdp_collapse_flush(vma, address, pmd); 1177 spin_unlock(pmd_ptl); 1178 mmu_notifier_invalidate_range_end(&range); 1179 tlb_remove_table_sync_one(); 1180 1181 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); 1182 if (pte) { 1183 result = __collapse_huge_page_isolate(vma, address, pte, cc, 1184 &compound_pagelist); 1185 spin_unlock(pte_ptl); 1186 } else { 1187 result = SCAN_PMD_NULL; 1188 } 1189 1190 if (unlikely(result != SCAN_SUCCEED)) { 1191 if (pte) 1192 pte_unmap(pte); 1193 spin_lock(pmd_ptl); 1194 BUG_ON(!pmd_none(*pmd)); 1195 /* 1196 * We can only use set_pmd_at when establishing 1197 * hugepmds and never for establishing regular pmds that 1198 * points to regular pagetables. Use pmd_populate for that 1199 */ 1200 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 1201 spin_unlock(pmd_ptl); 1202 anon_vma_unlock_write(vma->anon_vma); 1203 goto out_up_write; 1204 } 1205 1206 /* 1207 * All pages are isolated and locked so anon_vma rmap 1208 * can't run anymore. 1209 */ 1210 anon_vma_unlock_write(vma->anon_vma); 1211 1212 result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, 1213 vma, address, pte_ptl, 1214 &compound_pagelist); 1215 pte_unmap(pte); 1216 if (unlikely(result != SCAN_SUCCEED)) 1217 goto out_up_write; 1218 1219 /* 1220 * The smp_wmb() inside __folio_mark_uptodate() ensures the 1221 * copy_huge_page writes become visible before the set_pmd_at() 1222 * write. 1223 */ 1224 __folio_mark_uptodate(folio); 1225 pgtable = pmd_pgtable(_pmd); 1226 1227 _pmd = folio_mk_pmd(folio, vma->vm_page_prot); 1228 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 1229 1230 spin_lock(pmd_ptl); 1231 BUG_ON(!pmd_none(*pmd)); 1232 folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); 1233 folio_add_lru_vma(folio, vma); 1234 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1235 set_pmd_at(mm, address, pmd, _pmd); 1236 update_mmu_cache_pmd(vma, address, pmd); 1237 deferred_split_folio(folio, false); 1238 spin_unlock(pmd_ptl); 1239 1240 folio = NULL; 1241 1242 result = SCAN_SUCCEED; 1243 out_up_write: 1244 mmap_write_unlock(mm); 1245 out_nolock: 1246 if (folio) 1247 folio_put(folio); 1248 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); 1249 return result; 1250 } 1251 1252 static int hpage_collapse_scan_pmd(struct mm_struct *mm, 1253 struct vm_area_struct *vma, 1254 unsigned long start_addr, bool *mmap_locked, 1255 struct collapse_control *cc) 1256 { 1257 pmd_t *pmd; 1258 pte_t *pte, *_pte; 1259 int result = SCAN_FAIL, referenced = 0; 1260 int none_or_zero = 0, shared = 0; 1261 struct page *page = NULL; 1262 struct folio *folio = NULL; 1263 unsigned long addr; 1264 spinlock_t *ptl; 1265 int node = NUMA_NO_NODE, unmapped = 0; 1266 1267 VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); 1268 1269 result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); 1270 if (result != SCAN_SUCCEED) 1271 goto out; 1272 1273 memset(cc->node_load, 0, sizeof(cc->node_load)); 1274 nodes_clear(cc->alloc_nmask); 1275 pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); 1276 if (!pte) { 1277 result = SCAN_PMD_NULL; 1278 goto out; 1279 } 1280 1281 for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; 1282 _pte++, addr += PAGE_SIZE) { 1283 pte_t pteval = ptep_get(_pte); 1284 if (is_swap_pte(pteval)) { 1285 ++unmapped; 1286 if (!cc->is_khugepaged || 1287 unmapped <= khugepaged_max_ptes_swap) { 1288 /* 1289 * Always be strict with uffd-wp 1290 * enabled swap entries. Please see 1291 * comment below for pte_uffd_wp(). 1292 */ 1293 if (pte_swp_uffd_wp_any(pteval)) { 1294 result = SCAN_PTE_UFFD_WP; 1295 goto out_unmap; 1296 } 1297 continue; 1298 } else { 1299 result = SCAN_EXCEED_SWAP_PTE; 1300 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 1301 goto out_unmap; 1302 } 1303 } 1304 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 1305 ++none_or_zero; 1306 if (!userfaultfd_armed(vma) && 1307 (!cc->is_khugepaged || 1308 none_or_zero <= khugepaged_max_ptes_none)) { 1309 continue; 1310 } else { 1311 result = SCAN_EXCEED_NONE_PTE; 1312 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 1313 goto out_unmap; 1314 } 1315 } 1316 if (pte_uffd_wp(pteval)) { 1317 /* 1318 * Don't collapse the page if any of the small 1319 * PTEs are armed with uffd write protection. 1320 * Here we can also mark the new huge pmd as 1321 * write protected if any of the small ones is 1322 * marked but that could bring unknown 1323 * userfault messages that falls outside of 1324 * the registered range. So, just be simple. 1325 */ 1326 result = SCAN_PTE_UFFD_WP; 1327 goto out_unmap; 1328 } 1329 1330 page = vm_normal_page(vma, addr, pteval); 1331 if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 1332 result = SCAN_PAGE_NULL; 1333 goto out_unmap; 1334 } 1335 folio = page_folio(page); 1336 1337 if (!folio_test_anon(folio)) { 1338 result = SCAN_PAGE_ANON; 1339 goto out_unmap; 1340 } 1341 1342 /* 1343 * We treat a single page as shared if any part of the THP 1344 * is shared. 1345 */ 1346 if (folio_maybe_mapped_shared(folio)) { 1347 ++shared; 1348 if (cc->is_khugepaged && 1349 shared > khugepaged_max_ptes_shared) { 1350 result = SCAN_EXCEED_SHARED_PTE; 1351 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 1352 goto out_unmap; 1353 } 1354 } 1355 1356 /* 1357 * Record which node the original page is from and save this 1358 * information to cc->node_load[]. 1359 * Khugepaged will allocate hugepage from the node has the max 1360 * hit record. 1361 */ 1362 node = folio_nid(folio); 1363 if (hpage_collapse_scan_abort(node, cc)) { 1364 result = SCAN_SCAN_ABORT; 1365 goto out_unmap; 1366 } 1367 cc->node_load[node]++; 1368 if (!folio_test_lru(folio)) { 1369 result = SCAN_PAGE_LRU; 1370 goto out_unmap; 1371 } 1372 if (folio_test_locked(folio)) { 1373 result = SCAN_PAGE_LOCK; 1374 goto out_unmap; 1375 } 1376 1377 /* 1378 * Check if the page has any GUP (or other external) pins. 1379 * 1380 * Here the check may be racy: 1381 * it may see folio_mapcount() > folio_ref_count(). 1382 * But such case is ephemeral we could always retry collapse 1383 * later. However it may report false positive if the page 1384 * has excessive GUP pins (i.e. 512). Anyway the same check 1385 * will be done again later the risk seems low. 1386 */ 1387 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { 1388 result = SCAN_PAGE_COUNT; 1389 goto out_unmap; 1390 } 1391 1392 /* 1393 * If collapse was initiated by khugepaged, check that there is 1394 * enough young pte to justify collapsing the page 1395 */ 1396 if (cc->is_khugepaged && 1397 (pte_young(pteval) || folio_test_young(folio) || 1398 folio_test_referenced(folio) || 1399 mmu_notifier_test_young(vma->vm_mm, addr))) 1400 referenced++; 1401 } 1402 if (cc->is_khugepaged && 1403 (!referenced || 1404 (unmapped && referenced < HPAGE_PMD_NR / 2))) { 1405 result = SCAN_LACK_REFERENCED_PAGE; 1406 } else { 1407 result = SCAN_SUCCEED; 1408 } 1409 out_unmap: 1410 pte_unmap_unlock(pte, ptl); 1411 if (result == SCAN_SUCCEED) { 1412 result = collapse_huge_page(mm, start_addr, referenced, 1413 unmapped, cc); 1414 /* collapse_huge_page will return with the mmap_lock released */ 1415 *mmap_locked = false; 1416 } 1417 out: 1418 trace_mm_khugepaged_scan_pmd(mm, folio, referenced, 1419 none_or_zero, result, unmapped); 1420 return result; 1421 } 1422 1423 static void collect_mm_slot(struct mm_slot *slot) 1424 { 1425 struct mm_struct *mm = slot->mm; 1426 1427 lockdep_assert_held(&khugepaged_mm_lock); 1428 1429 if (hpage_collapse_test_exit(mm)) { 1430 /* free mm_slot */ 1431 hash_del(&slot->hash); 1432 list_del(&slot->mm_node); 1433 1434 /* 1435 * Not strictly needed because the mm exited already. 1436 * 1437 * mm_flags_clear(MMF_VM_HUGEPAGE, mm); 1438 */ 1439 1440 /* khugepaged_mm_lock actually not necessary for the below */ 1441 mm_slot_free(mm_slot_cache, slot); 1442 mmdrop(mm); 1443 } 1444 } 1445 1446 /* folio must be locked, and mmap_lock must be held */ 1447 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, 1448 pmd_t *pmdp, struct folio *folio, struct page *page) 1449 { 1450 struct mm_struct *mm = vma->vm_mm; 1451 struct vm_fault vmf = { 1452 .vma = vma, 1453 .address = addr, 1454 .flags = 0, 1455 }; 1456 pgd_t *pgdp; 1457 p4d_t *p4dp; 1458 pud_t *pudp; 1459 1460 mmap_assert_locked(vma->vm_mm); 1461 1462 if (!pmdp) { 1463 pgdp = pgd_offset(mm, addr); 1464 p4dp = p4d_alloc(mm, pgdp, addr); 1465 if (!p4dp) 1466 return SCAN_FAIL; 1467 pudp = pud_alloc(mm, p4dp, addr); 1468 if (!pudp) 1469 return SCAN_FAIL; 1470 pmdp = pmd_alloc(mm, pudp, addr); 1471 if (!pmdp) 1472 return SCAN_FAIL; 1473 } 1474 1475 vmf.pmd = pmdp; 1476 if (do_set_pmd(&vmf, folio, page)) 1477 return SCAN_FAIL; 1478 1479 folio_get(folio); 1480 return SCAN_SUCCEED; 1481 } 1482 1483 /** 1484 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at 1485 * address haddr. 1486 * 1487 * @mm: process address space where collapse happens 1488 * @addr: THP collapse address 1489 * @install_pmd: If a huge PMD should be installed 1490 * 1491 * This function checks whether all the PTEs in the PMD are pointing to the 1492 * right THP. If so, retract the page table so the THP can refault in with 1493 * as pmd-mapped. Possibly install a huge PMD mapping the THP. 1494 */ 1495 int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, 1496 bool install_pmd) 1497 { 1498 int nr_mapped_ptes = 0, result = SCAN_FAIL; 1499 unsigned int nr_batch_ptes; 1500 struct mmu_notifier_range range; 1501 bool notified = false; 1502 unsigned long haddr = addr & HPAGE_PMD_MASK; 1503 unsigned long end = haddr + HPAGE_PMD_SIZE; 1504 struct vm_area_struct *vma = vma_lookup(mm, haddr); 1505 struct folio *folio; 1506 pte_t *start_pte, *pte; 1507 pmd_t *pmd, pgt_pmd; 1508 spinlock_t *pml = NULL, *ptl; 1509 int i; 1510 1511 mmap_assert_locked(mm); 1512 1513 /* First check VMA found, in case page tables are being torn down */ 1514 if (!vma || !vma->vm_file || 1515 !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) 1516 return SCAN_VMA_CHECK; 1517 1518 /* Fast check before locking page if already PMD-mapped */ 1519 result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 1520 if (result == SCAN_PMD_MAPPED) 1521 return result; 1522 1523 /* 1524 * If we are here, we've succeeded in replacing all the native pages 1525 * in the page cache with a single hugepage. If a mm were to fault-in 1526 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage 1527 * and map it by a PMD, regardless of sysfs THP settings. As such, let's 1528 * analogously elide sysfs THP settings here and force collapse. 1529 */ 1530 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) 1531 return SCAN_VMA_CHECK; 1532 1533 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ 1534 if (userfaultfd_wp(vma)) 1535 return SCAN_PTE_UFFD_WP; 1536 1537 folio = filemap_lock_folio(vma->vm_file->f_mapping, 1538 linear_page_index(vma, haddr)); 1539 if (IS_ERR(folio)) 1540 return SCAN_PAGE_NULL; 1541 1542 if (folio_order(folio) != HPAGE_PMD_ORDER) { 1543 result = SCAN_PAGE_COMPOUND; 1544 goto drop_folio; 1545 } 1546 1547 result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 1548 switch (result) { 1549 case SCAN_SUCCEED: 1550 break; 1551 case SCAN_PMD_NULL: 1552 case SCAN_PMD_NONE: 1553 /* 1554 * All pte entries have been removed and pmd cleared. 1555 * Skip all the pte checks and just update the pmd mapping. 1556 */ 1557 goto maybe_install_pmd; 1558 default: 1559 goto drop_folio; 1560 } 1561 1562 result = SCAN_FAIL; 1563 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); 1564 if (!start_pte) /* mmap_lock + page lock should prevent this */ 1565 goto drop_folio; 1566 1567 /* step 1: check all mapped PTEs are to the right huge page */ 1568 for (i = 0, addr = haddr, pte = start_pte; 1569 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 1570 struct page *page; 1571 pte_t ptent = ptep_get(pte); 1572 1573 /* empty pte, skip */ 1574 if (pte_none(ptent)) 1575 continue; 1576 1577 /* page swapped out, abort */ 1578 if (!pte_present(ptent)) { 1579 result = SCAN_PTE_NON_PRESENT; 1580 goto abort; 1581 } 1582 1583 page = vm_normal_page(vma, addr, ptent); 1584 if (WARN_ON_ONCE(page && is_zone_device_page(page))) 1585 page = NULL; 1586 /* 1587 * Note that uprobe, debugger, or MAP_PRIVATE may change the 1588 * page table, but the new page will not be a subpage of hpage. 1589 */ 1590 if (folio_page(folio, i) != page) 1591 goto abort; 1592 } 1593 1594 pte_unmap_unlock(start_pte, ptl); 1595 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1596 haddr, haddr + HPAGE_PMD_SIZE); 1597 mmu_notifier_invalidate_range_start(&range); 1598 notified = true; 1599 1600 /* 1601 * pmd_lock covers a wider range than ptl, and (if split from mm's 1602 * page_table_lock) ptl nests inside pml. The less time we hold pml, 1603 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA 1604 * inserts a valid as-if-COWed PTE without even looking up page cache. 1605 * So page lock of folio does not protect from it, so we must not drop 1606 * ptl before pgt_pmd is removed, so uffd private needs pml taken now. 1607 */ 1608 if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) 1609 pml = pmd_lock(mm, pmd); 1610 1611 start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); 1612 if (!start_pte) /* mmap_lock + page lock should prevent this */ 1613 goto abort; 1614 if (!pml) 1615 spin_lock(ptl); 1616 else if (ptl != pml) 1617 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1618 1619 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) 1620 goto abort; 1621 1622 /* step 2: clear page table and adjust rmap */ 1623 for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; 1624 i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, 1625 pte += nr_batch_ptes) { 1626 unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; 1627 struct page *page; 1628 pte_t ptent = ptep_get(pte); 1629 1630 nr_batch_ptes = 1; 1631 1632 if (pte_none(ptent)) 1633 continue; 1634 /* 1635 * We dropped ptl after the first scan, to do the mmu_notifier: 1636 * page lock stops more PTEs of the folio being faulted in, but 1637 * does not stop write faults COWing anon copies from existing 1638 * PTEs; and does not stop those being swapped out or migrated. 1639 */ 1640 if (!pte_present(ptent)) { 1641 result = SCAN_PTE_NON_PRESENT; 1642 goto abort; 1643 } 1644 page = vm_normal_page(vma, addr, ptent); 1645 1646 if (folio_page(folio, i) != page) 1647 goto abort; 1648 1649 nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); 1650 1651 /* 1652 * Must clear entry, or a racing truncate may re-remove it. 1653 * TLB flush can be left until pmdp_collapse_flush() does it. 1654 * PTE dirty? Shmem page is already dirty; file is read-only. 1655 */ 1656 clear_ptes(mm, addr, pte, nr_batch_ptes); 1657 folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); 1658 nr_mapped_ptes += nr_batch_ptes; 1659 } 1660 1661 if (!pml) 1662 spin_unlock(ptl); 1663 1664 /* step 3: set proper refcount and mm_counters. */ 1665 if (nr_mapped_ptes) { 1666 folio_ref_sub(folio, nr_mapped_ptes); 1667 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1668 } 1669 1670 /* step 4: remove empty page table */ 1671 if (!pml) { 1672 pml = pmd_lock(mm, pmd); 1673 if (ptl != pml) { 1674 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1675 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { 1676 flush_tlb_mm(mm); 1677 goto unlock; 1678 } 1679 } 1680 } 1681 pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); 1682 pmdp_get_lockless_sync(); 1683 pte_unmap_unlock(start_pte, ptl); 1684 if (ptl != pml) 1685 spin_unlock(pml); 1686 1687 mmu_notifier_invalidate_range_end(&range); 1688 1689 mm_dec_nr_ptes(mm); 1690 page_table_check_pte_clear_range(mm, haddr, pgt_pmd); 1691 pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 1692 1693 maybe_install_pmd: 1694 /* step 5: install pmd entry */ 1695 result = install_pmd 1696 ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) 1697 : SCAN_SUCCEED; 1698 goto drop_folio; 1699 abort: 1700 if (nr_mapped_ptes) { 1701 flush_tlb_mm(mm); 1702 folio_ref_sub(folio, nr_mapped_ptes); 1703 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1704 } 1705 unlock: 1706 if (start_pte) 1707 pte_unmap_unlock(start_pte, ptl); 1708 if (pml && pml != ptl) 1709 spin_unlock(pml); 1710 if (notified) 1711 mmu_notifier_invalidate_range_end(&range); 1712 drop_folio: 1713 folio_unlock(folio); 1714 folio_put(folio); 1715 return result; 1716 } 1717 1718 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) 1719 { 1720 struct vm_area_struct *vma; 1721 1722 i_mmap_lock_read(mapping); 1723 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1724 struct mmu_notifier_range range; 1725 struct mm_struct *mm; 1726 unsigned long addr; 1727 pmd_t *pmd, pgt_pmd; 1728 spinlock_t *pml; 1729 spinlock_t *ptl; 1730 bool success = false; 1731 1732 /* 1733 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that 1734 * got written to. These VMAs are likely not worth removing 1735 * page tables from, as PMD-mapping is likely to be split later. 1736 */ 1737 if (READ_ONCE(vma->anon_vma)) 1738 continue; 1739 1740 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 1741 if (addr & ~HPAGE_PMD_MASK || 1742 vma->vm_end < addr + HPAGE_PMD_SIZE) 1743 continue; 1744 1745 mm = vma->vm_mm; 1746 if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) 1747 continue; 1748 1749 if (hpage_collapse_test_exit(mm)) 1750 continue; 1751 /* 1752 * When a vma is registered with uffd-wp, we cannot recycle 1753 * the page table because there may be pte markers installed. 1754 * Other vmas can still have the same file mapped hugely, but 1755 * skip this one: it will always be mapped in small page size 1756 * for uffd-wp registered ranges. 1757 */ 1758 if (userfaultfd_wp(vma)) 1759 continue; 1760 1761 /* PTEs were notified when unmapped; but now for the PMD? */ 1762 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1763 addr, addr + HPAGE_PMD_SIZE); 1764 mmu_notifier_invalidate_range_start(&range); 1765 1766 pml = pmd_lock(mm, pmd); 1767 /* 1768 * The lock of new_folio is still held, we will be blocked in 1769 * the page fault path, which prevents the pte entries from 1770 * being set again. So even though the old empty PTE page may be 1771 * concurrently freed and a new PTE page is filled into the pmd 1772 * entry, it is still empty and can be removed. 1773 * 1774 * So here we only need to recheck if the state of pmd entry 1775 * still meets our requirements, rather than checking pmd_same() 1776 * like elsewhere. 1777 */ 1778 if (check_pmd_state(pmd) != SCAN_SUCCEED) 1779 goto drop_pml; 1780 ptl = pte_lockptr(mm, pmd); 1781 if (ptl != pml) 1782 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1783 1784 /* 1785 * Huge page lock is still held, so normally the page table 1786 * must remain empty; and we have already skipped anon_vma 1787 * and userfaultfd_wp() vmas. But since the mmap_lock is not 1788 * held, it is still possible for a racing userfaultfd_ioctl() 1789 * to have inserted ptes or markers. Now that we hold ptlock, 1790 * repeating the anon_vma check protects from one category, 1791 * and repeating the userfaultfd_wp() check from another. 1792 */ 1793 if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { 1794 pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); 1795 pmdp_get_lockless_sync(); 1796 success = true; 1797 } 1798 1799 if (ptl != pml) 1800 spin_unlock(ptl); 1801 drop_pml: 1802 spin_unlock(pml); 1803 1804 mmu_notifier_invalidate_range_end(&range); 1805 1806 if (success) { 1807 mm_dec_nr_ptes(mm); 1808 page_table_check_pte_clear_range(mm, addr, pgt_pmd); 1809 pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 1810 } 1811 } 1812 i_mmap_unlock_read(mapping); 1813 } 1814 1815 /** 1816 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. 1817 * 1818 * @mm: process address space where collapse happens 1819 * @addr: virtual collapse start address 1820 * @file: file that collapse on 1821 * @start: collapse start address 1822 * @cc: collapse context and scratchpad 1823 * 1824 * Basic scheme is simple, details are more complex: 1825 * - allocate and lock a new huge page; 1826 * - scan page cache, locking old pages 1827 * + swap/gup in pages if necessary; 1828 * - copy data to new page 1829 * - handle shmem holes 1830 * + re-validate that holes weren't filled by someone else 1831 * + check for userfaultfd 1832 * - finalize updates to the page cache; 1833 * - if replacing succeeds: 1834 * + unlock huge page; 1835 * + free old pages; 1836 * - if replacing failed; 1837 * + unlock old pages 1838 * + unlock and free huge page; 1839 */ 1840 static int collapse_file(struct mm_struct *mm, unsigned long addr, 1841 struct file *file, pgoff_t start, 1842 struct collapse_control *cc) 1843 { 1844 struct address_space *mapping = file->f_mapping; 1845 struct page *dst; 1846 struct folio *folio, *tmp, *new_folio; 1847 pgoff_t index = 0, end = start + HPAGE_PMD_NR; 1848 LIST_HEAD(pagelist); 1849 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); 1850 int nr_none = 0, result = SCAN_SUCCEED; 1851 bool is_shmem = shmem_file(file); 1852 1853 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); 1854 VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 1855 1856 result = alloc_charge_folio(&new_folio, mm, cc); 1857 if (result != SCAN_SUCCEED) 1858 goto out; 1859 1860 mapping_set_update(&xas, mapping); 1861 1862 __folio_set_locked(new_folio); 1863 if (is_shmem) 1864 __folio_set_swapbacked(new_folio); 1865 new_folio->index = start; 1866 new_folio->mapping = mapping; 1867 1868 /* 1869 * Ensure we have slots for all the pages in the range. This is 1870 * almost certainly a no-op because most of the pages must be present 1871 */ 1872 do { 1873 xas_lock_irq(&xas); 1874 xas_create_range(&xas); 1875 if (!xas_error(&xas)) 1876 break; 1877 xas_unlock_irq(&xas); 1878 if (!xas_nomem(&xas, GFP_KERNEL)) { 1879 result = SCAN_FAIL; 1880 goto rollback; 1881 } 1882 } while (1); 1883 1884 for (index = start; index < end;) { 1885 xas_set(&xas, index); 1886 folio = xas_load(&xas); 1887 1888 VM_BUG_ON(index != xas.xa_index); 1889 if (is_shmem) { 1890 if (!folio) { 1891 /* 1892 * Stop if extent has been truncated or 1893 * hole-punched, and is now completely 1894 * empty. 1895 */ 1896 if (index == start) { 1897 if (!xas_next_entry(&xas, end - 1)) { 1898 result = SCAN_TRUNCATED; 1899 goto xa_locked; 1900 } 1901 } 1902 nr_none++; 1903 index++; 1904 continue; 1905 } 1906 1907 if (xa_is_value(folio) || !folio_test_uptodate(folio)) { 1908 xas_unlock_irq(&xas); 1909 /* swap in or instantiate fallocated page */ 1910 if (shmem_get_folio(mapping->host, index, 0, 1911 &folio, SGP_NOALLOC)) { 1912 result = SCAN_FAIL; 1913 goto xa_unlocked; 1914 } 1915 /* drain lru cache to help folio_isolate_lru() */ 1916 lru_add_drain(); 1917 } else if (folio_trylock(folio)) { 1918 folio_get(folio); 1919 xas_unlock_irq(&xas); 1920 } else { 1921 result = SCAN_PAGE_LOCK; 1922 goto xa_locked; 1923 } 1924 } else { /* !is_shmem */ 1925 if (!folio || xa_is_value(folio)) { 1926 xas_unlock_irq(&xas); 1927 page_cache_sync_readahead(mapping, &file->f_ra, 1928 file, index, 1929 end - index); 1930 /* drain lru cache to help folio_isolate_lru() */ 1931 lru_add_drain(); 1932 folio = filemap_lock_folio(mapping, index); 1933 if (IS_ERR(folio)) { 1934 result = SCAN_FAIL; 1935 goto xa_unlocked; 1936 } 1937 } else if (folio_test_dirty(folio)) { 1938 /* 1939 * khugepaged only works on read-only fd, 1940 * so this page is dirty because it hasn't 1941 * been flushed since first write. There 1942 * won't be new dirty pages. 1943 * 1944 * Trigger async flush here and hope the 1945 * writeback is done when khugepaged 1946 * revisits this page. 1947 * 1948 * This is a one-off situation. We are not 1949 * forcing writeback in loop. 1950 */ 1951 xas_unlock_irq(&xas); 1952 filemap_flush(mapping); 1953 result = SCAN_FAIL; 1954 goto xa_unlocked; 1955 } else if (folio_test_writeback(folio)) { 1956 xas_unlock_irq(&xas); 1957 result = SCAN_FAIL; 1958 goto xa_unlocked; 1959 } else if (folio_trylock(folio)) { 1960 folio_get(folio); 1961 xas_unlock_irq(&xas); 1962 } else { 1963 result = SCAN_PAGE_LOCK; 1964 goto xa_locked; 1965 } 1966 } 1967 1968 /* 1969 * The folio must be locked, so we can drop the i_pages lock 1970 * without racing with truncate. 1971 */ 1972 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1973 1974 /* make sure the folio is up to date */ 1975 if (unlikely(!folio_test_uptodate(folio))) { 1976 result = SCAN_FAIL; 1977 goto out_unlock; 1978 } 1979 1980 /* 1981 * If file was truncated then extended, or hole-punched, before 1982 * we locked the first folio, then a THP might be there already. 1983 * This will be discovered on the first iteration. 1984 */ 1985 if (folio_order(folio) == HPAGE_PMD_ORDER && 1986 folio->index == start) { 1987 /* Maybe PMD-mapped */ 1988 result = SCAN_PTE_MAPPED_HUGEPAGE; 1989 goto out_unlock; 1990 } 1991 1992 if (folio_mapping(folio) != mapping) { 1993 result = SCAN_TRUNCATED; 1994 goto out_unlock; 1995 } 1996 1997 if (!is_shmem && (folio_test_dirty(folio) || 1998 folio_test_writeback(folio))) { 1999 /* 2000 * khugepaged only works on read-only fd, so this 2001 * folio is dirty because it hasn't been flushed 2002 * since first write. 2003 */ 2004 result = SCAN_FAIL; 2005 goto out_unlock; 2006 } 2007 2008 if (!folio_isolate_lru(folio)) { 2009 result = SCAN_DEL_PAGE_LRU; 2010 goto out_unlock; 2011 } 2012 2013 if (!filemap_release_folio(folio, GFP_KERNEL)) { 2014 result = SCAN_PAGE_HAS_PRIVATE; 2015 folio_putback_lru(folio); 2016 goto out_unlock; 2017 } 2018 2019 if (folio_mapped(folio)) 2020 try_to_unmap(folio, 2021 TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); 2022 2023 xas_lock_irq(&xas); 2024 2025 VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); 2026 2027 /* 2028 * We control 2 + nr_pages references to the folio: 2029 * - we hold a pin on it; 2030 * - nr_pages reference from page cache; 2031 * - one from lru_isolate_folio; 2032 * If those are the only references, then any new usage 2033 * of the folio will have to fetch it from the page 2034 * cache. That requires locking the folio to handle 2035 * truncate, so any new usage will be blocked until we 2036 * unlock folio after collapse/during rollback. 2037 */ 2038 if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { 2039 result = SCAN_PAGE_COUNT; 2040 xas_unlock_irq(&xas); 2041 folio_putback_lru(folio); 2042 goto out_unlock; 2043 } 2044 2045 /* 2046 * Accumulate the folios that are being collapsed. 2047 */ 2048 list_add_tail(&folio->lru, &pagelist); 2049 index += folio_nr_pages(folio); 2050 continue; 2051 out_unlock: 2052 folio_unlock(folio); 2053 folio_put(folio); 2054 goto xa_unlocked; 2055 } 2056 2057 if (!is_shmem) { 2058 filemap_nr_thps_inc(mapping); 2059 /* 2060 * Paired with the fence in do_dentry_open() -> get_write_access() 2061 * to ensure i_writecount is up to date and the update to nr_thps 2062 * is visible. Ensures the page cache will be truncated if the 2063 * file is opened writable. 2064 */ 2065 smp_mb(); 2066 if (inode_is_open_for_write(mapping->host)) { 2067 result = SCAN_FAIL; 2068 filemap_nr_thps_dec(mapping); 2069 } 2070 } 2071 2072 xa_locked: 2073 xas_unlock_irq(&xas); 2074 xa_unlocked: 2075 2076 /* 2077 * If collapse is successful, flush must be done now before copying. 2078 * If collapse is unsuccessful, does flush actually need to be done? 2079 * Do it anyway, to clear the state. 2080 */ 2081 try_to_unmap_flush(); 2082 2083 if (result == SCAN_SUCCEED && nr_none && 2084 !shmem_charge(mapping->host, nr_none)) 2085 result = SCAN_FAIL; 2086 if (result != SCAN_SUCCEED) { 2087 nr_none = 0; 2088 goto rollback; 2089 } 2090 2091 /* 2092 * The old folios are locked, so they won't change anymore. 2093 */ 2094 index = start; 2095 dst = folio_page(new_folio, 0); 2096 list_for_each_entry(folio, &pagelist, lru) { 2097 int i, nr_pages = folio_nr_pages(folio); 2098 2099 while (index < folio->index) { 2100 clear_highpage(dst); 2101 index++; 2102 dst++; 2103 } 2104 2105 for (i = 0; i < nr_pages; i++) { 2106 if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { 2107 result = SCAN_COPY_MC; 2108 goto rollback; 2109 } 2110 index++; 2111 dst++; 2112 } 2113 } 2114 while (index < end) { 2115 clear_highpage(dst); 2116 index++; 2117 dst++; 2118 } 2119 2120 if (nr_none) { 2121 struct vm_area_struct *vma; 2122 int nr_none_check = 0; 2123 2124 i_mmap_lock_read(mapping); 2125 xas_lock_irq(&xas); 2126 2127 xas_set(&xas, start); 2128 for (index = start; index < end; index++) { 2129 if (!xas_next(&xas)) { 2130 xas_store(&xas, XA_RETRY_ENTRY); 2131 if (xas_error(&xas)) { 2132 result = SCAN_STORE_FAILED; 2133 goto immap_locked; 2134 } 2135 nr_none_check++; 2136 } 2137 } 2138 2139 if (nr_none != nr_none_check) { 2140 result = SCAN_PAGE_FILLED; 2141 goto immap_locked; 2142 } 2143 2144 /* 2145 * If userspace observed a missing page in a VMA with 2146 * a MODE_MISSING userfaultfd, then it might expect a 2147 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to 2148 * roll back to avoid suppressing such an event. Since 2149 * wp/minor userfaultfds don't give userspace any 2150 * guarantees that the kernel doesn't fill a missing 2151 * page with a zero page, so they don't matter here. 2152 * 2153 * Any userfaultfds registered after this point will 2154 * not be able to observe any missing pages due to the 2155 * previously inserted retry entries. 2156 */ 2157 vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { 2158 if (userfaultfd_missing(vma)) { 2159 result = SCAN_EXCEED_NONE_PTE; 2160 goto immap_locked; 2161 } 2162 } 2163 2164 immap_locked: 2165 i_mmap_unlock_read(mapping); 2166 if (result != SCAN_SUCCEED) { 2167 xas_set(&xas, start); 2168 for (index = start; index < end; index++) { 2169 if (xas_next(&xas) == XA_RETRY_ENTRY) 2170 xas_store(&xas, NULL); 2171 } 2172 2173 xas_unlock_irq(&xas); 2174 goto rollback; 2175 } 2176 } else { 2177 xas_lock_irq(&xas); 2178 } 2179 2180 if (is_shmem) 2181 __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); 2182 else 2183 __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); 2184 2185 if (nr_none) { 2186 __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); 2187 /* nr_none is always 0 for non-shmem. */ 2188 __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); 2189 } 2190 2191 /* 2192 * Mark new_folio as uptodate before inserting it into the 2193 * page cache so that it isn't mistaken for an fallocated but 2194 * unwritten page. 2195 */ 2196 folio_mark_uptodate(new_folio); 2197 folio_ref_add(new_folio, HPAGE_PMD_NR - 1); 2198 2199 if (is_shmem) 2200 folio_mark_dirty(new_folio); 2201 folio_add_lru(new_folio); 2202 2203 /* Join all the small entries into a single multi-index entry. */ 2204 xas_set_order(&xas, start, HPAGE_PMD_ORDER); 2205 xas_store(&xas, new_folio); 2206 WARN_ON_ONCE(xas_error(&xas)); 2207 xas_unlock_irq(&xas); 2208 2209 /* 2210 * Remove pte page tables, so we can re-fault the page as huge. 2211 * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). 2212 */ 2213 retract_page_tables(mapping, start); 2214 if (cc && !cc->is_khugepaged) 2215 result = SCAN_PTE_MAPPED_HUGEPAGE; 2216 folio_unlock(new_folio); 2217 2218 /* 2219 * The collapse has succeeded, so free the old folios. 2220 */ 2221 list_for_each_entry_safe(folio, tmp, &pagelist, lru) { 2222 list_del(&folio->lru); 2223 folio->mapping = NULL; 2224 folio_clear_active(folio); 2225 folio_clear_unevictable(folio); 2226 folio_unlock(folio); 2227 folio_put_refs(folio, 2 + folio_nr_pages(folio)); 2228 } 2229 2230 goto out; 2231 2232 rollback: 2233 /* Something went wrong: roll back page cache changes */ 2234 if (nr_none) { 2235 xas_lock_irq(&xas); 2236 mapping->nrpages -= nr_none; 2237 xas_unlock_irq(&xas); 2238 shmem_uncharge(mapping->host, nr_none); 2239 } 2240 2241 list_for_each_entry_safe(folio, tmp, &pagelist, lru) { 2242 list_del(&folio->lru); 2243 folio_unlock(folio); 2244 folio_putback_lru(folio); 2245 folio_put(folio); 2246 } 2247 /* 2248 * Undo the updates of filemap_nr_thps_inc for non-SHMEM 2249 * file only. This undo is not needed unless failure is 2250 * due to SCAN_COPY_MC. 2251 */ 2252 if (!is_shmem && result == SCAN_COPY_MC) { 2253 filemap_nr_thps_dec(mapping); 2254 /* 2255 * Paired with the fence in do_dentry_open() -> get_write_access() 2256 * to ensure the update to nr_thps is visible. 2257 */ 2258 smp_mb(); 2259 } 2260 2261 new_folio->mapping = NULL; 2262 2263 folio_unlock(new_folio); 2264 folio_put(new_folio); 2265 out: 2266 VM_BUG_ON(!list_empty(&pagelist)); 2267 trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); 2268 return result; 2269 } 2270 2271 static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, 2272 struct file *file, pgoff_t start, 2273 struct collapse_control *cc) 2274 { 2275 struct folio *folio = NULL; 2276 struct address_space *mapping = file->f_mapping; 2277 XA_STATE(xas, &mapping->i_pages, start); 2278 int present, swap; 2279 int node = NUMA_NO_NODE; 2280 int result = SCAN_SUCCEED; 2281 2282 present = 0; 2283 swap = 0; 2284 memset(cc->node_load, 0, sizeof(cc->node_load)); 2285 nodes_clear(cc->alloc_nmask); 2286 rcu_read_lock(); 2287 xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { 2288 if (xas_retry(&xas, folio)) 2289 continue; 2290 2291 if (xa_is_value(folio)) { 2292 swap += 1 << xas_get_order(&xas); 2293 if (cc->is_khugepaged && 2294 swap > khugepaged_max_ptes_swap) { 2295 result = SCAN_EXCEED_SWAP_PTE; 2296 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 2297 break; 2298 } 2299 continue; 2300 } 2301 2302 if (!folio_try_get(folio)) { 2303 xas_reset(&xas); 2304 continue; 2305 } 2306 2307 if (unlikely(folio != xas_reload(&xas))) { 2308 folio_put(folio); 2309 xas_reset(&xas); 2310 continue; 2311 } 2312 2313 if (folio_order(folio) == HPAGE_PMD_ORDER && 2314 folio->index == start) { 2315 /* Maybe PMD-mapped */ 2316 result = SCAN_PTE_MAPPED_HUGEPAGE; 2317 /* 2318 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing 2319 * by the caller won't touch the page cache, and so 2320 * it's safe to skip LRU and refcount checks before 2321 * returning. 2322 */ 2323 folio_put(folio); 2324 break; 2325 } 2326 2327 node = folio_nid(folio); 2328 if (hpage_collapse_scan_abort(node, cc)) { 2329 result = SCAN_SCAN_ABORT; 2330 folio_put(folio); 2331 break; 2332 } 2333 cc->node_load[node]++; 2334 2335 if (!folio_test_lru(folio)) { 2336 result = SCAN_PAGE_LRU; 2337 folio_put(folio); 2338 break; 2339 } 2340 2341 if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { 2342 result = SCAN_PAGE_COUNT; 2343 folio_put(folio); 2344 break; 2345 } 2346 2347 /* 2348 * We probably should check if the folio is referenced 2349 * here, but nobody would transfer pte_young() to 2350 * folio_test_referenced() for us. And rmap walk here 2351 * is just too costly... 2352 */ 2353 2354 present += folio_nr_pages(folio); 2355 folio_put(folio); 2356 2357 if (need_resched()) { 2358 xas_pause(&xas); 2359 cond_resched_rcu(); 2360 } 2361 } 2362 rcu_read_unlock(); 2363 2364 if (result == SCAN_SUCCEED) { 2365 if (cc->is_khugepaged && 2366 present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { 2367 result = SCAN_EXCEED_NONE_PTE; 2368 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 2369 } else { 2370 result = collapse_file(mm, addr, file, start, cc); 2371 } 2372 } 2373 2374 trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); 2375 return result; 2376 } 2377 2378 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, 2379 struct collapse_control *cc) 2380 __releases(&khugepaged_mm_lock) 2381 __acquires(&khugepaged_mm_lock) 2382 { 2383 struct vma_iterator vmi; 2384 struct mm_slot *slot; 2385 struct mm_struct *mm; 2386 struct vm_area_struct *vma; 2387 int progress = 0; 2388 2389 VM_BUG_ON(!pages); 2390 lockdep_assert_held(&khugepaged_mm_lock); 2391 *result = SCAN_FAIL; 2392 2393 if (khugepaged_scan.mm_slot) { 2394 slot = khugepaged_scan.mm_slot; 2395 } else { 2396 slot = list_first_entry(&khugepaged_scan.mm_head, 2397 struct mm_slot, mm_node); 2398 khugepaged_scan.address = 0; 2399 khugepaged_scan.mm_slot = slot; 2400 } 2401 spin_unlock(&khugepaged_mm_lock); 2402 2403 mm = slot->mm; 2404 /* 2405 * Don't wait for semaphore (to avoid long wait times). Just move to 2406 * the next mm on the list. 2407 */ 2408 vma = NULL; 2409 if (unlikely(!mmap_read_trylock(mm))) 2410 goto breakouterloop_mmap_lock; 2411 2412 progress++; 2413 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) 2414 goto breakouterloop; 2415 2416 vma_iter_init(&vmi, mm, khugepaged_scan.address); 2417 for_each_vma(vmi, vma) { 2418 unsigned long hstart, hend; 2419 2420 cond_resched(); 2421 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { 2422 progress++; 2423 break; 2424 } 2425 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { 2426 skip: 2427 progress++; 2428 continue; 2429 } 2430 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); 2431 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); 2432 if (khugepaged_scan.address > hend) 2433 goto skip; 2434 if (khugepaged_scan.address < hstart) 2435 khugepaged_scan.address = hstart; 2436 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2437 2438 while (khugepaged_scan.address < hend) { 2439 bool mmap_locked = true; 2440 2441 cond_resched(); 2442 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) 2443 goto breakouterloop; 2444 2445 VM_BUG_ON(khugepaged_scan.address < hstart || 2446 khugepaged_scan.address + HPAGE_PMD_SIZE > 2447 hend); 2448 if (!vma_is_anonymous(vma)) { 2449 struct file *file = get_file(vma->vm_file); 2450 pgoff_t pgoff = linear_page_index(vma, 2451 khugepaged_scan.address); 2452 2453 mmap_read_unlock(mm); 2454 mmap_locked = false; 2455 *result = hpage_collapse_scan_file(mm, 2456 khugepaged_scan.address, file, pgoff, cc); 2457 fput(file); 2458 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { 2459 mmap_read_lock(mm); 2460 if (hpage_collapse_test_exit_or_disable(mm)) 2461 goto breakouterloop; 2462 *result = collapse_pte_mapped_thp(mm, 2463 khugepaged_scan.address, false); 2464 if (*result == SCAN_PMD_MAPPED) 2465 *result = SCAN_SUCCEED; 2466 mmap_read_unlock(mm); 2467 } 2468 } else { 2469 *result = hpage_collapse_scan_pmd(mm, vma, 2470 khugepaged_scan.address, &mmap_locked, cc); 2471 } 2472 2473 if (*result == SCAN_SUCCEED) 2474 ++khugepaged_pages_collapsed; 2475 2476 /* move to next address */ 2477 khugepaged_scan.address += HPAGE_PMD_SIZE; 2478 progress += HPAGE_PMD_NR; 2479 if (!mmap_locked) 2480 /* 2481 * We released mmap_lock so break loop. Note 2482 * that we drop mmap_lock before all hugepage 2483 * allocations, so if allocation fails, we are 2484 * guaranteed to break here and report the 2485 * correct result back to caller. 2486 */ 2487 goto breakouterloop_mmap_lock; 2488 if (progress >= pages) 2489 goto breakouterloop; 2490 } 2491 } 2492 breakouterloop: 2493 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ 2494 breakouterloop_mmap_lock: 2495 2496 spin_lock(&khugepaged_mm_lock); 2497 VM_BUG_ON(khugepaged_scan.mm_slot != slot); 2498 /* 2499 * Release the current mm_slot if this mm is about to die, or 2500 * if we scanned all vmas of this mm. 2501 */ 2502 if (hpage_collapse_test_exit(mm) || !vma) { 2503 /* 2504 * Make sure that if mm_users is reaching zero while 2505 * khugepaged runs here, khugepaged_exit will find 2506 * mm_slot not pointing to the exiting mm. 2507 */ 2508 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { 2509 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); 2510 khugepaged_scan.address = 0; 2511 } else { 2512 khugepaged_scan.mm_slot = NULL; 2513 khugepaged_full_scans++; 2514 } 2515 2516 collect_mm_slot(slot); 2517 } 2518 2519 return progress; 2520 } 2521 2522 static int khugepaged_has_work(void) 2523 { 2524 return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); 2525 } 2526 2527 static int khugepaged_wait_event(void) 2528 { 2529 return !list_empty(&khugepaged_scan.mm_head) || 2530 kthread_should_stop(); 2531 } 2532 2533 static void khugepaged_do_scan(struct collapse_control *cc) 2534 { 2535 unsigned int progress = 0, pass_through_head = 0; 2536 unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); 2537 bool wait = true; 2538 int result = SCAN_SUCCEED; 2539 2540 lru_add_drain_all(); 2541 2542 while (true) { 2543 cond_resched(); 2544 2545 if (unlikely(kthread_should_stop())) 2546 break; 2547 2548 spin_lock(&khugepaged_mm_lock); 2549 if (!khugepaged_scan.mm_slot) 2550 pass_through_head++; 2551 if (khugepaged_has_work() && 2552 pass_through_head < 2) 2553 progress += khugepaged_scan_mm_slot(pages - progress, 2554 &result, cc); 2555 else 2556 progress = pages; 2557 spin_unlock(&khugepaged_mm_lock); 2558 2559 if (progress >= pages) 2560 break; 2561 2562 if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { 2563 /* 2564 * If fail to allocate the first time, try to sleep for 2565 * a while. When hit again, cancel the scan. 2566 */ 2567 if (!wait) 2568 break; 2569 wait = false; 2570 khugepaged_alloc_sleep(); 2571 } 2572 } 2573 } 2574 2575 static bool khugepaged_should_wakeup(void) 2576 { 2577 return kthread_should_stop() || 2578 time_after_eq(jiffies, khugepaged_sleep_expire); 2579 } 2580 2581 static void khugepaged_wait_work(void) 2582 { 2583 if (khugepaged_has_work()) { 2584 const unsigned long scan_sleep_jiffies = 2585 msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 2586 2587 if (!scan_sleep_jiffies) 2588 return; 2589 2590 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 2591 wait_event_freezable_timeout(khugepaged_wait, 2592 khugepaged_should_wakeup(), 2593 scan_sleep_jiffies); 2594 return; 2595 } 2596 2597 if (hugepage_pmd_enabled()) 2598 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2599 } 2600 2601 static int khugepaged(void *none) 2602 { 2603 struct mm_slot *slot; 2604 2605 set_freezable(); 2606 set_user_nice(current, MAX_NICE); 2607 2608 while (!kthread_should_stop()) { 2609 khugepaged_do_scan(&khugepaged_collapse_control); 2610 khugepaged_wait_work(); 2611 } 2612 2613 spin_lock(&khugepaged_mm_lock); 2614 slot = khugepaged_scan.mm_slot; 2615 khugepaged_scan.mm_slot = NULL; 2616 if (slot) 2617 collect_mm_slot(slot); 2618 spin_unlock(&khugepaged_mm_lock); 2619 return 0; 2620 } 2621 2622 static void set_recommended_min_free_kbytes(void) 2623 { 2624 struct zone *zone; 2625 int nr_zones = 0; 2626 unsigned long recommended_min; 2627 2628 if (!hugepage_pmd_enabled()) { 2629 calculate_min_free_kbytes(); 2630 goto update_wmarks; 2631 } 2632 2633 for_each_populated_zone(zone) { 2634 /* 2635 * We don't need to worry about fragmentation of 2636 * ZONE_MOVABLE since it only has movable pages. 2637 */ 2638 if (zone_idx(zone) > gfp_zone(GFP_USER)) 2639 continue; 2640 2641 nr_zones++; 2642 } 2643 2644 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 2645 recommended_min = pageblock_nr_pages * nr_zones * 2; 2646 2647 /* 2648 * Make sure that on average at least two pageblocks are almost free 2649 * of another type, one for a migratetype to fall back to and a 2650 * second to avoid subsequent fallbacks of other types There are 3 2651 * MIGRATE_TYPES we care about. 2652 */ 2653 recommended_min += pageblock_nr_pages * nr_zones * 2654 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 2655 2656 /* don't ever allow to reserve more than 5% of the lowmem */ 2657 recommended_min = min(recommended_min, 2658 (unsigned long) nr_free_buffer_pages() / 20); 2659 recommended_min <<= (PAGE_SHIFT-10); 2660 2661 if (recommended_min > min_free_kbytes) { 2662 if (user_min_free_kbytes >= 0) 2663 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 2664 min_free_kbytes, recommended_min); 2665 2666 min_free_kbytes = recommended_min; 2667 } 2668 2669 update_wmarks: 2670 setup_per_zone_wmarks(); 2671 } 2672 2673 int start_stop_khugepaged(void) 2674 { 2675 int err = 0; 2676 2677 mutex_lock(&khugepaged_mutex); 2678 if (hugepage_pmd_enabled()) { 2679 if (!khugepaged_thread) 2680 khugepaged_thread = kthread_run(khugepaged, NULL, 2681 "khugepaged"); 2682 if (IS_ERR(khugepaged_thread)) { 2683 pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 2684 err = PTR_ERR(khugepaged_thread); 2685 khugepaged_thread = NULL; 2686 goto fail; 2687 } 2688 2689 if (!list_empty(&khugepaged_scan.mm_head)) 2690 wake_up_interruptible(&khugepaged_wait); 2691 } else if (khugepaged_thread) { 2692 kthread_stop(khugepaged_thread); 2693 khugepaged_thread = NULL; 2694 } 2695 set_recommended_min_free_kbytes(); 2696 fail: 2697 mutex_unlock(&khugepaged_mutex); 2698 return err; 2699 } 2700 2701 void khugepaged_min_free_kbytes_update(void) 2702 { 2703 mutex_lock(&khugepaged_mutex); 2704 if (hugepage_pmd_enabled() && khugepaged_thread) 2705 set_recommended_min_free_kbytes(); 2706 mutex_unlock(&khugepaged_mutex); 2707 } 2708 2709 bool current_is_khugepaged(void) 2710 { 2711 return kthread_func(current) == khugepaged; 2712 } 2713 2714 static int madvise_collapse_errno(enum scan_result r) 2715 { 2716 /* 2717 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide 2718 * actionable feedback to caller, so they may take an appropriate 2719 * fallback measure depending on the nature of the failure. 2720 */ 2721 switch (r) { 2722 case SCAN_ALLOC_HUGE_PAGE_FAIL: 2723 return -ENOMEM; 2724 case SCAN_CGROUP_CHARGE_FAIL: 2725 case SCAN_EXCEED_NONE_PTE: 2726 return -EBUSY; 2727 /* Resource temporary unavailable - trying again might succeed */ 2728 case SCAN_PAGE_COUNT: 2729 case SCAN_PAGE_LOCK: 2730 case SCAN_PAGE_LRU: 2731 case SCAN_DEL_PAGE_LRU: 2732 case SCAN_PAGE_FILLED: 2733 return -EAGAIN; 2734 /* 2735 * Other: Trying again likely not to succeed / error intrinsic to 2736 * specified memory range. khugepaged likely won't be able to collapse 2737 * either. 2738 */ 2739 default: 2740 return -EINVAL; 2741 } 2742 } 2743 2744 int madvise_collapse(struct vm_area_struct *vma, unsigned long start, 2745 unsigned long end, bool *lock_dropped) 2746 { 2747 struct collapse_control *cc; 2748 struct mm_struct *mm = vma->vm_mm; 2749 unsigned long hstart, hend, addr; 2750 int thps = 0, last_fail = SCAN_FAIL; 2751 bool mmap_locked = true; 2752 2753 BUG_ON(vma->vm_start > start); 2754 BUG_ON(vma->vm_end < end); 2755 2756 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) 2757 return -EINVAL; 2758 2759 cc = kmalloc(sizeof(*cc), GFP_KERNEL); 2760 if (!cc) 2761 return -ENOMEM; 2762 cc->is_khugepaged = false; 2763 2764 mmgrab(mm); 2765 lru_add_drain_all(); 2766 2767 hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2768 hend = end & HPAGE_PMD_MASK; 2769 2770 for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { 2771 int result = SCAN_FAIL; 2772 2773 if (!mmap_locked) { 2774 cond_resched(); 2775 mmap_read_lock(mm); 2776 mmap_locked = true; 2777 result = hugepage_vma_revalidate(mm, addr, false, &vma, 2778 cc); 2779 if (result != SCAN_SUCCEED) { 2780 last_fail = result; 2781 goto out_nolock; 2782 } 2783 2784 hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); 2785 } 2786 mmap_assert_locked(mm); 2787 memset(cc->node_load, 0, sizeof(cc->node_load)); 2788 nodes_clear(cc->alloc_nmask); 2789 if (!vma_is_anonymous(vma)) { 2790 struct file *file = get_file(vma->vm_file); 2791 pgoff_t pgoff = linear_page_index(vma, addr); 2792 2793 mmap_read_unlock(mm); 2794 mmap_locked = false; 2795 result = hpage_collapse_scan_file(mm, addr, file, pgoff, 2796 cc); 2797 fput(file); 2798 } else { 2799 result = hpage_collapse_scan_pmd(mm, vma, addr, 2800 &mmap_locked, cc); 2801 } 2802 if (!mmap_locked) 2803 *lock_dropped = true; 2804 2805 handle_result: 2806 switch (result) { 2807 case SCAN_SUCCEED: 2808 case SCAN_PMD_MAPPED: 2809 ++thps; 2810 break; 2811 case SCAN_PTE_MAPPED_HUGEPAGE: 2812 BUG_ON(mmap_locked); 2813 mmap_read_lock(mm); 2814 result = collapse_pte_mapped_thp(mm, addr, true); 2815 mmap_read_unlock(mm); 2816 goto handle_result; 2817 /* Whitelisted set of results where continuing OK */ 2818 case SCAN_PMD_NULL: 2819 case SCAN_PTE_NON_PRESENT: 2820 case SCAN_PTE_UFFD_WP: 2821 case SCAN_LACK_REFERENCED_PAGE: 2822 case SCAN_PAGE_NULL: 2823 case SCAN_PAGE_COUNT: 2824 case SCAN_PAGE_LOCK: 2825 case SCAN_PAGE_COMPOUND: 2826 case SCAN_PAGE_LRU: 2827 case SCAN_DEL_PAGE_LRU: 2828 last_fail = result; 2829 break; 2830 default: 2831 last_fail = result; 2832 /* Other error, exit */ 2833 goto out_maybelock; 2834 } 2835 } 2836 2837 out_maybelock: 2838 /* Caller expects us to hold mmap_lock on return */ 2839 if (!mmap_locked) 2840 mmap_read_lock(mm); 2841 out_nolock: 2842 mmap_assert_locked(mm); 2843 mmdrop(mm); 2844 kfree(cc); 2845 2846 return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 2847 : madvise_collapse_errno(last_fail); 2848 } 2849