1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/mm.h> 5 #include <linux/sched.h> 6 #include <linux/sched/mm.h> 7 #include <linux/mmu_notifier.h> 8 #include <linux/rmap.h> 9 #include <linux/swap.h> 10 #include <linux/mm_inline.h> 11 #include <linux/kthread.h> 12 #include <linux/khugepaged.h> 13 #include <linux/freezer.h> 14 #include <linux/mman.h> 15 #include <linux/hashtable.h> 16 #include <linux/userfaultfd_k.h> 17 #include <linux/page_idle.h> 18 #include <linux/page_table_check.h> 19 #include <linux/rcupdate_wait.h> 20 #include <linux/leafops.h> 21 #include <linux/shmem_fs.h> 22 #include <linux/dax.h> 23 #include <linux/ksm.h> 24 #include <linux/pgalloc.h> 25 #include <linux/backing-dev.h> 26 27 #include <asm/tlb.h> 28 #include "internal.h" 29 #include "mm_slot.h" 30 31 enum scan_result { 32 SCAN_FAIL, 33 SCAN_SUCCEED, 34 SCAN_NO_PTE_TABLE, 35 SCAN_PMD_MAPPED, 36 SCAN_EXCEED_NONE_PTE, 37 SCAN_EXCEED_SWAP_PTE, 38 SCAN_EXCEED_SHARED_PTE, 39 SCAN_PTE_NON_PRESENT, 40 SCAN_PTE_UFFD_WP, 41 SCAN_PTE_MAPPED_HUGEPAGE, 42 SCAN_LACK_REFERENCED_PAGE, 43 SCAN_PAGE_NULL, 44 SCAN_SCAN_ABORT, 45 SCAN_PAGE_COUNT, 46 SCAN_PAGE_LRU, 47 SCAN_PAGE_LOCK, 48 SCAN_PAGE_ANON, 49 SCAN_PAGE_LAZYFREE, 50 SCAN_PAGE_COMPOUND, 51 SCAN_ANY_PROCESS, 52 SCAN_VMA_NULL, 53 SCAN_VMA_CHECK, 54 SCAN_ADDRESS_RANGE, 55 SCAN_DEL_PAGE_LRU, 56 SCAN_ALLOC_HUGE_PAGE_FAIL, 57 SCAN_CGROUP_CHARGE_FAIL, 58 SCAN_TRUNCATED, 59 SCAN_PAGE_HAS_PRIVATE, 60 SCAN_STORE_FAILED, 61 SCAN_COPY_MC, 62 SCAN_PAGE_FILLED, 63 SCAN_PAGE_DIRTY_OR_WRITEBACK, 64 }; 65 66 #define CREATE_TRACE_POINTS 67 #include <trace/events/huge_memory.h> 68 69 static struct task_struct *khugepaged_thread __read_mostly; 70 static DEFINE_MUTEX(khugepaged_mutex); 71 72 /* 73 * default scan 8*HPAGE_PMD_NR ptes, pte_mapped_hugepage, pmd_mapped, 74 * no_pte_table or vmas every 10 second. 75 */ 76 static unsigned int khugepaged_pages_to_scan __read_mostly; 77 static unsigned int khugepaged_pages_collapsed; 78 static unsigned int khugepaged_full_scans; 79 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 80 /* during fragmentation poll the hugepage allocator once every minute */ 81 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 82 static unsigned long khugepaged_sleep_expire; 83 static DEFINE_SPINLOCK(khugepaged_mm_lock); 84 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 85 /* 86 * default collapse hugepages if there is at least one pte mapped like 87 * it would have happened if the vma was large enough during page 88 * fault. 89 * 90 * Note that these are only respected if collapse was initiated by khugepaged. 91 */ 92 unsigned int khugepaged_max_ptes_none __read_mostly; 93 static unsigned int khugepaged_max_ptes_swap __read_mostly; 94 static unsigned int khugepaged_max_ptes_shared __read_mostly; 95 96 #define MM_SLOTS_HASH_BITS 10 97 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 98 99 static struct kmem_cache *mm_slot_cache __ro_after_init; 100 101 struct collapse_control { 102 bool is_khugepaged; 103 104 /* Num pages scanned per node */ 105 u32 node_load[MAX_NUMNODES]; 106 107 /* Num pages scanned (see khugepaged_pages_to_scan) */ 108 unsigned int progress; 109 110 /* nodemask for allocation fallback */ 111 nodemask_t alloc_nmask; 112 }; 113 114 /** 115 * struct khugepaged_scan - cursor for scanning 116 * @mm_head: the head of the mm list to scan 117 * @mm_slot: the current mm_slot we are scanning 118 * @address: the next address inside that to be scanned 119 * 120 * There is only the one khugepaged_scan instance of this cursor structure. 121 */ 122 struct khugepaged_scan { 123 struct list_head mm_head; 124 struct mm_slot *mm_slot; 125 unsigned long address; 126 }; 127 128 static struct khugepaged_scan khugepaged_scan = { 129 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 130 }; 131 132 #ifdef CONFIG_SYSFS 133 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 134 struct kobj_attribute *attr, 135 char *buf) 136 { 137 return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); 138 } 139 140 static ssize_t __sleep_millisecs_store(const char *buf, size_t count, 141 unsigned int *millisecs) 142 { 143 unsigned int msecs; 144 int err; 145 146 err = kstrtouint(buf, 10, &msecs); 147 if (err) 148 return -EINVAL; 149 150 *millisecs = msecs; 151 khugepaged_sleep_expire = 0; 152 wake_up_interruptible(&khugepaged_wait); 153 154 return count; 155 } 156 157 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 158 struct kobj_attribute *attr, 159 const char *buf, size_t count) 160 { 161 return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); 162 } 163 static struct kobj_attribute scan_sleep_millisecs_attr = 164 __ATTR_RW(scan_sleep_millisecs); 165 166 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 167 struct kobj_attribute *attr, 168 char *buf) 169 { 170 return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 171 } 172 173 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 174 struct kobj_attribute *attr, 175 const char *buf, size_t count) 176 { 177 return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); 178 } 179 static struct kobj_attribute alloc_sleep_millisecs_attr = 180 __ATTR_RW(alloc_sleep_millisecs); 181 182 static ssize_t pages_to_scan_show(struct kobject *kobj, 183 struct kobj_attribute *attr, 184 char *buf) 185 { 186 return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); 187 } 188 static ssize_t pages_to_scan_store(struct kobject *kobj, 189 struct kobj_attribute *attr, 190 const char *buf, size_t count) 191 { 192 unsigned int pages; 193 int err; 194 195 err = kstrtouint(buf, 10, &pages); 196 if (err || !pages) 197 return -EINVAL; 198 199 khugepaged_pages_to_scan = pages; 200 201 return count; 202 } 203 static struct kobj_attribute pages_to_scan_attr = 204 __ATTR_RW(pages_to_scan); 205 206 static ssize_t pages_collapsed_show(struct kobject *kobj, 207 struct kobj_attribute *attr, 208 char *buf) 209 { 210 return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); 211 } 212 static struct kobj_attribute pages_collapsed_attr = 213 __ATTR_RO(pages_collapsed); 214 215 static ssize_t full_scans_show(struct kobject *kobj, 216 struct kobj_attribute *attr, 217 char *buf) 218 { 219 return sysfs_emit(buf, "%u\n", khugepaged_full_scans); 220 } 221 static struct kobj_attribute full_scans_attr = 222 __ATTR_RO(full_scans); 223 224 static ssize_t defrag_show(struct kobject *kobj, 225 struct kobj_attribute *attr, char *buf) 226 { 227 return single_hugepage_flag_show(kobj, attr, buf, 228 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 229 } 230 static ssize_t defrag_store(struct kobject *kobj, 231 struct kobj_attribute *attr, 232 const char *buf, size_t count) 233 { 234 return single_hugepage_flag_store(kobj, attr, buf, count, 235 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 236 } 237 static struct kobj_attribute khugepaged_defrag_attr = 238 __ATTR_RW(defrag); 239 240 /* 241 * max_ptes_none controls if khugepaged should collapse hugepages over 242 * any unmapped ptes in turn potentially increasing the memory 243 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 244 * reduce the available free memory in the system as it 245 * runs. Increasing max_ptes_none will instead potentially reduce the 246 * free memory in the system during the khugepaged scan. 247 */ 248 static ssize_t max_ptes_none_show(struct kobject *kobj, 249 struct kobj_attribute *attr, 250 char *buf) 251 { 252 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); 253 } 254 static ssize_t max_ptes_none_store(struct kobject *kobj, 255 struct kobj_attribute *attr, 256 const char *buf, size_t count) 257 { 258 int err; 259 unsigned long max_ptes_none; 260 261 err = kstrtoul(buf, 10, &max_ptes_none); 262 if (err || max_ptes_none > HPAGE_PMD_NR - 1) 263 return -EINVAL; 264 265 khugepaged_max_ptes_none = max_ptes_none; 266 267 return count; 268 } 269 static struct kobj_attribute khugepaged_max_ptes_none_attr = 270 __ATTR_RW(max_ptes_none); 271 272 static ssize_t max_ptes_swap_show(struct kobject *kobj, 273 struct kobj_attribute *attr, 274 char *buf) 275 { 276 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); 277 } 278 279 static ssize_t max_ptes_swap_store(struct kobject *kobj, 280 struct kobj_attribute *attr, 281 const char *buf, size_t count) 282 { 283 int err; 284 unsigned long max_ptes_swap; 285 286 err = kstrtoul(buf, 10, &max_ptes_swap); 287 if (err || max_ptes_swap > HPAGE_PMD_NR - 1) 288 return -EINVAL; 289 290 khugepaged_max_ptes_swap = max_ptes_swap; 291 292 return count; 293 } 294 295 static struct kobj_attribute khugepaged_max_ptes_swap_attr = 296 __ATTR_RW(max_ptes_swap); 297 298 static ssize_t max_ptes_shared_show(struct kobject *kobj, 299 struct kobj_attribute *attr, 300 char *buf) 301 { 302 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); 303 } 304 305 static ssize_t max_ptes_shared_store(struct kobject *kobj, 306 struct kobj_attribute *attr, 307 const char *buf, size_t count) 308 { 309 int err; 310 unsigned long max_ptes_shared; 311 312 err = kstrtoul(buf, 10, &max_ptes_shared); 313 if (err || max_ptes_shared > HPAGE_PMD_NR - 1) 314 return -EINVAL; 315 316 khugepaged_max_ptes_shared = max_ptes_shared; 317 318 return count; 319 } 320 321 static struct kobj_attribute khugepaged_max_ptes_shared_attr = 322 __ATTR_RW(max_ptes_shared); 323 324 static struct attribute *khugepaged_attr[] = { 325 &khugepaged_defrag_attr.attr, 326 &khugepaged_max_ptes_none_attr.attr, 327 &khugepaged_max_ptes_swap_attr.attr, 328 &khugepaged_max_ptes_shared_attr.attr, 329 &pages_to_scan_attr.attr, 330 &pages_collapsed_attr.attr, 331 &full_scans_attr.attr, 332 &scan_sleep_millisecs_attr.attr, 333 &alloc_sleep_millisecs_attr.attr, 334 NULL, 335 }; 336 337 struct attribute_group khugepaged_attr_group = { 338 .attrs = khugepaged_attr, 339 .name = "khugepaged", 340 }; 341 #endif /* CONFIG_SYSFS */ 342 343 static bool pte_none_or_zero(pte_t pte) 344 { 345 if (pte_none(pte)) 346 return true; 347 return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); 348 } 349 350 int hugepage_madvise(struct vm_area_struct *vma, 351 vm_flags_t *vm_flags, int advice) 352 { 353 switch (advice) { 354 case MADV_HUGEPAGE: 355 *vm_flags &= ~VM_NOHUGEPAGE; 356 *vm_flags |= VM_HUGEPAGE; 357 /* 358 * If the vma become good for khugepaged to scan, 359 * register it here without waiting a page fault that 360 * may not happen any time soon. 361 */ 362 khugepaged_enter_vma(vma, *vm_flags); 363 break; 364 case MADV_NOHUGEPAGE: 365 *vm_flags &= ~VM_HUGEPAGE; 366 *vm_flags |= VM_NOHUGEPAGE; 367 /* 368 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 369 * this vma even if we leave the mm registered in khugepaged if 370 * it got registered before VM_NOHUGEPAGE was set. 371 */ 372 break; 373 } 374 375 return 0; 376 } 377 378 int __init khugepaged_init(void) 379 { 380 mm_slot_cache = KMEM_CACHE(mm_slot, 0); 381 if (!mm_slot_cache) 382 return -ENOMEM; 383 384 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 385 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; 386 khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; 387 khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; 388 389 return 0; 390 } 391 392 void __init khugepaged_destroy(void) 393 { 394 kmem_cache_destroy(mm_slot_cache); 395 } 396 397 static inline int hpage_collapse_test_exit(struct mm_struct *mm) 398 { 399 return atomic_read(&mm->mm_users) == 0; 400 } 401 402 static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) 403 { 404 return hpage_collapse_test_exit(mm) || 405 mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); 406 } 407 408 static bool hugepage_pmd_enabled(void) 409 { 410 /* 411 * We cover the anon, shmem and the file-backed case here; file-backed 412 * hugepages, when configured in, are determined by the global control. 413 * Anon pmd-sized hugepages are determined by the pmd-size control. 414 * Shmem pmd-sized hugepages are also determined by its pmd-size control, 415 * except when the global shmem_huge is set to SHMEM_HUGE_DENY. 416 */ 417 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 418 hugepage_global_enabled()) 419 return true; 420 if (test_bit(PMD_ORDER, &huge_anon_orders_always)) 421 return true; 422 if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) 423 return true; 424 if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && 425 hugepage_global_enabled()) 426 return true; 427 if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) 428 return true; 429 return false; 430 } 431 432 void __khugepaged_enter(struct mm_struct *mm) 433 { 434 struct mm_slot *slot; 435 int wakeup; 436 437 /* __khugepaged_exit() must not run from under us */ 438 VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); 439 if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) 440 return; 441 442 slot = mm_slot_alloc(mm_slot_cache); 443 if (!slot) 444 return; 445 446 spin_lock(&khugepaged_mm_lock); 447 mm_slot_insert(mm_slots_hash, mm, slot); 448 /* 449 * Insert just behind the scanning cursor, to let the area settle 450 * down a little. 451 */ 452 wakeup = list_empty(&khugepaged_scan.mm_head); 453 list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); 454 spin_unlock(&khugepaged_mm_lock); 455 456 mmgrab(mm); 457 if (wakeup) 458 wake_up_interruptible(&khugepaged_wait); 459 } 460 461 void khugepaged_enter_vma(struct vm_area_struct *vma, 462 vm_flags_t vm_flags) 463 { 464 if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && 465 hugepage_pmd_enabled()) { 466 if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) 467 __khugepaged_enter(vma->vm_mm); 468 } 469 } 470 471 void __khugepaged_exit(struct mm_struct *mm) 472 { 473 struct mm_slot *slot; 474 int free = 0; 475 476 spin_lock(&khugepaged_mm_lock); 477 slot = mm_slot_lookup(mm_slots_hash, mm); 478 if (slot && khugepaged_scan.mm_slot != slot) { 479 hash_del(&slot->hash); 480 list_del(&slot->mm_node); 481 free = 1; 482 } 483 spin_unlock(&khugepaged_mm_lock); 484 485 if (free) { 486 mm_flags_clear(MMF_VM_HUGEPAGE, mm); 487 mm_slot_free(mm_slot_cache, slot); 488 mmdrop(mm); 489 } else if (slot) { 490 /* 491 * This is required to serialize against 492 * hpage_collapse_test_exit() (which is guaranteed to run 493 * under mmap sem read mode). Stop here (after we return all 494 * pagetables will be destroyed) until khugepaged has finished 495 * working on the pagetables under the mmap_lock. 496 */ 497 mmap_write_lock(mm); 498 mmap_write_unlock(mm); 499 } 500 } 501 502 static void release_pte_folio(struct folio *folio) 503 { 504 node_stat_mod_folio(folio, 505 NR_ISOLATED_ANON + folio_is_file_lru(folio), 506 -folio_nr_pages(folio)); 507 folio_unlock(folio); 508 folio_putback_lru(folio); 509 } 510 511 static void release_pte_pages(pte_t *pte, pte_t *_pte, 512 struct list_head *compound_pagelist) 513 { 514 struct folio *folio, *tmp; 515 516 while (--_pte >= pte) { 517 pte_t pteval = ptep_get(_pte); 518 unsigned long pfn; 519 520 if (pte_none(pteval)) 521 continue; 522 VM_WARN_ON_ONCE(!pte_present(pteval)); 523 pfn = pte_pfn(pteval); 524 if (is_zero_pfn(pfn)) 525 continue; 526 folio = pfn_folio(pfn); 527 if (folio_test_large(folio)) 528 continue; 529 release_pte_folio(folio); 530 } 531 532 list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { 533 list_del(&folio->lru); 534 release_pte_folio(folio); 535 } 536 } 537 538 static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, 539 unsigned long start_addr, pte_t *pte, struct collapse_control *cc, 540 struct list_head *compound_pagelist) 541 { 542 struct page *page = NULL; 543 struct folio *folio = NULL; 544 unsigned long addr = start_addr; 545 pte_t *_pte; 546 int none_or_zero = 0, shared = 0, referenced = 0; 547 enum scan_result result = SCAN_FAIL; 548 549 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; 550 _pte++, addr += PAGE_SIZE) { 551 pte_t pteval = ptep_get(_pte); 552 if (pte_none_or_zero(pteval)) { 553 ++none_or_zero; 554 if (!userfaultfd_armed(vma) && 555 (!cc->is_khugepaged || 556 none_or_zero <= khugepaged_max_ptes_none)) { 557 continue; 558 } else { 559 result = SCAN_EXCEED_NONE_PTE; 560 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 561 goto out; 562 } 563 } 564 if (!pte_present(pteval)) { 565 result = SCAN_PTE_NON_PRESENT; 566 goto out; 567 } 568 if (pte_uffd_wp(pteval)) { 569 result = SCAN_PTE_UFFD_WP; 570 goto out; 571 } 572 page = vm_normal_page(vma, addr, pteval); 573 if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 574 result = SCAN_PAGE_NULL; 575 goto out; 576 } 577 578 folio = page_folio(page); 579 VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); 580 581 /* 582 * If the vma has the VM_DROPPABLE flag, the collapse will 583 * preserve the lazyfree property without needing to skip. 584 */ 585 if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && 586 folio_test_lazyfree(folio) && !pte_dirty(pteval)) { 587 result = SCAN_PAGE_LAZYFREE; 588 goto out; 589 } 590 591 /* See hpage_collapse_scan_pmd(). */ 592 if (folio_maybe_mapped_shared(folio)) { 593 ++shared; 594 if (cc->is_khugepaged && 595 shared > khugepaged_max_ptes_shared) { 596 result = SCAN_EXCEED_SHARED_PTE; 597 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 598 goto out; 599 } 600 } 601 602 if (folio_test_large(folio)) { 603 struct folio *f; 604 605 /* 606 * Check if we have dealt with the compound page 607 * already 608 */ 609 list_for_each_entry(f, compound_pagelist, lru) { 610 if (folio == f) 611 goto next; 612 } 613 } 614 615 /* 616 * We can do it before folio_isolate_lru because the 617 * folio can't be freed from under us. NOTE: PG_lock 618 * is needed to serialize against split_huge_page 619 * when invoked from the VM. 620 */ 621 if (!folio_trylock(folio)) { 622 result = SCAN_PAGE_LOCK; 623 goto out; 624 } 625 626 /* 627 * Check if the page has any GUP (or other external) pins. 628 * 629 * The page table that maps the page has been already unlinked 630 * from the page table tree and this process cannot get 631 * an additional pin on the page. 632 * 633 * New pins can come later if the page is shared across fork, 634 * but not from this process. The other process cannot write to 635 * the page, only trigger CoW. 636 */ 637 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { 638 folio_unlock(folio); 639 result = SCAN_PAGE_COUNT; 640 goto out; 641 } 642 643 /* 644 * Isolate the page to avoid collapsing an hugepage 645 * currently in use by the VM. 646 */ 647 if (!folio_isolate_lru(folio)) { 648 folio_unlock(folio); 649 result = SCAN_DEL_PAGE_LRU; 650 goto out; 651 } 652 node_stat_mod_folio(folio, 653 NR_ISOLATED_ANON + folio_is_file_lru(folio), 654 folio_nr_pages(folio)); 655 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 656 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 657 658 if (folio_test_large(folio)) 659 list_add_tail(&folio->lru, compound_pagelist); 660 next: 661 /* 662 * If collapse was initiated by khugepaged, check that there is 663 * enough young pte to justify collapsing the page 664 */ 665 if (cc->is_khugepaged && 666 (pte_young(pteval) || folio_test_young(folio) || 667 folio_test_referenced(folio) || 668 mmu_notifier_test_young(vma->vm_mm, addr))) 669 referenced++; 670 } 671 672 if (unlikely(cc->is_khugepaged && !referenced)) { 673 result = SCAN_LACK_REFERENCED_PAGE; 674 } else { 675 result = SCAN_SUCCEED; 676 trace_mm_collapse_huge_page_isolate(folio, none_or_zero, 677 referenced, result); 678 return result; 679 } 680 out: 681 release_pte_pages(pte, _pte, compound_pagelist); 682 trace_mm_collapse_huge_page_isolate(folio, none_or_zero, 683 referenced, result); 684 return result; 685 } 686 687 static void __collapse_huge_page_copy_succeeded(pte_t *pte, 688 struct vm_area_struct *vma, 689 unsigned long address, 690 spinlock_t *ptl, 691 struct list_head *compound_pagelist) 692 { 693 unsigned long end = address + HPAGE_PMD_SIZE; 694 struct folio *src, *tmp; 695 pte_t pteval; 696 pte_t *_pte; 697 unsigned int nr_ptes; 698 699 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, 700 address += nr_ptes * PAGE_SIZE) { 701 nr_ptes = 1; 702 pteval = ptep_get(_pte); 703 if (pte_none_or_zero(pteval)) { 704 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 705 if (pte_none(pteval)) 706 continue; 707 /* 708 * ptl mostly unnecessary. 709 */ 710 spin_lock(ptl); 711 ptep_clear(vma->vm_mm, address, _pte); 712 spin_unlock(ptl); 713 ksm_might_unmap_zero_page(vma->vm_mm, pteval); 714 } else { 715 struct page *src_page = pte_page(pteval); 716 717 src = page_folio(src_page); 718 719 if (folio_test_large(src)) { 720 unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; 721 722 nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); 723 } else { 724 release_pte_folio(src); 725 } 726 727 /* 728 * ptl mostly unnecessary, but preempt has to 729 * be disabled to update the per-cpu stats 730 * inside folio_remove_rmap_pte(). 731 */ 732 spin_lock(ptl); 733 clear_ptes(vma->vm_mm, address, _pte, nr_ptes); 734 folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); 735 spin_unlock(ptl); 736 free_swap_cache(src); 737 folio_put_refs(src, nr_ptes); 738 } 739 } 740 741 list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { 742 list_del(&src->lru); 743 node_stat_sub_folio(src, NR_ISOLATED_ANON + 744 folio_is_file_lru(src)); 745 folio_unlock(src); 746 free_swap_cache(src); 747 folio_putback_lru(src); 748 } 749 } 750 751 static void __collapse_huge_page_copy_failed(pte_t *pte, 752 pmd_t *pmd, 753 pmd_t orig_pmd, 754 struct vm_area_struct *vma, 755 struct list_head *compound_pagelist) 756 { 757 spinlock_t *pmd_ptl; 758 759 /* 760 * Re-establish the PMD to point to the original page table 761 * entry. Restoring PMD needs to be done prior to releasing 762 * pages. Since pages are still isolated and locked here, 763 * acquiring anon_vma_lock_write is unnecessary. 764 */ 765 pmd_ptl = pmd_lock(vma->vm_mm, pmd); 766 pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); 767 spin_unlock(pmd_ptl); 768 /* 769 * Release both raw and compound pages isolated 770 * in __collapse_huge_page_isolate. 771 */ 772 release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); 773 } 774 775 /* 776 * __collapse_huge_page_copy - attempts to copy memory contents from raw 777 * pages to a hugepage. Cleans up the raw pages if copying succeeds; 778 * otherwise restores the original page table and releases isolated raw pages. 779 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. 780 * 781 * @pte: starting of the PTEs to copy from 782 * @folio: the new hugepage to copy contents to 783 * @pmd: pointer to the new hugepage's PMD 784 * @orig_pmd: the original raw pages' PMD 785 * @vma: the original raw pages' virtual memory area 786 * @address: starting address to copy 787 * @ptl: lock on raw pages' PTEs 788 * @compound_pagelist: list that stores compound pages 789 */ 790 static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio, 791 pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, 792 unsigned long address, spinlock_t *ptl, 793 struct list_head *compound_pagelist) 794 { 795 unsigned int i; 796 enum scan_result result = SCAN_SUCCEED; 797 798 /* 799 * Copying pages' contents is subject to memory poison at any iteration. 800 */ 801 for (i = 0; i < HPAGE_PMD_NR; i++) { 802 pte_t pteval = ptep_get(pte + i); 803 struct page *page = folio_page(folio, i); 804 unsigned long src_addr = address + i * PAGE_SIZE; 805 struct page *src_page; 806 807 if (pte_none_or_zero(pteval)) { 808 clear_user_highpage(page, src_addr); 809 continue; 810 } 811 src_page = pte_page(pteval); 812 if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { 813 result = SCAN_COPY_MC; 814 break; 815 } 816 } 817 818 if (likely(result == SCAN_SUCCEED)) 819 __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, 820 compound_pagelist); 821 else 822 __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, 823 compound_pagelist); 824 825 return result; 826 } 827 828 static void khugepaged_alloc_sleep(void) 829 { 830 DEFINE_WAIT(wait); 831 832 add_wait_queue(&khugepaged_wait, &wait); 833 __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 834 schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 835 remove_wait_queue(&khugepaged_wait, &wait); 836 } 837 838 static struct collapse_control khugepaged_collapse_control = { 839 .is_khugepaged = true, 840 }; 841 842 static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) 843 { 844 int i; 845 846 /* 847 * If node_reclaim_mode is disabled, then no extra effort is made to 848 * allocate memory locally. 849 */ 850 if (!node_reclaim_enabled()) 851 return false; 852 853 /* If there is a count for this node already, it must be acceptable */ 854 if (cc->node_load[nid]) 855 return false; 856 857 for (i = 0; i < MAX_NUMNODES; i++) { 858 if (!cc->node_load[i]) 859 continue; 860 if (node_distance(nid, i) > node_reclaim_distance) 861 return true; 862 } 863 return false; 864 } 865 866 #define khugepaged_defrag() \ 867 (transparent_hugepage_flags & \ 868 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) 869 870 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 871 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 872 { 873 return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; 874 } 875 876 #ifdef CONFIG_NUMA 877 static int hpage_collapse_find_target_node(struct collapse_control *cc) 878 { 879 int nid, target_node = 0, max_value = 0; 880 881 /* find first node with max normal pages hit */ 882 for (nid = 0; nid < MAX_NUMNODES; nid++) 883 if (cc->node_load[nid] > max_value) { 884 max_value = cc->node_load[nid]; 885 target_node = nid; 886 } 887 888 for_each_online_node(nid) { 889 if (max_value == cc->node_load[nid]) 890 node_set(nid, cc->alloc_nmask); 891 } 892 893 return target_node; 894 } 895 #else 896 static int hpage_collapse_find_target_node(struct collapse_control *cc) 897 { 898 return 0; 899 } 900 #endif 901 902 /* 903 * If mmap_lock temporarily dropped, revalidate vma 904 * before taking mmap_lock. 905 * Returns enum scan_result value. 906 */ 907 908 static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, 909 bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) 910 { 911 struct vm_area_struct *vma; 912 enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : 913 TVA_FORCED_COLLAPSE; 914 915 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) 916 return SCAN_ANY_PROCESS; 917 918 *vmap = vma = find_vma(mm, address); 919 if (!vma) 920 return SCAN_VMA_NULL; 921 922 if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) 923 return SCAN_ADDRESS_RANGE; 924 if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) 925 return SCAN_VMA_CHECK; 926 /* 927 * Anon VMA expected, the address may be unmapped then 928 * remapped to file after khugepaged reaquired the mmap_lock. 929 * 930 * thp_vma_allowable_order may return true for qualified file 931 * vmas. 932 */ 933 if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) 934 return SCAN_PAGE_ANON; 935 return SCAN_SUCCEED; 936 } 937 938 static inline enum scan_result check_pmd_state(pmd_t *pmd) 939 { 940 pmd_t pmde = pmdp_get_lockless(pmd); 941 942 if (pmd_none(pmde)) 943 return SCAN_NO_PTE_TABLE; 944 945 /* 946 * The folio may be under migration when khugepaged is trying to 947 * collapse it. Migration success or failure will eventually end 948 * up with a present PMD mapping a folio again. 949 */ 950 if (pmd_is_migration_entry(pmde)) 951 return SCAN_PMD_MAPPED; 952 if (!pmd_present(pmde)) 953 return SCAN_NO_PTE_TABLE; 954 if (pmd_trans_huge(pmde)) 955 return SCAN_PMD_MAPPED; 956 if (pmd_bad(pmde)) 957 return SCAN_NO_PTE_TABLE; 958 return SCAN_SUCCEED; 959 } 960 961 static enum scan_result find_pmd_or_thp_or_none(struct mm_struct *mm, 962 unsigned long address, pmd_t **pmd) 963 { 964 *pmd = mm_find_pmd(mm, address); 965 if (!*pmd) 966 return SCAN_NO_PTE_TABLE; 967 968 return check_pmd_state(*pmd); 969 } 970 971 static enum scan_result check_pmd_still_valid(struct mm_struct *mm, 972 unsigned long address, pmd_t *pmd) 973 { 974 pmd_t *new_pmd; 975 enum scan_result result = find_pmd_or_thp_or_none(mm, address, &new_pmd); 976 977 if (result != SCAN_SUCCEED) 978 return result; 979 if (new_pmd != pmd) 980 return SCAN_FAIL; 981 return SCAN_SUCCEED; 982 } 983 984 /* 985 * Bring missing pages in from swap, to complete THP collapse. 986 * Only done if hpage_collapse_scan_pmd believes it is worthwhile. 987 * 988 * Called and returns without pte mapped or spinlocks held. 989 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. 990 */ 991 static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm, 992 struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd, 993 int referenced) 994 { 995 int swapped_in = 0; 996 vm_fault_t ret = 0; 997 unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); 998 enum scan_result result; 999 pte_t *pte = NULL; 1000 spinlock_t *ptl; 1001 1002 for (addr = start_addr; addr < end; addr += PAGE_SIZE) { 1003 struct vm_fault vmf = { 1004 .vma = vma, 1005 .address = addr, 1006 .pgoff = linear_page_index(vma, addr), 1007 .flags = FAULT_FLAG_ALLOW_RETRY, 1008 .pmd = pmd, 1009 }; 1010 1011 if (!pte++) { 1012 /* 1013 * Here the ptl is only used to check pte_same() in 1014 * do_swap_page(), so readonly version is enough. 1015 */ 1016 pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); 1017 if (!pte) { 1018 mmap_read_unlock(mm); 1019 result = SCAN_NO_PTE_TABLE; 1020 goto out; 1021 } 1022 } 1023 1024 vmf.orig_pte = ptep_get_lockless(pte); 1025 if (pte_none(vmf.orig_pte) || 1026 pte_present(vmf.orig_pte)) 1027 continue; 1028 1029 vmf.pte = pte; 1030 vmf.ptl = ptl; 1031 ret = do_swap_page(&vmf); 1032 /* Which unmaps pte (after perhaps re-checking the entry) */ 1033 pte = NULL; 1034 1035 /* 1036 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. 1037 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because 1038 * we do not retry here and swap entry will remain in pagetable 1039 * resulting in later failure. 1040 */ 1041 if (ret & VM_FAULT_RETRY) { 1042 /* Likely, but not guaranteed, that page lock failed */ 1043 result = SCAN_PAGE_LOCK; 1044 goto out; 1045 } 1046 if (ret & VM_FAULT_ERROR) { 1047 mmap_read_unlock(mm); 1048 result = SCAN_FAIL; 1049 goto out; 1050 } 1051 swapped_in++; 1052 } 1053 1054 if (pte) 1055 pte_unmap(pte); 1056 1057 /* Drain LRU cache to remove extra pin on the swapped in pages */ 1058 if (swapped_in) 1059 lru_add_drain(); 1060 1061 result = SCAN_SUCCEED; 1062 out: 1063 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); 1064 return result; 1065 } 1066 1067 static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, 1068 struct collapse_control *cc) 1069 { 1070 gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : 1071 GFP_TRANSHUGE); 1072 int node = hpage_collapse_find_target_node(cc); 1073 struct folio *folio; 1074 1075 folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); 1076 if (!folio) { 1077 *foliop = NULL; 1078 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1079 return SCAN_ALLOC_HUGE_PAGE_FAIL; 1080 } 1081 1082 count_vm_event(THP_COLLAPSE_ALLOC); 1083 if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { 1084 folio_put(folio); 1085 *foliop = NULL; 1086 return SCAN_CGROUP_CHARGE_FAIL; 1087 } 1088 1089 count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); 1090 1091 *foliop = folio; 1092 return SCAN_SUCCEED; 1093 } 1094 1095 static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address, 1096 int referenced, int unmapped, struct collapse_control *cc) 1097 { 1098 LIST_HEAD(compound_pagelist); 1099 pmd_t *pmd, _pmd; 1100 pte_t *pte; 1101 pgtable_t pgtable; 1102 struct folio *folio; 1103 spinlock_t *pmd_ptl, *pte_ptl; 1104 enum scan_result result = SCAN_FAIL; 1105 struct vm_area_struct *vma; 1106 struct mmu_notifier_range range; 1107 1108 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1109 1110 /* 1111 * Before allocating the hugepage, release the mmap_lock read lock. 1112 * The allocation can take potentially a long time if it involves 1113 * sync compaction, and we do not need to hold the mmap_lock during 1114 * that. We will recheck the vma after taking it again in write mode. 1115 */ 1116 mmap_read_unlock(mm); 1117 1118 result = alloc_charge_folio(&folio, mm, cc); 1119 if (result != SCAN_SUCCEED) 1120 goto out_nolock; 1121 1122 mmap_read_lock(mm); 1123 result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 1124 if (result != SCAN_SUCCEED) { 1125 mmap_read_unlock(mm); 1126 goto out_nolock; 1127 } 1128 1129 result = find_pmd_or_thp_or_none(mm, address, &pmd); 1130 if (result != SCAN_SUCCEED) { 1131 mmap_read_unlock(mm); 1132 goto out_nolock; 1133 } 1134 1135 if (unmapped) { 1136 /* 1137 * __collapse_huge_page_swapin will return with mmap_lock 1138 * released when it fails. So we jump out_nolock directly in 1139 * that case. Continuing to collapse causes inconsistency. 1140 */ 1141 result = __collapse_huge_page_swapin(mm, vma, address, pmd, 1142 referenced); 1143 if (result != SCAN_SUCCEED) 1144 goto out_nolock; 1145 } 1146 1147 mmap_read_unlock(mm); 1148 /* 1149 * Prevent all access to pagetables with the exception of 1150 * gup_fast later handled by the ptep_clear_flush and the VM 1151 * handled by the anon_vma lock + PG_lock. 1152 * 1153 * UFFDIO_MOVE is prevented to race as well thanks to the 1154 * mmap_lock. 1155 */ 1156 mmap_write_lock(mm); 1157 result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 1158 if (result != SCAN_SUCCEED) 1159 goto out_up_write; 1160 /* check if the pmd is still valid */ 1161 vma_start_write(vma); 1162 result = check_pmd_still_valid(mm, address, pmd); 1163 if (result != SCAN_SUCCEED) 1164 goto out_up_write; 1165 1166 anon_vma_lock_write(vma->anon_vma); 1167 1168 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, 1169 address + HPAGE_PMD_SIZE); 1170 mmu_notifier_invalidate_range_start(&range); 1171 1172 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1173 /* 1174 * This removes any huge TLB entry from the CPU so we won't allow 1175 * huge and small TLB entries for the same virtual address to 1176 * avoid the risk of CPU bugs in that area. 1177 * 1178 * Parallel GUP-fast is fine since GUP-fast will back off when 1179 * it detects PMD is changed. 1180 */ 1181 _pmd = pmdp_collapse_flush(vma, address, pmd); 1182 spin_unlock(pmd_ptl); 1183 mmu_notifier_invalidate_range_end(&range); 1184 tlb_remove_table_sync_one(); 1185 1186 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); 1187 if (pte) { 1188 result = __collapse_huge_page_isolate(vma, address, pte, cc, 1189 &compound_pagelist); 1190 spin_unlock(pte_ptl); 1191 } else { 1192 result = SCAN_NO_PTE_TABLE; 1193 } 1194 1195 if (unlikely(result != SCAN_SUCCEED)) { 1196 if (pte) 1197 pte_unmap(pte); 1198 spin_lock(pmd_ptl); 1199 BUG_ON(!pmd_none(*pmd)); 1200 /* 1201 * We can only use set_pmd_at when establishing 1202 * hugepmds and never for establishing regular pmds that 1203 * points to regular pagetables. Use pmd_populate for that 1204 */ 1205 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 1206 spin_unlock(pmd_ptl); 1207 anon_vma_unlock_write(vma->anon_vma); 1208 goto out_up_write; 1209 } 1210 1211 /* 1212 * All pages are isolated and locked so anon_vma rmap 1213 * can't run anymore. 1214 */ 1215 anon_vma_unlock_write(vma->anon_vma); 1216 1217 result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, 1218 vma, address, pte_ptl, 1219 &compound_pagelist); 1220 pte_unmap(pte); 1221 if (unlikely(result != SCAN_SUCCEED)) 1222 goto out_up_write; 1223 1224 /* 1225 * The smp_wmb() inside __folio_mark_uptodate() ensures the 1226 * copy_huge_page writes become visible before the set_pmd_at() 1227 * write. 1228 */ 1229 __folio_mark_uptodate(folio); 1230 pgtable = pmd_pgtable(_pmd); 1231 1232 spin_lock(pmd_ptl); 1233 BUG_ON(!pmd_none(*pmd)); 1234 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1235 map_anon_folio_pmd_nopf(folio, pmd, vma, address); 1236 spin_unlock(pmd_ptl); 1237 1238 folio = NULL; 1239 1240 result = SCAN_SUCCEED; 1241 out_up_write: 1242 mmap_write_unlock(mm); 1243 out_nolock: 1244 if (folio) 1245 folio_put(folio); 1246 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); 1247 return result; 1248 } 1249 1250 static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, 1251 struct vm_area_struct *vma, unsigned long start_addr, 1252 bool *mmap_locked, struct collapse_control *cc) 1253 { 1254 pmd_t *pmd; 1255 pte_t *pte, *_pte; 1256 int none_or_zero = 0, shared = 0, referenced = 0; 1257 enum scan_result result = SCAN_FAIL; 1258 struct page *page = NULL; 1259 struct folio *folio = NULL; 1260 unsigned long addr; 1261 spinlock_t *ptl; 1262 int node = NUMA_NO_NODE, unmapped = 0; 1263 1264 VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); 1265 1266 result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); 1267 if (result != SCAN_SUCCEED) { 1268 cc->progress++; 1269 goto out; 1270 } 1271 1272 memset(cc->node_load, 0, sizeof(cc->node_load)); 1273 nodes_clear(cc->alloc_nmask); 1274 pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); 1275 if (!pte) { 1276 cc->progress++; 1277 result = SCAN_NO_PTE_TABLE; 1278 goto out; 1279 } 1280 1281 for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; 1282 _pte++, addr += PAGE_SIZE) { 1283 cc->progress++; 1284 1285 pte_t pteval = ptep_get(_pte); 1286 if (pte_none_or_zero(pteval)) { 1287 ++none_or_zero; 1288 if (!userfaultfd_armed(vma) && 1289 (!cc->is_khugepaged || 1290 none_or_zero <= khugepaged_max_ptes_none)) { 1291 continue; 1292 } else { 1293 result = SCAN_EXCEED_NONE_PTE; 1294 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 1295 goto out_unmap; 1296 } 1297 } 1298 if (!pte_present(pteval)) { 1299 ++unmapped; 1300 if (!cc->is_khugepaged || 1301 unmapped <= khugepaged_max_ptes_swap) { 1302 /* 1303 * Always be strict with uffd-wp 1304 * enabled swap entries. Please see 1305 * comment below for pte_uffd_wp(). 1306 */ 1307 if (pte_swp_uffd_wp_any(pteval)) { 1308 result = SCAN_PTE_UFFD_WP; 1309 goto out_unmap; 1310 } 1311 continue; 1312 } else { 1313 result = SCAN_EXCEED_SWAP_PTE; 1314 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 1315 goto out_unmap; 1316 } 1317 } 1318 if (pte_uffd_wp(pteval)) { 1319 /* 1320 * Don't collapse the page if any of the small 1321 * PTEs are armed with uffd write protection. 1322 * Here we can also mark the new huge pmd as 1323 * write protected if any of the small ones is 1324 * marked but that could bring unknown 1325 * userfault messages that falls outside of 1326 * the registered range. So, just be simple. 1327 */ 1328 result = SCAN_PTE_UFFD_WP; 1329 goto out_unmap; 1330 } 1331 1332 page = vm_normal_page(vma, addr, pteval); 1333 if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 1334 result = SCAN_PAGE_NULL; 1335 goto out_unmap; 1336 } 1337 folio = page_folio(page); 1338 1339 /* 1340 * If the vma has the VM_DROPPABLE flag, the collapse will 1341 * preserve the lazyfree property without needing to skip. 1342 */ 1343 if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && 1344 folio_test_lazyfree(folio) && !pte_dirty(pteval)) { 1345 result = SCAN_PAGE_LAZYFREE; 1346 goto out_unmap; 1347 } 1348 1349 if (!folio_test_anon(folio)) { 1350 result = SCAN_PAGE_ANON; 1351 goto out_unmap; 1352 } 1353 1354 /* 1355 * We treat a single page as shared if any part of the THP 1356 * is shared. 1357 */ 1358 if (folio_maybe_mapped_shared(folio)) { 1359 ++shared; 1360 if (cc->is_khugepaged && 1361 shared > khugepaged_max_ptes_shared) { 1362 result = SCAN_EXCEED_SHARED_PTE; 1363 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 1364 goto out_unmap; 1365 } 1366 } 1367 1368 /* 1369 * Record which node the original page is from and save this 1370 * information to cc->node_load[]. 1371 * Khugepaged will allocate hugepage from the node has the max 1372 * hit record. 1373 */ 1374 node = folio_nid(folio); 1375 if (hpage_collapse_scan_abort(node, cc)) { 1376 result = SCAN_SCAN_ABORT; 1377 goto out_unmap; 1378 } 1379 cc->node_load[node]++; 1380 if (!folio_test_lru(folio)) { 1381 result = SCAN_PAGE_LRU; 1382 goto out_unmap; 1383 } 1384 if (folio_test_locked(folio)) { 1385 result = SCAN_PAGE_LOCK; 1386 goto out_unmap; 1387 } 1388 1389 /* 1390 * Check if the page has any GUP (or other external) pins. 1391 * 1392 * Here the check may be racy: 1393 * it may see folio_mapcount() > folio_ref_count(). 1394 * But such case is ephemeral we could always retry collapse 1395 * later. However it may report false positive if the page 1396 * has excessive GUP pins (i.e. 512). Anyway the same check 1397 * will be done again later the risk seems low. 1398 */ 1399 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { 1400 result = SCAN_PAGE_COUNT; 1401 goto out_unmap; 1402 } 1403 1404 /* 1405 * If collapse was initiated by khugepaged, check that there is 1406 * enough young pte to justify collapsing the page 1407 */ 1408 if (cc->is_khugepaged && 1409 (pte_young(pteval) || folio_test_young(folio) || 1410 folio_test_referenced(folio) || 1411 mmu_notifier_test_young(vma->vm_mm, addr))) 1412 referenced++; 1413 } 1414 if (cc->is_khugepaged && 1415 (!referenced || 1416 (unmapped && referenced < HPAGE_PMD_NR / 2))) { 1417 result = SCAN_LACK_REFERENCED_PAGE; 1418 } else { 1419 result = SCAN_SUCCEED; 1420 } 1421 out_unmap: 1422 pte_unmap_unlock(pte, ptl); 1423 if (result == SCAN_SUCCEED) { 1424 result = collapse_huge_page(mm, start_addr, referenced, 1425 unmapped, cc); 1426 /* collapse_huge_page will return with the mmap_lock released */ 1427 *mmap_locked = false; 1428 } 1429 out: 1430 trace_mm_khugepaged_scan_pmd(mm, folio, referenced, 1431 none_or_zero, result, unmapped); 1432 return result; 1433 } 1434 1435 static void collect_mm_slot(struct mm_slot *slot) 1436 { 1437 struct mm_struct *mm = slot->mm; 1438 1439 lockdep_assert_held(&khugepaged_mm_lock); 1440 1441 if (hpage_collapse_test_exit(mm)) { 1442 /* free mm_slot */ 1443 hash_del(&slot->hash); 1444 list_del(&slot->mm_node); 1445 1446 /* 1447 * Not strictly needed because the mm exited already. 1448 * 1449 * mm_flags_clear(MMF_VM_HUGEPAGE, mm); 1450 */ 1451 1452 /* khugepaged_mm_lock actually not necessary for the below */ 1453 mm_slot_free(mm_slot_cache, slot); 1454 mmdrop(mm); 1455 } 1456 } 1457 1458 /* folio must be locked, and mmap_lock must be held */ 1459 static enum scan_result set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, 1460 pmd_t *pmdp, struct folio *folio, struct page *page) 1461 { 1462 struct mm_struct *mm = vma->vm_mm; 1463 struct vm_fault vmf = { 1464 .vma = vma, 1465 .address = addr, 1466 .flags = 0, 1467 }; 1468 pgd_t *pgdp; 1469 p4d_t *p4dp; 1470 pud_t *pudp; 1471 1472 mmap_assert_locked(vma->vm_mm); 1473 1474 if (!pmdp) { 1475 pgdp = pgd_offset(mm, addr); 1476 p4dp = p4d_alloc(mm, pgdp, addr); 1477 if (!p4dp) 1478 return SCAN_FAIL; 1479 pudp = pud_alloc(mm, p4dp, addr); 1480 if (!pudp) 1481 return SCAN_FAIL; 1482 pmdp = pmd_alloc(mm, pudp, addr); 1483 if (!pmdp) 1484 return SCAN_FAIL; 1485 } 1486 1487 vmf.pmd = pmdp; 1488 if (do_set_pmd(&vmf, folio, page)) 1489 return SCAN_FAIL; 1490 1491 folio_get(folio); 1492 return SCAN_SUCCEED; 1493 } 1494 1495 static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, 1496 bool install_pmd) 1497 { 1498 enum scan_result result = SCAN_FAIL; 1499 int nr_mapped_ptes = 0; 1500 unsigned int nr_batch_ptes; 1501 struct mmu_notifier_range range; 1502 bool notified = false; 1503 unsigned long haddr = addr & HPAGE_PMD_MASK; 1504 unsigned long end = haddr + HPAGE_PMD_SIZE; 1505 struct vm_area_struct *vma = vma_lookup(mm, haddr); 1506 struct folio *folio; 1507 pte_t *start_pte, *pte; 1508 pmd_t *pmd, pgt_pmd; 1509 spinlock_t *pml = NULL, *ptl; 1510 int i; 1511 1512 mmap_assert_locked(mm); 1513 1514 /* First check VMA found, in case page tables are being torn down */ 1515 if (!vma || !vma->vm_file || 1516 !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) 1517 return SCAN_VMA_CHECK; 1518 1519 /* Fast check before locking page if already PMD-mapped */ 1520 result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 1521 if (result == SCAN_PMD_MAPPED) 1522 return result; 1523 1524 /* 1525 * If we are here, we've succeeded in replacing all the native pages 1526 * in the page cache with a single hugepage. If a mm were to fault-in 1527 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage 1528 * and map it by a PMD, regardless of sysfs THP settings. As such, let's 1529 * analogously elide sysfs THP settings here and force collapse. 1530 */ 1531 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) 1532 return SCAN_VMA_CHECK; 1533 1534 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ 1535 if (userfaultfd_wp(vma)) 1536 return SCAN_PTE_UFFD_WP; 1537 1538 folio = filemap_lock_folio(vma->vm_file->f_mapping, 1539 linear_page_index(vma, haddr)); 1540 if (IS_ERR(folio)) 1541 return SCAN_PAGE_NULL; 1542 1543 if (folio_order(folio) != HPAGE_PMD_ORDER) { 1544 result = SCAN_PAGE_COMPOUND; 1545 goto drop_folio; 1546 } 1547 1548 result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 1549 switch (result) { 1550 case SCAN_SUCCEED: 1551 break; 1552 case SCAN_NO_PTE_TABLE: 1553 /* 1554 * All pte entries have been removed and pmd cleared. 1555 * Skip all the pte checks and just update the pmd mapping. 1556 */ 1557 goto maybe_install_pmd; 1558 default: 1559 goto drop_folio; 1560 } 1561 1562 result = SCAN_FAIL; 1563 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); 1564 if (!start_pte) /* mmap_lock + page lock should prevent this */ 1565 goto drop_folio; 1566 1567 /* step 1: check all mapped PTEs are to the right huge page */ 1568 for (i = 0, addr = haddr, pte = start_pte; 1569 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 1570 struct page *page; 1571 pte_t ptent = ptep_get(pte); 1572 1573 /* empty pte, skip */ 1574 if (pte_none(ptent)) 1575 continue; 1576 1577 /* page swapped out, abort */ 1578 if (!pte_present(ptent)) { 1579 result = SCAN_PTE_NON_PRESENT; 1580 goto abort; 1581 } 1582 1583 page = vm_normal_page(vma, addr, ptent); 1584 if (WARN_ON_ONCE(page && is_zone_device_page(page))) 1585 page = NULL; 1586 /* 1587 * Note that uprobe, debugger, or MAP_PRIVATE may change the 1588 * page table, but the new page will not be a subpage of hpage. 1589 */ 1590 if (folio_page(folio, i) != page) 1591 goto abort; 1592 } 1593 1594 pte_unmap_unlock(start_pte, ptl); 1595 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1596 haddr, haddr + HPAGE_PMD_SIZE); 1597 mmu_notifier_invalidate_range_start(&range); 1598 notified = true; 1599 1600 /* 1601 * pmd_lock covers a wider range than ptl, and (if split from mm's 1602 * page_table_lock) ptl nests inside pml. The less time we hold pml, 1603 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA 1604 * inserts a valid as-if-COWed PTE without even looking up page cache. 1605 * So page lock of folio does not protect from it, so we must not drop 1606 * ptl before pgt_pmd is removed, so uffd private needs pml taken now. 1607 */ 1608 if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) 1609 pml = pmd_lock(mm, pmd); 1610 1611 start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); 1612 if (!start_pte) /* mmap_lock + page lock should prevent this */ 1613 goto abort; 1614 if (!pml) 1615 spin_lock(ptl); 1616 else if (ptl != pml) 1617 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1618 1619 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) 1620 goto abort; 1621 1622 /* step 2: clear page table and adjust rmap */ 1623 for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; 1624 i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, 1625 pte += nr_batch_ptes) { 1626 unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; 1627 struct page *page; 1628 pte_t ptent = ptep_get(pte); 1629 1630 nr_batch_ptes = 1; 1631 1632 if (pte_none(ptent)) 1633 continue; 1634 /* 1635 * We dropped ptl after the first scan, to do the mmu_notifier: 1636 * page lock stops more PTEs of the folio being faulted in, but 1637 * does not stop write faults COWing anon copies from existing 1638 * PTEs; and does not stop those being swapped out or migrated. 1639 */ 1640 if (!pte_present(ptent)) { 1641 result = SCAN_PTE_NON_PRESENT; 1642 goto abort; 1643 } 1644 page = vm_normal_page(vma, addr, ptent); 1645 1646 if (folio_page(folio, i) != page) 1647 goto abort; 1648 1649 nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); 1650 1651 /* 1652 * Must clear entry, or a racing truncate may re-remove it. 1653 * TLB flush can be left until pmdp_collapse_flush() does it. 1654 * PTE dirty? Shmem page is already dirty; file is read-only. 1655 */ 1656 clear_ptes(mm, addr, pte, nr_batch_ptes); 1657 folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); 1658 nr_mapped_ptes += nr_batch_ptes; 1659 } 1660 1661 if (!pml) 1662 spin_unlock(ptl); 1663 1664 /* step 3: set proper refcount and mm_counters. */ 1665 if (nr_mapped_ptes) { 1666 folio_ref_sub(folio, nr_mapped_ptes); 1667 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1668 } 1669 1670 /* step 4: remove empty page table */ 1671 if (!pml) { 1672 pml = pmd_lock(mm, pmd); 1673 if (ptl != pml) { 1674 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1675 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { 1676 flush_tlb_mm(mm); 1677 goto unlock; 1678 } 1679 } 1680 } 1681 pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); 1682 pmdp_get_lockless_sync(); 1683 pte_unmap_unlock(start_pte, ptl); 1684 if (ptl != pml) 1685 spin_unlock(pml); 1686 1687 mmu_notifier_invalidate_range_end(&range); 1688 1689 mm_dec_nr_ptes(mm); 1690 page_table_check_pte_clear_range(mm, haddr, pgt_pmd); 1691 pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 1692 1693 maybe_install_pmd: 1694 /* step 5: install pmd entry */ 1695 result = install_pmd 1696 ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) 1697 : SCAN_SUCCEED; 1698 goto drop_folio; 1699 abort: 1700 if (nr_mapped_ptes) { 1701 flush_tlb_mm(mm); 1702 folio_ref_sub(folio, nr_mapped_ptes); 1703 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1704 } 1705 unlock: 1706 if (start_pte) 1707 pte_unmap_unlock(start_pte, ptl); 1708 if (pml && pml != ptl) 1709 spin_unlock(pml); 1710 if (notified) 1711 mmu_notifier_invalidate_range_end(&range); 1712 drop_folio: 1713 folio_unlock(folio); 1714 folio_put(folio); 1715 return result; 1716 } 1717 1718 /** 1719 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at 1720 * address haddr. 1721 * 1722 * @mm: process address space where collapse happens 1723 * @addr: THP collapse address 1724 * @install_pmd: If a huge PMD should be installed 1725 * 1726 * This function checks whether all the PTEs in the PMD are pointing to the 1727 * right THP. If so, retract the page table so the THP can refault in with 1728 * as pmd-mapped. Possibly install a huge PMD mapping the THP. 1729 */ 1730 void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, 1731 bool install_pmd) 1732 { 1733 try_collapse_pte_mapped_thp(mm, addr, install_pmd); 1734 } 1735 1736 /* Can we retract page tables for this file-backed VMA? */ 1737 static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) 1738 { 1739 /* 1740 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that 1741 * got written to. These VMAs are likely not worth removing 1742 * page tables from, as PMD-mapping is likely to be split later. 1743 */ 1744 if (READ_ONCE(vma->anon_vma)) 1745 return false; 1746 1747 /* 1748 * When a vma is registered with uffd-wp, we cannot recycle 1749 * the page table because there may be pte markers installed. 1750 * Other vmas can still have the same file mapped hugely, but 1751 * skip this one: it will always be mapped in small page size 1752 * for uffd-wp registered ranges. 1753 */ 1754 if (userfaultfd_wp(vma)) 1755 return false; 1756 1757 /* 1758 * If the VMA contains guard regions then we can't collapse it. 1759 * 1760 * This is set atomically on guard marker installation under mmap/VMA 1761 * read lock, and here we may not hold any VMA or mmap lock at all. 1762 * 1763 * This is therefore serialised on the PTE page table lock, which is 1764 * obtained on guard region installation after the flag is set, so this 1765 * check being performed under this lock excludes races. 1766 */ 1767 if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT)) 1768 return false; 1769 1770 return true; 1771 } 1772 1773 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) 1774 { 1775 struct vm_area_struct *vma; 1776 1777 i_mmap_lock_read(mapping); 1778 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1779 struct mmu_notifier_range range; 1780 struct mm_struct *mm; 1781 unsigned long addr; 1782 pmd_t *pmd, pgt_pmd; 1783 spinlock_t *pml; 1784 spinlock_t *ptl; 1785 bool success = false; 1786 1787 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 1788 if (addr & ~HPAGE_PMD_MASK || 1789 vma->vm_end < addr + HPAGE_PMD_SIZE) 1790 continue; 1791 1792 mm = vma->vm_mm; 1793 if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) 1794 continue; 1795 1796 if (hpage_collapse_test_exit(mm)) 1797 continue; 1798 1799 if (!file_backed_vma_is_retractable(vma)) 1800 continue; 1801 1802 /* PTEs were notified when unmapped; but now for the PMD? */ 1803 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1804 addr, addr + HPAGE_PMD_SIZE); 1805 mmu_notifier_invalidate_range_start(&range); 1806 1807 pml = pmd_lock(mm, pmd); 1808 /* 1809 * The lock of new_folio is still held, we will be blocked in 1810 * the page fault path, which prevents the pte entries from 1811 * being set again. So even though the old empty PTE page may be 1812 * concurrently freed and a new PTE page is filled into the pmd 1813 * entry, it is still empty and can be removed. 1814 * 1815 * So here we only need to recheck if the state of pmd entry 1816 * still meets our requirements, rather than checking pmd_same() 1817 * like elsewhere. 1818 */ 1819 if (check_pmd_state(pmd) != SCAN_SUCCEED) 1820 goto drop_pml; 1821 ptl = pte_lockptr(mm, pmd); 1822 if (ptl != pml) 1823 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1824 1825 /* 1826 * Huge page lock is still held, so normally the page table must 1827 * remain empty; and we have already skipped anon_vma and 1828 * userfaultfd_wp() vmas. But since the mmap_lock is not held, 1829 * it is still possible for a racing userfaultfd_ioctl() or 1830 * madvise() to have inserted ptes or markers. Now that we hold 1831 * ptlock, repeating the retractable checks protects us from 1832 * races against the prior checks. 1833 */ 1834 if (likely(file_backed_vma_is_retractable(vma))) { 1835 pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); 1836 pmdp_get_lockless_sync(); 1837 success = true; 1838 } 1839 1840 if (ptl != pml) 1841 spin_unlock(ptl); 1842 drop_pml: 1843 spin_unlock(pml); 1844 1845 mmu_notifier_invalidate_range_end(&range); 1846 1847 if (success) { 1848 mm_dec_nr_ptes(mm); 1849 page_table_check_pte_clear_range(mm, addr, pgt_pmd); 1850 pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 1851 } 1852 } 1853 i_mmap_unlock_read(mapping); 1854 } 1855 1856 /** 1857 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. 1858 * 1859 * @mm: process address space where collapse happens 1860 * @addr: virtual collapse start address 1861 * @file: file that collapse on 1862 * @start: collapse start address 1863 * @cc: collapse context and scratchpad 1864 * 1865 * Basic scheme is simple, details are more complex: 1866 * - allocate and lock a new huge page; 1867 * - scan page cache, locking old pages 1868 * + swap/gup in pages if necessary; 1869 * - copy data to new page 1870 * - handle shmem holes 1871 * + re-validate that holes weren't filled by someone else 1872 * + check for userfaultfd 1873 * - finalize updates to the page cache; 1874 * - if replacing succeeds: 1875 * + unlock huge page; 1876 * + free old pages; 1877 * - if replacing failed; 1878 * + unlock old pages 1879 * + unlock and free huge page; 1880 */ 1881 static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, 1882 struct file *file, pgoff_t start, struct collapse_control *cc) 1883 { 1884 struct address_space *mapping = file->f_mapping; 1885 struct page *dst; 1886 struct folio *folio, *tmp, *new_folio; 1887 pgoff_t index = 0, end = start + HPAGE_PMD_NR; 1888 LIST_HEAD(pagelist); 1889 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); 1890 enum scan_result result = SCAN_SUCCEED; 1891 int nr_none = 0; 1892 bool is_shmem = shmem_file(file); 1893 1894 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); 1895 VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 1896 1897 result = alloc_charge_folio(&new_folio, mm, cc); 1898 if (result != SCAN_SUCCEED) 1899 goto out; 1900 1901 mapping_set_update(&xas, mapping); 1902 1903 __folio_set_locked(new_folio); 1904 if (is_shmem) 1905 __folio_set_swapbacked(new_folio); 1906 new_folio->index = start; 1907 new_folio->mapping = mapping; 1908 1909 /* 1910 * Ensure we have slots for all the pages in the range. This is 1911 * almost certainly a no-op because most of the pages must be present 1912 */ 1913 do { 1914 xas_lock_irq(&xas); 1915 xas_create_range(&xas); 1916 if (!xas_error(&xas)) 1917 break; 1918 xas_unlock_irq(&xas); 1919 if (!xas_nomem(&xas, GFP_KERNEL)) { 1920 result = SCAN_FAIL; 1921 goto rollback; 1922 } 1923 } while (1); 1924 1925 for (index = start; index < end;) { 1926 xas_set(&xas, index); 1927 folio = xas_load(&xas); 1928 1929 VM_BUG_ON(index != xas.xa_index); 1930 if (is_shmem) { 1931 if (!folio) { 1932 /* 1933 * Stop if extent has been truncated or 1934 * hole-punched, and is now completely 1935 * empty. 1936 */ 1937 if (index == start) { 1938 if (!xas_next_entry(&xas, end - 1)) { 1939 result = SCAN_TRUNCATED; 1940 goto xa_locked; 1941 } 1942 } 1943 nr_none++; 1944 index++; 1945 continue; 1946 } 1947 1948 if (xa_is_value(folio) || !folio_test_uptodate(folio)) { 1949 xas_unlock_irq(&xas); 1950 /* swap in or instantiate fallocated page */ 1951 if (shmem_get_folio(mapping->host, index, 0, 1952 &folio, SGP_NOALLOC)) { 1953 result = SCAN_FAIL; 1954 goto xa_unlocked; 1955 } 1956 /* drain lru cache to help folio_isolate_lru() */ 1957 lru_add_drain(); 1958 } else if (folio_trylock(folio)) { 1959 folio_get(folio); 1960 xas_unlock_irq(&xas); 1961 } else { 1962 result = SCAN_PAGE_LOCK; 1963 goto xa_locked; 1964 } 1965 } else { /* !is_shmem */ 1966 if (!folio || xa_is_value(folio)) { 1967 xas_unlock_irq(&xas); 1968 page_cache_sync_readahead(mapping, &file->f_ra, 1969 file, index, 1970 end - index); 1971 /* drain lru cache to help folio_isolate_lru() */ 1972 lru_add_drain(); 1973 folio = filemap_lock_folio(mapping, index); 1974 if (IS_ERR(folio)) { 1975 result = SCAN_FAIL; 1976 goto xa_unlocked; 1977 } 1978 } else if (folio_test_dirty(folio)) { 1979 /* 1980 * khugepaged only works on read-only fd, 1981 * so this page is dirty because it hasn't 1982 * been flushed since first write. There 1983 * won't be new dirty pages. 1984 * 1985 * Trigger async flush here and hope the 1986 * writeback is done when khugepaged 1987 * revisits this page. 1988 * 1989 * This is a one-off situation. We are not 1990 * forcing writeback in loop. 1991 */ 1992 xas_unlock_irq(&xas); 1993 filemap_flush(mapping); 1994 result = SCAN_PAGE_DIRTY_OR_WRITEBACK; 1995 goto xa_unlocked; 1996 } else if (folio_test_writeback(folio)) { 1997 xas_unlock_irq(&xas); 1998 result = SCAN_PAGE_DIRTY_OR_WRITEBACK; 1999 goto xa_unlocked; 2000 } else if (folio_trylock(folio)) { 2001 folio_get(folio); 2002 xas_unlock_irq(&xas); 2003 } else { 2004 result = SCAN_PAGE_LOCK; 2005 goto xa_locked; 2006 } 2007 } 2008 2009 /* 2010 * The folio must be locked, so we can drop the i_pages lock 2011 * without racing with truncate. 2012 */ 2013 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 2014 2015 /* make sure the folio is up to date */ 2016 if (unlikely(!folio_test_uptodate(folio))) { 2017 result = SCAN_FAIL; 2018 goto out_unlock; 2019 } 2020 2021 /* 2022 * If file was truncated then extended, or hole-punched, before 2023 * we locked the first folio, then a THP might be there already. 2024 * This will be discovered on the first iteration. 2025 */ 2026 if (folio_order(folio) == HPAGE_PMD_ORDER) { 2027 result = SCAN_PTE_MAPPED_HUGEPAGE; 2028 goto out_unlock; 2029 } 2030 2031 if (folio_mapping(folio) != mapping) { 2032 result = SCAN_TRUNCATED; 2033 goto out_unlock; 2034 } 2035 2036 if (!is_shmem && (folio_test_dirty(folio) || 2037 folio_test_writeback(folio))) { 2038 /* 2039 * khugepaged only works on read-only fd, so this 2040 * folio is dirty because it hasn't been flushed 2041 * since first write. 2042 */ 2043 result = SCAN_PAGE_DIRTY_OR_WRITEBACK; 2044 goto out_unlock; 2045 } 2046 2047 if (!folio_isolate_lru(folio)) { 2048 result = SCAN_DEL_PAGE_LRU; 2049 goto out_unlock; 2050 } 2051 2052 if (!filemap_release_folio(folio, GFP_KERNEL)) { 2053 result = SCAN_PAGE_HAS_PRIVATE; 2054 folio_putback_lru(folio); 2055 goto out_unlock; 2056 } 2057 2058 if (folio_mapped(folio)) 2059 try_to_unmap(folio, 2060 TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); 2061 2062 xas_lock_irq(&xas); 2063 2064 VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); 2065 2066 /* 2067 * We control 2 + nr_pages references to the folio: 2068 * - we hold a pin on it; 2069 * - nr_pages reference from page cache; 2070 * - one from lru_isolate_folio; 2071 * If those are the only references, then any new usage 2072 * of the folio will have to fetch it from the page 2073 * cache. That requires locking the folio to handle 2074 * truncate, so any new usage will be blocked until we 2075 * unlock folio after collapse/during rollback. 2076 */ 2077 if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { 2078 result = SCAN_PAGE_COUNT; 2079 xas_unlock_irq(&xas); 2080 folio_putback_lru(folio); 2081 goto out_unlock; 2082 } 2083 2084 /* 2085 * Accumulate the folios that are being collapsed. 2086 */ 2087 list_add_tail(&folio->lru, &pagelist); 2088 index += folio_nr_pages(folio); 2089 continue; 2090 out_unlock: 2091 folio_unlock(folio); 2092 folio_put(folio); 2093 goto xa_unlocked; 2094 } 2095 2096 if (!is_shmem) { 2097 filemap_nr_thps_inc(mapping); 2098 /* 2099 * Paired with the fence in do_dentry_open() -> get_write_access() 2100 * to ensure i_writecount is up to date and the update to nr_thps 2101 * is visible. Ensures the page cache will be truncated if the 2102 * file is opened writable. 2103 */ 2104 smp_mb(); 2105 if (inode_is_open_for_write(mapping->host)) { 2106 result = SCAN_FAIL; 2107 filemap_nr_thps_dec(mapping); 2108 } 2109 } 2110 2111 xa_locked: 2112 xas_unlock_irq(&xas); 2113 xa_unlocked: 2114 2115 /* 2116 * If collapse is successful, flush must be done now before copying. 2117 * If collapse is unsuccessful, does flush actually need to be done? 2118 * Do it anyway, to clear the state. 2119 */ 2120 try_to_unmap_flush(); 2121 2122 if (result == SCAN_SUCCEED && nr_none && 2123 !shmem_charge(mapping->host, nr_none)) 2124 result = SCAN_FAIL; 2125 if (result != SCAN_SUCCEED) { 2126 nr_none = 0; 2127 goto rollback; 2128 } 2129 2130 /* 2131 * The old folios are locked, so they won't change anymore. 2132 */ 2133 index = start; 2134 dst = folio_page(new_folio, 0); 2135 list_for_each_entry(folio, &pagelist, lru) { 2136 int i, nr_pages = folio_nr_pages(folio); 2137 2138 while (index < folio->index) { 2139 clear_highpage(dst); 2140 index++; 2141 dst++; 2142 } 2143 2144 for (i = 0; i < nr_pages; i++) { 2145 if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { 2146 result = SCAN_COPY_MC; 2147 goto rollback; 2148 } 2149 index++; 2150 dst++; 2151 } 2152 } 2153 while (index < end) { 2154 clear_highpage(dst); 2155 index++; 2156 dst++; 2157 } 2158 2159 if (nr_none) { 2160 struct vm_area_struct *vma; 2161 int nr_none_check = 0; 2162 2163 i_mmap_lock_read(mapping); 2164 xas_lock_irq(&xas); 2165 2166 xas_set(&xas, start); 2167 for (index = start; index < end; index++) { 2168 if (!xas_next(&xas)) { 2169 xas_store(&xas, XA_RETRY_ENTRY); 2170 if (xas_error(&xas)) { 2171 result = SCAN_STORE_FAILED; 2172 goto immap_locked; 2173 } 2174 nr_none_check++; 2175 } 2176 } 2177 2178 if (nr_none != nr_none_check) { 2179 result = SCAN_PAGE_FILLED; 2180 goto immap_locked; 2181 } 2182 2183 /* 2184 * If userspace observed a missing page in a VMA with 2185 * a MODE_MISSING userfaultfd, then it might expect a 2186 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to 2187 * roll back to avoid suppressing such an event. Since 2188 * wp/minor userfaultfds don't give userspace any 2189 * guarantees that the kernel doesn't fill a missing 2190 * page with a zero page, so they don't matter here. 2191 * 2192 * Any userfaultfds registered after this point will 2193 * not be able to observe any missing pages due to the 2194 * previously inserted retry entries. 2195 */ 2196 vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { 2197 if (userfaultfd_missing(vma)) { 2198 result = SCAN_EXCEED_NONE_PTE; 2199 goto immap_locked; 2200 } 2201 } 2202 2203 immap_locked: 2204 i_mmap_unlock_read(mapping); 2205 if (result != SCAN_SUCCEED) { 2206 xas_set(&xas, start); 2207 for (index = start; index < end; index++) { 2208 if (xas_next(&xas) == XA_RETRY_ENTRY) 2209 xas_store(&xas, NULL); 2210 } 2211 2212 xas_unlock_irq(&xas); 2213 goto rollback; 2214 } 2215 } else { 2216 xas_lock_irq(&xas); 2217 } 2218 2219 if (is_shmem) { 2220 lruvec_stat_mod_folio(new_folio, NR_SHMEM, HPAGE_PMD_NR); 2221 lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); 2222 } else { 2223 lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); 2224 } 2225 lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, HPAGE_PMD_NR); 2226 2227 /* 2228 * Mark new_folio as uptodate before inserting it into the 2229 * page cache so that it isn't mistaken for an fallocated but 2230 * unwritten page. 2231 */ 2232 folio_mark_uptodate(new_folio); 2233 folio_ref_add(new_folio, HPAGE_PMD_NR - 1); 2234 2235 if (is_shmem) 2236 folio_mark_dirty(new_folio); 2237 folio_add_lru(new_folio); 2238 2239 /* Join all the small entries into a single multi-index entry. */ 2240 xas_set_order(&xas, start, HPAGE_PMD_ORDER); 2241 xas_store(&xas, new_folio); 2242 WARN_ON_ONCE(xas_error(&xas)); 2243 xas_unlock_irq(&xas); 2244 2245 /* 2246 * Remove pte page tables, so we can re-fault the page as huge. 2247 * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp(). 2248 */ 2249 retract_page_tables(mapping, start); 2250 if (cc && !cc->is_khugepaged) 2251 result = SCAN_PTE_MAPPED_HUGEPAGE; 2252 folio_unlock(new_folio); 2253 2254 /* 2255 * The collapse has succeeded, so free the old folios. 2256 */ 2257 list_for_each_entry_safe(folio, tmp, &pagelist, lru) { 2258 list_del(&folio->lru); 2259 lruvec_stat_mod_folio(folio, NR_FILE_PAGES, 2260 -folio_nr_pages(folio)); 2261 if (is_shmem) 2262 lruvec_stat_mod_folio(folio, NR_SHMEM, 2263 -folio_nr_pages(folio)); 2264 folio->mapping = NULL; 2265 folio_clear_active(folio); 2266 folio_clear_unevictable(folio); 2267 folio_unlock(folio); 2268 folio_put_refs(folio, 2 + folio_nr_pages(folio)); 2269 } 2270 2271 goto out; 2272 2273 rollback: 2274 /* Something went wrong: roll back page cache changes */ 2275 if (nr_none) { 2276 xas_lock_irq(&xas); 2277 mapping->nrpages -= nr_none; 2278 xas_unlock_irq(&xas); 2279 shmem_uncharge(mapping->host, nr_none); 2280 } 2281 2282 list_for_each_entry_safe(folio, tmp, &pagelist, lru) { 2283 list_del(&folio->lru); 2284 folio_unlock(folio); 2285 folio_putback_lru(folio); 2286 folio_put(folio); 2287 } 2288 /* 2289 * Undo the updates of filemap_nr_thps_inc for non-SHMEM 2290 * file only. This undo is not needed unless failure is 2291 * due to SCAN_COPY_MC. 2292 */ 2293 if (!is_shmem && result == SCAN_COPY_MC) { 2294 filemap_nr_thps_dec(mapping); 2295 /* 2296 * Paired with the fence in do_dentry_open() -> get_write_access() 2297 * to ensure the update to nr_thps is visible. 2298 */ 2299 smp_mb(); 2300 } 2301 2302 new_folio->mapping = NULL; 2303 2304 folio_unlock(new_folio); 2305 folio_put(new_folio); 2306 out: 2307 VM_BUG_ON(!list_empty(&pagelist)); 2308 trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); 2309 return result; 2310 } 2311 2312 static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, 2313 unsigned long addr, struct file *file, pgoff_t start, 2314 struct collapse_control *cc) 2315 { 2316 struct folio *folio = NULL; 2317 struct address_space *mapping = file->f_mapping; 2318 XA_STATE(xas, &mapping->i_pages, start); 2319 int present, swap; 2320 int node = NUMA_NO_NODE; 2321 enum scan_result result = SCAN_SUCCEED; 2322 2323 present = 0; 2324 swap = 0; 2325 memset(cc->node_load, 0, sizeof(cc->node_load)); 2326 nodes_clear(cc->alloc_nmask); 2327 rcu_read_lock(); 2328 xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { 2329 if (xas_retry(&xas, folio)) 2330 continue; 2331 2332 if (xa_is_value(folio)) { 2333 swap += 1 << xas_get_order(&xas); 2334 if (cc->is_khugepaged && 2335 swap > khugepaged_max_ptes_swap) { 2336 result = SCAN_EXCEED_SWAP_PTE; 2337 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 2338 break; 2339 } 2340 continue; 2341 } 2342 2343 if (!folio_try_get(folio)) { 2344 xas_reset(&xas); 2345 continue; 2346 } 2347 2348 if (unlikely(folio != xas_reload(&xas))) { 2349 folio_put(folio); 2350 xas_reset(&xas); 2351 continue; 2352 } 2353 2354 if (folio_order(folio) == HPAGE_PMD_ORDER) { 2355 result = SCAN_PTE_MAPPED_HUGEPAGE; 2356 /* 2357 * PMD-sized THP implies that we can only try 2358 * retracting the PTE table. 2359 */ 2360 folio_put(folio); 2361 break; 2362 } 2363 2364 node = folio_nid(folio); 2365 if (hpage_collapse_scan_abort(node, cc)) { 2366 result = SCAN_SCAN_ABORT; 2367 folio_put(folio); 2368 break; 2369 } 2370 cc->node_load[node]++; 2371 2372 if (!folio_test_lru(folio)) { 2373 result = SCAN_PAGE_LRU; 2374 folio_put(folio); 2375 break; 2376 } 2377 2378 if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { 2379 result = SCAN_PAGE_COUNT; 2380 folio_put(folio); 2381 break; 2382 } 2383 2384 /* 2385 * We probably should check if the folio is referenced 2386 * here, but nobody would transfer pte_young() to 2387 * folio_test_referenced() for us. And rmap walk here 2388 * is just too costly... 2389 */ 2390 2391 present += folio_nr_pages(folio); 2392 folio_put(folio); 2393 2394 if (need_resched()) { 2395 xas_pause(&xas); 2396 cond_resched_rcu(); 2397 } 2398 } 2399 rcu_read_unlock(); 2400 if (result == SCAN_PTE_MAPPED_HUGEPAGE) 2401 cc->progress++; 2402 else 2403 cc->progress += HPAGE_PMD_NR; 2404 2405 if (result == SCAN_SUCCEED) { 2406 if (cc->is_khugepaged && 2407 present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { 2408 result = SCAN_EXCEED_NONE_PTE; 2409 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 2410 } else { 2411 result = collapse_file(mm, addr, file, start, cc); 2412 } 2413 } 2414 2415 trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); 2416 return result; 2417 } 2418 2419 static void khugepaged_scan_mm_slot(unsigned int progress_max, 2420 enum scan_result *result, struct collapse_control *cc) 2421 __releases(&khugepaged_mm_lock) 2422 __acquires(&khugepaged_mm_lock) 2423 { 2424 struct vma_iterator vmi; 2425 struct mm_slot *slot; 2426 struct mm_struct *mm; 2427 struct vm_area_struct *vma; 2428 unsigned int progress_prev = cc->progress; 2429 2430 lockdep_assert_held(&khugepaged_mm_lock); 2431 *result = SCAN_FAIL; 2432 2433 if (khugepaged_scan.mm_slot) { 2434 slot = khugepaged_scan.mm_slot; 2435 } else { 2436 slot = list_first_entry(&khugepaged_scan.mm_head, 2437 struct mm_slot, mm_node); 2438 khugepaged_scan.address = 0; 2439 khugepaged_scan.mm_slot = slot; 2440 } 2441 spin_unlock(&khugepaged_mm_lock); 2442 2443 mm = slot->mm; 2444 /* 2445 * Don't wait for semaphore (to avoid long wait times). Just move to 2446 * the next mm on the list. 2447 */ 2448 vma = NULL; 2449 if (unlikely(!mmap_read_trylock(mm))) 2450 goto breakouterloop_mmap_lock; 2451 2452 cc->progress++; 2453 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) 2454 goto breakouterloop; 2455 2456 vma_iter_init(&vmi, mm, khugepaged_scan.address); 2457 for_each_vma(vmi, vma) { 2458 unsigned long hstart, hend; 2459 2460 cond_resched(); 2461 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { 2462 cc->progress++; 2463 break; 2464 } 2465 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { 2466 cc->progress++; 2467 continue; 2468 } 2469 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); 2470 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); 2471 if (khugepaged_scan.address > hend) { 2472 cc->progress++; 2473 continue; 2474 } 2475 if (khugepaged_scan.address < hstart) 2476 khugepaged_scan.address = hstart; 2477 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2478 2479 while (khugepaged_scan.address < hend) { 2480 bool mmap_locked = true; 2481 2482 cond_resched(); 2483 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) 2484 goto breakouterloop; 2485 2486 VM_BUG_ON(khugepaged_scan.address < hstart || 2487 khugepaged_scan.address + HPAGE_PMD_SIZE > 2488 hend); 2489 if (!vma_is_anonymous(vma)) { 2490 struct file *file = get_file(vma->vm_file); 2491 pgoff_t pgoff = linear_page_index(vma, 2492 khugepaged_scan.address); 2493 2494 mmap_read_unlock(mm); 2495 mmap_locked = false; 2496 *result = hpage_collapse_scan_file(mm, 2497 khugepaged_scan.address, file, pgoff, cc); 2498 fput(file); 2499 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { 2500 mmap_read_lock(mm); 2501 if (hpage_collapse_test_exit_or_disable(mm)) 2502 goto breakouterloop; 2503 *result = try_collapse_pte_mapped_thp(mm, 2504 khugepaged_scan.address, false); 2505 if (*result == SCAN_PMD_MAPPED) 2506 *result = SCAN_SUCCEED; 2507 mmap_read_unlock(mm); 2508 } 2509 } else { 2510 *result = hpage_collapse_scan_pmd(mm, vma, 2511 khugepaged_scan.address, &mmap_locked, cc); 2512 } 2513 2514 if (*result == SCAN_SUCCEED) 2515 ++khugepaged_pages_collapsed; 2516 2517 /* move to next address */ 2518 khugepaged_scan.address += HPAGE_PMD_SIZE; 2519 if (!mmap_locked) 2520 /* 2521 * We released mmap_lock so break loop. Note 2522 * that we drop mmap_lock before all hugepage 2523 * allocations, so if allocation fails, we are 2524 * guaranteed to break here and report the 2525 * correct result back to caller. 2526 */ 2527 goto breakouterloop_mmap_lock; 2528 if (cc->progress >= progress_max) 2529 goto breakouterloop; 2530 } 2531 } 2532 breakouterloop: 2533 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ 2534 breakouterloop_mmap_lock: 2535 2536 spin_lock(&khugepaged_mm_lock); 2537 VM_BUG_ON(khugepaged_scan.mm_slot != slot); 2538 /* 2539 * Release the current mm_slot if this mm is about to die, or 2540 * if we scanned all vmas of this mm, or THP got disabled. 2541 */ 2542 if (hpage_collapse_test_exit_or_disable(mm) || !vma) { 2543 /* 2544 * Make sure that if mm_users is reaching zero while 2545 * khugepaged runs here, khugepaged_exit will find 2546 * mm_slot not pointing to the exiting mm. 2547 */ 2548 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { 2549 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); 2550 khugepaged_scan.address = 0; 2551 } else { 2552 khugepaged_scan.mm_slot = NULL; 2553 khugepaged_full_scans++; 2554 } 2555 2556 collect_mm_slot(slot); 2557 } 2558 2559 trace_mm_khugepaged_scan(mm, cc->progress - progress_prev, 2560 khugepaged_scan.mm_slot == NULL); 2561 } 2562 2563 static int khugepaged_has_work(void) 2564 { 2565 return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); 2566 } 2567 2568 static int khugepaged_wait_event(void) 2569 { 2570 return !list_empty(&khugepaged_scan.mm_head) || 2571 kthread_should_stop(); 2572 } 2573 2574 static void khugepaged_do_scan(struct collapse_control *cc) 2575 { 2576 const unsigned int progress_max = READ_ONCE(khugepaged_pages_to_scan); 2577 unsigned int pass_through_head = 0; 2578 bool wait = true; 2579 enum scan_result result = SCAN_SUCCEED; 2580 2581 lru_add_drain_all(); 2582 2583 cc->progress = 0; 2584 while (true) { 2585 cond_resched(); 2586 2587 if (unlikely(kthread_should_stop())) 2588 break; 2589 2590 spin_lock(&khugepaged_mm_lock); 2591 if (!khugepaged_scan.mm_slot) 2592 pass_through_head++; 2593 if (khugepaged_has_work() && 2594 pass_through_head < 2) 2595 khugepaged_scan_mm_slot(progress_max, &result, cc); 2596 else 2597 cc->progress = progress_max; 2598 spin_unlock(&khugepaged_mm_lock); 2599 2600 if (cc->progress >= progress_max) 2601 break; 2602 2603 if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { 2604 /* 2605 * If fail to allocate the first time, try to sleep for 2606 * a while. When hit again, cancel the scan. 2607 */ 2608 if (!wait) 2609 break; 2610 wait = false; 2611 khugepaged_alloc_sleep(); 2612 } 2613 } 2614 } 2615 2616 static bool khugepaged_should_wakeup(void) 2617 { 2618 return kthread_should_stop() || 2619 time_after_eq(jiffies, khugepaged_sleep_expire); 2620 } 2621 2622 static void khugepaged_wait_work(void) 2623 { 2624 if (khugepaged_has_work()) { 2625 const unsigned long scan_sleep_jiffies = 2626 msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 2627 2628 if (!scan_sleep_jiffies) 2629 return; 2630 2631 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 2632 wait_event_freezable_timeout(khugepaged_wait, 2633 khugepaged_should_wakeup(), 2634 scan_sleep_jiffies); 2635 return; 2636 } 2637 2638 if (hugepage_pmd_enabled()) 2639 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2640 } 2641 2642 static int khugepaged(void *none) 2643 { 2644 struct mm_slot *slot; 2645 2646 set_freezable(); 2647 set_user_nice(current, MAX_NICE); 2648 2649 while (!kthread_should_stop()) { 2650 khugepaged_do_scan(&khugepaged_collapse_control); 2651 khugepaged_wait_work(); 2652 } 2653 2654 spin_lock(&khugepaged_mm_lock); 2655 slot = khugepaged_scan.mm_slot; 2656 khugepaged_scan.mm_slot = NULL; 2657 if (slot) 2658 collect_mm_slot(slot); 2659 spin_unlock(&khugepaged_mm_lock); 2660 return 0; 2661 } 2662 2663 static void set_recommended_min_free_kbytes(void) 2664 { 2665 struct zone *zone; 2666 int nr_zones = 0; 2667 unsigned long recommended_min; 2668 2669 if (!hugepage_pmd_enabled()) { 2670 calculate_min_free_kbytes(); 2671 goto update_wmarks; 2672 } 2673 2674 for_each_populated_zone(zone) { 2675 /* 2676 * We don't need to worry about fragmentation of 2677 * ZONE_MOVABLE since it only has movable pages. 2678 */ 2679 if (zone_idx(zone) > gfp_zone(GFP_USER)) 2680 continue; 2681 2682 nr_zones++; 2683 } 2684 2685 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 2686 recommended_min = pageblock_nr_pages * nr_zones * 2; 2687 2688 /* 2689 * Make sure that on average at least two pageblocks are almost free 2690 * of another type, one for a migratetype to fall back to and a 2691 * second to avoid subsequent fallbacks of other types There are 3 2692 * MIGRATE_TYPES we care about. 2693 */ 2694 recommended_min += pageblock_nr_pages * nr_zones * 2695 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 2696 2697 /* don't ever allow to reserve more than 5% of the lowmem */ 2698 recommended_min = min(recommended_min, 2699 (unsigned long) nr_free_buffer_pages() / 20); 2700 recommended_min <<= (PAGE_SHIFT-10); 2701 2702 if (recommended_min > min_free_kbytes) { 2703 if (user_min_free_kbytes >= 0) 2704 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 2705 min_free_kbytes, recommended_min); 2706 2707 min_free_kbytes = recommended_min; 2708 } 2709 2710 update_wmarks: 2711 setup_per_zone_wmarks(); 2712 } 2713 2714 int start_stop_khugepaged(void) 2715 { 2716 int err = 0; 2717 2718 mutex_lock(&khugepaged_mutex); 2719 if (hugepage_pmd_enabled()) { 2720 if (!khugepaged_thread) 2721 khugepaged_thread = kthread_run(khugepaged, NULL, 2722 "khugepaged"); 2723 if (IS_ERR(khugepaged_thread)) { 2724 pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 2725 err = PTR_ERR(khugepaged_thread); 2726 khugepaged_thread = NULL; 2727 goto fail; 2728 } 2729 2730 if (!list_empty(&khugepaged_scan.mm_head)) 2731 wake_up_interruptible(&khugepaged_wait); 2732 } else if (khugepaged_thread) { 2733 kthread_stop(khugepaged_thread); 2734 khugepaged_thread = NULL; 2735 } 2736 set_recommended_min_free_kbytes(); 2737 fail: 2738 mutex_unlock(&khugepaged_mutex); 2739 return err; 2740 } 2741 2742 void khugepaged_min_free_kbytes_update(void) 2743 { 2744 mutex_lock(&khugepaged_mutex); 2745 if (hugepage_pmd_enabled() && khugepaged_thread) 2746 set_recommended_min_free_kbytes(); 2747 mutex_unlock(&khugepaged_mutex); 2748 } 2749 2750 bool current_is_khugepaged(void) 2751 { 2752 return kthread_func(current) == khugepaged; 2753 } 2754 2755 static int madvise_collapse_errno(enum scan_result r) 2756 { 2757 /* 2758 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide 2759 * actionable feedback to caller, so they may take an appropriate 2760 * fallback measure depending on the nature of the failure. 2761 */ 2762 switch (r) { 2763 case SCAN_ALLOC_HUGE_PAGE_FAIL: 2764 return -ENOMEM; 2765 case SCAN_CGROUP_CHARGE_FAIL: 2766 case SCAN_EXCEED_NONE_PTE: 2767 return -EBUSY; 2768 /* Resource temporary unavailable - trying again might succeed */ 2769 case SCAN_PAGE_COUNT: 2770 case SCAN_PAGE_LOCK: 2771 case SCAN_PAGE_LRU: 2772 case SCAN_DEL_PAGE_LRU: 2773 case SCAN_PAGE_FILLED: 2774 case SCAN_PAGE_DIRTY_OR_WRITEBACK: 2775 return -EAGAIN; 2776 /* 2777 * Other: Trying again likely not to succeed / error intrinsic to 2778 * specified memory range. khugepaged likely won't be able to collapse 2779 * either. 2780 */ 2781 default: 2782 return -EINVAL; 2783 } 2784 } 2785 2786 int madvise_collapse(struct vm_area_struct *vma, unsigned long start, 2787 unsigned long end, bool *lock_dropped) 2788 { 2789 struct collapse_control *cc; 2790 struct mm_struct *mm = vma->vm_mm; 2791 unsigned long hstart, hend, addr; 2792 enum scan_result last_fail = SCAN_FAIL; 2793 int thps = 0; 2794 bool mmap_locked = true; 2795 2796 BUG_ON(vma->vm_start > start); 2797 BUG_ON(vma->vm_end < end); 2798 2799 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) 2800 return -EINVAL; 2801 2802 cc = kmalloc_obj(*cc); 2803 if (!cc) 2804 return -ENOMEM; 2805 cc->is_khugepaged = false; 2806 cc->progress = 0; 2807 2808 mmgrab(mm); 2809 lru_add_drain_all(); 2810 2811 hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2812 hend = end & HPAGE_PMD_MASK; 2813 2814 for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { 2815 enum scan_result result = SCAN_FAIL; 2816 bool triggered_wb = false; 2817 2818 retry: 2819 if (!mmap_locked) { 2820 cond_resched(); 2821 mmap_read_lock(mm); 2822 mmap_locked = true; 2823 result = hugepage_vma_revalidate(mm, addr, false, &vma, 2824 cc); 2825 if (result != SCAN_SUCCEED) { 2826 last_fail = result; 2827 goto out_nolock; 2828 } 2829 2830 hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); 2831 } 2832 mmap_assert_locked(mm); 2833 if (!vma_is_anonymous(vma)) { 2834 struct file *file = get_file(vma->vm_file); 2835 pgoff_t pgoff = linear_page_index(vma, addr); 2836 2837 mmap_read_unlock(mm); 2838 mmap_locked = false; 2839 *lock_dropped = true; 2840 result = hpage_collapse_scan_file(mm, addr, file, pgoff, 2841 cc); 2842 2843 if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && 2844 mapping_can_writeback(file->f_mapping)) { 2845 loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; 2846 loff_t lend = lstart + HPAGE_PMD_SIZE - 1; 2847 2848 filemap_write_and_wait_range(file->f_mapping, lstart, lend); 2849 triggered_wb = true; 2850 fput(file); 2851 goto retry; 2852 } 2853 fput(file); 2854 } else { 2855 result = hpage_collapse_scan_pmd(mm, vma, addr, 2856 &mmap_locked, cc); 2857 } 2858 if (!mmap_locked) 2859 *lock_dropped = true; 2860 2861 handle_result: 2862 switch (result) { 2863 case SCAN_SUCCEED: 2864 case SCAN_PMD_MAPPED: 2865 ++thps; 2866 break; 2867 case SCAN_PTE_MAPPED_HUGEPAGE: 2868 BUG_ON(mmap_locked); 2869 mmap_read_lock(mm); 2870 result = try_collapse_pte_mapped_thp(mm, addr, true); 2871 mmap_read_unlock(mm); 2872 goto handle_result; 2873 /* Whitelisted set of results where continuing OK */ 2874 case SCAN_NO_PTE_TABLE: 2875 case SCAN_PTE_NON_PRESENT: 2876 case SCAN_PTE_UFFD_WP: 2877 case SCAN_LACK_REFERENCED_PAGE: 2878 case SCAN_PAGE_NULL: 2879 case SCAN_PAGE_COUNT: 2880 case SCAN_PAGE_LOCK: 2881 case SCAN_PAGE_COMPOUND: 2882 case SCAN_PAGE_LRU: 2883 case SCAN_DEL_PAGE_LRU: 2884 last_fail = result; 2885 break; 2886 default: 2887 last_fail = result; 2888 /* Other error, exit */ 2889 goto out_maybelock; 2890 } 2891 } 2892 2893 out_maybelock: 2894 /* Caller expects us to hold mmap_lock on return */ 2895 if (!mmap_locked) 2896 mmap_read_lock(mm); 2897 out_nolock: 2898 mmap_assert_locked(mm); 2899 mmdrop(mm); 2900 kfree(cc); 2901 2902 return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 2903 : madvise_collapse_errno(last_fail); 2904 } 2905