1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/mm.h> 5 #include <linux/sched.h> 6 #include <linux/sched/mm.h> 7 #include <linux/mmu_notifier.h> 8 #include <linux/rmap.h> 9 #include <linux/swap.h> 10 #include <linux/mm_inline.h> 11 #include <linux/kthread.h> 12 #include <linux/khugepaged.h> 13 #include <linux/freezer.h> 14 #include <linux/mman.h> 15 #include <linux/hashtable.h> 16 #include <linux/userfaultfd_k.h> 17 #include <linux/page_idle.h> 18 #include <linux/page_table_check.h> 19 #include <linux/rcupdate_wait.h> 20 #include <linux/leafops.h> 21 #include <linux/shmem_fs.h> 22 #include <linux/dax.h> 23 #include <linux/ksm.h> 24 #include <linux/pgalloc.h> 25 #include <linux/backing-dev.h> 26 27 #include <asm/tlb.h> 28 #include "internal.h" 29 #include "mm_slot.h" 30 31 enum scan_result { 32 SCAN_FAIL, 33 SCAN_SUCCEED, 34 SCAN_NO_PTE_TABLE, 35 SCAN_PMD_MAPPED, 36 SCAN_EXCEED_NONE_PTE, 37 SCAN_EXCEED_SWAP_PTE, 38 SCAN_EXCEED_SHARED_PTE, 39 SCAN_PTE_NON_PRESENT, 40 SCAN_PTE_UFFD_WP, 41 SCAN_PTE_MAPPED_HUGEPAGE, 42 SCAN_LACK_REFERENCED_PAGE, 43 SCAN_PAGE_NULL, 44 SCAN_SCAN_ABORT, 45 SCAN_PAGE_COUNT, 46 SCAN_PAGE_LRU, 47 SCAN_PAGE_LOCK, 48 SCAN_PAGE_ANON, 49 SCAN_PAGE_LAZYFREE, 50 SCAN_PAGE_COMPOUND, 51 SCAN_ANY_PROCESS, 52 SCAN_VMA_NULL, 53 SCAN_VMA_CHECK, 54 SCAN_ADDRESS_RANGE, 55 SCAN_DEL_PAGE_LRU, 56 SCAN_ALLOC_HUGE_PAGE_FAIL, 57 SCAN_CGROUP_CHARGE_FAIL, 58 SCAN_TRUNCATED, 59 SCAN_PAGE_HAS_PRIVATE, 60 SCAN_STORE_FAILED, 61 SCAN_COPY_MC, 62 SCAN_PAGE_FILLED, 63 SCAN_PAGE_DIRTY_OR_WRITEBACK, 64 }; 65 66 #define CREATE_TRACE_POINTS 67 #include <trace/events/huge_memory.h> 68 69 static struct task_struct *khugepaged_thread __read_mostly; 70 static DEFINE_MUTEX(khugepaged_mutex); 71 72 /* 73 * default scan 8*HPAGE_PMD_NR ptes, pte_mapped_hugepage, pmd_mapped, 74 * no_pte_table or vmas every 10 second. 75 */ 76 static unsigned int khugepaged_pages_to_scan __read_mostly; 77 static unsigned int khugepaged_pages_collapsed; 78 static unsigned int khugepaged_full_scans; 79 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 80 /* during fragmentation poll the hugepage allocator once every minute */ 81 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 82 static unsigned long khugepaged_sleep_expire; 83 static DEFINE_SPINLOCK(khugepaged_mm_lock); 84 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 85 /* 86 * default collapse hugepages if there is at least one pte mapped like 87 * it would have happened if the vma was large enough during page 88 * fault. 89 * 90 * Note that these are only respected if collapse was initiated by khugepaged. 91 */ 92 #define KHUGEPAGED_MAX_PTES_LIMIT (HPAGE_PMD_NR - 1) 93 unsigned int khugepaged_max_ptes_none __read_mostly; 94 static unsigned int khugepaged_max_ptes_swap __read_mostly; 95 static unsigned int khugepaged_max_ptes_shared __read_mostly; 96 97 #define MM_SLOTS_HASH_BITS 10 98 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 99 100 static struct kmem_cache *mm_slot_cache __ro_after_init; 101 102 struct collapse_control { 103 bool is_khugepaged; 104 105 /* Num pages scanned per node */ 106 u32 node_load[MAX_NUMNODES]; 107 108 /* Num pages scanned (see khugepaged_pages_to_scan) */ 109 unsigned int progress; 110 111 /* nodemask for allocation fallback */ 112 nodemask_t alloc_nmask; 113 }; 114 115 /** 116 * struct khugepaged_scan - cursor for scanning 117 * @mm_head: the head of the mm list to scan 118 * @mm_slot: the current mm_slot we are scanning 119 * @address: the next address inside that to be scanned 120 * 121 * There is only the one khugepaged_scan instance of this cursor structure. 122 */ 123 struct khugepaged_scan { 124 struct list_head mm_head; 125 struct mm_slot *mm_slot; 126 unsigned long address; 127 }; 128 129 static struct khugepaged_scan khugepaged_scan = { 130 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 131 }; 132 133 #ifdef CONFIG_SYSFS 134 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 135 struct kobj_attribute *attr, 136 char *buf) 137 { 138 return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); 139 } 140 141 static ssize_t __sleep_millisecs_store(const char *buf, size_t count, 142 unsigned int *millisecs) 143 { 144 unsigned int msecs; 145 int err; 146 147 err = kstrtouint(buf, 10, &msecs); 148 if (err) 149 return -EINVAL; 150 151 *millisecs = msecs; 152 khugepaged_sleep_expire = 0; 153 wake_up_interruptible(&khugepaged_wait); 154 155 return count; 156 } 157 158 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 159 struct kobj_attribute *attr, 160 const char *buf, size_t count) 161 { 162 return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); 163 } 164 static struct kobj_attribute scan_sleep_millisecs_attr = 165 __ATTR_RW(scan_sleep_millisecs); 166 167 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 168 struct kobj_attribute *attr, 169 char *buf) 170 { 171 return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 172 } 173 174 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 175 struct kobj_attribute *attr, 176 const char *buf, size_t count) 177 { 178 return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); 179 } 180 static struct kobj_attribute alloc_sleep_millisecs_attr = 181 __ATTR_RW(alloc_sleep_millisecs); 182 183 static ssize_t pages_to_scan_show(struct kobject *kobj, 184 struct kobj_attribute *attr, 185 char *buf) 186 { 187 return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); 188 } 189 static ssize_t pages_to_scan_store(struct kobject *kobj, 190 struct kobj_attribute *attr, 191 const char *buf, size_t count) 192 { 193 unsigned int pages; 194 int err; 195 196 err = kstrtouint(buf, 10, &pages); 197 if (err || !pages) 198 return -EINVAL; 199 200 khugepaged_pages_to_scan = pages; 201 202 return count; 203 } 204 static struct kobj_attribute pages_to_scan_attr = 205 __ATTR_RW(pages_to_scan); 206 207 static ssize_t pages_collapsed_show(struct kobject *kobj, 208 struct kobj_attribute *attr, 209 char *buf) 210 { 211 return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); 212 } 213 static struct kobj_attribute pages_collapsed_attr = 214 __ATTR_RO(pages_collapsed); 215 216 static ssize_t full_scans_show(struct kobject *kobj, 217 struct kobj_attribute *attr, 218 char *buf) 219 { 220 return sysfs_emit(buf, "%u\n", khugepaged_full_scans); 221 } 222 static struct kobj_attribute full_scans_attr = 223 __ATTR_RO(full_scans); 224 225 static ssize_t defrag_show(struct kobject *kobj, 226 struct kobj_attribute *attr, char *buf) 227 { 228 return single_hugepage_flag_show(kobj, attr, buf, 229 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 230 } 231 static ssize_t defrag_store(struct kobject *kobj, 232 struct kobj_attribute *attr, 233 const char *buf, size_t count) 234 { 235 return single_hugepage_flag_store(kobj, attr, buf, count, 236 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 237 } 238 static struct kobj_attribute khugepaged_defrag_attr = 239 __ATTR_RW(defrag); 240 241 /* 242 * max_ptes_none controls if khugepaged should collapse hugepages over 243 * any unmapped ptes in turn potentially increasing the memory 244 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 245 * reduce the available free memory in the system as it 246 * runs. Increasing max_ptes_none will instead potentially reduce the 247 * free memory in the system during the khugepaged scan. 248 */ 249 static ssize_t max_ptes_none_show(struct kobject *kobj, 250 struct kobj_attribute *attr, 251 char *buf) 252 { 253 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); 254 } 255 static ssize_t max_ptes_none_store(struct kobject *kobj, 256 struct kobj_attribute *attr, 257 const char *buf, size_t count) 258 { 259 int err; 260 unsigned long max_ptes_none; 261 262 err = kstrtoul(buf, 10, &max_ptes_none); 263 if (err || max_ptes_none > KHUGEPAGED_MAX_PTES_LIMIT) 264 return -EINVAL; 265 266 khugepaged_max_ptes_none = max_ptes_none; 267 268 return count; 269 } 270 static struct kobj_attribute khugepaged_max_ptes_none_attr = 271 __ATTR_RW(max_ptes_none); 272 273 static ssize_t max_ptes_swap_show(struct kobject *kobj, 274 struct kobj_attribute *attr, 275 char *buf) 276 { 277 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); 278 } 279 280 static ssize_t max_ptes_swap_store(struct kobject *kobj, 281 struct kobj_attribute *attr, 282 const char *buf, size_t count) 283 { 284 int err; 285 unsigned long max_ptes_swap; 286 287 err = kstrtoul(buf, 10, &max_ptes_swap); 288 if (err || max_ptes_swap > KHUGEPAGED_MAX_PTES_LIMIT) 289 return -EINVAL; 290 291 khugepaged_max_ptes_swap = max_ptes_swap; 292 293 return count; 294 } 295 296 static struct kobj_attribute khugepaged_max_ptes_swap_attr = 297 __ATTR_RW(max_ptes_swap); 298 299 static ssize_t max_ptes_shared_show(struct kobject *kobj, 300 struct kobj_attribute *attr, 301 char *buf) 302 { 303 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); 304 } 305 306 static ssize_t max_ptes_shared_store(struct kobject *kobj, 307 struct kobj_attribute *attr, 308 const char *buf, size_t count) 309 { 310 int err; 311 unsigned long max_ptes_shared; 312 313 err = kstrtoul(buf, 10, &max_ptes_shared); 314 if (err || max_ptes_shared > KHUGEPAGED_MAX_PTES_LIMIT) 315 return -EINVAL; 316 317 khugepaged_max_ptes_shared = max_ptes_shared; 318 319 return count; 320 } 321 322 static struct kobj_attribute khugepaged_max_ptes_shared_attr = 323 __ATTR_RW(max_ptes_shared); 324 325 static struct attribute *khugepaged_attr[] = { 326 &khugepaged_defrag_attr.attr, 327 &khugepaged_max_ptes_none_attr.attr, 328 &khugepaged_max_ptes_swap_attr.attr, 329 &khugepaged_max_ptes_shared_attr.attr, 330 &pages_to_scan_attr.attr, 331 &pages_collapsed_attr.attr, 332 &full_scans_attr.attr, 333 &scan_sleep_millisecs_attr.attr, 334 &alloc_sleep_millisecs_attr.attr, 335 NULL, 336 }; 337 338 struct attribute_group khugepaged_attr_group = { 339 .attrs = khugepaged_attr, 340 .name = "khugepaged", 341 }; 342 #endif /* CONFIG_SYSFS */ 343 344 static bool pte_none_or_zero(pte_t pte) 345 { 346 if (pte_none(pte)) 347 return true; 348 return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); 349 } 350 351 int hugepage_madvise(struct vm_area_struct *vma, 352 vm_flags_t *vm_flags, int advice) 353 { 354 switch (advice) { 355 case MADV_HUGEPAGE: 356 *vm_flags &= ~VM_NOHUGEPAGE; 357 *vm_flags |= VM_HUGEPAGE; 358 /* 359 * If the vma become good for khugepaged to scan, 360 * register it here without waiting a page fault that 361 * may not happen any time soon. 362 */ 363 khugepaged_enter_vma(vma, *vm_flags); 364 break; 365 case MADV_NOHUGEPAGE: 366 *vm_flags &= ~VM_HUGEPAGE; 367 *vm_flags |= VM_NOHUGEPAGE; 368 /* 369 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 370 * this vma even if we leave the mm registered in khugepaged if 371 * it got registered before VM_NOHUGEPAGE was set. 372 */ 373 break; 374 } 375 376 return 0; 377 } 378 379 int __init khugepaged_init(void) 380 { 381 mm_slot_cache = KMEM_CACHE(mm_slot, 0); 382 if (!mm_slot_cache) 383 return -ENOMEM; 384 385 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 386 khugepaged_max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT; 387 khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; 388 khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; 389 390 return 0; 391 } 392 393 void __init khugepaged_destroy(void) 394 { 395 kmem_cache_destroy(mm_slot_cache); 396 } 397 398 static inline int collapse_test_exit(struct mm_struct *mm) 399 { 400 return atomic_read(&mm->mm_users) == 0; 401 } 402 403 static inline int collapse_test_exit_or_disable(struct mm_struct *mm) 404 { 405 return collapse_test_exit(mm) || 406 mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); 407 } 408 409 static bool hugepage_pmd_enabled(void) 410 { 411 /* 412 * We cover the anon, shmem and the file-backed case here; file-backed 413 * hugepages, when configured in, are determined by the global control. 414 * Anon pmd-sized hugepages are determined by the pmd-size control. 415 * Shmem pmd-sized hugepages are also determined by its pmd-size control, 416 * except when the global shmem_huge is set to SHMEM_HUGE_DENY. 417 */ 418 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 419 hugepage_global_enabled()) 420 return true; 421 if (test_bit(PMD_ORDER, &huge_anon_orders_always)) 422 return true; 423 if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) 424 return true; 425 if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && 426 hugepage_global_enabled()) 427 return true; 428 if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) 429 return true; 430 return false; 431 } 432 433 void __khugepaged_enter(struct mm_struct *mm) 434 { 435 struct mm_slot *slot; 436 int wakeup; 437 438 /* __khugepaged_exit() must not run from under us */ 439 VM_BUG_ON_MM(collapse_test_exit(mm), mm); 440 if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) 441 return; 442 443 slot = mm_slot_alloc(mm_slot_cache); 444 if (!slot) 445 return; 446 447 spin_lock(&khugepaged_mm_lock); 448 mm_slot_insert(mm_slots_hash, mm, slot); 449 /* 450 * Insert just behind the scanning cursor, to let the area settle 451 * down a little. 452 */ 453 wakeup = list_empty(&khugepaged_scan.mm_head); 454 list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); 455 spin_unlock(&khugepaged_mm_lock); 456 457 mmgrab(mm); 458 if (wakeup) 459 wake_up_interruptible(&khugepaged_wait); 460 } 461 462 void khugepaged_enter_vma(struct vm_area_struct *vma, 463 vm_flags_t vm_flags) 464 { 465 if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && 466 hugepage_pmd_enabled()) { 467 if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) 468 __khugepaged_enter(vma->vm_mm); 469 } 470 } 471 472 void __khugepaged_exit(struct mm_struct *mm) 473 { 474 struct mm_slot *slot; 475 int free = 0; 476 477 spin_lock(&khugepaged_mm_lock); 478 slot = mm_slot_lookup(mm_slots_hash, mm); 479 if (slot && khugepaged_scan.mm_slot != slot) { 480 hash_del(&slot->hash); 481 list_del(&slot->mm_node); 482 free = 1; 483 } 484 spin_unlock(&khugepaged_mm_lock); 485 486 if (free) { 487 mm_flags_clear(MMF_VM_HUGEPAGE, mm); 488 mm_slot_free(mm_slot_cache, slot); 489 mmdrop(mm); 490 } else if (slot) { 491 /* 492 * This is required to serialize against 493 * collapse_test_exit() (which is guaranteed to run 494 * under mmap sem read mode). Stop here (after we return all 495 * pagetables will be destroyed) until khugepaged has finished 496 * working on the pagetables under the mmap_lock. 497 */ 498 mmap_write_lock(mm); 499 mmap_write_unlock(mm); 500 } 501 } 502 503 static void release_pte_folio(struct folio *folio) 504 { 505 node_stat_mod_folio(folio, 506 NR_ISOLATED_ANON + folio_is_file_lru(folio), 507 -folio_nr_pages(folio)); 508 folio_unlock(folio); 509 folio_putback_lru(folio); 510 } 511 512 static void release_pte_pages(pte_t *pte, pte_t *_pte, 513 struct list_head *compound_pagelist) 514 { 515 struct folio *folio, *tmp; 516 517 while (--_pte >= pte) { 518 pte_t pteval = ptep_get(_pte); 519 unsigned long pfn; 520 521 if (pte_none(pteval)) 522 continue; 523 VM_WARN_ON_ONCE(!pte_present(pteval)); 524 pfn = pte_pfn(pteval); 525 if (is_zero_pfn(pfn)) 526 continue; 527 folio = pfn_folio(pfn); 528 if (folio_test_large(folio)) 529 continue; 530 release_pte_folio(folio); 531 } 532 533 list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { 534 list_del(&folio->lru); 535 release_pte_folio(folio); 536 } 537 } 538 539 static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, 540 unsigned long start_addr, pte_t *pte, struct collapse_control *cc, 541 struct list_head *compound_pagelist) 542 { 543 struct page *page = NULL; 544 struct folio *folio = NULL; 545 unsigned long addr = start_addr; 546 pte_t *_pte; 547 int none_or_zero = 0, shared = 0, referenced = 0; 548 enum scan_result result = SCAN_FAIL; 549 550 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; 551 _pte++, addr += PAGE_SIZE) { 552 pte_t pteval = ptep_get(_pte); 553 if (pte_none_or_zero(pteval)) { 554 ++none_or_zero; 555 if (!userfaultfd_armed(vma) && 556 (!cc->is_khugepaged || 557 none_or_zero <= khugepaged_max_ptes_none)) { 558 continue; 559 } else { 560 result = SCAN_EXCEED_NONE_PTE; 561 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 562 goto out; 563 } 564 } 565 if (!pte_present(pteval)) { 566 result = SCAN_PTE_NON_PRESENT; 567 goto out; 568 } 569 if (pte_uffd_wp(pteval)) { 570 result = SCAN_PTE_UFFD_WP; 571 goto out; 572 } 573 page = vm_normal_page(vma, addr, pteval); 574 if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 575 result = SCAN_PAGE_NULL; 576 goto out; 577 } 578 579 folio = page_folio(page); 580 VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); 581 582 /* 583 * If the vma has the VM_DROPPABLE flag, the collapse will 584 * preserve the lazyfree property without needing to skip. 585 */ 586 if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && 587 folio_test_lazyfree(folio) && !pte_dirty(pteval)) { 588 result = SCAN_PAGE_LAZYFREE; 589 goto out; 590 } 591 592 /* See collapse_scan_pmd(). */ 593 if (folio_maybe_mapped_shared(folio)) { 594 ++shared; 595 if (cc->is_khugepaged && 596 shared > khugepaged_max_ptes_shared) { 597 result = SCAN_EXCEED_SHARED_PTE; 598 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 599 goto out; 600 } 601 } 602 603 if (folio_test_large(folio)) { 604 struct folio *f; 605 606 /* 607 * Check if we have dealt with the compound page 608 * already 609 */ 610 list_for_each_entry(f, compound_pagelist, lru) { 611 if (folio == f) 612 goto next; 613 } 614 } 615 616 /* 617 * We can do it before folio_isolate_lru because the 618 * folio can't be freed from under us. NOTE: PG_lock 619 * is needed to serialize against split_huge_page 620 * when invoked from the VM. 621 */ 622 if (!folio_trylock(folio)) { 623 result = SCAN_PAGE_LOCK; 624 goto out; 625 } 626 627 /* 628 * Check if the page has any GUP (or other external) pins. 629 * 630 * The page table that maps the page has been already unlinked 631 * from the page table tree and this process cannot get 632 * an additional pin on the page. 633 * 634 * New pins can come later if the page is shared across fork, 635 * but not from this process. The other process cannot write to 636 * the page, only trigger CoW. 637 */ 638 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { 639 folio_unlock(folio); 640 result = SCAN_PAGE_COUNT; 641 goto out; 642 } 643 644 /* 645 * Isolate the page to avoid collapsing an hugepage 646 * currently in use by the VM. 647 */ 648 if (!folio_isolate_lru(folio)) { 649 folio_unlock(folio); 650 result = SCAN_DEL_PAGE_LRU; 651 goto out; 652 } 653 node_stat_mod_folio(folio, 654 NR_ISOLATED_ANON + folio_is_file_lru(folio), 655 folio_nr_pages(folio)); 656 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 657 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 658 659 if (folio_test_large(folio)) 660 list_add_tail(&folio->lru, compound_pagelist); 661 next: 662 /* 663 * If collapse was initiated by khugepaged, check that there is 664 * enough young pte to justify collapsing the page 665 */ 666 if (cc->is_khugepaged && 667 (pte_young(pteval) || folio_test_young(folio) || 668 folio_test_referenced(folio) || 669 mmu_notifier_test_young(vma->vm_mm, addr))) 670 referenced++; 671 } 672 673 if (unlikely(cc->is_khugepaged && !referenced)) { 674 result = SCAN_LACK_REFERENCED_PAGE; 675 } else { 676 result = SCAN_SUCCEED; 677 trace_mm_collapse_huge_page_isolate(folio, none_or_zero, 678 referenced, result); 679 return result; 680 } 681 out: 682 release_pte_pages(pte, _pte, compound_pagelist); 683 trace_mm_collapse_huge_page_isolate(folio, none_or_zero, 684 referenced, result); 685 return result; 686 } 687 688 static void __collapse_huge_page_copy_succeeded(pte_t *pte, 689 struct vm_area_struct *vma, 690 unsigned long address, 691 spinlock_t *ptl, 692 struct list_head *compound_pagelist) 693 { 694 unsigned long end = address + HPAGE_PMD_SIZE; 695 struct folio *src, *tmp; 696 pte_t pteval; 697 pte_t *_pte; 698 unsigned int nr_ptes; 699 700 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, 701 address += nr_ptes * PAGE_SIZE) { 702 nr_ptes = 1; 703 pteval = ptep_get(_pte); 704 if (pte_none_or_zero(pteval)) { 705 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 706 if (pte_none(pteval)) 707 continue; 708 /* 709 * ptl mostly unnecessary. 710 */ 711 spin_lock(ptl); 712 ptep_clear(vma->vm_mm, address, _pte); 713 spin_unlock(ptl); 714 ksm_might_unmap_zero_page(vma->vm_mm, pteval); 715 } else { 716 struct page *src_page = pte_page(pteval); 717 718 src = page_folio(src_page); 719 720 if (folio_test_large(src)) { 721 unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; 722 723 nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); 724 } else { 725 release_pte_folio(src); 726 } 727 728 /* 729 * ptl mostly unnecessary, but preempt has to 730 * be disabled to update the per-cpu stats 731 * inside folio_remove_rmap_pte(). 732 */ 733 spin_lock(ptl); 734 clear_ptes(vma->vm_mm, address, _pte, nr_ptes); 735 folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); 736 spin_unlock(ptl); 737 free_swap_cache(src); 738 folio_put_refs(src, nr_ptes); 739 } 740 } 741 742 list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { 743 list_del(&src->lru); 744 node_stat_sub_folio(src, NR_ISOLATED_ANON + 745 folio_is_file_lru(src)); 746 folio_unlock(src); 747 free_swap_cache(src); 748 folio_putback_lru(src); 749 } 750 } 751 752 static void __collapse_huge_page_copy_failed(pte_t *pte, 753 pmd_t *pmd, 754 pmd_t orig_pmd, 755 struct vm_area_struct *vma, 756 struct list_head *compound_pagelist) 757 { 758 spinlock_t *pmd_ptl; 759 760 /* 761 * Re-establish the PMD to point to the original page table 762 * entry. Restoring PMD needs to be done prior to releasing 763 * pages. Since pages are still isolated and locked here, 764 * acquiring anon_vma_lock_write is unnecessary. 765 */ 766 pmd_ptl = pmd_lock(vma->vm_mm, pmd); 767 pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); 768 spin_unlock(pmd_ptl); 769 /* 770 * Release both raw and compound pages isolated 771 * in __collapse_huge_page_isolate. 772 */ 773 release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); 774 } 775 776 /* 777 * __collapse_huge_page_copy - attempts to copy memory contents from raw 778 * pages to a hugepage. Cleans up the raw pages if copying succeeds; 779 * otherwise restores the original page table and releases isolated raw pages. 780 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. 781 * 782 * @pte: starting of the PTEs to copy from 783 * @folio: the new hugepage to copy contents to 784 * @pmd: pointer to the new hugepage's PMD 785 * @orig_pmd: the original raw pages' PMD 786 * @vma: the original raw pages' virtual memory area 787 * @address: starting address to copy 788 * @ptl: lock on raw pages' PTEs 789 * @compound_pagelist: list that stores compound pages 790 */ 791 static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio, 792 pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, 793 unsigned long address, spinlock_t *ptl, 794 struct list_head *compound_pagelist) 795 { 796 unsigned int i; 797 enum scan_result result = SCAN_SUCCEED; 798 799 /* 800 * Copying pages' contents is subject to memory poison at any iteration. 801 */ 802 for (i = 0; i < HPAGE_PMD_NR; i++) { 803 pte_t pteval = ptep_get(pte + i); 804 struct page *page = folio_page(folio, i); 805 unsigned long src_addr = address + i * PAGE_SIZE; 806 struct page *src_page; 807 808 if (pte_none_or_zero(pteval)) { 809 clear_user_highpage(page, src_addr); 810 continue; 811 } 812 src_page = pte_page(pteval); 813 if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { 814 result = SCAN_COPY_MC; 815 break; 816 } 817 } 818 819 if (likely(result == SCAN_SUCCEED)) 820 __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, 821 compound_pagelist); 822 else 823 __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, 824 compound_pagelist); 825 826 return result; 827 } 828 829 static void khugepaged_alloc_sleep(void) 830 { 831 DEFINE_WAIT(wait); 832 833 add_wait_queue(&khugepaged_wait, &wait); 834 __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 835 schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 836 remove_wait_queue(&khugepaged_wait, &wait); 837 } 838 839 static struct collapse_control khugepaged_collapse_control = { 840 .is_khugepaged = true, 841 }; 842 843 static bool collapse_scan_abort(int nid, struct collapse_control *cc) 844 { 845 int i; 846 847 /* 848 * If node_reclaim_mode is disabled, then no extra effort is made to 849 * allocate memory locally. 850 */ 851 if (!node_reclaim_enabled()) 852 return false; 853 854 /* If there is a count for this node already, it must be acceptable */ 855 if (cc->node_load[nid]) 856 return false; 857 858 for (i = 0; i < MAX_NUMNODES; i++) { 859 if (!cc->node_load[i]) 860 continue; 861 if (node_distance(nid, i) > node_reclaim_distance) 862 return true; 863 } 864 return false; 865 } 866 867 #define khugepaged_defrag() \ 868 (transparent_hugepage_flags & \ 869 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) 870 871 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 872 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 873 { 874 return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; 875 } 876 877 #ifdef CONFIG_NUMA 878 static int collapse_find_target_node(struct collapse_control *cc) 879 { 880 int nid, target_node = 0, max_value = 0; 881 882 /* find first node with max normal pages hit */ 883 for (nid = 0; nid < MAX_NUMNODES; nid++) 884 if (cc->node_load[nid] > max_value) { 885 max_value = cc->node_load[nid]; 886 target_node = nid; 887 } 888 889 for_each_online_node(nid) { 890 if (max_value == cc->node_load[nid]) 891 node_set(nid, cc->alloc_nmask); 892 } 893 894 return target_node; 895 } 896 #else 897 static int collapse_find_target_node(struct collapse_control *cc) 898 { 899 return 0; 900 } 901 #endif 902 903 /* 904 * If mmap_lock temporarily dropped, revalidate vma 905 * before taking mmap_lock. 906 * Returns enum scan_result value. 907 */ 908 909 static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, 910 bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) 911 { 912 struct vm_area_struct *vma; 913 enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : 914 TVA_FORCED_COLLAPSE; 915 916 if (unlikely(collapse_test_exit_or_disable(mm))) 917 return SCAN_ANY_PROCESS; 918 919 *vmap = vma = find_vma(mm, address); 920 if (!vma) 921 return SCAN_VMA_NULL; 922 923 if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) 924 return SCAN_ADDRESS_RANGE; 925 if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) 926 return SCAN_VMA_CHECK; 927 /* 928 * Anon VMA expected, the address may be unmapped then 929 * remapped to file after khugepaged reaquired the mmap_lock. 930 * 931 * thp_vma_allowable_order may return true for qualified file 932 * vmas. 933 */ 934 if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) 935 return SCAN_PAGE_ANON; 936 return SCAN_SUCCEED; 937 } 938 939 static inline enum scan_result check_pmd_state(pmd_t *pmd) 940 { 941 pmd_t pmde = pmdp_get_lockless(pmd); 942 943 if (pmd_none(pmde)) 944 return SCAN_NO_PTE_TABLE; 945 946 /* 947 * The folio may be under migration when khugepaged is trying to 948 * collapse it. Migration success or failure will eventually end 949 * up with a present PMD mapping a folio again. 950 */ 951 if (pmd_is_migration_entry(pmde)) 952 return SCAN_PMD_MAPPED; 953 if (!pmd_present(pmde)) 954 return SCAN_NO_PTE_TABLE; 955 if (pmd_trans_huge(pmde)) 956 return SCAN_PMD_MAPPED; 957 if (pmd_bad(pmde)) 958 return SCAN_NO_PTE_TABLE; 959 return SCAN_SUCCEED; 960 } 961 962 static enum scan_result find_pmd_or_thp_or_none(struct mm_struct *mm, 963 unsigned long address, pmd_t **pmd) 964 { 965 *pmd = mm_find_pmd(mm, address); 966 if (!*pmd) 967 return SCAN_NO_PTE_TABLE; 968 969 return check_pmd_state(*pmd); 970 } 971 972 static enum scan_result check_pmd_still_valid(struct mm_struct *mm, 973 unsigned long address, pmd_t *pmd) 974 { 975 pmd_t *new_pmd; 976 enum scan_result result = find_pmd_or_thp_or_none(mm, address, &new_pmd); 977 978 if (result != SCAN_SUCCEED) 979 return result; 980 if (new_pmd != pmd) 981 return SCAN_FAIL; 982 return SCAN_SUCCEED; 983 } 984 985 /* 986 * Bring missing pages in from swap, to complete THP collapse. 987 * Only done if khugepaged_scan_pmd believes it is worthwhile. 988 * 989 * Called and returns without pte mapped or spinlocks held. 990 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. 991 */ 992 static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm, 993 struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd, 994 int referenced) 995 { 996 int swapped_in = 0; 997 vm_fault_t ret = 0; 998 unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); 999 enum scan_result result; 1000 pte_t *pte = NULL; 1001 spinlock_t *ptl; 1002 1003 for (addr = start_addr; addr < end; addr += PAGE_SIZE) { 1004 struct vm_fault vmf = { 1005 .vma = vma, 1006 .address = addr, 1007 .pgoff = linear_page_index(vma, addr), 1008 .flags = FAULT_FLAG_ALLOW_RETRY, 1009 .pmd = pmd, 1010 }; 1011 1012 if (!pte++) { 1013 /* 1014 * Here the ptl is only used to check pte_same() in 1015 * do_swap_page(), so readonly version is enough. 1016 */ 1017 pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); 1018 if (!pte) { 1019 mmap_read_unlock(mm); 1020 result = SCAN_NO_PTE_TABLE; 1021 goto out; 1022 } 1023 } 1024 1025 vmf.orig_pte = ptep_get_lockless(pte); 1026 if (pte_none(vmf.orig_pte) || 1027 pte_present(vmf.orig_pte)) 1028 continue; 1029 1030 vmf.pte = pte; 1031 vmf.ptl = ptl; 1032 ret = do_swap_page(&vmf); 1033 /* Which unmaps pte (after perhaps re-checking the entry) */ 1034 pte = NULL; 1035 1036 /* 1037 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. 1038 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because 1039 * we do not retry here and swap entry will remain in pagetable 1040 * resulting in later failure. 1041 */ 1042 if (ret & VM_FAULT_RETRY) { 1043 /* Likely, but not guaranteed, that page lock failed */ 1044 result = SCAN_PAGE_LOCK; 1045 goto out; 1046 } 1047 if (ret & VM_FAULT_ERROR) { 1048 mmap_read_unlock(mm); 1049 result = SCAN_FAIL; 1050 goto out; 1051 } 1052 swapped_in++; 1053 } 1054 1055 if (pte) 1056 pte_unmap(pte); 1057 1058 /* Drain LRU cache to remove extra pin on the swapped in pages */ 1059 if (swapped_in) 1060 lru_add_drain(); 1061 1062 result = SCAN_SUCCEED; 1063 out: 1064 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); 1065 return result; 1066 } 1067 1068 static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, 1069 struct collapse_control *cc) 1070 { 1071 gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : 1072 GFP_TRANSHUGE); 1073 int node = collapse_find_target_node(cc); 1074 struct folio *folio; 1075 1076 folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); 1077 if (!folio) { 1078 *foliop = NULL; 1079 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1080 return SCAN_ALLOC_HUGE_PAGE_FAIL; 1081 } 1082 1083 count_vm_event(THP_COLLAPSE_ALLOC); 1084 if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { 1085 folio_put(folio); 1086 *foliop = NULL; 1087 return SCAN_CGROUP_CHARGE_FAIL; 1088 } 1089 1090 count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); 1091 1092 *foliop = folio; 1093 return SCAN_SUCCEED; 1094 } 1095 1096 static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address, 1097 int referenced, int unmapped, struct collapse_control *cc) 1098 { 1099 LIST_HEAD(compound_pagelist); 1100 pmd_t *pmd, _pmd; 1101 pte_t *pte; 1102 pgtable_t pgtable; 1103 struct folio *folio; 1104 spinlock_t *pmd_ptl, *pte_ptl; 1105 enum scan_result result = SCAN_FAIL; 1106 struct vm_area_struct *vma; 1107 struct mmu_notifier_range range; 1108 1109 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1110 1111 /* 1112 * Before allocating the hugepage, release the mmap_lock read lock. 1113 * The allocation can take potentially a long time if it involves 1114 * sync compaction, and we do not need to hold the mmap_lock during 1115 * that. We will recheck the vma after taking it again in write mode. 1116 */ 1117 mmap_read_unlock(mm); 1118 1119 result = alloc_charge_folio(&folio, mm, cc); 1120 if (result != SCAN_SUCCEED) 1121 goto out_nolock; 1122 1123 mmap_read_lock(mm); 1124 result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 1125 if (result != SCAN_SUCCEED) { 1126 mmap_read_unlock(mm); 1127 goto out_nolock; 1128 } 1129 1130 result = find_pmd_or_thp_or_none(mm, address, &pmd); 1131 if (result != SCAN_SUCCEED) { 1132 mmap_read_unlock(mm); 1133 goto out_nolock; 1134 } 1135 1136 if (unmapped) { 1137 /* 1138 * __collapse_huge_page_swapin will return with mmap_lock 1139 * released when it fails. So we jump out_nolock directly in 1140 * that case. Continuing to collapse causes inconsistency. 1141 */ 1142 result = __collapse_huge_page_swapin(mm, vma, address, pmd, 1143 referenced); 1144 if (result != SCAN_SUCCEED) 1145 goto out_nolock; 1146 } 1147 1148 mmap_read_unlock(mm); 1149 /* 1150 * Prevent all access to pagetables with the exception of 1151 * gup_fast later handled by the ptep_clear_flush and the VM 1152 * handled by the anon_vma lock + PG_lock. 1153 * 1154 * UFFDIO_MOVE is prevented to race as well thanks to the 1155 * mmap_lock. 1156 */ 1157 mmap_write_lock(mm); 1158 result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 1159 if (result != SCAN_SUCCEED) 1160 goto out_up_write; 1161 /* check if the pmd is still valid */ 1162 vma_start_write(vma); 1163 result = check_pmd_still_valid(mm, address, pmd); 1164 if (result != SCAN_SUCCEED) 1165 goto out_up_write; 1166 1167 anon_vma_lock_write(vma->anon_vma); 1168 1169 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, 1170 address + HPAGE_PMD_SIZE); 1171 mmu_notifier_invalidate_range_start(&range); 1172 1173 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1174 /* 1175 * This removes any huge TLB entry from the CPU so we won't allow 1176 * huge and small TLB entries for the same virtual address to 1177 * avoid the risk of CPU bugs in that area. 1178 * 1179 * Parallel GUP-fast is fine since GUP-fast will back off when 1180 * it detects PMD is changed. 1181 */ 1182 _pmd = pmdp_collapse_flush(vma, address, pmd); 1183 spin_unlock(pmd_ptl); 1184 mmu_notifier_invalidate_range_end(&range); 1185 tlb_remove_table_sync_one(); 1186 1187 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); 1188 if (pte) { 1189 result = __collapse_huge_page_isolate(vma, address, pte, cc, 1190 &compound_pagelist); 1191 spin_unlock(pte_ptl); 1192 } else { 1193 result = SCAN_NO_PTE_TABLE; 1194 } 1195 1196 if (unlikely(result != SCAN_SUCCEED)) { 1197 if (pte) 1198 pte_unmap(pte); 1199 spin_lock(pmd_ptl); 1200 BUG_ON(!pmd_none(*pmd)); 1201 /* 1202 * We can only use set_pmd_at when establishing 1203 * hugepmds and never for establishing regular pmds that 1204 * points to regular pagetables. Use pmd_populate for that 1205 */ 1206 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 1207 spin_unlock(pmd_ptl); 1208 anon_vma_unlock_write(vma->anon_vma); 1209 goto out_up_write; 1210 } 1211 1212 /* 1213 * All pages are isolated and locked so anon_vma rmap 1214 * can't run anymore. 1215 */ 1216 anon_vma_unlock_write(vma->anon_vma); 1217 1218 result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, 1219 vma, address, pte_ptl, 1220 &compound_pagelist); 1221 pte_unmap(pte); 1222 if (unlikely(result != SCAN_SUCCEED)) 1223 goto out_up_write; 1224 1225 /* 1226 * The smp_wmb() inside __folio_mark_uptodate() ensures the 1227 * copy_huge_page writes become visible before the set_pmd_at() 1228 * write. 1229 */ 1230 __folio_mark_uptodate(folio); 1231 pgtable = pmd_pgtable(_pmd); 1232 1233 spin_lock(pmd_ptl); 1234 BUG_ON(!pmd_none(*pmd)); 1235 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1236 map_anon_folio_pmd_nopf(folio, pmd, vma, address); 1237 spin_unlock(pmd_ptl); 1238 1239 folio = NULL; 1240 1241 result = SCAN_SUCCEED; 1242 out_up_write: 1243 mmap_write_unlock(mm); 1244 out_nolock: 1245 if (folio) 1246 folio_put(folio); 1247 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); 1248 return result; 1249 } 1250 1251 static enum scan_result collapse_scan_pmd(struct mm_struct *mm, 1252 struct vm_area_struct *vma, unsigned long start_addr, 1253 bool *lock_dropped, struct collapse_control *cc) 1254 { 1255 pmd_t *pmd; 1256 pte_t *pte, *_pte; 1257 int none_or_zero = 0, shared = 0, referenced = 0; 1258 enum scan_result result = SCAN_FAIL; 1259 struct page *page = NULL; 1260 struct folio *folio = NULL; 1261 unsigned long addr; 1262 spinlock_t *ptl; 1263 int node = NUMA_NO_NODE, unmapped = 0; 1264 1265 VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); 1266 1267 result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); 1268 if (result != SCAN_SUCCEED) { 1269 cc->progress++; 1270 goto out; 1271 } 1272 1273 memset(cc->node_load, 0, sizeof(cc->node_load)); 1274 nodes_clear(cc->alloc_nmask); 1275 pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); 1276 if (!pte) { 1277 cc->progress++; 1278 result = SCAN_NO_PTE_TABLE; 1279 goto out; 1280 } 1281 1282 for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; 1283 _pte++, addr += PAGE_SIZE) { 1284 cc->progress++; 1285 1286 pte_t pteval = ptep_get(_pte); 1287 if (pte_none_or_zero(pteval)) { 1288 ++none_or_zero; 1289 if (!userfaultfd_armed(vma) && 1290 (!cc->is_khugepaged || 1291 none_or_zero <= khugepaged_max_ptes_none)) { 1292 continue; 1293 } else { 1294 result = SCAN_EXCEED_NONE_PTE; 1295 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 1296 goto out_unmap; 1297 } 1298 } 1299 if (!pte_present(pteval)) { 1300 ++unmapped; 1301 if (!cc->is_khugepaged || 1302 unmapped <= khugepaged_max_ptes_swap) { 1303 /* 1304 * Always be strict with uffd-wp 1305 * enabled swap entries. Please see 1306 * comment below for pte_uffd_wp(). 1307 */ 1308 if (pte_swp_uffd_wp_any(pteval)) { 1309 result = SCAN_PTE_UFFD_WP; 1310 goto out_unmap; 1311 } 1312 continue; 1313 } else { 1314 result = SCAN_EXCEED_SWAP_PTE; 1315 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 1316 goto out_unmap; 1317 } 1318 } 1319 if (pte_uffd_wp(pteval)) { 1320 /* 1321 * Don't collapse the page if any of the small 1322 * PTEs are armed with uffd write protection. 1323 * Here we can also mark the new huge pmd as 1324 * write protected if any of the small ones is 1325 * marked but that could bring unknown 1326 * userfault messages that falls outside of 1327 * the registered range. So, just be simple. 1328 */ 1329 result = SCAN_PTE_UFFD_WP; 1330 goto out_unmap; 1331 } 1332 1333 page = vm_normal_page(vma, addr, pteval); 1334 if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 1335 result = SCAN_PAGE_NULL; 1336 goto out_unmap; 1337 } 1338 folio = page_folio(page); 1339 1340 /* 1341 * If the vma has the VM_DROPPABLE flag, the collapse will 1342 * preserve the lazyfree property without needing to skip. 1343 */ 1344 if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && 1345 folio_test_lazyfree(folio) && !pte_dirty(pteval)) { 1346 result = SCAN_PAGE_LAZYFREE; 1347 goto out_unmap; 1348 } 1349 1350 if (!folio_test_anon(folio)) { 1351 result = SCAN_PAGE_ANON; 1352 goto out_unmap; 1353 } 1354 1355 /* 1356 * We treat a single page as shared if any part of the THP 1357 * is shared. 1358 */ 1359 if (folio_maybe_mapped_shared(folio)) { 1360 ++shared; 1361 if (cc->is_khugepaged && 1362 shared > khugepaged_max_ptes_shared) { 1363 result = SCAN_EXCEED_SHARED_PTE; 1364 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 1365 goto out_unmap; 1366 } 1367 } 1368 1369 /* 1370 * Record which node the original page is from and save this 1371 * information to cc->node_load[]. 1372 * Khugepaged will allocate hugepage from the node has the max 1373 * hit record. 1374 */ 1375 node = folio_nid(folio); 1376 if (collapse_scan_abort(node, cc)) { 1377 result = SCAN_SCAN_ABORT; 1378 goto out_unmap; 1379 } 1380 cc->node_load[node]++; 1381 if (!folio_test_lru(folio)) { 1382 result = SCAN_PAGE_LRU; 1383 goto out_unmap; 1384 } 1385 if (folio_test_locked(folio)) { 1386 result = SCAN_PAGE_LOCK; 1387 goto out_unmap; 1388 } 1389 1390 /* 1391 * Check if the page has any GUP (or other external) pins. 1392 * 1393 * Here the check may be racy: 1394 * it may see folio_mapcount() > folio_ref_count(). 1395 * But such case is ephemeral we could always retry collapse 1396 * later. However it may report false positive if the page 1397 * has excessive GUP pins (i.e. 512). Anyway the same check 1398 * will be done again later the risk seems low. 1399 */ 1400 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { 1401 result = SCAN_PAGE_COUNT; 1402 goto out_unmap; 1403 } 1404 1405 /* 1406 * If collapse was initiated by khugepaged, check that there is 1407 * enough young pte to justify collapsing the page 1408 */ 1409 if (cc->is_khugepaged && 1410 (pte_young(pteval) || folio_test_young(folio) || 1411 folio_test_referenced(folio) || 1412 mmu_notifier_test_young(vma->vm_mm, addr))) 1413 referenced++; 1414 } 1415 if (cc->is_khugepaged && 1416 (!referenced || 1417 (unmapped && referenced < HPAGE_PMD_NR / 2))) { 1418 result = SCAN_LACK_REFERENCED_PAGE; 1419 } else { 1420 result = SCAN_SUCCEED; 1421 } 1422 out_unmap: 1423 pte_unmap_unlock(pte, ptl); 1424 if (result == SCAN_SUCCEED) { 1425 result = collapse_huge_page(mm, start_addr, referenced, 1426 unmapped, cc); 1427 /* collapse_huge_page will return with the mmap_lock released */ 1428 *lock_dropped = true; 1429 } 1430 out: 1431 trace_mm_khugepaged_scan_pmd(mm, folio, referenced, 1432 none_or_zero, result, unmapped); 1433 return result; 1434 } 1435 1436 static void collect_mm_slot(struct mm_slot *slot) 1437 { 1438 struct mm_struct *mm = slot->mm; 1439 1440 lockdep_assert_held(&khugepaged_mm_lock); 1441 1442 if (collapse_test_exit(mm)) { 1443 /* free mm_slot */ 1444 hash_del(&slot->hash); 1445 list_del(&slot->mm_node); 1446 1447 /* 1448 * Not strictly needed because the mm exited already. 1449 * 1450 * mm_flags_clear(MMF_VM_HUGEPAGE, mm); 1451 */ 1452 1453 /* khugepaged_mm_lock actually not necessary for the below */ 1454 mm_slot_free(mm_slot_cache, slot); 1455 mmdrop(mm); 1456 } 1457 } 1458 1459 /* folio must be locked, and mmap_lock must be held */ 1460 static enum scan_result set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, 1461 pmd_t *pmdp, struct folio *folio, struct page *page) 1462 { 1463 struct mm_struct *mm = vma->vm_mm; 1464 struct vm_fault vmf = { 1465 .vma = vma, 1466 .address = addr, 1467 .flags = 0, 1468 }; 1469 pgd_t *pgdp; 1470 p4d_t *p4dp; 1471 pud_t *pudp; 1472 1473 mmap_assert_locked(vma->vm_mm); 1474 1475 if (!pmdp) { 1476 pgdp = pgd_offset(mm, addr); 1477 p4dp = p4d_alloc(mm, pgdp, addr); 1478 if (!p4dp) 1479 return SCAN_FAIL; 1480 pudp = pud_alloc(mm, p4dp, addr); 1481 if (!pudp) 1482 return SCAN_FAIL; 1483 pmdp = pmd_alloc(mm, pudp, addr); 1484 if (!pmdp) 1485 return SCAN_FAIL; 1486 } 1487 1488 vmf.pmd = pmdp; 1489 if (do_set_pmd(&vmf, folio, page)) 1490 return SCAN_FAIL; 1491 1492 folio_get(folio); 1493 return SCAN_SUCCEED; 1494 } 1495 1496 static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, 1497 bool install_pmd) 1498 { 1499 enum scan_result result = SCAN_FAIL; 1500 int nr_mapped_ptes = 0; 1501 unsigned int nr_batch_ptes; 1502 struct mmu_notifier_range range; 1503 bool notified = false; 1504 unsigned long haddr = addr & HPAGE_PMD_MASK; 1505 unsigned long end = haddr + HPAGE_PMD_SIZE; 1506 struct vm_area_struct *vma = vma_lookup(mm, haddr); 1507 struct folio *folio; 1508 pte_t *start_pte, *pte; 1509 pmd_t *pmd, pgt_pmd; 1510 spinlock_t *pml = NULL, *ptl; 1511 int i; 1512 1513 mmap_assert_locked(mm); 1514 1515 /* First check VMA found, in case page tables are being torn down */ 1516 if (!vma || !vma->vm_file || 1517 !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) 1518 return SCAN_VMA_CHECK; 1519 1520 /* Fast check before locking page if already PMD-mapped */ 1521 result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 1522 if (result == SCAN_PMD_MAPPED) 1523 return result; 1524 1525 /* 1526 * If we are here, we've succeeded in replacing all the native pages 1527 * in the page cache with a single hugepage. If a mm were to fault-in 1528 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage 1529 * and map it by a PMD, regardless of sysfs THP settings. As such, let's 1530 * analogously elide sysfs THP settings here and force collapse. 1531 */ 1532 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) 1533 return SCAN_VMA_CHECK; 1534 1535 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ 1536 if (userfaultfd_wp(vma)) 1537 return SCAN_PTE_UFFD_WP; 1538 1539 folio = filemap_lock_folio(vma->vm_file->f_mapping, 1540 linear_page_index(vma, haddr)); 1541 if (IS_ERR(folio)) 1542 return SCAN_PAGE_NULL; 1543 1544 if (!is_pmd_order(folio_order(folio))) { 1545 result = SCAN_PAGE_COMPOUND; 1546 goto drop_folio; 1547 } 1548 1549 result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 1550 switch (result) { 1551 case SCAN_SUCCEED: 1552 break; 1553 case SCAN_NO_PTE_TABLE: 1554 /* 1555 * All pte entries have been removed and pmd cleared. 1556 * Skip all the pte checks and just update the pmd mapping. 1557 */ 1558 goto maybe_install_pmd; 1559 default: 1560 goto drop_folio; 1561 } 1562 1563 result = SCAN_FAIL; 1564 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); 1565 if (!start_pte) /* mmap_lock + page lock should prevent this */ 1566 goto drop_folio; 1567 1568 /* step 1: check all mapped PTEs are to the right huge page */ 1569 for (i = 0, addr = haddr, pte = start_pte; 1570 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 1571 struct page *page; 1572 pte_t ptent = ptep_get(pte); 1573 1574 /* empty pte, skip */ 1575 if (pte_none(ptent)) 1576 continue; 1577 1578 /* page swapped out, abort */ 1579 if (!pte_present(ptent)) { 1580 result = SCAN_PTE_NON_PRESENT; 1581 goto abort; 1582 } 1583 1584 page = vm_normal_page(vma, addr, ptent); 1585 if (WARN_ON_ONCE(page && is_zone_device_page(page))) 1586 page = NULL; 1587 /* 1588 * Note that uprobe, debugger, or MAP_PRIVATE may change the 1589 * page table, but the new page will not be a subpage of hpage. 1590 */ 1591 if (folio_page(folio, i) != page) 1592 goto abort; 1593 } 1594 1595 pte_unmap_unlock(start_pte, ptl); 1596 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1597 haddr, haddr + HPAGE_PMD_SIZE); 1598 mmu_notifier_invalidate_range_start(&range); 1599 notified = true; 1600 1601 /* 1602 * pmd_lock covers a wider range than ptl, and (if split from mm's 1603 * page_table_lock) ptl nests inside pml. The less time we hold pml, 1604 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA 1605 * inserts a valid as-if-COWed PTE without even looking up page cache. 1606 * So page lock of folio does not protect from it, so we must not drop 1607 * ptl before pgt_pmd is removed, so uffd private needs pml taken now. 1608 */ 1609 if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) 1610 pml = pmd_lock(mm, pmd); 1611 1612 start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); 1613 if (!start_pte) /* mmap_lock + page lock should prevent this */ 1614 goto abort; 1615 if (!pml) 1616 spin_lock(ptl); 1617 else if (ptl != pml) 1618 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1619 1620 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) 1621 goto abort; 1622 1623 /* step 2: clear page table and adjust rmap */ 1624 for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; 1625 i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, 1626 pte += nr_batch_ptes) { 1627 unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; 1628 struct page *page; 1629 pte_t ptent = ptep_get(pte); 1630 1631 nr_batch_ptes = 1; 1632 1633 if (pte_none(ptent)) 1634 continue; 1635 /* 1636 * We dropped ptl after the first scan, to do the mmu_notifier: 1637 * page lock stops more PTEs of the folio being faulted in, but 1638 * does not stop write faults COWing anon copies from existing 1639 * PTEs; and does not stop those being swapped out or migrated. 1640 */ 1641 if (!pte_present(ptent)) { 1642 result = SCAN_PTE_NON_PRESENT; 1643 goto abort; 1644 } 1645 page = vm_normal_page(vma, addr, ptent); 1646 1647 if (folio_page(folio, i) != page) 1648 goto abort; 1649 1650 nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); 1651 1652 /* 1653 * Must clear entry, or a racing truncate may re-remove it. 1654 * TLB flush can be left until pmdp_collapse_flush() does it. 1655 * PTE dirty? Shmem page is already dirty; file is read-only. 1656 */ 1657 clear_ptes(mm, addr, pte, nr_batch_ptes); 1658 folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); 1659 nr_mapped_ptes += nr_batch_ptes; 1660 } 1661 1662 if (!pml) 1663 spin_unlock(ptl); 1664 1665 /* step 3: set proper refcount and mm_counters. */ 1666 if (nr_mapped_ptes) { 1667 folio_ref_sub(folio, nr_mapped_ptes); 1668 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1669 } 1670 1671 /* step 4: remove empty page table */ 1672 if (!pml) { 1673 pml = pmd_lock(mm, pmd); 1674 if (ptl != pml) { 1675 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1676 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { 1677 flush_tlb_mm(mm); 1678 goto unlock; 1679 } 1680 } 1681 } 1682 pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); 1683 pmdp_get_lockless_sync(); 1684 pte_unmap_unlock(start_pte, ptl); 1685 if (ptl != pml) 1686 spin_unlock(pml); 1687 1688 mmu_notifier_invalidate_range_end(&range); 1689 1690 mm_dec_nr_ptes(mm); 1691 page_table_check_pte_clear_range(mm, haddr, pgt_pmd); 1692 pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 1693 1694 maybe_install_pmd: 1695 /* step 5: install pmd entry */ 1696 result = install_pmd 1697 ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) 1698 : SCAN_SUCCEED; 1699 goto drop_folio; 1700 abort: 1701 if (nr_mapped_ptes) { 1702 flush_tlb_mm(mm); 1703 folio_ref_sub(folio, nr_mapped_ptes); 1704 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1705 } 1706 unlock: 1707 if (start_pte) 1708 pte_unmap_unlock(start_pte, ptl); 1709 if (pml && pml != ptl) 1710 spin_unlock(pml); 1711 if (notified) 1712 mmu_notifier_invalidate_range_end(&range); 1713 drop_folio: 1714 folio_unlock(folio); 1715 folio_put(folio); 1716 return result; 1717 } 1718 1719 /** 1720 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at 1721 * address haddr. 1722 * 1723 * @mm: process address space where collapse happens 1724 * @addr: THP collapse address 1725 * @install_pmd: If a huge PMD should be installed 1726 * 1727 * This function checks whether all the PTEs in the PMD are pointing to the 1728 * right THP. If so, retract the page table so the THP can refault in with 1729 * as pmd-mapped. Possibly install a huge PMD mapping the THP. 1730 */ 1731 void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, 1732 bool install_pmd) 1733 { 1734 try_collapse_pte_mapped_thp(mm, addr, install_pmd); 1735 } 1736 1737 /* Can we retract page tables for this file-backed VMA? */ 1738 static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) 1739 { 1740 /* 1741 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that 1742 * got written to. These VMAs are likely not worth removing 1743 * page tables from, as PMD-mapping is likely to be split later. 1744 */ 1745 if (READ_ONCE(vma->anon_vma)) 1746 return false; 1747 1748 /* 1749 * When a vma is registered with uffd-wp, we cannot recycle 1750 * the page table because there may be pte markers installed. 1751 * Other vmas can still have the same file mapped hugely, but 1752 * skip this one: it will always be mapped in small page size 1753 * for uffd-wp registered ranges. 1754 */ 1755 if (userfaultfd_wp(vma)) 1756 return false; 1757 1758 /* 1759 * If the VMA contains guard regions then we can't collapse it. 1760 * 1761 * This is set atomically on guard marker installation under mmap/VMA 1762 * read lock, and here we may not hold any VMA or mmap lock at all. 1763 * 1764 * This is therefore serialised on the PTE page table lock, which is 1765 * obtained on guard region installation after the flag is set, so this 1766 * check being performed under this lock excludes races. 1767 */ 1768 if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT)) 1769 return false; 1770 1771 return true; 1772 } 1773 1774 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) 1775 { 1776 struct vm_area_struct *vma; 1777 1778 i_mmap_lock_read(mapping); 1779 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1780 struct mmu_notifier_range range; 1781 struct mm_struct *mm; 1782 unsigned long addr; 1783 pmd_t *pmd, pgt_pmd; 1784 spinlock_t *pml; 1785 spinlock_t *ptl; 1786 bool success = false; 1787 1788 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 1789 if (addr & ~HPAGE_PMD_MASK || 1790 vma->vm_end < addr + HPAGE_PMD_SIZE) 1791 continue; 1792 1793 mm = vma->vm_mm; 1794 if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) 1795 continue; 1796 1797 if (collapse_test_exit(mm)) 1798 continue; 1799 1800 if (!file_backed_vma_is_retractable(vma)) 1801 continue; 1802 1803 /* PTEs were notified when unmapped; but now for the PMD? */ 1804 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1805 addr, addr + HPAGE_PMD_SIZE); 1806 mmu_notifier_invalidate_range_start(&range); 1807 1808 pml = pmd_lock(mm, pmd); 1809 /* 1810 * The lock of new_folio is still held, we will be blocked in 1811 * the page fault path, which prevents the pte entries from 1812 * being set again. So even though the old empty PTE page may be 1813 * concurrently freed and a new PTE page is filled into the pmd 1814 * entry, it is still empty and can be removed. 1815 * 1816 * So here we only need to recheck if the state of pmd entry 1817 * still meets our requirements, rather than checking pmd_same() 1818 * like elsewhere. 1819 */ 1820 if (check_pmd_state(pmd) != SCAN_SUCCEED) 1821 goto drop_pml; 1822 ptl = pte_lockptr(mm, pmd); 1823 if (ptl != pml) 1824 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 1825 1826 /* 1827 * Huge page lock is still held, so normally the page table must 1828 * remain empty; and we have already skipped anon_vma and 1829 * userfaultfd_wp() vmas. But since the mmap_lock is not held, 1830 * it is still possible for a racing userfaultfd_ioctl() or 1831 * madvise() to have inserted ptes or markers. Now that we hold 1832 * ptlock, repeating the retractable checks protects us from 1833 * races against the prior checks. 1834 */ 1835 if (likely(file_backed_vma_is_retractable(vma))) { 1836 pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); 1837 pmdp_get_lockless_sync(); 1838 success = true; 1839 } 1840 1841 if (ptl != pml) 1842 spin_unlock(ptl); 1843 drop_pml: 1844 spin_unlock(pml); 1845 1846 mmu_notifier_invalidate_range_end(&range); 1847 1848 if (success) { 1849 mm_dec_nr_ptes(mm); 1850 page_table_check_pte_clear_range(mm, addr, pgt_pmd); 1851 pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 1852 } 1853 } 1854 i_mmap_unlock_read(mapping); 1855 } 1856 1857 /** 1858 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. 1859 * 1860 * @mm: process address space where collapse happens 1861 * @addr: virtual collapse start address 1862 * @file: file that collapse on 1863 * @start: collapse start address 1864 * @cc: collapse context and scratchpad 1865 * 1866 * Basic scheme is simple, details are more complex: 1867 * - allocate and lock a new huge page; 1868 * - scan page cache, locking old pages 1869 * + swap/gup in pages if necessary; 1870 * - copy data to new page 1871 * - handle shmem holes 1872 * + re-validate that holes weren't filled by someone else 1873 * + check for userfaultfd 1874 * - finalize updates to the page cache; 1875 * - if replacing succeeds: 1876 * + unlock huge page; 1877 * + free old pages; 1878 * - if replacing failed; 1879 * + unlock old pages 1880 * + unlock and free huge page; 1881 */ 1882 static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, 1883 struct file *file, pgoff_t start, struct collapse_control *cc) 1884 { 1885 struct address_space *mapping = file->f_mapping; 1886 struct page *dst; 1887 struct folio *folio, *tmp, *new_folio; 1888 pgoff_t index = 0, end = start + HPAGE_PMD_NR; 1889 LIST_HEAD(pagelist); 1890 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); 1891 enum scan_result result = SCAN_SUCCEED; 1892 int nr_none = 0; 1893 bool is_shmem = shmem_file(file); 1894 1895 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); 1896 VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 1897 1898 result = alloc_charge_folio(&new_folio, mm, cc); 1899 if (result != SCAN_SUCCEED) 1900 goto out; 1901 1902 mapping_set_update(&xas, mapping); 1903 1904 __folio_set_locked(new_folio); 1905 if (is_shmem) 1906 __folio_set_swapbacked(new_folio); 1907 new_folio->index = start; 1908 new_folio->mapping = mapping; 1909 1910 /* 1911 * Ensure we have slots for all the pages in the range. This is 1912 * almost certainly a no-op because most of the pages must be present 1913 */ 1914 do { 1915 xas_lock_irq(&xas); 1916 xas_create_range(&xas); 1917 if (!xas_error(&xas)) 1918 break; 1919 xas_unlock_irq(&xas); 1920 if (!xas_nomem(&xas, GFP_KERNEL)) { 1921 result = SCAN_FAIL; 1922 goto rollback; 1923 } 1924 } while (1); 1925 1926 for (index = start; index < end;) { 1927 xas_set(&xas, index); 1928 folio = xas_load(&xas); 1929 1930 VM_BUG_ON(index != xas.xa_index); 1931 if (is_shmem) { 1932 if (!folio) { 1933 /* 1934 * Stop if extent has been truncated or 1935 * hole-punched, and is now completely 1936 * empty. 1937 */ 1938 if (index == start) { 1939 if (!xas_next_entry(&xas, end - 1)) { 1940 result = SCAN_TRUNCATED; 1941 goto xa_locked; 1942 } 1943 } 1944 nr_none++; 1945 index++; 1946 continue; 1947 } 1948 1949 if (xa_is_value(folio) || !folio_test_uptodate(folio)) { 1950 xas_unlock_irq(&xas); 1951 /* swap in or instantiate fallocated page */ 1952 if (shmem_get_folio(mapping->host, index, 0, 1953 &folio, SGP_NOALLOC)) { 1954 result = SCAN_FAIL; 1955 goto xa_unlocked; 1956 } 1957 /* drain lru cache to help folio_isolate_lru() */ 1958 lru_add_drain(); 1959 } else if (folio_trylock(folio)) { 1960 folio_get(folio); 1961 xas_unlock_irq(&xas); 1962 } else { 1963 result = SCAN_PAGE_LOCK; 1964 goto xa_locked; 1965 } 1966 } else { /* !is_shmem */ 1967 if (!folio || xa_is_value(folio)) { 1968 xas_unlock_irq(&xas); 1969 page_cache_sync_readahead(mapping, &file->f_ra, 1970 file, index, 1971 end - index); 1972 /* drain lru cache to help folio_isolate_lru() */ 1973 lru_add_drain(); 1974 folio = filemap_lock_folio(mapping, index); 1975 if (IS_ERR(folio)) { 1976 result = SCAN_FAIL; 1977 goto xa_unlocked; 1978 } 1979 } else if (folio_test_dirty(folio)) { 1980 /* 1981 * khugepaged only works on read-only fd, 1982 * so this page is dirty because it hasn't 1983 * been flushed since first write. There 1984 * won't be new dirty pages. 1985 * 1986 * Trigger async flush here and hope the 1987 * writeback is done when khugepaged 1988 * revisits this page. 1989 * 1990 * This is a one-off situation. We are not 1991 * forcing writeback in loop. 1992 */ 1993 xas_unlock_irq(&xas); 1994 filemap_flush(mapping); 1995 result = SCAN_PAGE_DIRTY_OR_WRITEBACK; 1996 goto xa_unlocked; 1997 } else if (folio_test_writeback(folio)) { 1998 xas_unlock_irq(&xas); 1999 result = SCAN_PAGE_DIRTY_OR_WRITEBACK; 2000 goto xa_unlocked; 2001 } else if (folio_trylock(folio)) { 2002 folio_get(folio); 2003 xas_unlock_irq(&xas); 2004 } else { 2005 result = SCAN_PAGE_LOCK; 2006 goto xa_locked; 2007 } 2008 } 2009 2010 /* 2011 * The folio must be locked, so we can drop the i_pages lock 2012 * without racing with truncate. 2013 */ 2014 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 2015 2016 /* make sure the folio is up to date */ 2017 if (unlikely(!folio_test_uptodate(folio))) { 2018 result = SCAN_FAIL; 2019 goto out_unlock; 2020 } 2021 2022 /* 2023 * If file was truncated then extended, or hole-punched, before 2024 * we locked the first folio, then a THP might be there already. 2025 * This will be discovered on the first iteration. 2026 */ 2027 if (is_pmd_order(folio_order(folio))) { 2028 result = SCAN_PTE_MAPPED_HUGEPAGE; 2029 goto out_unlock; 2030 } 2031 2032 if (folio_mapping(folio) != mapping) { 2033 result = SCAN_TRUNCATED; 2034 goto out_unlock; 2035 } 2036 2037 if (!is_shmem && (folio_test_dirty(folio) || 2038 folio_test_writeback(folio))) { 2039 /* 2040 * khugepaged only works on read-only fd, so this 2041 * folio is dirty because it hasn't been flushed 2042 * since first write. 2043 */ 2044 result = SCAN_PAGE_DIRTY_OR_WRITEBACK; 2045 goto out_unlock; 2046 } 2047 2048 if (!folio_isolate_lru(folio)) { 2049 result = SCAN_DEL_PAGE_LRU; 2050 goto out_unlock; 2051 } 2052 2053 if (!filemap_release_folio(folio, GFP_KERNEL)) { 2054 result = SCAN_PAGE_HAS_PRIVATE; 2055 folio_putback_lru(folio); 2056 goto out_unlock; 2057 } 2058 2059 if (folio_mapped(folio)) 2060 try_to_unmap(folio, 2061 TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); 2062 2063 xas_lock_irq(&xas); 2064 2065 VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); 2066 2067 /* 2068 * We control 2 + nr_pages references to the folio: 2069 * - we hold a pin on it; 2070 * - nr_pages reference from page cache; 2071 * - one from lru_isolate_folio; 2072 * If those are the only references, then any new usage 2073 * of the folio will have to fetch it from the page 2074 * cache. That requires locking the folio to handle 2075 * truncate, so any new usage will be blocked until we 2076 * unlock folio after collapse/during rollback. 2077 */ 2078 if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { 2079 result = SCAN_PAGE_COUNT; 2080 xas_unlock_irq(&xas); 2081 folio_putback_lru(folio); 2082 goto out_unlock; 2083 } 2084 2085 /* 2086 * Accumulate the folios that are being collapsed. 2087 */ 2088 list_add_tail(&folio->lru, &pagelist); 2089 index += folio_nr_pages(folio); 2090 continue; 2091 out_unlock: 2092 folio_unlock(folio); 2093 folio_put(folio); 2094 goto xa_unlocked; 2095 } 2096 2097 if (!is_shmem) { 2098 filemap_nr_thps_inc(mapping); 2099 /* 2100 * Paired with the fence in do_dentry_open() -> get_write_access() 2101 * to ensure i_writecount is up to date and the update to nr_thps 2102 * is visible. Ensures the page cache will be truncated if the 2103 * file is opened writable. 2104 */ 2105 smp_mb(); 2106 if (inode_is_open_for_write(mapping->host)) { 2107 result = SCAN_FAIL; 2108 filemap_nr_thps_dec(mapping); 2109 } 2110 } 2111 2112 xa_locked: 2113 xas_unlock_irq(&xas); 2114 xa_unlocked: 2115 2116 /* 2117 * If collapse is successful, flush must be done now before copying. 2118 * If collapse is unsuccessful, does flush actually need to be done? 2119 * Do it anyway, to clear the state. 2120 */ 2121 try_to_unmap_flush(); 2122 2123 if (result == SCAN_SUCCEED && nr_none && 2124 !shmem_charge(mapping->host, nr_none)) 2125 result = SCAN_FAIL; 2126 if (result != SCAN_SUCCEED) { 2127 nr_none = 0; 2128 goto rollback; 2129 } 2130 2131 /* 2132 * The old folios are locked, so they won't change anymore. 2133 */ 2134 index = start; 2135 dst = folio_page(new_folio, 0); 2136 list_for_each_entry(folio, &pagelist, lru) { 2137 int i, nr_pages = folio_nr_pages(folio); 2138 2139 while (index < folio->index) { 2140 clear_highpage(dst); 2141 index++; 2142 dst++; 2143 } 2144 2145 for (i = 0; i < nr_pages; i++) { 2146 if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { 2147 result = SCAN_COPY_MC; 2148 goto rollback; 2149 } 2150 index++; 2151 dst++; 2152 } 2153 } 2154 while (index < end) { 2155 clear_highpage(dst); 2156 index++; 2157 dst++; 2158 } 2159 2160 if (nr_none) { 2161 struct vm_area_struct *vma; 2162 int nr_none_check = 0; 2163 2164 i_mmap_lock_read(mapping); 2165 xas_lock_irq(&xas); 2166 2167 xas_set(&xas, start); 2168 for (index = start; index < end; index++) { 2169 if (!xas_next(&xas)) { 2170 xas_store(&xas, XA_RETRY_ENTRY); 2171 if (xas_error(&xas)) { 2172 result = SCAN_STORE_FAILED; 2173 goto immap_locked; 2174 } 2175 nr_none_check++; 2176 } 2177 } 2178 2179 if (nr_none != nr_none_check) { 2180 result = SCAN_PAGE_FILLED; 2181 goto immap_locked; 2182 } 2183 2184 /* 2185 * If userspace observed a missing page in a VMA with 2186 * a MODE_MISSING userfaultfd, then it might expect a 2187 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to 2188 * roll back to avoid suppressing such an event. Since 2189 * wp/minor userfaultfds don't give userspace any 2190 * guarantees that the kernel doesn't fill a missing 2191 * page with a zero page, so they don't matter here. 2192 * 2193 * Any userfaultfds registered after this point will 2194 * not be able to observe any missing pages due to the 2195 * previously inserted retry entries. 2196 */ 2197 vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { 2198 if (userfaultfd_missing(vma)) { 2199 result = SCAN_EXCEED_NONE_PTE; 2200 goto immap_locked; 2201 } 2202 } 2203 2204 immap_locked: 2205 i_mmap_unlock_read(mapping); 2206 if (result != SCAN_SUCCEED) { 2207 xas_set(&xas, start); 2208 for (index = start; index < end; index++) { 2209 if (xas_next(&xas) == XA_RETRY_ENTRY) 2210 xas_store(&xas, NULL); 2211 } 2212 2213 xas_unlock_irq(&xas); 2214 goto rollback; 2215 } 2216 } else { 2217 xas_lock_irq(&xas); 2218 } 2219 2220 if (is_shmem) { 2221 lruvec_stat_mod_folio(new_folio, NR_SHMEM, HPAGE_PMD_NR); 2222 lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); 2223 } else { 2224 lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); 2225 } 2226 lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, HPAGE_PMD_NR); 2227 2228 /* 2229 * Mark new_folio as uptodate before inserting it into the 2230 * page cache so that it isn't mistaken for an fallocated but 2231 * unwritten page. 2232 */ 2233 folio_mark_uptodate(new_folio); 2234 folio_ref_add(new_folio, HPAGE_PMD_NR - 1); 2235 2236 if (is_shmem) 2237 folio_mark_dirty(new_folio); 2238 folio_add_lru(new_folio); 2239 2240 /* Join all the small entries into a single multi-index entry. */ 2241 xas_set_order(&xas, start, HPAGE_PMD_ORDER); 2242 xas_store(&xas, new_folio); 2243 WARN_ON_ONCE(xas_error(&xas)); 2244 xas_unlock_irq(&xas); 2245 2246 /* 2247 * Remove pte page tables, so we can re-fault the page as huge. 2248 * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp(). 2249 */ 2250 retract_page_tables(mapping, start); 2251 if (cc && !cc->is_khugepaged) 2252 result = SCAN_PTE_MAPPED_HUGEPAGE; 2253 folio_unlock(new_folio); 2254 2255 /* 2256 * The collapse has succeeded, so free the old folios. 2257 */ 2258 list_for_each_entry_safe(folio, tmp, &pagelist, lru) { 2259 list_del(&folio->lru); 2260 lruvec_stat_mod_folio(folio, NR_FILE_PAGES, 2261 -folio_nr_pages(folio)); 2262 if (is_shmem) 2263 lruvec_stat_mod_folio(folio, NR_SHMEM, 2264 -folio_nr_pages(folio)); 2265 folio->mapping = NULL; 2266 folio_clear_active(folio); 2267 folio_clear_unevictable(folio); 2268 folio_unlock(folio); 2269 folio_put_refs(folio, 2 + folio_nr_pages(folio)); 2270 } 2271 2272 goto out; 2273 2274 rollback: 2275 /* Something went wrong: roll back page cache changes */ 2276 if (nr_none) { 2277 xas_lock_irq(&xas); 2278 mapping->nrpages -= nr_none; 2279 xas_unlock_irq(&xas); 2280 shmem_uncharge(mapping->host, nr_none); 2281 } 2282 2283 list_for_each_entry_safe(folio, tmp, &pagelist, lru) { 2284 list_del(&folio->lru); 2285 folio_unlock(folio); 2286 folio_putback_lru(folio); 2287 folio_put(folio); 2288 } 2289 /* 2290 * Undo the updates of filemap_nr_thps_inc for non-SHMEM 2291 * file only. This undo is not needed unless failure is 2292 * due to SCAN_COPY_MC. 2293 */ 2294 if (!is_shmem && result == SCAN_COPY_MC) { 2295 filemap_nr_thps_dec(mapping); 2296 /* 2297 * Paired with the fence in do_dentry_open() -> get_write_access() 2298 * to ensure the update to nr_thps is visible. 2299 */ 2300 smp_mb(); 2301 } 2302 2303 new_folio->mapping = NULL; 2304 2305 folio_unlock(new_folio); 2306 folio_put(new_folio); 2307 out: 2308 VM_BUG_ON(!list_empty(&pagelist)); 2309 trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); 2310 return result; 2311 } 2312 2313 static enum scan_result collapse_scan_file(struct mm_struct *mm, 2314 unsigned long addr, struct file *file, pgoff_t start, 2315 struct collapse_control *cc) 2316 { 2317 struct folio *folio = NULL; 2318 struct address_space *mapping = file->f_mapping; 2319 XA_STATE(xas, &mapping->i_pages, start); 2320 int present, swap; 2321 int node = NUMA_NO_NODE; 2322 enum scan_result result = SCAN_SUCCEED; 2323 2324 present = 0; 2325 swap = 0; 2326 memset(cc->node_load, 0, sizeof(cc->node_load)); 2327 nodes_clear(cc->alloc_nmask); 2328 rcu_read_lock(); 2329 xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { 2330 if (xas_retry(&xas, folio)) 2331 continue; 2332 2333 if (xa_is_value(folio)) { 2334 swap += 1 << xas_get_order(&xas); 2335 if (cc->is_khugepaged && 2336 swap > khugepaged_max_ptes_swap) { 2337 result = SCAN_EXCEED_SWAP_PTE; 2338 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 2339 break; 2340 } 2341 continue; 2342 } 2343 2344 if (!folio_try_get(folio)) { 2345 xas_reset(&xas); 2346 continue; 2347 } 2348 2349 if (unlikely(folio != xas_reload(&xas))) { 2350 folio_put(folio); 2351 xas_reset(&xas); 2352 continue; 2353 } 2354 2355 if (is_pmd_order(folio_order(folio))) { 2356 result = SCAN_PTE_MAPPED_HUGEPAGE; 2357 /* 2358 * PMD-sized THP implies that we can only try 2359 * retracting the PTE table. 2360 */ 2361 folio_put(folio); 2362 break; 2363 } 2364 2365 node = folio_nid(folio); 2366 if (collapse_scan_abort(node, cc)) { 2367 result = SCAN_SCAN_ABORT; 2368 folio_put(folio); 2369 break; 2370 } 2371 cc->node_load[node]++; 2372 2373 if (!folio_test_lru(folio)) { 2374 result = SCAN_PAGE_LRU; 2375 folio_put(folio); 2376 break; 2377 } 2378 2379 if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { 2380 result = SCAN_PAGE_COUNT; 2381 folio_put(folio); 2382 break; 2383 } 2384 2385 /* 2386 * We probably should check if the folio is referenced 2387 * here, but nobody would transfer pte_young() to 2388 * folio_test_referenced() for us. And rmap walk here 2389 * is just too costly... 2390 */ 2391 2392 present += folio_nr_pages(folio); 2393 folio_put(folio); 2394 2395 if (need_resched()) { 2396 xas_pause(&xas); 2397 cond_resched_rcu(); 2398 } 2399 } 2400 rcu_read_unlock(); 2401 if (result == SCAN_PTE_MAPPED_HUGEPAGE) 2402 cc->progress++; 2403 else 2404 cc->progress += HPAGE_PMD_NR; 2405 2406 if (result == SCAN_SUCCEED) { 2407 if (cc->is_khugepaged && 2408 present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { 2409 result = SCAN_EXCEED_NONE_PTE; 2410 count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 2411 } else { 2412 result = collapse_file(mm, addr, file, start, cc); 2413 } 2414 } 2415 2416 trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); 2417 return result; 2418 } 2419 2420 /* 2421 * Try to collapse a single PMD starting at a PMD aligned addr, and return 2422 * the results. 2423 */ 2424 static enum scan_result collapse_single_pmd(unsigned long addr, 2425 struct vm_area_struct *vma, bool *lock_dropped, 2426 struct collapse_control *cc) 2427 { 2428 struct mm_struct *mm = vma->vm_mm; 2429 bool triggered_wb = false; 2430 enum scan_result result; 2431 struct file *file; 2432 pgoff_t pgoff; 2433 2434 mmap_assert_locked(mm); 2435 2436 if (vma_is_anonymous(vma)) { 2437 result = collapse_scan_pmd(mm, vma, addr, lock_dropped, cc); 2438 goto end; 2439 } 2440 2441 file = get_file(vma->vm_file); 2442 pgoff = linear_page_index(vma, addr); 2443 2444 mmap_read_unlock(mm); 2445 *lock_dropped = true; 2446 retry: 2447 result = collapse_scan_file(mm, addr, file, pgoff, cc); 2448 2449 /* 2450 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback, 2451 * then retry the collapse one time. 2452 */ 2453 if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK && 2454 !triggered_wb && mapping_can_writeback(file->f_mapping)) { 2455 const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; 2456 const loff_t lend = lstart + HPAGE_PMD_SIZE - 1; 2457 2458 filemap_write_and_wait_range(file->f_mapping, lstart, lend); 2459 triggered_wb = true; 2460 goto retry; 2461 } 2462 fput(file); 2463 2464 if (result == SCAN_PTE_MAPPED_HUGEPAGE) { 2465 mmap_read_lock(mm); 2466 if (collapse_test_exit_or_disable(mm)) 2467 result = SCAN_ANY_PROCESS; 2468 else 2469 result = try_collapse_pte_mapped_thp(mm, addr, 2470 !cc->is_khugepaged); 2471 if (result == SCAN_PMD_MAPPED) 2472 result = SCAN_SUCCEED; 2473 mmap_read_unlock(mm); 2474 } 2475 end: 2476 if (cc->is_khugepaged && result == SCAN_SUCCEED) 2477 ++khugepaged_pages_collapsed; 2478 return result; 2479 } 2480 2481 static void collapse_scan_mm_slot(unsigned int progress_max, 2482 enum scan_result *result, struct collapse_control *cc) 2483 __releases(&khugepaged_mm_lock) 2484 __acquires(&khugepaged_mm_lock) 2485 { 2486 struct vma_iterator vmi; 2487 struct mm_slot *slot; 2488 struct mm_struct *mm; 2489 struct vm_area_struct *vma; 2490 unsigned int progress_prev = cc->progress; 2491 2492 lockdep_assert_held(&khugepaged_mm_lock); 2493 *result = SCAN_FAIL; 2494 2495 if (khugepaged_scan.mm_slot) { 2496 slot = khugepaged_scan.mm_slot; 2497 } else { 2498 slot = list_first_entry(&khugepaged_scan.mm_head, 2499 struct mm_slot, mm_node); 2500 khugepaged_scan.address = 0; 2501 khugepaged_scan.mm_slot = slot; 2502 } 2503 spin_unlock(&khugepaged_mm_lock); 2504 2505 mm = slot->mm; 2506 /* 2507 * Don't wait for semaphore (to avoid long wait times). Just move to 2508 * the next mm on the list. 2509 */ 2510 vma = NULL; 2511 if (unlikely(!mmap_read_trylock(mm))) 2512 goto breakouterloop_mmap_lock; 2513 2514 cc->progress++; 2515 if (unlikely(collapse_test_exit_or_disable(mm))) 2516 goto breakouterloop; 2517 2518 vma_iter_init(&vmi, mm, khugepaged_scan.address); 2519 for_each_vma(vmi, vma) { 2520 unsigned long hstart, hend; 2521 2522 cond_resched(); 2523 if (unlikely(collapse_test_exit_or_disable(mm))) { 2524 cc->progress++; 2525 break; 2526 } 2527 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { 2528 cc->progress++; 2529 continue; 2530 } 2531 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); 2532 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); 2533 if (khugepaged_scan.address > hend) { 2534 cc->progress++; 2535 continue; 2536 } 2537 if (khugepaged_scan.address < hstart) 2538 khugepaged_scan.address = hstart; 2539 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2540 2541 while (khugepaged_scan.address < hend) { 2542 bool lock_dropped = false; 2543 2544 cond_resched(); 2545 if (unlikely(collapse_test_exit_or_disable(mm))) 2546 goto breakouterloop; 2547 2548 VM_WARN_ON_ONCE(khugepaged_scan.address < hstart || 2549 khugepaged_scan.address + HPAGE_PMD_SIZE > 2550 hend); 2551 2552 *result = collapse_single_pmd(khugepaged_scan.address, 2553 vma, &lock_dropped, cc); 2554 /* move to next address */ 2555 khugepaged_scan.address += HPAGE_PMD_SIZE; 2556 if (lock_dropped) 2557 /* 2558 * We released mmap_lock so break loop. Note 2559 * that we drop mmap_lock before all hugepage 2560 * allocations, so if allocation fails, we are 2561 * guaranteed to break here and report the 2562 * correct result back to caller. 2563 */ 2564 goto breakouterloop_mmap_lock; 2565 if (cc->progress >= progress_max) 2566 goto breakouterloop; 2567 } 2568 } 2569 breakouterloop: 2570 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ 2571 breakouterloop_mmap_lock: 2572 2573 spin_lock(&khugepaged_mm_lock); 2574 VM_BUG_ON(khugepaged_scan.mm_slot != slot); 2575 /* 2576 * Release the current mm_slot if this mm is about to die, or 2577 * if we scanned all vmas of this mm, or THP got disabled. 2578 */ 2579 if (collapse_test_exit_or_disable(mm) || !vma) { 2580 /* 2581 * Make sure that if mm_users is reaching zero while 2582 * khugepaged runs here, khugepaged_exit will find 2583 * mm_slot not pointing to the exiting mm. 2584 */ 2585 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { 2586 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); 2587 khugepaged_scan.address = 0; 2588 } else { 2589 khugepaged_scan.mm_slot = NULL; 2590 khugepaged_full_scans++; 2591 } 2592 2593 collect_mm_slot(slot); 2594 } 2595 2596 trace_mm_khugepaged_scan(mm, cc->progress - progress_prev, 2597 khugepaged_scan.mm_slot == NULL); 2598 } 2599 2600 static int khugepaged_has_work(void) 2601 { 2602 return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); 2603 } 2604 2605 static int khugepaged_wait_event(void) 2606 { 2607 return !list_empty(&khugepaged_scan.mm_head) || 2608 kthread_should_stop(); 2609 } 2610 2611 static void khugepaged_do_scan(struct collapse_control *cc) 2612 { 2613 const unsigned int progress_max = READ_ONCE(khugepaged_pages_to_scan); 2614 unsigned int pass_through_head = 0; 2615 bool wait = true; 2616 enum scan_result result = SCAN_SUCCEED; 2617 2618 lru_add_drain_all(); 2619 2620 cc->progress = 0; 2621 while (true) { 2622 cond_resched(); 2623 2624 if (unlikely(kthread_should_stop())) 2625 break; 2626 2627 spin_lock(&khugepaged_mm_lock); 2628 if (!khugepaged_scan.mm_slot) 2629 pass_through_head++; 2630 if (khugepaged_has_work() && 2631 pass_through_head < 2) 2632 collapse_scan_mm_slot(progress_max, &result, cc); 2633 else 2634 cc->progress = progress_max; 2635 spin_unlock(&khugepaged_mm_lock); 2636 2637 if (cc->progress >= progress_max) 2638 break; 2639 2640 if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { 2641 /* 2642 * If fail to allocate the first time, try to sleep for 2643 * a while. When hit again, cancel the scan. 2644 */ 2645 if (!wait) 2646 break; 2647 wait = false; 2648 khugepaged_alloc_sleep(); 2649 } 2650 } 2651 } 2652 2653 static bool khugepaged_should_wakeup(void) 2654 { 2655 return kthread_should_stop() || 2656 time_after_eq(jiffies, khugepaged_sleep_expire); 2657 } 2658 2659 static void khugepaged_wait_work(void) 2660 { 2661 if (khugepaged_has_work()) { 2662 const unsigned long scan_sleep_jiffies = 2663 msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 2664 2665 if (!scan_sleep_jiffies) 2666 return; 2667 2668 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 2669 wait_event_freezable_timeout(khugepaged_wait, 2670 khugepaged_should_wakeup(), 2671 scan_sleep_jiffies); 2672 return; 2673 } 2674 2675 if (hugepage_pmd_enabled()) 2676 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2677 } 2678 2679 static int khugepaged(void *none) 2680 { 2681 struct mm_slot *slot; 2682 2683 set_freezable(); 2684 set_user_nice(current, MAX_NICE); 2685 2686 while (!kthread_should_stop()) { 2687 khugepaged_do_scan(&khugepaged_collapse_control); 2688 khugepaged_wait_work(); 2689 } 2690 2691 spin_lock(&khugepaged_mm_lock); 2692 slot = khugepaged_scan.mm_slot; 2693 khugepaged_scan.mm_slot = NULL; 2694 if (slot) 2695 collect_mm_slot(slot); 2696 spin_unlock(&khugepaged_mm_lock); 2697 return 0; 2698 } 2699 2700 void set_recommended_min_free_kbytes(void) 2701 { 2702 struct zone *zone; 2703 int nr_zones = 0; 2704 unsigned long recommended_min; 2705 2706 if (!hugepage_pmd_enabled()) { 2707 calculate_min_free_kbytes(); 2708 goto update_wmarks; 2709 } 2710 2711 for_each_populated_zone(zone) { 2712 /* 2713 * We don't need to worry about fragmentation of 2714 * ZONE_MOVABLE since it only has movable pages. 2715 */ 2716 if (zone_idx(zone) > gfp_zone(GFP_USER)) 2717 continue; 2718 2719 nr_zones++; 2720 } 2721 2722 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 2723 recommended_min = pageblock_nr_pages * nr_zones * 2; 2724 2725 /* 2726 * Make sure that on average at least two pageblocks are almost free 2727 * of another type, one for a migratetype to fall back to and a 2728 * second to avoid subsequent fallbacks of other types There are 3 2729 * MIGRATE_TYPES we care about. 2730 */ 2731 recommended_min += pageblock_nr_pages * nr_zones * 2732 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 2733 2734 /* don't ever allow to reserve more than 5% of the lowmem */ 2735 recommended_min = min(recommended_min, 2736 (unsigned long) nr_free_buffer_pages() / 20); 2737 recommended_min <<= (PAGE_SHIFT-10); 2738 2739 if (recommended_min > min_free_kbytes) { 2740 if (user_min_free_kbytes >= 0) 2741 pr_info_ratelimited("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 2742 min_free_kbytes, recommended_min); 2743 2744 min_free_kbytes = recommended_min; 2745 } 2746 2747 update_wmarks: 2748 setup_per_zone_wmarks(); 2749 } 2750 2751 int start_stop_khugepaged(void) 2752 { 2753 int err = 0; 2754 2755 mutex_lock(&khugepaged_mutex); 2756 if (hugepage_pmd_enabled()) { 2757 if (!khugepaged_thread) 2758 khugepaged_thread = kthread_run(khugepaged, NULL, 2759 "khugepaged"); 2760 if (IS_ERR(khugepaged_thread)) { 2761 pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 2762 err = PTR_ERR(khugepaged_thread); 2763 khugepaged_thread = NULL; 2764 goto fail; 2765 } 2766 2767 if (!list_empty(&khugepaged_scan.mm_head)) 2768 wake_up_interruptible(&khugepaged_wait); 2769 } else if (khugepaged_thread) { 2770 kthread_stop(khugepaged_thread); 2771 khugepaged_thread = NULL; 2772 } 2773 set_recommended_min_free_kbytes(); 2774 fail: 2775 mutex_unlock(&khugepaged_mutex); 2776 return err; 2777 } 2778 2779 void khugepaged_min_free_kbytes_update(void) 2780 { 2781 mutex_lock(&khugepaged_mutex); 2782 if (hugepage_pmd_enabled() && khugepaged_thread) 2783 set_recommended_min_free_kbytes(); 2784 mutex_unlock(&khugepaged_mutex); 2785 } 2786 2787 bool current_is_khugepaged(void) 2788 { 2789 return kthread_func(current) == khugepaged; 2790 } 2791 2792 static int madvise_collapse_errno(enum scan_result r) 2793 { 2794 /* 2795 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide 2796 * actionable feedback to caller, so they may take an appropriate 2797 * fallback measure depending on the nature of the failure. 2798 */ 2799 switch (r) { 2800 case SCAN_ALLOC_HUGE_PAGE_FAIL: 2801 return -ENOMEM; 2802 case SCAN_CGROUP_CHARGE_FAIL: 2803 case SCAN_EXCEED_NONE_PTE: 2804 return -EBUSY; 2805 /* Resource temporary unavailable - trying again might succeed */ 2806 case SCAN_PAGE_COUNT: 2807 case SCAN_PAGE_LOCK: 2808 case SCAN_PAGE_LRU: 2809 case SCAN_DEL_PAGE_LRU: 2810 case SCAN_PAGE_FILLED: 2811 case SCAN_PAGE_DIRTY_OR_WRITEBACK: 2812 return -EAGAIN; 2813 /* 2814 * Other: Trying again likely not to succeed / error intrinsic to 2815 * specified memory range. khugepaged likely won't be able to collapse 2816 * either. 2817 */ 2818 default: 2819 return -EINVAL; 2820 } 2821 } 2822 2823 int madvise_collapse(struct vm_area_struct *vma, unsigned long start, 2824 unsigned long end, bool *lock_dropped) 2825 { 2826 struct collapse_control *cc; 2827 struct mm_struct *mm = vma->vm_mm; 2828 unsigned long hstart, hend, addr; 2829 enum scan_result last_fail = SCAN_FAIL; 2830 int thps = 0; 2831 bool mmap_unlocked = false; 2832 2833 BUG_ON(vma->vm_start > start); 2834 BUG_ON(vma->vm_end < end); 2835 2836 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) 2837 return -EINVAL; 2838 2839 cc = kmalloc_obj(*cc); 2840 if (!cc) 2841 return -ENOMEM; 2842 cc->is_khugepaged = false; 2843 cc->progress = 0; 2844 2845 mmgrab(mm); 2846 lru_add_drain_all(); 2847 2848 hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2849 hend = end & HPAGE_PMD_MASK; 2850 2851 for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { 2852 enum scan_result result = SCAN_FAIL; 2853 2854 if (mmap_unlocked) { 2855 cond_resched(); 2856 mmap_read_lock(mm); 2857 mmap_unlocked = false; 2858 *lock_dropped = true; 2859 result = hugepage_vma_revalidate(mm, addr, false, &vma, 2860 cc); 2861 if (result != SCAN_SUCCEED) { 2862 last_fail = result; 2863 goto out_nolock; 2864 } 2865 2866 hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); 2867 } 2868 2869 result = collapse_single_pmd(addr, vma, &mmap_unlocked, cc); 2870 2871 switch (result) { 2872 case SCAN_SUCCEED: 2873 case SCAN_PMD_MAPPED: 2874 ++thps; 2875 break; 2876 /* Whitelisted set of results where continuing OK */ 2877 case SCAN_NO_PTE_TABLE: 2878 case SCAN_PTE_NON_PRESENT: 2879 case SCAN_PTE_UFFD_WP: 2880 case SCAN_LACK_REFERENCED_PAGE: 2881 case SCAN_PAGE_NULL: 2882 case SCAN_PAGE_COUNT: 2883 case SCAN_PAGE_LOCK: 2884 case SCAN_PAGE_COMPOUND: 2885 case SCAN_PAGE_LRU: 2886 case SCAN_DEL_PAGE_LRU: 2887 last_fail = result; 2888 break; 2889 default: 2890 last_fail = result; 2891 /* Other error, exit */ 2892 goto out_maybelock; 2893 } 2894 } 2895 2896 out_maybelock: 2897 /* Caller expects us to hold mmap_lock on return */ 2898 if (mmap_unlocked) { 2899 *lock_dropped = true; 2900 mmap_read_lock(mm); 2901 } 2902 out_nolock: 2903 mmap_assert_locked(mm); 2904 mmdrop(mm); 2905 kfree(cc); 2906 2907 return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 2908 : madvise_collapse_errno(last_fail); 2909 } 2910