1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/mm.h> 11 #include <linux/sched/numa_balancing.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/backing-dev.h> 21 #include <linux/dax.h> 22 #include <linux/mm_types.h> 23 #include <linux/khugepaged.h> 24 #include <linux/freezer.h> 25 #include <linux/mman.h> 26 #include <linux/memremap.h> 27 #include <linux/pagemap.h> 28 #include <linux/debugfs.h> 29 #include <linux/migrate.h> 30 #include <linux/hashtable.h> 31 #include <linux/userfaultfd_k.h> 32 #include <linux/page_idle.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/oom.h> 35 #include <linux/numa.h> 36 #include <linux/page_owner.h> 37 #include <linux/sched/sysctl.h> 38 #include <linux/memory-tiers.h> 39 #include <linux/compat.h> 40 #include <linux/pgalloc.h> 41 #include <linux/pgalloc_tag.h> 42 #include <linux/pagewalk.h> 43 44 #include <asm/tlb.h> 45 #include "internal.h" 46 #include "swap.h" 47 48 #define CREATE_TRACE_POINTS 49 #include <trace/events/thp.h> 50 51 /* 52 * By default, transparent hugepage support is disabled in order to avoid 53 * risking an increased memory footprint for applications that are not 54 * guaranteed to benefit from it. When transparent hugepage support is 55 * enabled, it is for all mappings, and khugepaged scans all mappings. 56 * Defrag is invoked by khugepaged hugepage allocations and by page faults 57 * for all hugepage allocations. 58 */ 59 unsigned long transparent_hugepage_flags __read_mostly = 60 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 61 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 62 #endif 63 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 64 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 65 #endif 66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 68 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 69 70 static struct shrinker *deferred_split_shrinker; 71 static unsigned long deferred_split_count(struct shrinker *shrink, 72 struct shrink_control *sc); 73 static unsigned long deferred_split_scan(struct shrinker *shrink, 74 struct shrink_control *sc); 75 static bool split_underused_thp = true; 76 77 static atomic_t huge_zero_refcount; 78 struct folio *huge_zero_folio __read_mostly; 79 unsigned long huge_zero_pfn __read_mostly = ~0UL; 80 unsigned long huge_anon_orders_always __read_mostly; 81 unsigned long huge_anon_orders_madvise __read_mostly; 82 unsigned long huge_anon_orders_inherit __read_mostly; 83 static bool anon_orders_configured __initdata; 84 85 static inline bool file_thp_enabled(struct vm_area_struct *vma) 86 { 87 struct inode *inode; 88 89 if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) 90 return false; 91 92 if (!vma->vm_file) 93 return false; 94 95 inode = file_inode(vma->vm_file); 96 97 if (IS_ANON_FILE(inode)) 98 return false; 99 100 return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); 101 } 102 103 /* If returns true, we are unable to access the VMA's folios. */ 104 static bool vma_is_special_huge(const struct vm_area_struct *vma) 105 { 106 if (vma_is_dax(vma)) 107 return false; 108 return vma_test_any(vma, VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT); 109 } 110 111 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, 112 vm_flags_t vm_flags, 113 enum tva_type type, 114 unsigned long orders) 115 { 116 const bool smaps = type == TVA_SMAPS; 117 const bool in_pf = type == TVA_PAGEFAULT; 118 const bool forced_collapse = type == TVA_FORCED_COLLAPSE; 119 unsigned long supported_orders; 120 121 /* Check the intersection of requested and supported orders. */ 122 if (vma_is_anonymous(vma)) 123 supported_orders = THP_ORDERS_ALL_ANON; 124 else if (vma_is_dax(vma) || vma_is_special_huge(vma)) 125 supported_orders = THP_ORDERS_ALL_SPECIAL_DAX; 126 else 127 supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; 128 129 orders &= supported_orders; 130 if (!orders) 131 return 0; 132 133 if (!vma->vm_mm) /* vdso */ 134 return 0; 135 136 if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) 137 return 0; 138 139 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ 140 if (vma_is_dax(vma)) 141 return in_pf ? orders : 0; 142 143 /* 144 * khugepaged special VMA and hugetlb VMA. 145 * Must be checked after dax since some dax mappings may have 146 * VM_MIXEDMAP set. 147 */ 148 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) 149 return 0; 150 151 /* 152 * Check alignment for file vma and size for both file and anon vma by 153 * filtering out the unsuitable orders. 154 * 155 * Skip the check for page fault. Huge fault does the check in fault 156 * handlers. 157 */ 158 if (!in_pf) { 159 int order = highest_order(orders); 160 unsigned long addr; 161 162 while (orders) { 163 addr = vma->vm_end - (PAGE_SIZE << order); 164 if (thp_vma_suitable_order(vma, addr, order)) 165 break; 166 order = next_order(&orders, order); 167 } 168 169 if (!orders) 170 return 0; 171 } 172 173 /* 174 * Enabled via shmem mount options or sysfs settings. 175 * Must be done before hugepage flags check since shmem has its 176 * own flags. 177 */ 178 if (!in_pf && shmem_file(vma->vm_file)) 179 return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), 180 vma, vma->vm_pgoff, 0, 181 forced_collapse); 182 183 if (!vma_is_anonymous(vma)) { 184 /* 185 * Enforce THP collapse requirements as necessary. Anonymous vmas 186 * were already handled in thp_vma_allowable_orders(). 187 */ 188 if (!forced_collapse && 189 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && 190 !hugepage_global_always()))) 191 return 0; 192 193 /* 194 * Trust that ->huge_fault() handlers know what they are doing 195 * in fault path. 196 */ 197 if (((in_pf || smaps)) && vma->vm_ops->huge_fault) 198 return orders; 199 /* Only regular file is valid in collapse path */ 200 if (((!in_pf || smaps)) && file_thp_enabled(vma)) 201 return orders; 202 return 0; 203 } 204 205 if (vma_is_temporary_stack(vma)) 206 return 0; 207 208 /* 209 * THPeligible bit of smaps should show 1 for proper VMAs even 210 * though anon_vma is not initialized yet. 211 * 212 * Allow page fault since anon_vma may be not initialized until 213 * the first page fault. 214 */ 215 if (!vma->anon_vma) 216 return (smaps || in_pf) ? orders : 0; 217 218 return orders; 219 } 220 221 static bool get_huge_zero_folio(void) 222 { 223 struct folio *zero_folio; 224 retry: 225 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 226 return true; 227 228 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) & 229 ~__GFP_MOVABLE, 230 HPAGE_PMD_ORDER); 231 if (!zero_folio) { 232 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 233 return false; 234 } 235 /* Ensure zero folio won't have large_rmappable flag set. */ 236 folio_clear_large_rmappable(zero_folio); 237 preempt_disable(); 238 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { 239 preempt_enable(); 240 folio_put(zero_folio); 241 goto retry; 242 } 243 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); 244 245 /* We take additional reference here. It will be put back by shrinker */ 246 atomic_set(&huge_zero_refcount, 2); 247 preempt_enable(); 248 count_vm_event(THP_ZERO_PAGE_ALLOC); 249 return true; 250 } 251 252 static void put_huge_zero_folio(void) 253 { 254 /* 255 * Counter should never go to zero here. Only shrinker can put 256 * last reference. 257 */ 258 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 259 } 260 261 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) 262 { 263 if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) 264 return huge_zero_folio; 265 266 if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) 267 return READ_ONCE(huge_zero_folio); 268 269 if (!get_huge_zero_folio()) 270 return NULL; 271 272 if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm)) 273 put_huge_zero_folio(); 274 275 return READ_ONCE(huge_zero_folio); 276 } 277 278 void mm_put_huge_zero_folio(struct mm_struct *mm) 279 { 280 if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) 281 return; 282 283 if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) 284 put_huge_zero_folio(); 285 } 286 287 static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, 288 struct shrink_control *sc) 289 { 290 /* we can free zero page only if last reference remains */ 291 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 292 } 293 294 static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, 295 struct shrink_control *sc) 296 { 297 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 298 struct folio *zero_folio = xchg(&huge_zero_folio, NULL); 299 BUG_ON(zero_folio == NULL); 300 WRITE_ONCE(huge_zero_pfn, ~0UL); 301 folio_put(zero_folio); 302 return HPAGE_PMD_NR; 303 } 304 305 return 0; 306 } 307 308 static struct shrinker *huge_zero_folio_shrinker; 309 310 #ifdef CONFIG_SYSFS 311 static ssize_t enabled_show(struct kobject *kobj, 312 struct kobj_attribute *attr, char *buf) 313 { 314 const char *output; 315 316 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 317 output = "[always] madvise never"; 318 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 319 &transparent_hugepage_flags)) 320 output = "always [madvise] never"; 321 else 322 output = "always madvise [never]"; 323 324 return sysfs_emit(buf, "%s\n", output); 325 } 326 327 enum anon_enabled_mode { 328 ANON_ENABLED_ALWAYS = 0, 329 ANON_ENABLED_INHERIT = 1, 330 ANON_ENABLED_MADVISE = 2, 331 ANON_ENABLED_NEVER = 3, 332 }; 333 334 static const char * const anon_enabled_mode_strings[] = { 335 [ANON_ENABLED_ALWAYS] = "always", 336 [ANON_ENABLED_INHERIT] = "inherit", 337 [ANON_ENABLED_MADVISE] = "madvise", 338 [ANON_ENABLED_NEVER] = "never", 339 }; 340 341 enum global_enabled_mode { 342 GLOBAL_ENABLED_ALWAYS = 0, 343 GLOBAL_ENABLED_MADVISE = 1, 344 GLOBAL_ENABLED_NEVER = 2, 345 }; 346 347 static const char * const global_enabled_mode_strings[] = { 348 [GLOBAL_ENABLED_ALWAYS] = "always", 349 [GLOBAL_ENABLED_MADVISE] = "madvise", 350 [GLOBAL_ENABLED_NEVER] = "never", 351 }; 352 353 static bool set_global_enabled_mode(enum global_enabled_mode mode) 354 { 355 static const unsigned long thp_flags[] = { 356 TRANSPARENT_HUGEPAGE_FLAG, 357 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 358 }; 359 enum global_enabled_mode m; 360 bool changed = false; 361 362 for (m = 0; m < ARRAY_SIZE(thp_flags); m++) { 363 if (m == mode) 364 changed |= !test_and_set_bit(thp_flags[m], 365 &transparent_hugepage_flags); 366 else 367 changed |= test_and_clear_bit(thp_flags[m], 368 &transparent_hugepage_flags); 369 } 370 371 return changed; 372 } 373 374 static ssize_t enabled_store(struct kobject *kobj, 375 struct kobj_attribute *attr, 376 const char *buf, size_t count) 377 { 378 int mode; 379 380 mode = sysfs_match_string(global_enabled_mode_strings, buf); 381 if (mode < 0) 382 return -EINVAL; 383 384 if (set_global_enabled_mode(mode)) { 385 int err = start_stop_khugepaged(); 386 387 if (err) 388 return err; 389 } else { 390 /* 391 * Recalculate watermarks even when the mode didn't 392 * change, as the previous code always called 393 * start_stop_khugepaged() which does this internally. 394 */ 395 set_recommended_min_free_kbytes(); 396 } 397 return count; 398 } 399 400 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); 401 402 ssize_t single_hugepage_flag_show(struct kobject *kobj, 403 struct kobj_attribute *attr, char *buf, 404 enum transparent_hugepage_flag flag) 405 { 406 return sysfs_emit(buf, "%d\n", 407 !!test_bit(flag, &transparent_hugepage_flags)); 408 } 409 410 ssize_t single_hugepage_flag_store(struct kobject *kobj, 411 struct kobj_attribute *attr, 412 const char *buf, size_t count, 413 enum transparent_hugepage_flag flag) 414 { 415 unsigned long value; 416 int ret; 417 418 ret = kstrtoul(buf, 10, &value); 419 if (ret < 0) 420 return ret; 421 if (value > 1) 422 return -EINVAL; 423 424 if (value) 425 set_bit(flag, &transparent_hugepage_flags); 426 else 427 clear_bit(flag, &transparent_hugepage_flags); 428 429 return count; 430 } 431 432 enum defrag_mode { 433 DEFRAG_ALWAYS = 0, 434 DEFRAG_DEFER, 435 DEFRAG_DEFER_MADVISE, 436 DEFRAG_MADVISE, 437 DEFRAG_NEVER, 438 }; 439 440 static const char * const defrag_mode_strings[] = { 441 [DEFRAG_ALWAYS] = "always", 442 [DEFRAG_DEFER] = "defer", 443 [DEFRAG_DEFER_MADVISE] = "defer+madvise", 444 [DEFRAG_MADVISE] = "madvise", 445 [DEFRAG_NEVER] = "never", 446 }; 447 448 static const enum transparent_hugepage_flag defrag_flags[] = { 449 [DEFRAG_ALWAYS] = TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 450 [DEFRAG_DEFER] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 451 [DEFRAG_DEFER_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 452 [DEFRAG_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 453 }; 454 455 static ssize_t defrag_show(struct kobject *kobj, 456 struct kobj_attribute *attr, char *buf) 457 { 458 int active = DEFRAG_NEVER; 459 int len = 0; 460 int i; 461 462 for (i = 0; i < ARRAY_SIZE(defrag_flags); i++) { 463 if (test_bit(defrag_flags[i], &transparent_hugepage_flags)) { 464 active = i; 465 break; 466 } 467 } 468 469 for (i = 0; i < ARRAY_SIZE(defrag_mode_strings); i++) { 470 if (i == active) 471 len += sysfs_emit_at(buf, len, "[%s] ", 472 defrag_mode_strings[i]); 473 else 474 len += sysfs_emit_at(buf, len, "%s ", 475 defrag_mode_strings[i]); 476 } 477 478 /* Replace trailing space with newline */ 479 buf[len - 1] = '\n'; 480 481 return len; 482 } 483 484 static ssize_t defrag_store(struct kobject *kobj, 485 struct kobj_attribute *attr, 486 const char *buf, size_t count) 487 { 488 int mode, m; 489 490 mode = sysfs_match_string(defrag_mode_strings, buf); 491 if (mode < 0) 492 return -EINVAL; 493 494 for (m = 0; m < ARRAY_SIZE(defrag_flags); m++) { 495 if (m == mode) 496 set_bit(defrag_flags[m], &transparent_hugepage_flags); 497 else 498 clear_bit(defrag_flags[m], &transparent_hugepage_flags); 499 } 500 501 return count; 502 } 503 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); 504 505 static ssize_t use_zero_page_show(struct kobject *kobj, 506 struct kobj_attribute *attr, char *buf) 507 { 508 return single_hugepage_flag_show(kobj, attr, buf, 509 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 510 } 511 static ssize_t use_zero_page_store(struct kobject *kobj, 512 struct kobj_attribute *attr, const char *buf, size_t count) 513 { 514 return single_hugepage_flag_store(kobj, attr, buf, count, 515 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 516 } 517 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); 518 519 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 520 struct kobj_attribute *attr, char *buf) 521 { 522 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); 523 } 524 static struct kobj_attribute hpage_pmd_size_attr = 525 __ATTR_RO(hpage_pmd_size); 526 527 static ssize_t split_underused_thp_show(struct kobject *kobj, 528 struct kobj_attribute *attr, char *buf) 529 { 530 return sysfs_emit(buf, "%d\n", split_underused_thp); 531 } 532 533 static ssize_t split_underused_thp_store(struct kobject *kobj, 534 struct kobj_attribute *attr, 535 const char *buf, size_t count) 536 { 537 int err = kstrtobool(buf, &split_underused_thp); 538 539 if (err < 0) 540 return err; 541 542 return count; 543 } 544 545 static struct kobj_attribute split_underused_thp_attr = __ATTR( 546 shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store); 547 548 static struct attribute *hugepage_attr[] = { 549 &enabled_attr.attr, 550 &defrag_attr.attr, 551 &use_zero_page_attr.attr, 552 &hpage_pmd_size_attr.attr, 553 #ifdef CONFIG_SHMEM 554 &shmem_enabled_attr.attr, 555 #endif 556 &split_underused_thp_attr.attr, 557 NULL, 558 }; 559 560 static const struct attribute_group hugepage_attr_group = { 561 .attrs = hugepage_attr, 562 }; 563 564 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); 565 static void thpsize_release(struct kobject *kobj); 566 static DEFINE_SPINLOCK(huge_anon_orders_lock); 567 static LIST_HEAD(thpsize_list); 568 569 static ssize_t anon_enabled_show(struct kobject *kobj, 570 struct kobj_attribute *attr, char *buf) 571 { 572 int order = to_thpsize(kobj)->order; 573 const char *output; 574 575 if (test_bit(order, &huge_anon_orders_always)) 576 output = "[always] inherit madvise never"; 577 else if (test_bit(order, &huge_anon_orders_inherit)) 578 output = "always [inherit] madvise never"; 579 else if (test_bit(order, &huge_anon_orders_madvise)) 580 output = "always inherit [madvise] never"; 581 else 582 output = "always inherit madvise [never]"; 583 584 return sysfs_emit(buf, "%s\n", output); 585 } 586 587 static bool set_anon_enabled_mode(int order, enum anon_enabled_mode mode) 588 { 589 static unsigned long *enabled_orders[] = { 590 &huge_anon_orders_always, 591 &huge_anon_orders_inherit, 592 &huge_anon_orders_madvise, 593 }; 594 enum anon_enabled_mode m; 595 bool changed = false; 596 597 spin_lock(&huge_anon_orders_lock); 598 for (m = 0; m < ARRAY_SIZE(enabled_orders); m++) { 599 if (m == mode) 600 changed |= !__test_and_set_bit(order, enabled_orders[m]); 601 else 602 changed |= __test_and_clear_bit(order, enabled_orders[m]); 603 } 604 spin_unlock(&huge_anon_orders_lock); 605 606 return changed; 607 } 608 609 static ssize_t anon_enabled_store(struct kobject *kobj, 610 struct kobj_attribute *attr, 611 const char *buf, size_t count) 612 { 613 int order = to_thpsize(kobj)->order; 614 int mode; 615 616 mode = sysfs_match_string(anon_enabled_mode_strings, buf); 617 if (mode < 0) 618 return -EINVAL; 619 620 if (set_anon_enabled_mode(order, mode)) { 621 int err = start_stop_khugepaged(); 622 623 if (err) 624 return err; 625 } else { 626 /* 627 * Recalculate watermarks even when the mode didn't 628 * change, as the previous code always called 629 * start_stop_khugepaged() which does this internally. 630 */ 631 set_recommended_min_free_kbytes(); 632 } 633 634 return count; 635 } 636 637 static struct kobj_attribute anon_enabled_attr = 638 __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); 639 640 static struct attribute *anon_ctrl_attrs[] = { 641 &anon_enabled_attr.attr, 642 NULL, 643 }; 644 645 static const struct attribute_group anon_ctrl_attr_grp = { 646 .attrs = anon_ctrl_attrs, 647 }; 648 649 static struct attribute *file_ctrl_attrs[] = { 650 #ifdef CONFIG_SHMEM 651 &thpsize_shmem_enabled_attr.attr, 652 #endif 653 NULL, 654 }; 655 656 static const struct attribute_group file_ctrl_attr_grp = { 657 .attrs = file_ctrl_attrs, 658 }; 659 660 static struct attribute *any_ctrl_attrs[] = { 661 NULL, 662 }; 663 664 static const struct attribute_group any_ctrl_attr_grp = { 665 .attrs = any_ctrl_attrs, 666 }; 667 668 static const struct kobj_type thpsize_ktype = { 669 .release = &thpsize_release, 670 .sysfs_ops = &kobj_sysfs_ops, 671 }; 672 673 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; 674 675 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) 676 { 677 unsigned long sum = 0; 678 int cpu; 679 680 for_each_possible_cpu(cpu) { 681 struct mthp_stat *this = &per_cpu(mthp_stats, cpu); 682 683 sum += this->stats[order][item]; 684 } 685 686 return sum; 687 } 688 689 #define DEFINE_MTHP_STAT_ATTR(_name, _index) \ 690 static ssize_t _name##_show(struct kobject *kobj, \ 691 struct kobj_attribute *attr, char *buf) \ 692 { \ 693 int order = to_thpsize(kobj)->order; \ 694 \ 695 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ 696 } \ 697 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 698 699 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); 700 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); 701 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 702 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); 703 DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); 704 DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); 705 DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); 706 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); 707 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); 708 #ifdef CONFIG_SHMEM 709 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); 710 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); 711 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); 712 #endif 713 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); 714 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); 715 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); 716 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); 717 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); 718 719 static struct attribute *anon_stats_attrs[] = { 720 &anon_fault_alloc_attr.attr, 721 &anon_fault_fallback_attr.attr, 722 &anon_fault_fallback_charge_attr.attr, 723 #ifndef CONFIG_SHMEM 724 &zswpout_attr.attr, 725 &swpin_attr.attr, 726 &swpin_fallback_attr.attr, 727 &swpin_fallback_charge_attr.attr, 728 &swpout_attr.attr, 729 &swpout_fallback_attr.attr, 730 #endif 731 &split_deferred_attr.attr, 732 &nr_anon_attr.attr, 733 &nr_anon_partially_mapped_attr.attr, 734 NULL, 735 }; 736 737 static struct attribute_group anon_stats_attr_grp = { 738 .name = "stats", 739 .attrs = anon_stats_attrs, 740 }; 741 742 static struct attribute *file_stats_attrs[] = { 743 #ifdef CONFIG_SHMEM 744 &shmem_alloc_attr.attr, 745 &shmem_fallback_attr.attr, 746 &shmem_fallback_charge_attr.attr, 747 #endif 748 NULL, 749 }; 750 751 static struct attribute_group file_stats_attr_grp = { 752 .name = "stats", 753 .attrs = file_stats_attrs, 754 }; 755 756 static struct attribute *any_stats_attrs[] = { 757 #ifdef CONFIG_SHMEM 758 &zswpout_attr.attr, 759 &swpin_attr.attr, 760 &swpin_fallback_attr.attr, 761 &swpin_fallback_charge_attr.attr, 762 &swpout_attr.attr, 763 &swpout_fallback_attr.attr, 764 #endif 765 &split_attr.attr, 766 &split_failed_attr.attr, 767 NULL, 768 }; 769 770 static struct attribute_group any_stats_attr_grp = { 771 .name = "stats", 772 .attrs = any_stats_attrs, 773 }; 774 775 static int sysfs_add_group(struct kobject *kobj, 776 const struct attribute_group *grp) 777 { 778 int ret = -ENOENT; 779 780 /* 781 * If the group is named, try to merge first, assuming the subdirectory 782 * was already created. This avoids the warning emitted by 783 * sysfs_create_group() if the directory already exists. 784 */ 785 if (grp->name) 786 ret = sysfs_merge_group(kobj, grp); 787 if (ret) 788 ret = sysfs_create_group(kobj, grp); 789 790 return ret; 791 } 792 793 static struct thpsize *thpsize_create(int order, struct kobject *parent) 794 { 795 unsigned long size = (PAGE_SIZE << order) / SZ_1K; 796 struct thpsize *thpsize; 797 int ret = -ENOMEM; 798 799 thpsize = kzalloc_obj(*thpsize); 800 if (!thpsize) 801 goto err; 802 803 thpsize->order = order; 804 805 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, 806 "hugepages-%lukB", size); 807 if (ret) { 808 kfree(thpsize); 809 goto err; 810 } 811 812 813 ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp); 814 if (ret) 815 goto err_put; 816 817 ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp); 818 if (ret) 819 goto err_put; 820 821 if (BIT(order) & THP_ORDERS_ALL_ANON) { 822 ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp); 823 if (ret) 824 goto err_put; 825 826 ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp); 827 if (ret) 828 goto err_put; 829 } 830 831 if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) { 832 ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp); 833 if (ret) 834 goto err_put; 835 836 ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp); 837 if (ret) 838 goto err_put; 839 } 840 841 return thpsize; 842 err_put: 843 kobject_put(&thpsize->kobj); 844 err: 845 return ERR_PTR(ret); 846 } 847 848 static void thpsize_release(struct kobject *kobj) 849 { 850 kfree(to_thpsize(kobj)); 851 } 852 853 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 854 { 855 int err; 856 struct thpsize *thpsize; 857 unsigned long orders; 858 int order; 859 860 /* 861 * Default to setting PMD-sized THP to inherit the global setting and 862 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time 863 * constant so we have to do this here. 864 */ 865 if (!anon_orders_configured) 866 huge_anon_orders_inherit = BIT(PMD_ORDER); 867 868 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 869 if (unlikely(!*hugepage_kobj)) { 870 pr_err("failed to create transparent hugepage kobject\n"); 871 return -ENOMEM; 872 } 873 874 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 875 if (err) { 876 pr_err("failed to register transparent hugepage group\n"); 877 goto delete_obj; 878 } 879 880 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 881 if (err) { 882 pr_err("failed to register transparent hugepage group\n"); 883 goto remove_hp_group; 884 } 885 886 orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT; 887 order = highest_order(orders); 888 while (orders) { 889 thpsize = thpsize_create(order, *hugepage_kobj); 890 if (IS_ERR(thpsize)) { 891 pr_err("failed to create thpsize for order %d\n", order); 892 err = PTR_ERR(thpsize); 893 goto remove_all; 894 } 895 list_add(&thpsize->node, &thpsize_list); 896 order = next_order(&orders, order); 897 } 898 899 return 0; 900 901 remove_all: 902 hugepage_exit_sysfs(*hugepage_kobj); 903 return err; 904 remove_hp_group: 905 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 906 delete_obj: 907 kobject_put(*hugepage_kobj); 908 return err; 909 } 910 911 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 912 { 913 struct thpsize *thpsize, *tmp; 914 915 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { 916 list_del(&thpsize->node); 917 kobject_put(&thpsize->kobj); 918 } 919 920 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 921 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 922 kobject_put(hugepage_kobj); 923 } 924 #else 925 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 926 { 927 return 0; 928 } 929 930 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 931 { 932 } 933 #endif /* CONFIG_SYSFS */ 934 935 static int __init thp_shrinker_init(void) 936 { 937 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 938 SHRINKER_MEMCG_AWARE | 939 SHRINKER_NONSLAB, 940 "thp-deferred_split"); 941 if (!deferred_split_shrinker) 942 return -ENOMEM; 943 944 deferred_split_shrinker->count_objects = deferred_split_count; 945 deferred_split_shrinker->scan_objects = deferred_split_scan; 946 shrinker_register(deferred_split_shrinker); 947 948 if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) { 949 /* 950 * Bump the reference of the huge_zero_folio and do not 951 * initialize the shrinker. 952 * 953 * huge_zero_folio will always be NULL on failure. We assume 954 * that get_huge_zero_folio() will most likely not fail as 955 * thp_shrinker_init() is invoked early on during boot. 956 */ 957 if (!get_huge_zero_folio()) 958 pr_warn("Allocating persistent huge zero folio failed\n"); 959 return 0; 960 } 961 962 huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); 963 if (!huge_zero_folio_shrinker) { 964 shrinker_free(deferred_split_shrinker); 965 return -ENOMEM; 966 } 967 968 huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count; 969 huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; 970 shrinker_register(huge_zero_folio_shrinker); 971 972 return 0; 973 } 974 975 static void __init thp_shrinker_exit(void) 976 { 977 shrinker_free(huge_zero_folio_shrinker); 978 shrinker_free(deferred_split_shrinker); 979 } 980 981 static int __init hugepage_init(void) 982 { 983 int err; 984 struct kobject *hugepage_kobj; 985 986 if (!has_transparent_hugepage()) { 987 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; 988 return -EINVAL; 989 } 990 991 /* 992 * hugepages can't be allocated by the buddy allocator 993 */ 994 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); 995 996 err = hugepage_init_sysfs(&hugepage_kobj); 997 if (err) 998 goto err_sysfs; 999 1000 err = khugepaged_init(); 1001 if (err) 1002 goto err_slab; 1003 1004 err = thp_shrinker_init(); 1005 if (err) 1006 goto err_shrinker; 1007 1008 /* 1009 * By default disable transparent hugepages on smaller systems, 1010 * where the extra memory used could hurt more than TLB overhead 1011 * is likely to save. The admin can still enable it through /sys. 1012 */ 1013 if (totalram_pages() < MB_TO_PAGES(512)) { 1014 transparent_hugepage_flags = 0; 1015 return 0; 1016 } 1017 1018 err = start_stop_khugepaged(); 1019 if (err) 1020 goto err_khugepaged; 1021 1022 return 0; 1023 err_khugepaged: 1024 thp_shrinker_exit(); 1025 err_shrinker: 1026 khugepaged_destroy(); 1027 err_slab: 1028 hugepage_exit_sysfs(hugepage_kobj); 1029 err_sysfs: 1030 return err; 1031 } 1032 subsys_initcall(hugepage_init); 1033 1034 static int __init setup_transparent_hugepage(char *str) 1035 { 1036 int ret = 0; 1037 if (!str) 1038 goto out; 1039 if (!strcmp(str, "always")) { 1040 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 1041 &transparent_hugepage_flags); 1042 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 1043 &transparent_hugepage_flags); 1044 ret = 1; 1045 } else if (!strcmp(str, "madvise")) { 1046 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 1047 &transparent_hugepage_flags); 1048 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 1049 &transparent_hugepage_flags); 1050 ret = 1; 1051 } else if (!strcmp(str, "never")) { 1052 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 1053 &transparent_hugepage_flags); 1054 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 1055 &transparent_hugepage_flags); 1056 ret = 1; 1057 } 1058 out: 1059 if (!ret) 1060 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 1061 return ret; 1062 } 1063 __setup("transparent_hugepage=", setup_transparent_hugepage); 1064 1065 static char str_dup[PAGE_SIZE] __initdata; 1066 static int __init setup_thp_anon(char *str) 1067 { 1068 char *token, *range, *policy, *subtoken; 1069 unsigned long always, inherit, madvise; 1070 char *start_size, *end_size; 1071 int start, end, nr; 1072 char *p; 1073 1074 if (!str || strlen(str) + 1 > PAGE_SIZE) 1075 goto err; 1076 strscpy(str_dup, str); 1077 1078 always = huge_anon_orders_always; 1079 madvise = huge_anon_orders_madvise; 1080 inherit = huge_anon_orders_inherit; 1081 p = str_dup; 1082 while ((token = strsep(&p, ";")) != NULL) { 1083 range = strsep(&token, ":"); 1084 policy = token; 1085 1086 if (!policy) 1087 goto err; 1088 1089 while ((subtoken = strsep(&range, ",")) != NULL) { 1090 if (strchr(subtoken, '-')) { 1091 start_size = strsep(&subtoken, "-"); 1092 end_size = subtoken; 1093 1094 start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON); 1095 end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON); 1096 } else { 1097 start_size = end_size = subtoken; 1098 start = end = get_order_from_str(subtoken, 1099 THP_ORDERS_ALL_ANON); 1100 } 1101 1102 if (start == -EINVAL) { 1103 pr_err("invalid size %s in thp_anon boot parameter\n", start_size); 1104 goto err; 1105 } 1106 1107 if (end == -EINVAL) { 1108 pr_err("invalid size %s in thp_anon boot parameter\n", end_size); 1109 goto err; 1110 } 1111 1112 if (start < 0 || end < 0 || start > end) 1113 goto err; 1114 1115 nr = end - start + 1; 1116 if (!strcmp(policy, "always")) { 1117 bitmap_set(&always, start, nr); 1118 bitmap_clear(&inherit, start, nr); 1119 bitmap_clear(&madvise, start, nr); 1120 } else if (!strcmp(policy, "madvise")) { 1121 bitmap_set(&madvise, start, nr); 1122 bitmap_clear(&inherit, start, nr); 1123 bitmap_clear(&always, start, nr); 1124 } else if (!strcmp(policy, "inherit")) { 1125 bitmap_set(&inherit, start, nr); 1126 bitmap_clear(&madvise, start, nr); 1127 bitmap_clear(&always, start, nr); 1128 } else if (!strcmp(policy, "never")) { 1129 bitmap_clear(&inherit, start, nr); 1130 bitmap_clear(&madvise, start, nr); 1131 bitmap_clear(&always, start, nr); 1132 } else { 1133 pr_err("invalid policy %s in thp_anon boot parameter\n", policy); 1134 goto err; 1135 } 1136 } 1137 } 1138 1139 huge_anon_orders_always = always; 1140 huge_anon_orders_madvise = madvise; 1141 huge_anon_orders_inherit = inherit; 1142 anon_orders_configured = true; 1143 return 1; 1144 1145 err: 1146 pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str); 1147 return 0; 1148 } 1149 __setup("thp_anon=", setup_thp_anon); 1150 1151 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 1152 { 1153 if (likely(vma->vm_flags & VM_WRITE)) 1154 pmd = pmd_mkwrite(pmd, vma); 1155 return pmd; 1156 } 1157 1158 static struct deferred_split *split_queue_node(int nid) 1159 { 1160 struct pglist_data *pgdata = NODE_DATA(nid); 1161 1162 return &pgdata->deferred_split_queue; 1163 } 1164 1165 #ifdef CONFIG_MEMCG 1166 static inline 1167 struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, 1168 struct deferred_split *queue) 1169 { 1170 if (mem_cgroup_disabled()) 1171 return NULL; 1172 if (split_queue_node(folio_nid(folio)) == queue) 1173 return NULL; 1174 return container_of(queue, struct mem_cgroup, deferred_split_queue); 1175 } 1176 1177 static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) 1178 { 1179 return memcg ? &memcg->deferred_split_queue : split_queue_node(nid); 1180 } 1181 #else 1182 static inline 1183 struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, 1184 struct deferred_split *queue) 1185 { 1186 return NULL; 1187 } 1188 1189 static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) 1190 { 1191 return split_queue_node(nid); 1192 } 1193 #endif 1194 1195 static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg) 1196 { 1197 struct deferred_split *queue; 1198 1199 retry: 1200 queue = memcg_split_queue(nid, memcg); 1201 spin_lock(&queue->split_queue_lock); 1202 /* 1203 * There is a period between setting memcg to dying and reparenting 1204 * deferred split queue, and during this period the THPs in the deferred 1205 * split queue will be hidden from the shrinker side. 1206 */ 1207 if (unlikely(memcg_is_dying(memcg))) { 1208 spin_unlock(&queue->split_queue_lock); 1209 memcg = parent_mem_cgroup(memcg); 1210 goto retry; 1211 } 1212 1213 return queue; 1214 } 1215 1216 static struct deferred_split * 1217 split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags) 1218 { 1219 struct deferred_split *queue; 1220 1221 retry: 1222 queue = memcg_split_queue(nid, memcg); 1223 spin_lock_irqsave(&queue->split_queue_lock, *flags); 1224 if (unlikely(memcg_is_dying(memcg))) { 1225 spin_unlock_irqrestore(&queue->split_queue_lock, *flags); 1226 memcg = parent_mem_cgroup(memcg); 1227 goto retry; 1228 } 1229 1230 return queue; 1231 } 1232 1233 static struct deferred_split *folio_split_queue_lock(struct folio *folio) 1234 { 1235 struct deferred_split *queue; 1236 1237 rcu_read_lock(); 1238 queue = split_queue_lock(folio_nid(folio), folio_memcg(folio)); 1239 /* 1240 * The memcg destruction path is acquiring the split queue lock for 1241 * reparenting. Once you have it locked, it's safe to drop the rcu lock. 1242 */ 1243 rcu_read_unlock(); 1244 1245 return queue; 1246 } 1247 1248 static struct deferred_split * 1249 folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) 1250 { 1251 struct deferred_split *queue; 1252 1253 rcu_read_lock(); 1254 queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); 1255 rcu_read_unlock(); 1256 1257 return queue; 1258 } 1259 1260 static inline void split_queue_unlock(struct deferred_split *queue) 1261 { 1262 spin_unlock(&queue->split_queue_lock); 1263 } 1264 1265 static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, 1266 unsigned long flags) 1267 { 1268 spin_unlock_irqrestore(&queue->split_queue_lock, flags); 1269 } 1270 1271 static inline bool is_transparent_hugepage(const struct folio *folio) 1272 { 1273 if (!folio_test_large(folio)) 1274 return false; 1275 1276 return is_huge_zero_folio(folio) || 1277 folio_test_large_rmappable(folio); 1278 } 1279 1280 static unsigned long __thp_get_unmapped_area(struct file *filp, 1281 unsigned long addr, unsigned long len, 1282 loff_t off, unsigned long flags, unsigned long size, 1283 vm_flags_t vm_flags) 1284 { 1285 loff_t off_end = off + len; 1286 loff_t off_align = round_up(off, size); 1287 unsigned long len_pad, ret, off_sub; 1288 1289 if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) 1290 return 0; 1291 1292 if (off_end <= off_align || (off_end - off_align) < size) 1293 return 0; 1294 1295 len_pad = len + size; 1296 if (len_pad < len || (off + len_pad) < off) 1297 return 0; 1298 1299 ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad, 1300 off >> PAGE_SHIFT, flags, vm_flags); 1301 1302 /* 1303 * The failure might be due to length padding. The caller will retry 1304 * without the padding. 1305 */ 1306 if (IS_ERR_VALUE(ret)) 1307 return 0; 1308 1309 /* 1310 * Do not try to align to THP boundary if allocation at the address 1311 * hint succeeds. 1312 */ 1313 if (ret == addr) 1314 return addr; 1315 1316 off_sub = (off - ret) & (size - 1); 1317 1318 if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub) 1319 return ret + size; 1320 1321 ret += off_sub; 1322 return ret; 1323 } 1324 1325 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, 1326 unsigned long len, unsigned long pgoff, unsigned long flags, 1327 vm_flags_t vm_flags) 1328 { 1329 unsigned long ret; 1330 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 1331 1332 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); 1333 if (ret) 1334 return ret; 1335 1336 return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 1337 vm_flags); 1338 } 1339 1340 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 1341 unsigned long len, unsigned long pgoff, unsigned long flags) 1342 { 1343 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); 1344 } 1345 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 1346 1347 static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, 1348 unsigned long addr) 1349 { 1350 gfp_t gfp = vma_thp_gfp_mask(vma); 1351 const int order = HPAGE_PMD_ORDER; 1352 struct folio *folio; 1353 1354 folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); 1355 1356 if (unlikely(!folio)) { 1357 count_vm_event(THP_FAULT_FALLBACK); 1358 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); 1359 return NULL; 1360 } 1361 1362 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 1363 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { 1364 folio_put(folio); 1365 count_vm_event(THP_FAULT_FALLBACK); 1366 count_vm_event(THP_FAULT_FALLBACK_CHARGE); 1367 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); 1368 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 1369 return NULL; 1370 } 1371 folio_throttle_swaprate(folio, gfp); 1372 1373 /* 1374 * When a folio is not zeroed during allocation (__GFP_ZERO not used) 1375 * or user folios require special handling, folio_zero_user() is used to 1376 * make sure that the page corresponding to the faulting address will be 1377 * hot in the cache after zeroing. 1378 */ 1379 if (user_alloc_needs_zeroing()) 1380 folio_zero_user(folio, addr); 1381 /* 1382 * The memory barrier inside __folio_mark_uptodate makes sure that 1383 * folio_zero_user writes become visible before the set_pmd_at() 1384 * write. 1385 */ 1386 __folio_mark_uptodate(folio); 1387 return folio; 1388 } 1389 1390 void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, 1391 struct vm_area_struct *vma, unsigned long haddr) 1392 { 1393 pmd_t entry; 1394 1395 entry = folio_mk_pmd(folio, vma->vm_page_prot); 1396 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1397 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); 1398 folio_add_lru_vma(folio, vma); 1399 set_pmd_at(vma->vm_mm, haddr, pmd, entry); 1400 update_mmu_cache_pmd(vma, haddr, pmd); 1401 deferred_split_folio(folio, false); 1402 } 1403 1404 static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd, 1405 struct vm_area_struct *vma, unsigned long haddr) 1406 { 1407 map_anon_folio_pmd_nopf(folio, pmd, vma, haddr); 1408 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1409 count_vm_event(THP_FAULT_ALLOC); 1410 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); 1411 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); 1412 } 1413 1414 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1415 { 1416 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1417 struct vm_area_struct *vma = vmf->vma; 1418 struct folio *folio; 1419 pgtable_t pgtable; 1420 vm_fault_t ret = 0; 1421 1422 folio = vma_alloc_anon_folio_pmd(vma, vmf->address); 1423 if (unlikely(!folio)) 1424 return VM_FAULT_FALLBACK; 1425 1426 pgtable = pte_alloc_one(vma->vm_mm); 1427 if (unlikely(!pgtable)) { 1428 ret = VM_FAULT_OOM; 1429 goto release; 1430 } 1431 1432 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1433 if (unlikely(!pmd_none(*vmf->pmd))) { 1434 goto unlock_release; 1435 } else { 1436 ret = check_stable_address_space(vma->vm_mm); 1437 if (ret) 1438 goto unlock_release; 1439 1440 /* Deliver the page fault to userland */ 1441 if (userfaultfd_missing(vma)) { 1442 spin_unlock(vmf->ptl); 1443 folio_put(folio); 1444 pte_free(vma->vm_mm, pgtable); 1445 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1446 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1447 return ret; 1448 } 1449 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 1450 map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); 1451 mm_inc_nr_ptes(vma->vm_mm); 1452 spin_unlock(vmf->ptl); 1453 } 1454 1455 return 0; 1456 unlock_release: 1457 spin_unlock(vmf->ptl); 1458 release: 1459 if (pgtable) 1460 pte_free(vma->vm_mm, pgtable); 1461 folio_put(folio); 1462 return ret; 1463 1464 } 1465 1466 vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) 1467 { 1468 struct vm_area_struct *vma = vmf->vma; 1469 vm_fault_t ret = 0; 1470 spinlock_t *ptl; 1471 softleaf_t entry; 1472 struct page *page; 1473 struct folio *folio; 1474 1475 if (vmf->flags & FAULT_FLAG_VMA_LOCK) { 1476 vma_end_read(vma); 1477 return VM_FAULT_RETRY; 1478 } 1479 1480 ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1481 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) { 1482 spin_unlock(ptl); 1483 return 0; 1484 } 1485 1486 entry = softleaf_from_pmd(vmf->orig_pmd); 1487 page = softleaf_to_page(entry); 1488 folio = page_folio(page); 1489 vmf->page = page; 1490 vmf->pte = NULL; 1491 if (folio_trylock(folio)) { 1492 folio_get(folio); 1493 spin_unlock(ptl); 1494 ret = page_pgmap(page)->ops->migrate_to_ram(vmf); 1495 folio_unlock(folio); 1496 folio_put(folio); 1497 } else { 1498 spin_unlock(ptl); 1499 } 1500 1501 return ret; 1502 } 1503 1504 /* 1505 * always: directly stall for all thp allocations 1506 * defer: wake kswapd and fail if not immediately available 1507 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 1508 * fail if not immediately available 1509 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 1510 * available 1511 * never: never stall for any thp allocation 1512 */ 1513 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) 1514 { 1515 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); 1516 1517 /* Always do synchronous compaction */ 1518 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 1519 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 1520 1521 /* Kick kcompactd and fail quickly */ 1522 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 1523 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 1524 1525 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 1526 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 1527 return GFP_TRANSHUGE_LIGHT | 1528 (vma_madvised ? __GFP_DIRECT_RECLAIM : 1529 __GFP_KSWAPD_RECLAIM); 1530 1531 /* Only do synchronous compaction if madvised */ 1532 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 1533 return GFP_TRANSHUGE_LIGHT | 1534 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 1535 1536 return GFP_TRANSHUGE_LIGHT; 1537 } 1538 1539 /* Caller must hold page table lock. */ 1540 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, 1541 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 1542 struct folio *zero_folio) 1543 { 1544 pmd_t entry; 1545 entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); 1546 entry = pmd_mkspecial(entry); 1547 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1548 set_pmd_at(mm, haddr, pmd, entry); 1549 mm_inc_nr_ptes(mm); 1550 } 1551 1552 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1553 { 1554 struct vm_area_struct *vma = vmf->vma; 1555 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1556 vm_fault_t ret; 1557 1558 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) 1559 return VM_FAULT_FALLBACK; 1560 ret = vmf_anon_prepare(vmf); 1561 if (ret) 1562 return ret; 1563 khugepaged_enter_vma(vma, vma->vm_flags); 1564 1565 if (!(vmf->flags & FAULT_FLAG_WRITE) && 1566 !mm_forbids_zeropage(vma->vm_mm) && 1567 transparent_hugepage_use_zero_page()) { 1568 pgtable_t pgtable; 1569 struct folio *zero_folio; 1570 vm_fault_t ret; 1571 1572 pgtable = pte_alloc_one(vma->vm_mm); 1573 if (unlikely(!pgtable)) 1574 return VM_FAULT_OOM; 1575 zero_folio = mm_get_huge_zero_folio(vma->vm_mm); 1576 if (unlikely(!zero_folio)) { 1577 pte_free(vma->vm_mm, pgtable); 1578 count_vm_event(THP_FAULT_FALLBACK); 1579 return VM_FAULT_FALLBACK; 1580 } 1581 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1582 ret = 0; 1583 if (pmd_none(*vmf->pmd)) { 1584 ret = check_stable_address_space(vma->vm_mm); 1585 if (ret) { 1586 spin_unlock(vmf->ptl); 1587 pte_free(vma->vm_mm, pgtable); 1588 } else if (userfaultfd_missing(vma)) { 1589 spin_unlock(vmf->ptl); 1590 pte_free(vma->vm_mm, pgtable); 1591 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1592 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1593 } else { 1594 set_huge_zero_folio(pgtable, vma->vm_mm, vma, 1595 haddr, vmf->pmd, zero_folio); 1596 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1597 spin_unlock(vmf->ptl); 1598 } 1599 } else { 1600 spin_unlock(vmf->ptl); 1601 pte_free(vma->vm_mm, pgtable); 1602 } 1603 return ret; 1604 } 1605 1606 return __do_huge_pmd_anonymous_page(vmf); 1607 } 1608 1609 struct folio_or_pfn { 1610 union { 1611 struct folio *folio; 1612 unsigned long pfn; 1613 }; 1614 bool is_folio; 1615 }; 1616 1617 static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr, 1618 pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot, 1619 bool write) 1620 { 1621 struct mm_struct *mm = vma->vm_mm; 1622 pgtable_t pgtable = NULL; 1623 spinlock_t *ptl; 1624 pmd_t entry; 1625 1626 if (addr < vma->vm_start || addr >= vma->vm_end) 1627 return VM_FAULT_SIGBUS; 1628 1629 if (arch_needs_pgtable_deposit()) { 1630 pgtable = pte_alloc_one(vma->vm_mm); 1631 if (!pgtable) 1632 return VM_FAULT_OOM; 1633 } 1634 1635 ptl = pmd_lock(mm, pmd); 1636 if (!pmd_none(*pmd)) { 1637 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : 1638 fop.pfn; 1639 1640 if (write) { 1641 if (pmd_pfn(*pmd) != pfn) { 1642 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 1643 goto out_unlock; 1644 } 1645 entry = pmd_mkyoung(*pmd); 1646 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1647 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 1648 update_mmu_cache_pmd(vma, addr, pmd); 1649 } 1650 goto out_unlock; 1651 } 1652 1653 if (fop.is_folio) { 1654 entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); 1655 1656 if (is_huge_zero_folio(fop.folio)) { 1657 entry = pmd_mkspecial(entry); 1658 } else { 1659 folio_get(fop.folio); 1660 folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); 1661 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); 1662 } 1663 } else { 1664 entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot)); 1665 entry = pmd_mkspecial(entry); 1666 } 1667 if (write) { 1668 entry = pmd_mkyoung(pmd_mkdirty(entry)); 1669 entry = maybe_pmd_mkwrite(entry, vma); 1670 } 1671 1672 if (pgtable) { 1673 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1674 mm_inc_nr_ptes(mm); 1675 pgtable = NULL; 1676 } 1677 1678 set_pmd_at(mm, addr, pmd, entry); 1679 update_mmu_cache_pmd(vma, addr, pmd); 1680 1681 out_unlock: 1682 spin_unlock(ptl); 1683 if (pgtable) 1684 pte_free(mm, pgtable); 1685 return VM_FAULT_NOPAGE; 1686 } 1687 1688 /** 1689 * vmf_insert_pfn_pmd - insert a pmd size pfn 1690 * @vmf: Structure describing the fault 1691 * @pfn: pfn to insert 1692 * @write: whether it's a write fault 1693 * 1694 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. 1695 * 1696 * Return: vm_fault_t value. 1697 */ 1698 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, 1699 bool write) 1700 { 1701 unsigned long addr = vmf->address & PMD_MASK; 1702 struct vm_area_struct *vma = vmf->vma; 1703 pgprot_t pgprot = vma->vm_page_prot; 1704 struct folio_or_pfn fop = { 1705 .pfn = pfn, 1706 }; 1707 1708 /* 1709 * If we had pmd_special, we could avoid all these restrictions, 1710 * but we need to be consistent with PTEs and architectures that 1711 * can't support a 'special' bit. 1712 */ 1713 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1714 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1715 (VM_PFNMAP|VM_MIXEDMAP)); 1716 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1717 1718 pfnmap_setup_cachemode_pfn(pfn, &pgprot); 1719 1720 return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write); 1721 } 1722 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1723 1724 vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, 1725 bool write) 1726 { 1727 struct vm_area_struct *vma = vmf->vma; 1728 unsigned long addr = vmf->address & PMD_MASK; 1729 struct folio_or_pfn fop = { 1730 .folio = folio, 1731 .is_folio = true, 1732 }; 1733 1734 if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER)) 1735 return VM_FAULT_SIGBUS; 1736 1737 return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write); 1738 } 1739 EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd); 1740 1741 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1742 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 1743 { 1744 if (likely(vma->vm_flags & VM_WRITE)) 1745 pud = pud_mkwrite(pud); 1746 return pud; 1747 } 1748 1749 static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr, 1750 pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) 1751 { 1752 struct mm_struct *mm = vma->vm_mm; 1753 spinlock_t *ptl; 1754 pud_t entry; 1755 1756 if (addr < vma->vm_start || addr >= vma->vm_end) 1757 return VM_FAULT_SIGBUS; 1758 1759 ptl = pud_lock(mm, pud); 1760 if (!pud_none(*pud)) { 1761 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : 1762 fop.pfn; 1763 1764 if (write) { 1765 if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) 1766 goto out_unlock; 1767 entry = pud_mkyoung(*pud); 1768 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 1769 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 1770 update_mmu_cache_pud(vma, addr, pud); 1771 } 1772 goto out_unlock; 1773 } 1774 1775 if (fop.is_folio) { 1776 entry = folio_mk_pud(fop.folio, vma->vm_page_prot); 1777 1778 folio_get(fop.folio); 1779 folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma); 1780 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR); 1781 } else { 1782 entry = pud_mkhuge(pfn_pud(fop.pfn, prot)); 1783 entry = pud_mkspecial(entry); 1784 } 1785 if (write) { 1786 entry = pud_mkyoung(pud_mkdirty(entry)); 1787 entry = maybe_pud_mkwrite(entry, vma); 1788 } 1789 set_pud_at(mm, addr, pud, entry); 1790 update_mmu_cache_pud(vma, addr, pud); 1791 out_unlock: 1792 spin_unlock(ptl); 1793 return VM_FAULT_NOPAGE; 1794 } 1795 1796 /** 1797 * vmf_insert_pfn_pud - insert a pud size pfn 1798 * @vmf: Structure describing the fault 1799 * @pfn: pfn to insert 1800 * @write: whether it's a write fault 1801 * 1802 * Insert a pud size pfn. See vmf_insert_pfn() for additional info. 1803 * 1804 * Return: vm_fault_t value. 1805 */ 1806 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, 1807 bool write) 1808 { 1809 unsigned long addr = vmf->address & PUD_MASK; 1810 struct vm_area_struct *vma = vmf->vma; 1811 pgprot_t pgprot = vma->vm_page_prot; 1812 struct folio_or_pfn fop = { 1813 .pfn = pfn, 1814 }; 1815 1816 /* 1817 * If we had pud_special, we could avoid all these restrictions, 1818 * but we need to be consistent with PTEs and architectures that 1819 * can't support a 'special' bit. 1820 */ 1821 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1822 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1823 (VM_PFNMAP|VM_MIXEDMAP)); 1824 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1825 1826 pfnmap_setup_cachemode_pfn(pfn, &pgprot); 1827 1828 return insert_pud(vma, addr, vmf->pud, fop, pgprot, write); 1829 } 1830 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 1831 1832 /** 1833 * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry 1834 * @vmf: Structure describing the fault 1835 * @folio: folio to insert 1836 * @write: whether it's a write fault 1837 * 1838 * Return: vm_fault_t value. 1839 */ 1840 vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, 1841 bool write) 1842 { 1843 struct vm_area_struct *vma = vmf->vma; 1844 unsigned long addr = vmf->address & PUD_MASK; 1845 struct folio_or_pfn fop = { 1846 .folio = folio, 1847 .is_folio = true, 1848 }; 1849 1850 if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER)) 1851 return VM_FAULT_SIGBUS; 1852 1853 return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); 1854 } 1855 EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); 1856 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1857 1858 /** 1859 * touch_pmd - Mark page table pmd entry as accessed and dirty (for write) 1860 * @vma: The VMA covering @addr 1861 * @addr: The virtual address 1862 * @pmd: pmd pointer into the page table mapping @addr 1863 * @write: Whether it's a write access 1864 * 1865 * Return: whether the pmd entry is changed 1866 */ 1867 bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1868 pmd_t *pmd, bool write) 1869 { 1870 pmd_t entry; 1871 1872 entry = pmd_mkyoung(*pmd); 1873 if (write) 1874 entry = pmd_mkdirty(entry); 1875 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1876 pmd, entry, write)) { 1877 update_mmu_cache_pmd(vma, addr, pmd); 1878 return true; 1879 } 1880 1881 return false; 1882 } 1883 1884 static void copy_huge_non_present_pmd( 1885 struct mm_struct *dst_mm, struct mm_struct *src_mm, 1886 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1887 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 1888 pmd_t pmd, pgtable_t pgtable) 1889 { 1890 softleaf_t entry = softleaf_from_pmd(pmd); 1891 struct folio *src_folio; 1892 1893 VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd)); 1894 1895 if (softleaf_is_migration_write(entry) || 1896 softleaf_is_migration_read_exclusive(entry)) { 1897 entry = make_readable_migration_entry(swp_offset(entry)); 1898 pmd = swp_entry_to_pmd(entry); 1899 if (pmd_swp_soft_dirty(*src_pmd)) 1900 pmd = pmd_swp_mksoft_dirty(pmd); 1901 if (pmd_swp_uffd_wp(*src_pmd)) 1902 pmd = pmd_swp_mkuffd_wp(pmd); 1903 set_pmd_at(src_mm, addr, src_pmd, pmd); 1904 } else if (softleaf_is_device_private(entry)) { 1905 /* 1906 * For device private entries, since there are no 1907 * read exclusive entries, writable = !readable 1908 */ 1909 if (softleaf_is_device_private_write(entry)) { 1910 entry = make_readable_device_private_entry(swp_offset(entry)); 1911 pmd = swp_entry_to_pmd(entry); 1912 1913 if (pmd_swp_soft_dirty(*src_pmd)) 1914 pmd = pmd_swp_mksoft_dirty(pmd); 1915 if (pmd_swp_uffd_wp(*src_pmd)) 1916 pmd = pmd_swp_mkuffd_wp(pmd); 1917 set_pmd_at(src_mm, addr, src_pmd, pmd); 1918 } 1919 1920 src_folio = softleaf_to_folio(entry); 1921 VM_WARN_ON(!folio_test_large(src_folio)); 1922 1923 folio_get(src_folio); 1924 /* 1925 * folio_try_dup_anon_rmap_pmd does not fail for 1926 * device private entries. 1927 */ 1928 folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, 1929 dst_vma, src_vma); 1930 } 1931 1932 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1933 mm_inc_nr_ptes(dst_mm); 1934 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1935 if (!userfaultfd_wp(dst_vma)) 1936 pmd = pmd_swp_clear_uffd_wp(pmd); 1937 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1938 } 1939 1940 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1941 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1942 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1943 { 1944 spinlock_t *dst_ptl, *src_ptl; 1945 struct page *src_page; 1946 struct folio *src_folio; 1947 pmd_t pmd; 1948 pgtable_t pgtable = NULL; 1949 int ret = -ENOMEM; 1950 1951 pmd = pmdp_get_lockless(src_pmd); 1952 if (unlikely(pmd_present(pmd) && pmd_special(pmd) && 1953 !is_huge_zero_pmd(pmd))) { 1954 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1955 src_ptl = pmd_lockptr(src_mm, src_pmd); 1956 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1957 /* 1958 * No need to recheck the pmd, it can't change with write 1959 * mmap lock held here. 1960 * 1961 * Meanwhile, making sure it's not a CoW VMA with writable 1962 * mapping, otherwise it means either the anon page wrongly 1963 * applied special bit, or we made the PRIVATE mapping be 1964 * able to wrongly write to the backend MMIO. 1965 */ 1966 VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); 1967 goto set_pmd; 1968 } 1969 1970 /* Skip if can be re-fill on fault */ 1971 if (!vma_is_anonymous(dst_vma)) 1972 return 0; 1973 1974 pgtable = pte_alloc_one(dst_mm); 1975 if (unlikely(!pgtable)) 1976 goto out; 1977 1978 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1979 src_ptl = pmd_lockptr(src_mm, src_pmd); 1980 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1981 1982 ret = -EAGAIN; 1983 pmd = *src_pmd; 1984 1985 if (unlikely(thp_migration_supported() && 1986 pmd_is_valid_softleaf(pmd))) { 1987 copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, 1988 dst_vma, src_vma, pmd, pgtable); 1989 ret = 0; 1990 goto out_unlock; 1991 } 1992 1993 if (unlikely(!pmd_trans_huge(pmd))) { 1994 pte_free(dst_mm, pgtable); 1995 goto out_unlock; 1996 } 1997 /* 1998 * When page table lock is held, the huge zero pmd should not be 1999 * under splitting since we don't split the page itself, only pmd to 2000 * a page table. 2001 */ 2002 if (is_huge_zero_pmd(pmd)) { 2003 /* 2004 * mm_get_huge_zero_folio() will never allocate a new 2005 * folio here, since we already have a zero page to 2006 * copy. It just takes a reference. 2007 */ 2008 mm_get_huge_zero_folio(dst_mm); 2009 goto out_zero_page; 2010 } 2011 2012 src_page = pmd_page(pmd); 2013 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 2014 src_folio = page_folio(src_page); 2015 2016 folio_get(src_folio); 2017 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) { 2018 /* Page maybe pinned: split and retry the fault on PTEs. */ 2019 folio_put(src_folio); 2020 pte_free(dst_mm, pgtable); 2021 spin_unlock(src_ptl); 2022 spin_unlock(dst_ptl); 2023 __split_huge_pmd(src_vma, src_pmd, addr, false); 2024 return -EAGAIN; 2025 } 2026 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 2027 out_zero_page: 2028 mm_inc_nr_ptes(dst_mm); 2029 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 2030 pmdp_set_wrprotect(src_mm, addr, src_pmd); 2031 if (!userfaultfd_wp(dst_vma)) 2032 pmd = pmd_clear_uffd_wp(pmd); 2033 pmd = pmd_wrprotect(pmd); 2034 set_pmd: 2035 pmd = pmd_mkold(pmd); 2036 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 2037 2038 ret = 0; 2039 out_unlock: 2040 spin_unlock(src_ptl); 2041 spin_unlock(dst_ptl); 2042 out: 2043 return ret; 2044 } 2045 2046 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2047 void touch_pud(struct vm_area_struct *vma, unsigned long addr, 2048 pud_t *pud, bool write) 2049 { 2050 pud_t _pud; 2051 2052 _pud = pud_mkyoung(*pud); 2053 if (write) 2054 _pud = pud_mkdirty(_pud); 2055 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 2056 pud, _pud, write)) 2057 update_mmu_cache_pud(vma, addr, pud); 2058 } 2059 2060 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 2061 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 2062 struct vm_area_struct *vma) 2063 { 2064 spinlock_t *dst_ptl, *src_ptl; 2065 pud_t pud; 2066 int ret; 2067 2068 dst_ptl = pud_lock(dst_mm, dst_pud); 2069 src_ptl = pud_lockptr(src_mm, src_pud); 2070 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 2071 2072 ret = -EAGAIN; 2073 pud = *src_pud; 2074 if (unlikely(!pud_trans_huge(pud))) 2075 goto out_unlock; 2076 2077 /* 2078 * TODO: once we support anonymous pages, use 2079 * folio_try_dup_anon_rmap_*() and split if duplicating fails. 2080 */ 2081 if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { 2082 pudp_set_wrprotect(src_mm, addr, src_pud); 2083 pud = pud_wrprotect(pud); 2084 } 2085 pud = pud_mkold(pud); 2086 set_pud_at(dst_mm, addr, dst_pud, pud); 2087 2088 ret = 0; 2089 out_unlock: 2090 spin_unlock(src_ptl); 2091 spin_unlock(dst_ptl); 2092 return ret; 2093 } 2094 2095 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 2096 { 2097 bool write = vmf->flags & FAULT_FLAG_WRITE; 2098 2099 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 2100 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 2101 goto unlock; 2102 2103 touch_pud(vmf->vma, vmf->address, vmf->pud, write); 2104 unlock: 2105 spin_unlock(vmf->ptl); 2106 } 2107 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2108 2109 bool huge_pmd_set_accessed(struct vm_fault *vmf) 2110 { 2111 bool write = vmf->flags & FAULT_FLAG_WRITE; 2112 2113 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) 2114 return false; 2115 2116 return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); 2117 } 2118 2119 static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) 2120 { 2121 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 2122 struct vm_area_struct *vma = vmf->vma; 2123 struct mmu_notifier_range range; 2124 struct folio *folio; 2125 vm_fault_t ret = 0; 2126 2127 folio = vma_alloc_anon_folio_pmd(vma, vmf->address); 2128 if (unlikely(!folio)) 2129 return VM_FAULT_FALLBACK; 2130 2131 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, 2132 haddr + HPAGE_PMD_SIZE); 2133 mmu_notifier_invalidate_range_start(&range); 2134 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 2135 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) 2136 goto release; 2137 ret = check_stable_address_space(vma->vm_mm); 2138 if (ret) 2139 goto release; 2140 (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); 2141 map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); 2142 goto unlock; 2143 release: 2144 folio_put(folio); 2145 unlock: 2146 spin_unlock(vmf->ptl); 2147 mmu_notifier_invalidate_range_end(&range); 2148 return ret; 2149 } 2150 2151 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) 2152 { 2153 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 2154 struct vm_area_struct *vma = vmf->vma; 2155 struct folio *folio; 2156 struct page *page; 2157 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 2158 pmd_t orig_pmd = vmf->orig_pmd; 2159 2160 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 2161 VM_BUG_ON_VMA(!vma->anon_vma, vma); 2162 2163 if (is_huge_zero_pmd(orig_pmd)) { 2164 vm_fault_t ret = do_huge_zero_wp_pmd(vmf); 2165 2166 if (!(ret & VM_FAULT_FALLBACK)) 2167 return ret; 2168 2169 /* Fallback to splitting PMD if THP cannot be allocated */ 2170 goto fallback; 2171 } 2172 2173 spin_lock(vmf->ptl); 2174 2175 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 2176 spin_unlock(vmf->ptl); 2177 return 0; 2178 } 2179 2180 page = pmd_page(orig_pmd); 2181 folio = page_folio(page); 2182 VM_BUG_ON_PAGE(!PageHead(page), page); 2183 2184 /* Early check when only holding the PT lock. */ 2185 if (PageAnonExclusive(page)) 2186 goto reuse; 2187 2188 if (!folio_trylock(folio)) { 2189 folio_get(folio); 2190 spin_unlock(vmf->ptl); 2191 folio_lock(folio); 2192 spin_lock(vmf->ptl); 2193 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 2194 spin_unlock(vmf->ptl); 2195 folio_unlock(folio); 2196 folio_put(folio); 2197 return 0; 2198 } 2199 folio_put(folio); 2200 } 2201 2202 /* Recheck after temporarily dropping the PT lock. */ 2203 if (PageAnonExclusive(page)) { 2204 folio_unlock(folio); 2205 goto reuse; 2206 } 2207 2208 /* 2209 * See do_wp_page(): we can only reuse the folio exclusively if 2210 * there are no additional references. Note that we always drain 2211 * the LRU cache immediately after adding a THP. 2212 */ 2213 if (folio_ref_count(folio) > 2214 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) 2215 goto unlock_fallback; 2216 if (folio_test_swapcache(folio)) 2217 folio_free_swap(folio); 2218 if (folio_ref_count(folio) == 1) { 2219 pmd_t entry; 2220 2221 folio_move_anon_rmap(folio, vma); 2222 SetPageAnonExclusive(page); 2223 folio_unlock(folio); 2224 reuse: 2225 if (unlikely(unshare)) { 2226 spin_unlock(vmf->ptl); 2227 return 0; 2228 } 2229 entry = pmd_mkyoung(orig_pmd); 2230 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2231 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 2232 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 2233 spin_unlock(vmf->ptl); 2234 return 0; 2235 } 2236 2237 unlock_fallback: 2238 folio_unlock(folio); 2239 spin_unlock(vmf->ptl); 2240 fallback: 2241 __split_huge_pmd(vma, vmf->pmd, vmf->address, false); 2242 return VM_FAULT_FALLBACK; 2243 } 2244 2245 static inline bool can_change_pmd_writable(struct vm_area_struct *vma, 2246 unsigned long addr, pmd_t pmd) 2247 { 2248 struct page *page; 2249 2250 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) 2251 return false; 2252 2253 /* Don't touch entries that are not even readable (NUMA hinting). */ 2254 if (pmd_protnone(pmd)) 2255 return false; 2256 2257 /* Do we need write faults for softdirty tracking? */ 2258 if (pmd_needs_soft_dirty_wp(vma, pmd)) 2259 return false; 2260 2261 /* Do we need write faults for uffd-wp tracking? */ 2262 if (userfaultfd_huge_pmd_wp(vma, pmd)) 2263 return false; 2264 2265 if (!(vma->vm_flags & VM_SHARED)) { 2266 /* See can_change_pte_writable(). */ 2267 page = vm_normal_page_pmd(vma, addr, pmd); 2268 return page && PageAnon(page) && PageAnonExclusive(page); 2269 } 2270 2271 /* See can_change_pte_writable(). */ 2272 return pmd_dirty(pmd); 2273 } 2274 2275 /* NUMA hinting page fault entry point for trans huge pmds */ 2276 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) 2277 { 2278 struct vm_area_struct *vma = vmf->vma; 2279 struct folio *folio; 2280 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 2281 int nid = NUMA_NO_NODE; 2282 int target_nid, last_cpupid; 2283 pmd_t pmd, old_pmd; 2284 bool writable = false; 2285 int flags = 0; 2286 2287 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 2288 old_pmd = pmdp_get(vmf->pmd); 2289 2290 if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) { 2291 spin_unlock(vmf->ptl); 2292 return 0; 2293 } 2294 2295 pmd = pmd_modify(old_pmd, vma->vm_page_prot); 2296 2297 /* 2298 * Detect now whether the PMD could be writable; this information 2299 * is only valid while holding the PT lock. 2300 */ 2301 writable = pmd_write(pmd); 2302 if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 2303 can_change_pmd_writable(vma, vmf->address, pmd)) 2304 writable = true; 2305 2306 folio = vm_normal_folio_pmd(vma, haddr, pmd); 2307 if (!folio) 2308 goto out_map; 2309 2310 nid = folio_nid(folio); 2311 2312 target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable, 2313 &last_cpupid); 2314 if (target_nid == NUMA_NO_NODE) 2315 goto out_map; 2316 if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { 2317 flags |= TNF_MIGRATE_FAIL; 2318 goto out_map; 2319 } 2320 /* The folio is isolated and isolation code holds a folio reference. */ 2321 spin_unlock(vmf->ptl); 2322 writable = false; 2323 2324 if (!migrate_misplaced_folio(folio, target_nid)) { 2325 flags |= TNF_MIGRATED; 2326 nid = target_nid; 2327 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 2328 return 0; 2329 } 2330 2331 flags |= TNF_MIGRATE_FAIL; 2332 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 2333 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) { 2334 spin_unlock(vmf->ptl); 2335 return 0; 2336 } 2337 out_map: 2338 /* Restore the PMD */ 2339 pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot); 2340 pmd = pmd_mkyoung(pmd); 2341 if (writable) 2342 pmd = pmd_mkwrite(pmd, vma); 2343 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 2344 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 2345 spin_unlock(vmf->ptl); 2346 2347 if (nid != NUMA_NO_NODE) 2348 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 2349 return 0; 2350 } 2351 2352 /* 2353 * Return true if we do MADV_FREE successfully on entire pmd page. 2354 * Otherwise, return false. 2355 */ 2356 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2357 pmd_t *pmd, unsigned long addr, unsigned long next) 2358 { 2359 spinlock_t *ptl; 2360 pmd_t orig_pmd; 2361 struct folio *folio; 2362 struct mm_struct *mm = tlb->mm; 2363 bool ret = false; 2364 2365 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2366 2367 ptl = pmd_trans_huge_lock(pmd, vma); 2368 if (!ptl) 2369 goto out_unlocked; 2370 2371 orig_pmd = *pmd; 2372 if (is_huge_zero_pmd(orig_pmd)) 2373 goto out; 2374 2375 if (unlikely(!pmd_present(orig_pmd))) { 2376 VM_BUG_ON(thp_migration_supported() && 2377 !pmd_is_migration_entry(orig_pmd)); 2378 goto out; 2379 } 2380 2381 folio = pmd_folio(orig_pmd); 2382 /* 2383 * If other processes are mapping this folio, we couldn't discard 2384 * the folio unless they all do MADV_FREE so let's skip the folio. 2385 */ 2386 if (folio_maybe_mapped_shared(folio)) 2387 goto out; 2388 2389 if (!folio_trylock(folio)) 2390 goto out; 2391 2392 /* 2393 * If user want to discard part-pages of THP, split it so MADV_FREE 2394 * will deactivate only them. 2395 */ 2396 if (next - addr != HPAGE_PMD_SIZE) { 2397 folio_get(folio); 2398 spin_unlock(ptl); 2399 split_folio(folio); 2400 folio_unlock(folio); 2401 folio_put(folio); 2402 goto out_unlocked; 2403 } 2404 2405 if (folio_test_dirty(folio)) 2406 folio_clear_dirty(folio); 2407 folio_unlock(folio); 2408 2409 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 2410 pmdp_invalidate(vma, addr, pmd); 2411 orig_pmd = pmd_mkold(orig_pmd); 2412 orig_pmd = pmd_mkclean(orig_pmd); 2413 2414 set_pmd_at(mm, addr, pmd, orig_pmd); 2415 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2416 } 2417 2418 folio_mark_lazyfree(folio); 2419 ret = true; 2420 out: 2421 spin_unlock(ptl); 2422 out_unlocked: 2423 return ret; 2424 } 2425 2426 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 2427 { 2428 pgtable_t pgtable; 2429 2430 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2431 pte_free(mm, pgtable); 2432 mm_dec_nr_ptes(mm); 2433 } 2434 2435 static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma, 2436 pmd_t pmdval, struct folio *folio, bool is_present) 2437 { 2438 const bool is_device_private = folio_is_device_private(folio); 2439 2440 /* Present and device private folios are rmappable. */ 2441 if (is_present || is_device_private) 2442 folio_remove_rmap_pmd(folio, &folio->page, vma); 2443 2444 if (folio_test_anon(folio)) { 2445 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); 2446 } else { 2447 add_mm_counter(mm, mm_counter_file(folio), 2448 -HPAGE_PMD_NR); 2449 2450 if (is_present && pmd_young(pmdval) && 2451 likely(vma_has_recency(vma))) 2452 folio_mark_accessed(folio); 2453 } 2454 2455 /* Device private folios are pinned. */ 2456 if (is_device_private) 2457 folio_put(folio); 2458 } 2459 2460 static struct folio *normal_or_softleaf_folio_pmd(struct vm_area_struct *vma, 2461 unsigned long addr, pmd_t pmdval, bool is_present) 2462 { 2463 if (is_present) 2464 return vm_normal_folio_pmd(vma, addr, pmdval); 2465 2466 if (!thp_migration_supported()) 2467 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 2468 return pmd_to_softleaf_folio(pmdval); 2469 } 2470 2471 static bool has_deposited_pgtable(struct vm_area_struct *vma, pmd_t pmdval, 2472 struct folio *folio) 2473 { 2474 /* Some architectures require unconditional depositing. */ 2475 if (arch_needs_pgtable_deposit()) 2476 return true; 2477 2478 /* 2479 * Huge zero always deposited except for DAX which handles itself, see 2480 * set_huge_zero_folio(). 2481 */ 2482 if (is_huge_zero_pmd(pmdval)) 2483 return !vma_is_dax(vma); 2484 2485 /* 2486 * Otherwise, only anonymous folios are deposited, see 2487 * __do_huge_pmd_anonymous_page(). 2488 */ 2489 return folio && folio_test_anon(folio); 2490 } 2491 2492 /** 2493 * zap_huge_pmd - Zap a huge THP which is of PMD size. 2494 * @tlb: The MMU gather TLB state associated with the operation. 2495 * @vma: The VMA containing the range to zap. 2496 * @pmd: A pointer to the leaf PMD entry. 2497 * @addr: The virtual address for the range to zap. 2498 * 2499 * Returns: %true on success, %false otherwise. 2500 */ 2501 bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2502 pmd_t *pmd, unsigned long addr) 2503 { 2504 struct mm_struct *mm = tlb->mm; 2505 struct folio *folio = NULL; 2506 bool is_present = false; 2507 bool has_deposit; 2508 spinlock_t *ptl; 2509 pmd_t orig_pmd; 2510 2511 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2512 2513 ptl = __pmd_trans_huge_lock(pmd, vma); 2514 if (!ptl) 2515 return false; 2516 /* 2517 * For architectures like ppc64 we look at deposited pgtable 2518 * when calling pmdp_huge_get_and_clear. So do the 2519 * pgtable_trans_huge_withdraw after finishing pmdp related 2520 * operations. 2521 */ 2522 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, 2523 tlb->fullmm); 2524 arch_check_zapped_pmd(vma, orig_pmd); 2525 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2526 2527 is_present = pmd_present(orig_pmd); 2528 folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present); 2529 has_deposit = has_deposited_pgtable(vma, orig_pmd, folio); 2530 if (folio) 2531 zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present); 2532 if (has_deposit) 2533 zap_deposited_table(mm, pmd); 2534 2535 spin_unlock(ptl); 2536 if (is_present && folio) 2537 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); 2538 return true; 2539 } 2540 2541 #ifndef pmd_move_must_withdraw 2542 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 2543 spinlock_t *old_pmd_ptl, 2544 struct vm_area_struct *vma) 2545 { 2546 /* 2547 * With split pmd lock we also need to move preallocated 2548 * PTE page table if new_pmd is on different PMD page table. 2549 * 2550 * We also don't deposit and withdraw tables for file pages. 2551 */ 2552 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 2553 } 2554 #endif 2555 2556 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 2557 { 2558 if (pgtable_supports_soft_dirty()) { 2559 if (unlikely(pmd_is_migration_entry(pmd))) 2560 pmd = pmd_swp_mksoft_dirty(pmd); 2561 else if (pmd_present(pmd)) 2562 pmd = pmd_mksoft_dirty(pmd); 2563 } 2564 2565 return pmd; 2566 } 2567 2568 static pmd_t clear_uffd_wp_pmd(pmd_t pmd) 2569 { 2570 if (pmd_none(pmd)) 2571 return pmd; 2572 if (pmd_present(pmd)) 2573 pmd = pmd_clear_uffd_wp(pmd); 2574 else 2575 pmd = pmd_swp_clear_uffd_wp(pmd); 2576 2577 return pmd; 2578 } 2579 2580 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 2581 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) 2582 { 2583 spinlock_t *old_ptl, *new_ptl; 2584 pmd_t pmd; 2585 struct mm_struct *mm = vma->vm_mm; 2586 bool force_flush = false; 2587 2588 /* 2589 * The destination pmd shouldn't be established, free_pgtables() 2590 * should have released it; but move_page_tables() might have already 2591 * inserted a page table, if racing against shmem/file collapse. 2592 */ 2593 if (!pmd_none(*new_pmd)) { 2594 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 2595 return false; 2596 } 2597 2598 /* 2599 * We don't have to worry about the ordering of src and dst 2600 * ptlocks because exclusive mmap_lock prevents deadlock. 2601 */ 2602 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 2603 if (old_ptl) { 2604 new_ptl = pmd_lockptr(mm, new_pmd); 2605 if (new_ptl != old_ptl) 2606 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 2607 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 2608 if (pmd_present(pmd)) 2609 force_flush = true; 2610 VM_BUG_ON(!pmd_none(*new_pmd)); 2611 2612 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 2613 pgtable_t pgtable; 2614 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 2615 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 2616 } 2617 pmd = move_soft_dirty_pmd(pmd); 2618 if (vma_has_uffd_without_event_remap(vma)) 2619 pmd = clear_uffd_wp_pmd(pmd); 2620 set_pmd_at(mm, new_addr, new_pmd, pmd); 2621 if (force_flush) 2622 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 2623 if (new_ptl != old_ptl) 2624 spin_unlock(new_ptl); 2625 spin_unlock(old_ptl); 2626 return true; 2627 } 2628 return false; 2629 } 2630 2631 static void change_non_present_huge_pmd(struct mm_struct *mm, 2632 unsigned long addr, pmd_t *pmd, bool uffd_wp, 2633 bool uffd_wp_resolve) 2634 { 2635 softleaf_t entry = softleaf_from_pmd(*pmd); 2636 const struct folio *folio = softleaf_to_folio(entry); 2637 pmd_t newpmd; 2638 2639 VM_WARN_ON(!pmd_is_valid_softleaf(*pmd)); 2640 if (softleaf_is_migration_write(entry)) { 2641 /* 2642 * A protection check is difficult so 2643 * just be safe and disable write 2644 */ 2645 if (folio_test_anon(folio)) 2646 entry = make_readable_exclusive_migration_entry(swp_offset(entry)); 2647 else 2648 entry = make_readable_migration_entry(swp_offset(entry)); 2649 newpmd = swp_entry_to_pmd(entry); 2650 if (pmd_swp_soft_dirty(*pmd)) 2651 newpmd = pmd_swp_mksoft_dirty(newpmd); 2652 } else if (softleaf_is_device_private_write(entry)) { 2653 entry = make_readable_device_private_entry(swp_offset(entry)); 2654 newpmd = swp_entry_to_pmd(entry); 2655 } else { 2656 newpmd = *pmd; 2657 } 2658 2659 if (uffd_wp) 2660 newpmd = pmd_swp_mkuffd_wp(newpmd); 2661 else if (uffd_wp_resolve) 2662 newpmd = pmd_swp_clear_uffd_wp(newpmd); 2663 if (!pmd_same(*pmd, newpmd)) 2664 set_pmd_at(mm, addr, pmd, newpmd); 2665 } 2666 2667 /* 2668 * Returns 2669 * - 0 if PMD could not be locked 2670 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary 2671 * or if prot_numa but THP migration is not supported 2672 * - HPAGE_PMD_NR if protections changed and TLB flush necessary 2673 */ 2674 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2675 pmd_t *pmd, unsigned long addr, pgprot_t newprot, 2676 unsigned long cp_flags) 2677 { 2678 struct mm_struct *mm = vma->vm_mm; 2679 spinlock_t *ptl; 2680 pmd_t oldpmd, entry; 2681 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 2682 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 2683 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 2684 int ret = 1; 2685 2686 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2687 2688 if (prot_numa && !thp_migration_supported()) 2689 return 1; 2690 2691 ptl = __pmd_trans_huge_lock(pmd, vma); 2692 if (!ptl) 2693 return 0; 2694 2695 if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) { 2696 change_non_present_huge_pmd(mm, addr, pmd, uffd_wp, 2697 uffd_wp_resolve); 2698 goto unlock; 2699 } 2700 2701 if (prot_numa) { 2702 2703 /* 2704 * Avoid trapping faults against the zero page. The read-only 2705 * data is likely to be read-cached on the local CPU and 2706 * local/remote hits to the zero page are not interesting. 2707 */ 2708 if (is_huge_zero_pmd(*pmd)) 2709 goto unlock; 2710 2711 if (pmd_protnone(*pmd)) 2712 goto unlock; 2713 2714 if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma, 2715 vma_is_single_threaded_private(vma))) 2716 goto unlock; 2717 } 2718 /* 2719 * In case prot_numa, we are under mmap_read_lock(mm). It's critical 2720 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 2721 * which is also under mmap_read_lock(mm): 2722 * 2723 * CPU0: CPU1: 2724 * change_huge_pmd(prot_numa=1) 2725 * pmdp_huge_get_and_clear_notify() 2726 * madvise_dontneed() 2727 * zap_pmd_range() 2728 * pmd_trans_huge(*pmd) == 0 (without ptl) 2729 * // skip the pmd 2730 * set_pmd_at(); 2731 * // pmd is re-established 2732 * 2733 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 2734 * which may break userspace. 2735 * 2736 * pmdp_invalidate_ad() is required to make sure we don't miss 2737 * dirty/young flags set by hardware. 2738 */ 2739 oldpmd = pmdp_invalidate_ad(vma, addr, pmd); 2740 2741 entry = pmd_modify(oldpmd, newprot); 2742 if (uffd_wp) 2743 entry = pmd_mkuffd_wp(entry); 2744 else if (uffd_wp_resolve) 2745 /* 2746 * Leave the write bit to be handled by PF interrupt 2747 * handler, then things like COW could be properly 2748 * handled. 2749 */ 2750 entry = pmd_clear_uffd_wp(entry); 2751 2752 /* See change_pte_range(). */ 2753 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && 2754 can_change_pmd_writable(vma, addr, entry)) 2755 entry = pmd_mkwrite(entry, vma); 2756 2757 ret = HPAGE_PMD_NR; 2758 set_pmd_at(mm, addr, pmd, entry); 2759 2760 if (huge_pmd_needs_flush(oldpmd, entry)) 2761 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); 2762 unlock: 2763 spin_unlock(ptl); 2764 return ret; 2765 } 2766 2767 /* 2768 * Returns: 2769 * 2770 * - 0: if pud leaf changed from under us 2771 * - 1: if pud can be skipped 2772 * - HPAGE_PUD_NR: if pud was successfully processed 2773 */ 2774 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2775 int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2776 pud_t *pudp, unsigned long addr, pgprot_t newprot, 2777 unsigned long cp_flags) 2778 { 2779 struct mm_struct *mm = vma->vm_mm; 2780 pud_t oldpud, entry; 2781 spinlock_t *ptl; 2782 2783 tlb_change_page_size(tlb, HPAGE_PUD_SIZE); 2784 2785 /* NUMA balancing doesn't apply to dax */ 2786 if (cp_flags & MM_CP_PROT_NUMA) 2787 return 1; 2788 2789 /* 2790 * Huge entries on userfault-wp only works with anonymous, while we 2791 * don't have anonymous PUDs yet. 2792 */ 2793 if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) 2794 return 1; 2795 2796 ptl = __pud_trans_huge_lock(pudp, vma); 2797 if (!ptl) 2798 return 0; 2799 2800 /* 2801 * Can't clear PUD or it can race with concurrent zapping. See 2802 * change_huge_pmd(). 2803 */ 2804 oldpud = pudp_invalidate(vma, addr, pudp); 2805 entry = pud_modify(oldpud, newprot); 2806 set_pud_at(mm, addr, pudp, entry); 2807 tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE); 2808 2809 spin_unlock(ptl); 2810 return HPAGE_PUD_NR; 2811 } 2812 #endif 2813 2814 #ifdef CONFIG_USERFAULTFD 2815 /* 2816 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by 2817 * the caller, but it must return after releasing the page_table_lock. 2818 * Just move the page from src_pmd to dst_pmd if possible. 2819 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be 2820 * repeated by the caller, or other errors in case of failure. 2821 */ 2822 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 2823 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 2824 unsigned long dst_addr, unsigned long src_addr) 2825 { 2826 pmd_t _dst_pmd, src_pmdval; 2827 struct page *src_page; 2828 struct folio *src_folio; 2829 spinlock_t *src_ptl, *dst_ptl; 2830 pgtable_t src_pgtable; 2831 struct mmu_notifier_range range; 2832 int err = 0; 2833 2834 src_pmdval = *src_pmd; 2835 src_ptl = pmd_lockptr(mm, src_pmd); 2836 2837 lockdep_assert_held(src_ptl); 2838 vma_assert_locked(src_vma); 2839 vma_assert_locked(dst_vma); 2840 2841 /* Sanity checks before the operation */ 2842 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || 2843 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { 2844 spin_unlock(src_ptl); 2845 return -EINVAL; 2846 } 2847 2848 if (!pmd_trans_huge(src_pmdval)) { 2849 spin_unlock(src_ptl); 2850 if (pmd_is_migration_entry(src_pmdval)) { 2851 pmd_migration_entry_wait(mm, &src_pmdval); 2852 return -EAGAIN; 2853 } 2854 return -ENOENT; 2855 } 2856 2857 src_page = pmd_page(src_pmdval); 2858 2859 if (!is_huge_zero_pmd(src_pmdval)) { 2860 if (unlikely(!PageAnonExclusive(src_page))) { 2861 spin_unlock(src_ptl); 2862 return -EBUSY; 2863 } 2864 2865 src_folio = page_folio(src_page); 2866 folio_get(src_folio); 2867 } else 2868 src_folio = NULL; 2869 2870 spin_unlock(src_ptl); 2871 2872 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); 2873 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, 2874 src_addr + HPAGE_PMD_SIZE); 2875 mmu_notifier_invalidate_range_start(&range); 2876 2877 if (src_folio) 2878 folio_lock(src_folio); 2879 2880 dst_ptl = pmd_lockptr(mm, dst_pmd); 2881 double_pt_lock(src_ptl, dst_ptl); 2882 if (unlikely(!pmd_same(*src_pmd, src_pmdval) || 2883 !pmd_same(*dst_pmd, dst_pmdval))) { 2884 err = -EAGAIN; 2885 goto unlock_ptls; 2886 } 2887 if (src_folio) { 2888 if (folio_maybe_dma_pinned(src_folio) || 2889 !PageAnonExclusive(&src_folio->page)) { 2890 err = -EBUSY; 2891 goto unlock_ptls; 2892 } 2893 2894 if (WARN_ON_ONCE(!folio_test_head(src_folio)) || 2895 WARN_ON_ONCE(!folio_test_anon(src_folio))) { 2896 err = -EBUSY; 2897 goto unlock_ptls; 2898 } 2899 2900 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2901 /* Folio got pinned from under us. Put it back and fail the move. */ 2902 if (folio_maybe_dma_pinned(src_folio)) { 2903 set_pmd_at(mm, src_addr, src_pmd, src_pmdval); 2904 err = -EBUSY; 2905 goto unlock_ptls; 2906 } 2907 2908 folio_move_anon_rmap(src_folio, dst_vma); 2909 src_folio->index = linear_page_index(dst_vma, dst_addr); 2910 2911 _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); 2912 /* Follow mremap() behavior and treat the entry dirty after the move */ 2913 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); 2914 } else { 2915 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2916 _dst_pmd = move_soft_dirty_pmd(src_pmdval); 2917 _dst_pmd = clear_uffd_wp_pmd(_dst_pmd); 2918 } 2919 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); 2920 2921 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); 2922 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2923 unlock_ptls: 2924 double_pt_unlock(src_ptl, dst_ptl); 2925 /* unblock rmap walks */ 2926 if (src_folio) 2927 folio_unlock(src_folio); 2928 mmu_notifier_invalidate_range_end(&range); 2929 if (src_folio) 2930 folio_put(src_folio); 2931 return err; 2932 } 2933 #endif /* CONFIG_USERFAULTFD */ 2934 2935 /* 2936 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2937 * 2938 * Note that if it returns page table lock pointer, this routine returns without 2939 * unlocking page table lock. So callers must unlock it. 2940 */ 2941 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2942 { 2943 spinlock_t *ptl; 2944 2945 ptl = pmd_lock(vma->vm_mm, pmd); 2946 if (likely(pmd_is_huge(*pmd))) 2947 return ptl; 2948 spin_unlock(ptl); 2949 return NULL; 2950 } 2951 2952 /* 2953 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. 2954 * 2955 * Note that if it returns page table lock pointer, this routine returns without 2956 * unlocking page table lock. So callers must unlock it. 2957 */ 2958 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2959 { 2960 spinlock_t *ptl; 2961 2962 ptl = pud_lock(vma->vm_mm, pud); 2963 if (likely(pud_trans_huge(*pud))) 2964 return ptl; 2965 spin_unlock(ptl); 2966 return NULL; 2967 } 2968 2969 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2970 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2971 pud_t *pud, unsigned long addr) 2972 { 2973 spinlock_t *ptl; 2974 pud_t orig_pud; 2975 2976 ptl = __pud_trans_huge_lock(pud, vma); 2977 if (!ptl) 2978 return 0; 2979 2980 orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); 2981 arch_check_zapped_pud(vma, orig_pud); 2982 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2983 if (vma_is_special_huge(vma)) { 2984 spin_unlock(ptl); 2985 /* No zero page support yet */ 2986 } else { 2987 struct page *page = NULL; 2988 struct folio *folio; 2989 2990 /* No support for anonymous PUD pages or migration yet */ 2991 VM_WARN_ON_ONCE(vma_is_anonymous(vma) || 2992 !pud_present(orig_pud)); 2993 2994 page = pud_page(orig_pud); 2995 folio = page_folio(page); 2996 folio_remove_rmap_pud(folio, page, vma); 2997 add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR); 2998 2999 spin_unlock(ptl); 3000 tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE); 3001 } 3002 return 1; 3003 } 3004 3005 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 3006 unsigned long haddr) 3007 { 3008 struct folio *folio; 3009 struct page *page; 3010 pud_t old_pud; 3011 3012 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 3013 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 3014 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 3015 VM_BUG_ON(!pud_trans_huge(*pud)); 3016 3017 count_vm_event(THP_SPLIT_PUD); 3018 3019 old_pud = pudp_huge_clear_flush(vma, haddr, pud); 3020 3021 if (!vma_is_dax(vma)) 3022 return; 3023 3024 page = pud_page(old_pud); 3025 folio = page_folio(page); 3026 3027 if (!folio_test_dirty(folio) && pud_dirty(old_pud)) 3028 folio_mark_dirty(folio); 3029 if (!folio_test_referenced(folio) && pud_young(old_pud)) 3030 folio_set_referenced(folio); 3031 folio_remove_rmap_pud(folio, page, vma); 3032 add_mm_counter(vma->vm_mm, mm_counter_file(folio), 3033 -HPAGE_PUD_NR); 3034 folio_put(folio); 3035 } 3036 3037 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 3038 unsigned long address) 3039 { 3040 spinlock_t *ptl; 3041 struct mmu_notifier_range range; 3042 3043 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 3044 address & HPAGE_PUD_MASK, 3045 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 3046 mmu_notifier_invalidate_range_start(&range); 3047 ptl = pud_lock(vma->vm_mm, pud); 3048 if (unlikely(!pud_trans_huge(*pud))) 3049 goto out; 3050 __split_huge_pud_locked(vma, pud, range.start); 3051 3052 out: 3053 spin_unlock(ptl); 3054 mmu_notifier_invalidate_range_end(&range); 3055 } 3056 #else 3057 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 3058 unsigned long address) 3059 { 3060 } 3061 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 3062 3063 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 3064 unsigned long haddr, pmd_t *pmd) 3065 { 3066 struct mm_struct *mm = vma->vm_mm; 3067 pgtable_t pgtable; 3068 pmd_t _pmd, old_pmd; 3069 unsigned long addr; 3070 pte_t *pte; 3071 int i; 3072 3073 /* 3074 * Leave pmd empty until pte is filled note that it is fine to delay 3075 * notification until mmu_notifier_invalidate_range_end() as we are 3076 * replacing a zero pmd write protected page with a zero pte write 3077 * protected page. 3078 * 3079 * See Documentation/mm/mmu_notifier.rst 3080 */ 3081 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 3082 3083 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 3084 pmd_populate(mm, &_pmd, pgtable); 3085 3086 pte = pte_offset_map(&_pmd, haddr); 3087 VM_BUG_ON(!pte); 3088 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 3089 pte_t entry; 3090 3091 entry = pfn_pte(zero_pfn(addr), vma->vm_page_prot); 3092 entry = pte_mkspecial(entry); 3093 if (pmd_uffd_wp(old_pmd)) 3094 entry = pte_mkuffd_wp(entry); 3095 VM_BUG_ON(!pte_none(ptep_get(pte))); 3096 set_pte_at(mm, addr, pte, entry); 3097 pte++; 3098 } 3099 pte_unmap(pte - 1); 3100 smp_wmb(); /* make pte visible before pmd */ 3101 pmd_populate(mm, pmd, pgtable); 3102 } 3103 3104 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 3105 unsigned long haddr, bool freeze) 3106 { 3107 struct mm_struct *mm = vma->vm_mm; 3108 struct folio *folio; 3109 struct page *page; 3110 pgtable_t pgtable; 3111 pmd_t old_pmd, _pmd; 3112 bool soft_dirty, uffd_wp = false, young = false, write = false; 3113 bool anon_exclusive = false, dirty = false; 3114 unsigned long addr; 3115 pte_t *pte; 3116 int i; 3117 3118 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 3119 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 3120 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 3121 3122 VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd)); 3123 3124 count_vm_event(THP_SPLIT_PMD); 3125 3126 if (!vma_is_anonymous(vma)) { 3127 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 3128 /* 3129 * We are going to unmap this huge page. So 3130 * just go ahead and zap it 3131 */ 3132 if (arch_needs_pgtable_deposit()) 3133 zap_deposited_table(mm, pmd); 3134 if (vma_is_special_huge(vma)) 3135 return; 3136 if (unlikely(pmd_is_migration_entry(old_pmd))) { 3137 const softleaf_t old_entry = softleaf_from_pmd(old_pmd); 3138 3139 folio = softleaf_to_folio(old_entry); 3140 } else if (is_huge_zero_pmd(old_pmd)) { 3141 return; 3142 } else { 3143 page = pmd_page(old_pmd); 3144 folio = page_folio(page); 3145 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) 3146 folio_mark_dirty(folio); 3147 if (!folio_test_referenced(folio) && pmd_young(old_pmd)) 3148 folio_set_referenced(folio); 3149 folio_remove_rmap_pmd(folio, page, vma); 3150 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); 3151 folio_put(folio); 3152 return; 3153 } 3154 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); 3155 return; 3156 } 3157 3158 if (is_huge_zero_pmd(*pmd)) { 3159 /* 3160 * FIXME: Do we want to invalidate secondary mmu by calling 3161 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below 3162 * inside __split_huge_pmd() ? 3163 * 3164 * We are going from a zero huge page write protected to zero 3165 * small page also write protected so it does not seems useful 3166 * to invalidate secondary mmu at this time. 3167 */ 3168 return __split_huge_zero_page_pmd(vma, haddr, pmd); 3169 } 3170 3171 if (pmd_is_migration_entry(*pmd)) { 3172 softleaf_t entry; 3173 3174 old_pmd = *pmd; 3175 entry = softleaf_from_pmd(old_pmd); 3176 page = softleaf_to_page(entry); 3177 folio = page_folio(page); 3178 3179 soft_dirty = pmd_swp_soft_dirty(old_pmd); 3180 uffd_wp = pmd_swp_uffd_wp(old_pmd); 3181 3182 write = softleaf_is_migration_write(entry); 3183 if (PageAnon(page)) 3184 anon_exclusive = softleaf_is_migration_read_exclusive(entry); 3185 young = softleaf_is_migration_young(entry); 3186 dirty = softleaf_is_migration_dirty(entry); 3187 } else if (pmd_is_device_private_entry(*pmd)) { 3188 softleaf_t entry; 3189 3190 old_pmd = *pmd; 3191 entry = softleaf_from_pmd(old_pmd); 3192 page = softleaf_to_page(entry); 3193 folio = page_folio(page); 3194 3195 soft_dirty = pmd_swp_soft_dirty(old_pmd); 3196 uffd_wp = pmd_swp_uffd_wp(old_pmd); 3197 3198 write = softleaf_is_device_private_write(entry); 3199 anon_exclusive = PageAnonExclusive(page); 3200 3201 /* 3202 * Device private THP should be treated the same as regular 3203 * folios w.r.t anon exclusive handling. See the comments for 3204 * folio handling and anon_exclusive below. 3205 */ 3206 if (freeze && anon_exclusive && 3207 folio_try_share_anon_rmap_pmd(folio, page)) 3208 freeze = false; 3209 if (!freeze) { 3210 rmap_t rmap_flags = RMAP_NONE; 3211 3212 folio_ref_add(folio, HPAGE_PMD_NR - 1); 3213 if (anon_exclusive) 3214 rmap_flags |= RMAP_EXCLUSIVE; 3215 3216 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, 3217 vma, haddr, rmap_flags); 3218 } 3219 } else { 3220 /* 3221 * Up to this point the pmd is present and huge and userland has 3222 * the whole access to the hugepage during the split (which 3223 * happens in place). If we overwrite the pmd with the not-huge 3224 * version pointing to the pte here (which of course we could if 3225 * all CPUs were bug free), userland could trigger a small page 3226 * size TLB miss on the small sized TLB while the hugepage TLB 3227 * entry is still established in the huge TLB. Some CPU doesn't 3228 * like that. See 3229 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum 3230 * 383 on page 105. Intel should be safe but is also warns that 3231 * it's only safe if the permission and cache attributes of the 3232 * two entries loaded in the two TLB is identical (which should 3233 * be the case here). But it is generally safer to never allow 3234 * small and huge TLB entries for the same virtual address to be 3235 * loaded simultaneously. So instead of doing "pmd_populate(); 3236 * flush_pmd_tlb_range();" we first mark the current pmd 3237 * notpresent (atomically because here the pmd_trans_huge must 3238 * remain set at all times on the pmd until the split is 3239 * complete for this pmd), then we flush the SMP TLB and finally 3240 * we write the non-huge version of the pmd entry with 3241 * pmd_populate. 3242 */ 3243 old_pmd = pmdp_invalidate(vma, haddr, pmd); 3244 page = pmd_page(old_pmd); 3245 folio = page_folio(page); 3246 if (pmd_dirty(old_pmd)) { 3247 dirty = true; 3248 folio_set_dirty(folio); 3249 } 3250 write = pmd_write(old_pmd); 3251 young = pmd_young(old_pmd); 3252 soft_dirty = pmd_soft_dirty(old_pmd); 3253 uffd_wp = pmd_uffd_wp(old_pmd); 3254 3255 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); 3256 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 3257 3258 /* 3259 * Without "freeze", we'll simply split the PMD, propagating the 3260 * PageAnonExclusive() flag for each PTE by setting it for 3261 * each subpage -- no need to (temporarily) clear. 3262 * 3263 * With "freeze" we want to replace mapped pages by 3264 * migration entries right away. This is only possible if we 3265 * managed to clear PageAnonExclusive() -- see 3266 * set_pmd_migration_entry(). 3267 * 3268 * In case we cannot clear PageAnonExclusive(), split the PMD 3269 * only and let try_to_migrate_one() fail later. 3270 * 3271 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. 3272 */ 3273 anon_exclusive = PageAnonExclusive(page); 3274 if (freeze && anon_exclusive && 3275 folio_try_share_anon_rmap_pmd(folio, page)) 3276 freeze = false; 3277 if (!freeze) { 3278 rmap_t rmap_flags = RMAP_NONE; 3279 3280 folio_ref_add(folio, HPAGE_PMD_NR - 1); 3281 if (anon_exclusive) 3282 rmap_flags |= RMAP_EXCLUSIVE; 3283 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, 3284 vma, haddr, rmap_flags); 3285 } 3286 } 3287 3288 /* 3289 * Withdraw the table only after we mark the pmd entry invalid. 3290 * This's critical for some architectures (Power). 3291 */ 3292 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 3293 pmd_populate(mm, &_pmd, pgtable); 3294 3295 pte = pte_offset_map(&_pmd, haddr); 3296 VM_BUG_ON(!pte); 3297 3298 /* 3299 * Note that NUMA hinting access restrictions are not transferred to 3300 * avoid any possibility of altering permissions across VMAs. 3301 */ 3302 if (freeze || pmd_is_migration_entry(old_pmd)) { 3303 pte_t entry; 3304 swp_entry_t swp_entry; 3305 3306 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 3307 if (write) 3308 swp_entry = make_writable_migration_entry( 3309 page_to_pfn(page + i)); 3310 else if (anon_exclusive) 3311 swp_entry = make_readable_exclusive_migration_entry( 3312 page_to_pfn(page + i)); 3313 else 3314 swp_entry = make_readable_migration_entry( 3315 page_to_pfn(page + i)); 3316 if (young) 3317 swp_entry = make_migration_entry_young(swp_entry); 3318 if (dirty) 3319 swp_entry = make_migration_entry_dirty(swp_entry); 3320 entry = swp_entry_to_pte(swp_entry); 3321 if (soft_dirty) 3322 entry = pte_swp_mksoft_dirty(entry); 3323 if (uffd_wp) 3324 entry = pte_swp_mkuffd_wp(entry); 3325 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 3326 set_pte_at(mm, addr, pte + i, entry); 3327 } 3328 } else if (pmd_is_device_private_entry(old_pmd)) { 3329 pte_t entry; 3330 swp_entry_t swp_entry; 3331 3332 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 3333 /* 3334 * anon_exclusive was already propagated to the relevant 3335 * pages corresponding to the pte entries when freeze 3336 * is false. 3337 */ 3338 if (write) 3339 swp_entry = make_writable_device_private_entry( 3340 page_to_pfn(page + i)); 3341 else 3342 swp_entry = make_readable_device_private_entry( 3343 page_to_pfn(page + i)); 3344 /* 3345 * Young and dirty bits are not progated via swp_entry 3346 */ 3347 entry = swp_entry_to_pte(swp_entry); 3348 if (soft_dirty) 3349 entry = pte_swp_mksoft_dirty(entry); 3350 if (uffd_wp) 3351 entry = pte_swp_mkuffd_wp(entry); 3352 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 3353 set_pte_at(mm, addr, pte + i, entry); 3354 } 3355 } else { 3356 pte_t entry; 3357 3358 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); 3359 if (write) 3360 entry = pte_mkwrite(entry, vma); 3361 if (!young) 3362 entry = pte_mkold(entry); 3363 /* NOTE: this may set soft-dirty too on some archs */ 3364 if (dirty) 3365 entry = pte_mkdirty(entry); 3366 if (soft_dirty) 3367 entry = pte_mksoft_dirty(entry); 3368 if (uffd_wp) 3369 entry = pte_mkuffd_wp(entry); 3370 3371 for (i = 0; i < HPAGE_PMD_NR; i++) 3372 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 3373 3374 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); 3375 } 3376 pte_unmap(pte); 3377 3378 if (!pmd_is_migration_entry(*pmd)) 3379 folio_remove_rmap_pmd(folio, page, vma); 3380 if (freeze) 3381 put_page(page); 3382 3383 smp_wmb(); /* make pte visible before pmd */ 3384 pmd_populate(mm, pmd, pgtable); 3385 } 3386 3387 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, 3388 pmd_t *pmd, bool freeze) 3389 { 3390 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); 3391 if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd)) 3392 __split_huge_pmd_locked(vma, pmd, address, freeze); 3393 } 3394 3395 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 3396 unsigned long address, bool freeze) 3397 { 3398 spinlock_t *ptl; 3399 struct mmu_notifier_range range; 3400 3401 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 3402 address & HPAGE_PMD_MASK, 3403 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 3404 mmu_notifier_invalidate_range_start(&range); 3405 ptl = pmd_lock(vma->vm_mm, pmd); 3406 split_huge_pmd_locked(vma, range.start, pmd, freeze); 3407 spin_unlock(ptl); 3408 mmu_notifier_invalidate_range_end(&range); 3409 } 3410 3411 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 3412 bool freeze) 3413 { 3414 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 3415 3416 if (!pmd) 3417 return; 3418 3419 __split_huge_pmd(vma, pmd, address, freeze); 3420 } 3421 3422 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) 3423 { 3424 /* 3425 * If the new address isn't hpage aligned and it could previously 3426 * contain an hugepage: check if we need to split an huge pmd. 3427 */ 3428 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && 3429 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), 3430 ALIGN(address, HPAGE_PMD_SIZE))) 3431 split_huge_pmd_address(vma, address, false); 3432 } 3433 3434 void vma_adjust_trans_huge(struct vm_area_struct *vma, 3435 unsigned long start, 3436 unsigned long end, 3437 struct vm_area_struct *next) 3438 { 3439 /* Check if we need to split start first. */ 3440 split_huge_pmd_if_needed(vma, start); 3441 3442 /* Check if we need to split end next. */ 3443 split_huge_pmd_if_needed(vma, end); 3444 3445 /* If we're incrementing next->vm_start, we might need to split it. */ 3446 if (next) 3447 split_huge_pmd_if_needed(next, end); 3448 } 3449 3450 static void unmap_folio(struct folio *folio) 3451 { 3452 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | 3453 TTU_BATCH_FLUSH; 3454 3455 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 3456 3457 if (folio_test_pmd_mappable(folio)) 3458 ttu_flags |= TTU_SPLIT_HUGE_PMD; 3459 3460 /* 3461 * Anon pages need migration entries to preserve them, but file 3462 * pages can simply be left unmapped, then faulted back on demand. 3463 * If that is ever changed (perhaps for mlock), update remap_page(). 3464 */ 3465 if (folio_test_anon(folio)) 3466 try_to_migrate(folio, ttu_flags); 3467 else 3468 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 3469 3470 try_to_unmap_flush(); 3471 } 3472 3473 static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, 3474 unsigned long addr, pmd_t *pmdp, 3475 struct folio *folio) 3476 { 3477 struct mm_struct *mm = vma->vm_mm; 3478 int ref_count, map_count; 3479 pmd_t orig_pmd = *pmdp; 3480 3481 if (pmd_dirty(orig_pmd)) 3482 folio_set_dirty(folio); 3483 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { 3484 folio_set_swapbacked(folio); 3485 return false; 3486 } 3487 3488 orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp); 3489 3490 /* 3491 * Syncing against concurrent GUP-fast: 3492 * - clear PMD; barrier; read refcount 3493 * - inc refcount; barrier; read PMD 3494 */ 3495 smp_mb(); 3496 3497 ref_count = folio_ref_count(folio); 3498 map_count = folio_mapcount(folio); 3499 3500 /* 3501 * Order reads for folio refcount and dirty flag 3502 * (see comments in __remove_mapping()). 3503 */ 3504 smp_rmb(); 3505 3506 /* 3507 * If the folio or its PMD is redirtied at this point, or if there 3508 * are unexpected references, we will give up to discard this folio 3509 * and remap it. 3510 * 3511 * The only folio refs must be one from isolation plus the rmap(s). 3512 */ 3513 if (pmd_dirty(orig_pmd)) 3514 folio_set_dirty(folio); 3515 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { 3516 folio_set_swapbacked(folio); 3517 set_pmd_at(mm, addr, pmdp, orig_pmd); 3518 return false; 3519 } 3520 3521 if (ref_count != map_count + 1) { 3522 set_pmd_at(mm, addr, pmdp, orig_pmd); 3523 return false; 3524 } 3525 3526 folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma); 3527 zap_deposited_table(mm, pmdp); 3528 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); 3529 if (vma->vm_flags & VM_LOCKED) 3530 mlock_drain_local(); 3531 folio_put(folio); 3532 3533 return true; 3534 } 3535 3536 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, 3537 pmd_t *pmdp, struct folio *folio) 3538 { 3539 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); 3540 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 3541 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 3542 VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio); 3543 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE)); 3544 3545 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); 3546 } 3547 3548 static void remap_page(struct folio *folio, unsigned long nr, int flags) 3549 { 3550 int i = 0; 3551 3552 /* If unmap_folio() uses try_to_migrate() on file, remove this check */ 3553 if (!folio_test_anon(folio)) 3554 return; 3555 for (;;) { 3556 remove_migration_ptes(folio, folio, TTU_RMAP_LOCKED | flags); 3557 i += folio_nr_pages(folio); 3558 if (i >= nr) 3559 break; 3560 folio = folio_next(folio); 3561 } 3562 } 3563 3564 static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, 3565 struct lruvec *lruvec, struct list_head *list) 3566 { 3567 VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); 3568 lockdep_assert_held(&lruvec->lru_lock); 3569 3570 if (folio_is_device_private(folio)) 3571 return; 3572 3573 if (list) { 3574 /* page reclaim is reclaiming a huge page */ 3575 VM_WARN_ON(folio_test_lru(folio)); 3576 folio_get(new_folio); 3577 list_add_tail(&new_folio->lru, list); 3578 } else { 3579 /* head is still on lru (and we have it frozen) */ 3580 VM_WARN_ON(!folio_test_lru(folio)); 3581 if (folio_test_unevictable(folio)) 3582 new_folio->mlock_count = 0; 3583 else 3584 list_add_tail(&new_folio->lru, &folio->lru); 3585 folio_set_lru(new_folio); 3586 } 3587 } 3588 3589 static bool page_range_has_hwpoisoned(struct page *page, long nr_pages) 3590 { 3591 for (; nr_pages; page++, nr_pages--) 3592 if (PageHWPoison(page)) 3593 return true; 3594 return false; 3595 } 3596 3597 /* 3598 * It splits @folio into @new_order folios and copies the @folio metadata to 3599 * all the resulting folios. 3600 */ 3601 static void __split_folio_to_order(struct folio *folio, int old_order, 3602 int new_order) 3603 { 3604 /* Scan poisoned pages when split a poisoned folio to large folios */ 3605 const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order; 3606 long new_nr_pages = 1 << new_order; 3607 long nr_pages = 1 << old_order; 3608 long i; 3609 3610 folio_clear_has_hwpoisoned(folio); 3611 3612 /* Check first new_nr_pages since the loop below skips them */ 3613 if (handle_hwpoison && 3614 page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages)) 3615 folio_set_has_hwpoisoned(folio); 3616 /* 3617 * Skip the first new_nr_pages, since the new folio from them have all 3618 * the flags from the original folio. 3619 */ 3620 for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { 3621 struct page *new_head = &folio->page + i; 3622 /* 3623 * Careful: new_folio is not a "real" folio before we cleared PageTail. 3624 * Don't pass it around before clear_compound_head(). 3625 */ 3626 struct folio *new_folio = (struct folio *)new_head; 3627 3628 VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head); 3629 3630 /* 3631 * Clone page flags before unfreezing refcount. 3632 * 3633 * After successful get_page_unless_zero() might follow flags change, 3634 * for example lock_page() which set PG_waiters. 3635 * 3636 * Note that for mapped sub-pages of an anonymous THP, 3637 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in 3638 * the migration entry instead from where remap_page() will restore it. 3639 * We can still have PG_anon_exclusive set on effectively unmapped and 3640 * unreferenced sub-pages of an anonymous THP: we can simply drop 3641 * PG_anon_exclusive (-> PG_mappedtodisk) for these here. 3642 */ 3643 new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; 3644 new_folio->flags.f |= (folio->flags.f & 3645 ((1L << PG_referenced) | 3646 (1L << PG_swapbacked) | 3647 (1L << PG_swapcache) | 3648 (1L << PG_mlocked) | 3649 (1L << PG_uptodate) | 3650 (1L << PG_active) | 3651 (1L << PG_workingset) | 3652 (1L << PG_locked) | 3653 (1L << PG_unevictable) | 3654 #ifdef CONFIG_ARCH_USES_PG_ARCH_2 3655 (1L << PG_arch_2) | 3656 #endif 3657 #ifdef CONFIG_ARCH_USES_PG_ARCH_3 3658 (1L << PG_arch_3) | 3659 #endif 3660 (1L << PG_dirty) | 3661 LRU_GEN_MASK | LRU_REFS_MASK)); 3662 3663 if (handle_hwpoison && 3664 page_range_has_hwpoisoned(new_head, new_nr_pages)) 3665 folio_set_has_hwpoisoned(new_folio); 3666 3667 new_folio->mapping = folio->mapping; 3668 new_folio->index = folio->index + i; 3669 3670 if (folio_test_swapcache(folio)) 3671 new_folio->swap.val = folio->swap.val + i; 3672 3673 /* Page flags must be visible before we make the page non-compound. */ 3674 smp_wmb(); 3675 3676 /* 3677 * Clear PageTail before unfreezing page refcount. 3678 * 3679 * After successful get_page_unless_zero() might follow put_page() 3680 * which needs correct compound_head(). 3681 */ 3682 clear_compound_head(new_head); 3683 if (new_order) { 3684 prep_compound_page(new_head, new_order); 3685 folio_set_large_rmappable(new_folio); 3686 } 3687 3688 if (folio_test_young(folio)) 3689 folio_set_young(new_folio); 3690 if (folio_test_idle(folio)) 3691 folio_set_idle(new_folio); 3692 #ifdef CONFIG_MEMCG 3693 new_folio->memcg_data = folio->memcg_data; 3694 #endif 3695 3696 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); 3697 } 3698 3699 if (new_order) 3700 folio_set_order(folio, new_order); 3701 else 3702 ClearPageCompound(&folio->page); 3703 } 3704 3705 /** 3706 * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in 3707 * two ways: uniform split or non-uniform split. 3708 * @folio: the to-be-split folio 3709 * @new_order: the smallest order of the after split folios (since buddy 3710 * allocator like split generates folios with orders from @folio's 3711 * order - 1 to new_order). 3712 * @split_at: in buddy allocator like split, the folio containing @split_at 3713 * will be split until its order becomes @new_order. 3714 * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller 3715 * @mapping: @folio->mapping 3716 * @split_type: if the split is uniform or not (buddy allocator like split) 3717 * 3718 * 3719 * 1. uniform split: the given @folio into multiple @new_order small folios, 3720 * where all small folios have the same order. This is done when 3721 * split_type is SPLIT_TYPE_UNIFORM. 3722 * 2. buddy allocator like (non-uniform) split: the given @folio is split into 3723 * half and one of the half (containing the given page) is split into half 3724 * until the given @folio's order becomes @new_order. This is done when 3725 * split_type is SPLIT_TYPE_NON_UNIFORM. 3726 * 3727 * The high level flow for these two methods are: 3728 * 3729 * 1. uniform split: @xas is split with no expectation of failure and a single 3730 * __split_folio_to_order() is called to split the @folio into @new_order 3731 * along with stats update. 3732 * 2. non-uniform split: folio_order - @new_order calls to 3733 * __split_folio_to_order() are expected to be made in a for loop to split 3734 * the @folio to one lower order at a time. The folio containing @split_at 3735 * is split in each iteration. @xas is split into half in each iteration and 3736 * can fail. A failed @xas split leaves split folios as is without merging 3737 * them back. 3738 * 3739 * After splitting, the caller's folio reference will be transferred to the 3740 * folio containing @split_at. The caller needs to unlock and/or free 3741 * after-split folios if necessary. 3742 * 3743 * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be 3744 * split but not to @new_order, the caller needs to check) 3745 */ 3746 static int __split_unmapped_folio(struct folio *folio, int new_order, 3747 struct page *split_at, struct xa_state *xas, 3748 struct address_space *mapping, enum split_type split_type) 3749 { 3750 const bool is_anon = folio_test_anon(folio); 3751 int old_order = folio_order(folio); 3752 int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1; 3753 struct folio *old_folio = folio; 3754 int split_order; 3755 3756 /* 3757 * split to new_order one order at a time. For uniform split, 3758 * folio is split to new_order directly. 3759 */ 3760 for (split_order = start_order; 3761 split_order >= new_order; 3762 split_order--) { 3763 int nr_new_folios = 1UL << (old_order - split_order); 3764 3765 /* order-1 anonymous folio is not supported */ 3766 if (is_anon && split_order == 1) 3767 continue; 3768 3769 if (mapping) { 3770 /* 3771 * uniform split has xas_split_alloc() called before 3772 * irq is disabled to allocate enough memory, whereas 3773 * non-uniform split can handle ENOMEM. 3774 * Use the to-be-split folio, so that a parallel 3775 * folio_try_get() waits on it until xarray is updated 3776 * with after-split folios and the original one is 3777 * unfrozen. 3778 */ 3779 if (split_type == SPLIT_TYPE_UNIFORM) { 3780 xas_split(xas, old_folio, old_order); 3781 } else { 3782 xas_set_order(xas, folio->index, split_order); 3783 xas_try_split(xas, old_folio, old_order); 3784 if (xas_error(xas)) 3785 return xas_error(xas); 3786 } 3787 } 3788 3789 folio_split_memcg_refs(folio, old_order, split_order); 3790 split_page_owner(&folio->page, old_order, split_order); 3791 pgalloc_tag_split(folio, old_order, split_order); 3792 __split_folio_to_order(folio, old_order, split_order); 3793 3794 if (is_anon) { 3795 mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1); 3796 mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios); 3797 } 3798 /* 3799 * If uniform split, the process is complete. 3800 * If non-uniform, continue splitting the folio at @split_at 3801 * as long as the next @split_order is >= @new_order. 3802 */ 3803 folio = page_folio(split_at); 3804 old_order = split_order; 3805 } 3806 3807 return 0; 3808 } 3809 3810 /** 3811 * folio_check_splittable() - check if a folio can be split to a given order 3812 * @folio: folio to be split 3813 * @new_order: the smallest order of the after split folios (since buddy 3814 * allocator like split generates folios with orders from @folio's 3815 * order - 1 to new_order). 3816 * @split_type: uniform or non-uniform split 3817 * 3818 * folio_check_splittable() checks if @folio can be split to @new_order using 3819 * @split_type method. The truncated folio check must come first. 3820 * 3821 * Context: folio must be locked. 3822 * 3823 * Return: 0 - @folio can be split to @new_order, otherwise an error number is 3824 * returned. 3825 */ 3826 int folio_check_splittable(struct folio *folio, unsigned int new_order, 3827 enum split_type split_type) 3828 { 3829 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 3830 /* 3831 * Folios that just got truncated cannot get split. Signal to the 3832 * caller that there was a race. 3833 * 3834 * TODO: this will also currently refuse folios without a mapping in the 3835 * swapcache (shmem or to-be-anon folios). 3836 */ 3837 if (!folio->mapping && !folio_test_anon(folio)) 3838 return -EBUSY; 3839 3840 if (folio_test_anon(folio)) { 3841 /* order-1 is not supported for anonymous THP. */ 3842 if (new_order == 1) 3843 return -EINVAL; 3844 } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) { 3845 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 3846 !mapping_large_folio_support(folio->mapping)) { 3847 /* 3848 * We can always split a folio down to a single page 3849 * (new_order == 0) uniformly. 3850 * 3851 * For any other scenario 3852 * a) uniform split targeting a large folio 3853 * (new_order > 0) 3854 * b) any non-uniform split 3855 * we must confirm that the file system supports large 3856 * folios. 3857 * 3858 * Note that we might still have THPs in such 3859 * mappings, which is created from khugepaged when 3860 * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that 3861 * case, the mapping does not actually support large 3862 * folios properly. 3863 */ 3864 return -EINVAL; 3865 } 3866 } 3867 3868 /* 3869 * swapcache folio could only be split to order 0 3870 * 3871 * non-uniform split creates after-split folios with orders from 3872 * folio_order(folio) - 1 to new_order, making it not suitable for any 3873 * swapcache folio split. Only uniform split to order-0 can be used 3874 * here. 3875 */ 3876 if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) { 3877 return -EINVAL; 3878 } 3879 3880 if (is_huge_zero_folio(folio)) 3881 return -EINVAL; 3882 3883 if (folio_test_writeback(folio)) 3884 return -EBUSY; 3885 3886 return 0; 3887 } 3888 3889 /* Number of folio references from the pagecache or the swapcache. */ 3890 static unsigned int folio_cache_ref_count(const struct folio *folio) 3891 { 3892 if (folio_test_anon(folio) && !folio_test_swapcache(folio)) 3893 return 0; 3894 return folio_nr_pages(folio); 3895 } 3896 3897 static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order, 3898 struct page *split_at, struct xa_state *xas, 3899 struct address_space *mapping, bool do_lru, 3900 struct list_head *list, enum split_type split_type, 3901 pgoff_t end, int *nr_shmem_dropped) 3902 { 3903 struct folio *end_folio = folio_next(folio); 3904 struct folio *new_folio, *next; 3905 int old_order = folio_order(folio); 3906 int ret = 0; 3907 struct deferred_split *ds_queue; 3908 3909 VM_WARN_ON_ONCE(!mapping && end); 3910 /* Prevent deferred_split_scan() touching ->_refcount */ 3911 ds_queue = folio_split_queue_lock(folio); 3912 if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) { 3913 struct swap_cluster_info *ci = NULL; 3914 struct lruvec *lruvec; 3915 3916 if (old_order > 1) { 3917 if (!list_empty(&folio->_deferred_list)) { 3918 ds_queue->split_queue_len--; 3919 /* 3920 * Reinitialize page_deferred_list after removing the 3921 * page from the split_queue, otherwise a subsequent 3922 * split will see list corruption when checking the 3923 * page_deferred_list. 3924 */ 3925 list_del_init(&folio->_deferred_list); 3926 } 3927 if (folio_test_partially_mapped(folio)) { 3928 folio_clear_partially_mapped(folio); 3929 mod_mthp_stat(old_order, 3930 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 3931 } 3932 } 3933 split_queue_unlock(ds_queue); 3934 if (mapping) { 3935 int nr = folio_nr_pages(folio); 3936 3937 if (folio_test_pmd_mappable(folio) && 3938 new_order < HPAGE_PMD_ORDER) { 3939 if (folio_test_swapbacked(folio)) { 3940 lruvec_stat_mod_folio(folio, 3941 NR_SHMEM_THPS, -nr); 3942 } else { 3943 lruvec_stat_mod_folio(folio, 3944 NR_FILE_THPS, -nr); 3945 filemap_nr_thps_dec(mapping); 3946 } 3947 } 3948 } 3949 3950 if (folio_test_swapcache(folio)) { 3951 if (mapping) { 3952 VM_WARN_ON_ONCE_FOLIO(mapping, folio); 3953 return -EINVAL; 3954 } 3955 3956 ci = swap_cluster_get_and_lock(folio); 3957 } 3958 3959 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 3960 if (do_lru) 3961 lruvec = folio_lruvec_lock(folio); 3962 3963 ret = __split_unmapped_folio(folio, new_order, split_at, xas, 3964 mapping, split_type); 3965 3966 /* 3967 * Unfreeze after-split folios and put them back to the right 3968 * list. @folio should be kept frozon until page cache 3969 * entries are updated with all the other after-split folios 3970 * to prevent others seeing stale page cache entries. 3971 * As a result, new_folio starts from the next folio of 3972 * @folio. 3973 */ 3974 for (new_folio = folio_next(folio); new_folio != end_folio; 3975 new_folio = next) { 3976 unsigned long nr_pages = folio_nr_pages(new_folio); 3977 3978 next = folio_next(new_folio); 3979 3980 zone_device_private_split_cb(folio, new_folio); 3981 3982 folio_ref_unfreeze(new_folio, 3983 folio_cache_ref_count(new_folio) + 1); 3984 3985 if (do_lru) 3986 lru_add_split_folio(folio, new_folio, lruvec, list); 3987 3988 /* 3989 * Anonymous folio with swap cache. 3990 * NOTE: shmem in swap cache is not supported yet. 3991 */ 3992 if (ci) { 3993 __swap_cache_replace_folio(ci, folio, new_folio); 3994 continue; 3995 } 3996 3997 /* Anonymous folio without swap cache */ 3998 if (!mapping) 3999 continue; 4000 4001 /* Add the new folio to the page cache. */ 4002 if (new_folio->index < end) { 4003 __xa_store(&mapping->i_pages, new_folio->index, 4004 new_folio, 0); 4005 continue; 4006 } 4007 4008 VM_WARN_ON_ONCE(!nr_shmem_dropped); 4009 /* Drop folio beyond EOF: ->index >= end */ 4010 if (shmem_mapping(mapping) && nr_shmem_dropped) 4011 *nr_shmem_dropped += nr_pages; 4012 else if (folio_test_clear_dirty(new_folio)) 4013 folio_account_cleaned( 4014 new_folio, inode_to_wb(mapping->host)); 4015 __filemap_remove_folio(new_folio, NULL); 4016 folio_put_refs(new_folio, nr_pages); 4017 } 4018 4019 zone_device_private_split_cb(folio, NULL); 4020 /* 4021 * Unfreeze @folio only after all page cache entries, which 4022 * used to point to it, have been updated with new folios. 4023 * Otherwise, a parallel folio_try_get() can grab @folio 4024 * and its caller can see stale page cache entries. 4025 */ 4026 folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1); 4027 4028 if (do_lru) 4029 lruvec_unlock(lruvec); 4030 4031 if (ci) 4032 swap_cluster_unlock(ci); 4033 } else { 4034 split_queue_unlock(ds_queue); 4035 return -EAGAIN; 4036 } 4037 4038 return ret; 4039 } 4040 4041 /** 4042 * __folio_split() - split a folio at @split_at to a @new_order folio 4043 * @folio: folio to split 4044 * @new_order: the order of the new folio 4045 * @split_at: a page within the new folio 4046 * @lock_at: a page within @folio to be left locked to caller 4047 * @list: after-split folios will be put on it if non NULL 4048 * @split_type: perform uniform split or not (non-uniform split) 4049 * 4050 * It calls __split_unmapped_folio() to perform uniform and non-uniform split. 4051 * It is in charge of checking whether the split is supported or not and 4052 * preparing @folio for __split_unmapped_folio(). 4053 * 4054 * After splitting, the after-split folio containing @lock_at remains locked 4055 * and others are unlocked: 4056 * 1. for uniform split, @lock_at points to one of @folio's subpages; 4057 * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. 4058 * 4059 * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be 4060 * split but not to @new_order, the caller needs to check) 4061 */ 4062 static int __folio_split(struct folio *folio, unsigned int new_order, 4063 struct page *split_at, struct page *lock_at, 4064 struct list_head *list, enum split_type split_type) 4065 { 4066 XA_STATE(xas, &folio->mapping->i_pages, folio->index); 4067 struct folio *end_folio = folio_next(folio); 4068 bool is_anon = folio_test_anon(folio); 4069 struct address_space *mapping = NULL; 4070 struct anon_vma *anon_vma = NULL; 4071 int old_order = folio_order(folio); 4072 struct folio *new_folio, *next; 4073 int nr_shmem_dropped = 0; 4074 enum ttu_flags ttu_flags = 0; 4075 int ret; 4076 pgoff_t end = 0; 4077 4078 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 4079 VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); 4080 4081 if (folio != page_folio(split_at) || folio != page_folio(lock_at)) { 4082 ret = -EINVAL; 4083 goto out; 4084 } 4085 4086 if (new_order >= old_order) { 4087 ret = -EINVAL; 4088 goto out; 4089 } 4090 4091 ret = folio_check_splittable(folio, new_order, split_type); 4092 if (ret) { 4093 VM_WARN_ONCE(ret == -EINVAL, "Tried to split an unsplittable folio"); 4094 goto out; 4095 } 4096 4097 if (is_anon) { 4098 /* 4099 * The caller does not necessarily hold an mmap_lock that would 4100 * prevent the anon_vma disappearing so we first we take a 4101 * reference to it and then lock the anon_vma for write. This 4102 * is similar to folio_lock_anon_vma_read except the write lock 4103 * is taken to serialise against parallel split or collapse 4104 * operations. 4105 */ 4106 anon_vma = folio_get_anon_vma(folio); 4107 if (!anon_vma) { 4108 ret = -EBUSY; 4109 goto out; 4110 } 4111 anon_vma_lock_write(anon_vma); 4112 mapping = NULL; 4113 } else { 4114 unsigned int min_order; 4115 gfp_t gfp; 4116 4117 mapping = folio->mapping; 4118 min_order = mapping_min_folio_order(folio->mapping); 4119 if (new_order < min_order) { 4120 ret = -EINVAL; 4121 goto out; 4122 } 4123 4124 gfp = current_gfp_context(mapping_gfp_mask(mapping) & 4125 GFP_RECLAIM_MASK); 4126 4127 if (!filemap_release_folio(folio, gfp)) { 4128 ret = -EBUSY; 4129 goto out; 4130 } 4131 4132 if (split_type == SPLIT_TYPE_UNIFORM) { 4133 xas_set_order(&xas, folio->index, new_order); 4134 xas_split_alloc(&xas, folio, old_order, gfp); 4135 if (xas_error(&xas)) { 4136 ret = xas_error(&xas); 4137 goto out; 4138 } 4139 } 4140 4141 anon_vma = NULL; 4142 i_mmap_lock_read(mapping); 4143 4144 /* 4145 *__split_unmapped_folio() may need to trim off pages beyond 4146 * EOF: but on 32-bit, i_size_read() takes an irq-unsafe 4147 * seqlock, which cannot be nested inside the page tree lock. 4148 * So note end now: i_size itself may be changed at any moment, 4149 * but folio lock is good enough to serialize the trimming. 4150 */ 4151 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 4152 if (shmem_mapping(mapping)) 4153 end = shmem_fallocend(mapping->host, end); 4154 } 4155 4156 /* 4157 * Racy check if we can split the page, before unmap_folio() will 4158 * split PMDs 4159 */ 4160 if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) { 4161 ret = -EAGAIN; 4162 goto out_unlock; 4163 } 4164 4165 unmap_folio(folio); 4166 4167 /* block interrupt reentry in xa_lock and spinlock */ 4168 local_irq_disable(); 4169 if (mapping) { 4170 /* 4171 * Check if the folio is present in page cache. 4172 * We assume all tail are present too, if folio is there. 4173 */ 4174 xas_lock(&xas); 4175 xas_reset(&xas); 4176 if (xas_load(&xas) != folio) { 4177 ret = -EAGAIN; 4178 goto fail; 4179 } 4180 } 4181 4182 ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping, 4183 true, list, split_type, end, &nr_shmem_dropped); 4184 fail: 4185 if (mapping) 4186 xas_unlock(&xas); 4187 4188 local_irq_enable(); 4189 4190 if (nr_shmem_dropped) 4191 shmem_uncharge(mapping->host, nr_shmem_dropped); 4192 4193 if (!ret && is_anon && !folio_is_device_private(folio)) 4194 ttu_flags = TTU_USE_SHARED_ZEROPAGE; 4195 4196 remap_page(folio, 1 << old_order, ttu_flags); 4197 4198 /* 4199 * Unlock all after-split folios except the one containing 4200 * @lock_at page. If @folio is not split, it will be kept locked. 4201 */ 4202 for (new_folio = folio; new_folio != end_folio; new_folio = next) { 4203 next = folio_next(new_folio); 4204 if (new_folio == page_folio(lock_at)) 4205 continue; 4206 4207 folio_unlock(new_folio); 4208 /* 4209 * Subpages whose mapping has been zapped may be freed 4210 * earlier, but freeing them requires taking the 4211 * lru_lock, so we defer put_page() on tail pages until 4212 * after the split completes. 4213 */ 4214 free_folio_and_swap_cache(new_folio); 4215 } 4216 4217 out_unlock: 4218 if (anon_vma) { 4219 anon_vma_unlock_write(anon_vma); 4220 put_anon_vma(anon_vma); 4221 } 4222 if (mapping) 4223 i_mmap_unlock_read(mapping); 4224 out: 4225 xas_destroy(&xas); 4226 if (is_pmd_order(old_order)) 4227 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 4228 count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); 4229 return ret; 4230 } 4231 4232 /** 4233 * folio_split_unmapped() - split a large anon folio that is already unmapped 4234 * @folio: folio to split 4235 * @new_order: the order of folios after split 4236 * 4237 * This function is a helper for splitting folios that have already been 4238 * unmapped. The use case is that the device or the CPU can refuse to migrate 4239 * THP pages in the middle of migration, due to allocation issues on either 4240 * side. 4241 * 4242 * anon_vma_lock is not required to be held, mmap_read_lock() or 4243 * mmap_write_lock() should be held. @folio is expected to be locked by the 4244 * caller. device-private and non device-private folios are supported along 4245 * with folios that are in the swapcache. @folio should also be unmapped and 4246 * isolated from LRU (if applicable) 4247 * 4248 * Upon return, the folio is not remapped, split folios are not added to LRU, 4249 * free_folio_and_swap_cache() is not called, and new folios remain locked. 4250 * 4251 * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to 4252 * insufficient reference count or extra pins). 4253 */ 4254 int folio_split_unmapped(struct folio *folio, unsigned int new_order) 4255 { 4256 int ret = 0; 4257 4258 VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); 4259 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 4260 VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); 4261 VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio); 4262 4263 if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) 4264 return -EAGAIN; 4265 4266 local_irq_disable(); 4267 ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL, 4268 NULL, false, NULL, SPLIT_TYPE_UNIFORM, 4269 0, NULL); 4270 local_irq_enable(); 4271 return ret; 4272 } 4273 4274 /* 4275 * This function splits a large folio into smaller folios of order @new_order. 4276 * @page can point to any page of the large folio to split. The split operation 4277 * does not change the position of @page. 4278 * 4279 * Prerequisites: 4280 * 4281 * 1) The caller must hold a reference on the @page's owning folio, also known 4282 * as the large folio. 4283 * 4284 * 2) The large folio must be locked. 4285 * 4286 * 3) The folio must not be pinned. Any unexpected folio references, including 4287 * GUP pins, will result in the folio not getting split; instead, the caller 4288 * will receive an -EAGAIN. 4289 * 4290 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not 4291 * supported for non-file-backed folios, because folio->_deferred_list, which 4292 * is used by partially mapped folios, is stored in subpage 2, but an order-1 4293 * folio only has subpages 0 and 1. File-backed order-1 folios are supported, 4294 * since they do not use _deferred_list. 4295 * 4296 * After splitting, the caller's folio reference will be transferred to @page, 4297 * resulting in a raised refcount of @page after this call. The other pages may 4298 * be freed if they are not mapped. 4299 * 4300 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 4301 * 4302 * Pages in @new_order will inherit the mapping, flags, and so on from the 4303 * huge page. 4304 * 4305 * Returns 0 if the huge page was split successfully. 4306 * 4307 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if 4308 * the folio was concurrently removed from the page cache. 4309 * 4310 * Returns -EBUSY when trying to split the huge zeropage, if the folio is 4311 * under writeback, if fs-specific folio metadata cannot currently be 4312 * released, or if some unexpected race happened (e.g., anon VMA disappeared, 4313 * truncation). 4314 * 4315 * Callers should ensure that the order respects the address space mapping 4316 * min-order if one is set for non-anonymous folios. 4317 * 4318 * Returns -EINVAL when trying to split to an order that is incompatible 4319 * with the folio. Splitting to order 0 is compatible with all folios. 4320 */ 4321 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 4322 unsigned int new_order) 4323 { 4324 struct folio *folio = page_folio(page); 4325 4326 return __folio_split(folio, new_order, &folio->page, page, list, 4327 SPLIT_TYPE_UNIFORM); 4328 } 4329 4330 /** 4331 * folio_split() - split a folio at @split_at to a @new_order folio 4332 * @folio: folio to split 4333 * @new_order: the order of the new folio 4334 * @split_at: a page within the new folio 4335 * @list: after-split folios are added to @list if not null, otherwise to LRU 4336 * list 4337 * 4338 * It has the same prerequisites and returns as 4339 * split_huge_page_to_list_to_order(). 4340 * 4341 * Split a folio at @split_at to a new_order folio, leave the 4342 * remaining subpages of the original folio as large as possible. For example, 4343 * in the case of splitting an order-9 folio at its third order-3 subpages to 4344 * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio. 4345 * After the split, there will be a group of folios with different orders and 4346 * the new folio containing @split_at is marked in bracket: 4347 * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. 4348 * 4349 * After split, folio is left locked for caller. 4350 * 4351 * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be 4352 * split but not to @new_order, the caller needs to check) 4353 */ 4354 int folio_split(struct folio *folio, unsigned int new_order, 4355 struct page *split_at, struct list_head *list) 4356 { 4357 return __folio_split(folio, new_order, split_at, &folio->page, list, 4358 SPLIT_TYPE_NON_UNIFORM); 4359 } 4360 4361 /** 4362 * min_order_for_split() - get the minimum order @folio can be split to 4363 * @folio: folio to split 4364 * 4365 * min_order_for_split() tells the minimum order @folio can be split to. 4366 * If a file-backed folio is truncated, 0 will be returned. Any subsequent 4367 * split attempt should get -EBUSY from split checking code. 4368 * 4369 * Return: @folio's minimum order for split 4370 */ 4371 unsigned int min_order_for_split(struct folio *folio) 4372 { 4373 if (folio_test_anon(folio)) 4374 return 0; 4375 4376 /* 4377 * If the folio got truncated, we don't know the previous mapping and 4378 * consequently the old min order. But it doesn't matter, as any split 4379 * attempt will immediately fail with -EBUSY as the folio cannot get 4380 * split until freed. 4381 */ 4382 if (!folio->mapping) 4383 return 0; 4384 4385 return mapping_min_folio_order(folio->mapping); 4386 } 4387 4388 int split_folio_to_list(struct folio *folio, struct list_head *list) 4389 { 4390 return split_huge_page_to_list_to_order(&folio->page, list, 0); 4391 } 4392 4393 /* 4394 * __folio_unqueue_deferred_split() is not to be called directly: 4395 * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h 4396 * limits its calls to those folios which may have a _deferred_list for 4397 * queueing THP splits, and that list is (racily observed to be) non-empty. 4398 * 4399 * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is 4400 * zero: because even when split_queue_lock is held, a non-empty _deferred_list 4401 * might be in use on deferred_split_scan()'s unlocked on-stack list. 4402 * 4403 * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is 4404 * therefore important to unqueue deferred split before changing folio memcg. 4405 */ 4406 bool __folio_unqueue_deferred_split(struct folio *folio) 4407 { 4408 struct deferred_split *ds_queue; 4409 unsigned long flags; 4410 bool unqueued = false; 4411 4412 WARN_ON_ONCE(folio_ref_count(folio)); 4413 WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); 4414 4415 ds_queue = folio_split_queue_lock_irqsave(folio, &flags); 4416 if (!list_empty(&folio->_deferred_list)) { 4417 ds_queue->split_queue_len--; 4418 if (folio_test_partially_mapped(folio)) { 4419 folio_clear_partially_mapped(folio); 4420 mod_mthp_stat(folio_order(folio), 4421 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 4422 } 4423 list_del_init(&folio->_deferred_list); 4424 unqueued = true; 4425 } 4426 split_queue_unlock_irqrestore(ds_queue, flags); 4427 4428 return unqueued; /* useful for debug warnings */ 4429 } 4430 4431 /* partially_mapped=false won't clear PG_partially_mapped folio flag */ 4432 void deferred_split_folio(struct folio *folio, bool partially_mapped) 4433 { 4434 struct deferred_split *ds_queue; 4435 unsigned long flags; 4436 4437 /* 4438 * Order 1 folios have no space for a deferred list, but we also 4439 * won't waste much memory by not adding them to the deferred list. 4440 */ 4441 if (folio_order(folio) <= 1) 4442 return; 4443 4444 if (!partially_mapped && !split_underused_thp) 4445 return; 4446 4447 /* 4448 * Exclude swapcache: originally to avoid a corrupt deferred split 4449 * queue. Nowadays that is fully prevented by __memcg1_swapout(); 4450 * but if page reclaim is already handling the same folio, it is 4451 * unnecessary to handle it again in the shrinker, so excluding 4452 * swapcache here may still be a useful optimization. 4453 */ 4454 if (folio_test_swapcache(folio)) 4455 return; 4456 4457 ds_queue = folio_split_queue_lock_irqsave(folio, &flags); 4458 if (partially_mapped) { 4459 if (!folio_test_partially_mapped(folio)) { 4460 folio_set_partially_mapped(folio); 4461 if (folio_test_pmd_mappable(folio)) 4462 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 4463 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); 4464 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); 4465 4466 } 4467 } else { 4468 /* partially mapped folios cannot become non-partially mapped */ 4469 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); 4470 } 4471 if (list_empty(&folio->_deferred_list)) { 4472 struct mem_cgroup *memcg; 4473 4474 memcg = folio_split_queue_memcg(folio, ds_queue); 4475 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); 4476 ds_queue->split_queue_len++; 4477 if (memcg) 4478 set_shrinker_bit(memcg, folio_nid(folio), 4479 shrinker_id(deferred_split_shrinker)); 4480 } 4481 split_queue_unlock_irqrestore(ds_queue, flags); 4482 } 4483 4484 static unsigned long deferred_split_count(struct shrinker *shrink, 4485 struct shrink_control *sc) 4486 { 4487 struct pglist_data *pgdata = NODE_DATA(sc->nid); 4488 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 4489 4490 #ifdef CONFIG_MEMCG 4491 if (sc->memcg) 4492 ds_queue = &sc->memcg->deferred_split_queue; 4493 #endif 4494 return READ_ONCE(ds_queue->split_queue_len); 4495 } 4496 4497 static bool thp_underused(struct folio *folio) 4498 { 4499 int num_zero_pages = 0, num_filled_pages = 0; 4500 int i; 4501 4502 if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) 4503 return false; 4504 4505 if (folio_contain_hwpoisoned_page(folio)) 4506 return false; 4507 4508 for (i = 0; i < folio_nr_pages(folio); i++) { 4509 if (pages_identical(folio_page(folio, i), ZERO_PAGE(0))) { 4510 if (++num_zero_pages > khugepaged_max_ptes_none) 4511 return true; 4512 } else { 4513 /* 4514 * Another path for early exit once the number 4515 * of non-zero filled pages exceeds threshold. 4516 */ 4517 if (++num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) 4518 return false; 4519 } 4520 } 4521 return false; 4522 } 4523 4524 static unsigned long deferred_split_scan(struct shrinker *shrink, 4525 struct shrink_control *sc) 4526 { 4527 struct deferred_split *ds_queue; 4528 unsigned long flags; 4529 struct folio *folio, *next; 4530 int split = 0, i; 4531 struct folio_batch fbatch; 4532 4533 folio_batch_init(&fbatch); 4534 4535 retry: 4536 ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); 4537 /* Take pin on all head pages to avoid freeing them under us */ 4538 list_for_each_entry_safe(folio, next, &ds_queue->split_queue, 4539 _deferred_list) { 4540 if (folio_try_get(folio)) { 4541 folio_batch_add(&fbatch, folio); 4542 } else if (folio_test_partially_mapped(folio)) { 4543 /* We lost race with folio_put() */ 4544 folio_clear_partially_mapped(folio); 4545 mod_mthp_stat(folio_order(folio), 4546 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 4547 } 4548 list_del_init(&folio->_deferred_list); 4549 ds_queue->split_queue_len--; 4550 if (!--sc->nr_to_scan) 4551 break; 4552 if (!folio_batch_space(&fbatch)) 4553 break; 4554 } 4555 split_queue_unlock_irqrestore(ds_queue, flags); 4556 4557 for (i = 0; i < folio_batch_count(&fbatch); i++) { 4558 bool did_split = false; 4559 bool underused = false; 4560 struct deferred_split *fqueue; 4561 4562 folio = fbatch.folios[i]; 4563 if (!folio_test_partially_mapped(folio)) { 4564 /* 4565 * See try_to_map_unused_to_zeropage(): we cannot 4566 * optimize zero-filled pages after splitting an 4567 * mlocked folio. 4568 */ 4569 if (folio_test_mlocked(folio)) 4570 goto next; 4571 underused = thp_underused(folio); 4572 if (!underused) 4573 goto next; 4574 } 4575 if (!folio_trylock(folio)) 4576 goto requeue; 4577 if (!split_folio(folio)) { 4578 did_split = true; 4579 if (underused) 4580 count_vm_event(THP_UNDERUSED_SPLIT_PAGE); 4581 split++; 4582 } 4583 folio_unlock(folio); 4584 next: 4585 /* 4586 * If thp_underused() returns false, or if split_folio() 4587 * succeeds, or if split_folio() fails in the case it was 4588 * underused, then consider it used and don't add it back to 4589 * split_queue. 4590 */ 4591 if (did_split || !folio_test_partially_mapped(folio)) 4592 continue; 4593 requeue: 4594 /* 4595 * Add back partially mapped folios, or underused folios that 4596 * we could not lock this round. 4597 */ 4598 fqueue = folio_split_queue_lock_irqsave(folio, &flags); 4599 if (list_empty(&folio->_deferred_list)) { 4600 list_add_tail(&folio->_deferred_list, &fqueue->split_queue); 4601 fqueue->split_queue_len++; 4602 } 4603 split_queue_unlock_irqrestore(fqueue, flags); 4604 } 4605 folios_put(&fbatch); 4606 4607 if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) { 4608 cond_resched(); 4609 goto retry; 4610 } 4611 4612 /* 4613 * Stop shrinker if we didn't split any page, but the queue is empty. 4614 * This can happen if pages were freed under us. 4615 */ 4616 if (!split && list_empty(&ds_queue->split_queue)) 4617 return SHRINK_STOP; 4618 return split; 4619 } 4620 4621 #ifdef CONFIG_MEMCG 4622 void reparent_deferred_split_queue(struct mem_cgroup *memcg) 4623 { 4624 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4625 struct deferred_split *ds_queue = &memcg->deferred_split_queue; 4626 struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; 4627 int nid; 4628 4629 spin_lock_irq(&ds_queue->split_queue_lock); 4630 spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); 4631 4632 if (!ds_queue->split_queue_len) 4633 goto unlock; 4634 4635 list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); 4636 parent_ds_queue->split_queue_len += ds_queue->split_queue_len; 4637 ds_queue->split_queue_len = 0; 4638 4639 for_each_node(nid) 4640 set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); 4641 4642 unlock: 4643 spin_unlock(&parent_ds_queue->split_queue_lock); 4644 spin_unlock_irq(&ds_queue->split_queue_lock); 4645 } 4646 #endif 4647 4648 #ifdef CONFIG_DEBUG_FS 4649 static void split_huge_pages_all(void) 4650 { 4651 struct zone *zone; 4652 struct page *page; 4653 struct folio *folio; 4654 unsigned long pfn, max_zone_pfn; 4655 unsigned long total = 0, split = 0; 4656 4657 pr_debug("Split all THPs\n"); 4658 for_each_zone(zone) { 4659 if (!managed_zone(zone)) 4660 continue; 4661 max_zone_pfn = zone_end_pfn(zone); 4662 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 4663 int nr_pages; 4664 4665 page = pfn_to_online_page(pfn); 4666 if (!page || PageTail(page)) 4667 continue; 4668 folio = page_folio(page); 4669 if (!folio_try_get(folio)) 4670 continue; 4671 4672 if (unlikely(page_folio(page) != folio)) 4673 goto next; 4674 4675 if (zone != folio_zone(folio)) 4676 goto next; 4677 4678 if (!folio_test_large(folio) 4679 || folio_test_hugetlb(folio) 4680 || !folio_test_lru(folio)) 4681 goto next; 4682 4683 total++; 4684 folio_lock(folio); 4685 nr_pages = folio_nr_pages(folio); 4686 if (!split_folio(folio)) 4687 split++; 4688 pfn += nr_pages - 1; 4689 folio_unlock(folio); 4690 next: 4691 folio_put(folio); 4692 cond_resched(); 4693 } 4694 } 4695 4696 pr_debug("%lu of %lu THP split\n", split, total); 4697 } 4698 4699 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) 4700 { 4701 if (vma_is_dax(vma)) 4702 return true; 4703 if (vma_is_special_huge(vma)) 4704 return true; 4705 if (vma_test(vma, VMA_IO_BIT)) 4706 return true; 4707 if (is_vm_hugetlb_page(vma)) 4708 return true; 4709 4710 return false; 4711 } 4712 4713 static int split_huge_pages_pid(int pid, unsigned long vaddr_start, 4714 unsigned long vaddr_end, unsigned int new_order, 4715 long in_folio_offset) 4716 { 4717 int ret = 0; 4718 struct task_struct *task; 4719 struct mm_struct *mm; 4720 unsigned long total = 0, split = 0; 4721 unsigned long addr; 4722 4723 vaddr_start &= PAGE_MASK; 4724 vaddr_end &= PAGE_MASK; 4725 4726 task = find_get_task_by_vpid(pid); 4727 if (!task) { 4728 ret = -ESRCH; 4729 goto out; 4730 } 4731 4732 /* Find the mm_struct */ 4733 mm = get_task_mm(task); 4734 put_task_struct(task); 4735 4736 if (!mm) { 4737 ret = -EINVAL; 4738 goto out; 4739 } 4740 4741 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", 4742 pid, vaddr_start, vaddr_end, new_order, in_folio_offset); 4743 4744 mmap_read_lock(mm); 4745 /* 4746 * always increase addr by PAGE_SIZE, since we could have a PTE page 4747 * table filled with PTE-mapped THPs, each of which is distinct. 4748 */ 4749 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { 4750 struct vm_area_struct *vma = vma_lookup(mm, addr); 4751 struct folio_walk fw; 4752 struct folio *folio; 4753 struct address_space *mapping; 4754 unsigned int target_order = new_order; 4755 4756 if (!vma) 4757 break; 4758 4759 /* skip special VMA and hugetlb VMA */ 4760 if (vma_not_suitable_for_thp_split(vma)) { 4761 addr = vma->vm_end; 4762 continue; 4763 } 4764 4765 folio = folio_walk_start(&fw, vma, addr, 0); 4766 if (!folio) 4767 continue; 4768 4769 if (!is_transparent_hugepage(folio)) 4770 goto next; 4771 4772 if (!folio_test_anon(folio)) { 4773 mapping = folio->mapping; 4774 target_order = max(new_order, 4775 mapping_min_folio_order(mapping)); 4776 } 4777 4778 if (target_order >= folio_order(folio)) 4779 goto next; 4780 4781 total++; 4782 /* 4783 * For folios with private, split_huge_page_to_list_to_order() 4784 * will try to drop it before split and then check if the folio 4785 * can be split or not. So skip the check here. 4786 */ 4787 if (!folio_test_private(folio) && 4788 folio_expected_ref_count(folio) != folio_ref_count(folio)) 4789 goto next; 4790 4791 if (!folio_trylock(folio)) 4792 goto next; 4793 folio_get(folio); 4794 folio_walk_end(&fw, vma); 4795 4796 if (!folio_test_anon(folio) && folio->mapping != mapping) 4797 goto unlock; 4798 4799 if (in_folio_offset < 0 || 4800 in_folio_offset >= folio_nr_pages(folio)) { 4801 if (!split_folio_to_order(folio, target_order)) 4802 split++; 4803 } else { 4804 struct page *split_at = folio_page(folio, 4805 in_folio_offset); 4806 if (!folio_split(folio, target_order, split_at, NULL)) 4807 split++; 4808 } 4809 4810 unlock: 4811 4812 folio_unlock(folio); 4813 folio_put(folio); 4814 4815 cond_resched(); 4816 continue; 4817 next: 4818 folio_walk_end(&fw, vma); 4819 cond_resched(); 4820 } 4821 mmap_read_unlock(mm); 4822 mmput(mm); 4823 4824 pr_debug("%lu of %lu THP split\n", split, total); 4825 4826 out: 4827 return ret; 4828 } 4829 4830 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, 4831 pgoff_t off_end, unsigned int new_order, 4832 long in_folio_offset) 4833 { 4834 struct file *candidate; 4835 struct address_space *mapping; 4836 pgoff_t index; 4837 int nr_pages = 1; 4838 unsigned long total = 0, split = 0; 4839 unsigned int min_order; 4840 unsigned int target_order; 4841 4842 CLASS(filename_kernel, file)(file_path); 4843 candidate = file_open_name(file, O_RDONLY, 0); 4844 if (IS_ERR(candidate)) 4845 return -EINVAL; 4846 4847 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", 4848 file_path, off_start, off_end, new_order, in_folio_offset); 4849 4850 mapping = candidate->f_mapping; 4851 min_order = mapping_min_folio_order(mapping); 4852 target_order = max(new_order, min_order); 4853 4854 for (index = off_start; index < off_end; index += nr_pages) { 4855 struct folio *folio = filemap_get_folio(mapping, index); 4856 4857 nr_pages = 1; 4858 if (IS_ERR(folio)) 4859 continue; 4860 4861 if (!folio_test_large(folio)) 4862 goto next; 4863 4864 total++; 4865 nr_pages = folio_nr_pages(folio); 4866 4867 if (target_order >= folio_order(folio)) 4868 goto next; 4869 4870 if (!folio_trylock(folio)) 4871 goto next; 4872 4873 if (folio->mapping != mapping) 4874 goto unlock; 4875 4876 if (in_folio_offset < 0 || in_folio_offset >= nr_pages) { 4877 if (!split_folio_to_order(folio, target_order)) 4878 split++; 4879 } else { 4880 struct page *split_at = folio_page(folio, 4881 in_folio_offset); 4882 if (!folio_split(folio, target_order, split_at, NULL)) 4883 split++; 4884 } 4885 4886 unlock: 4887 folio_unlock(folio); 4888 next: 4889 folio_put(folio); 4890 cond_resched(); 4891 } 4892 4893 filp_close(candidate, NULL); 4894 pr_debug("%lu of %lu file-backed THP split\n", split, total); 4895 return 0; 4896 } 4897 4898 #define MAX_INPUT_BUF_SZ 255 4899 4900 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, 4901 size_t count, loff_t *ppops) 4902 { 4903 static DEFINE_MUTEX(split_debug_mutex); 4904 ssize_t ret; 4905 /* 4906 * hold pid, start_vaddr, end_vaddr, new_order or 4907 * file_path, off_start, off_end, new_order 4908 */ 4909 char input_buf[MAX_INPUT_BUF_SZ]; 4910 int pid; 4911 unsigned long vaddr_start, vaddr_end; 4912 unsigned int new_order = 0; 4913 long in_folio_offset = -1; 4914 4915 ret = mutex_lock_interruptible(&split_debug_mutex); 4916 if (ret) 4917 return ret; 4918 4919 ret = -EFAULT; 4920 4921 memset(input_buf, 0, MAX_INPUT_BUF_SZ); 4922 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) 4923 goto out; 4924 4925 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; 4926 4927 if (input_buf[0] == '/') { 4928 char *tok; 4929 char *tok_buf = input_buf; 4930 char file_path[MAX_INPUT_BUF_SZ]; 4931 pgoff_t off_start = 0, off_end = 0; 4932 size_t input_len = strlen(input_buf); 4933 4934 tok = strsep(&tok_buf, ","); 4935 if (tok && tok_buf) { 4936 strscpy(file_path, tok); 4937 } else { 4938 ret = -EINVAL; 4939 goto out; 4940 } 4941 4942 ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end, 4943 &new_order, &in_folio_offset); 4944 if (ret != 2 && ret != 3 && ret != 4) { 4945 ret = -EINVAL; 4946 goto out; 4947 } 4948 ret = split_huge_pages_in_file(file_path, off_start, off_end, 4949 new_order, in_folio_offset); 4950 if (!ret) 4951 ret = input_len; 4952 4953 goto out; 4954 } 4955 4956 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start, 4957 &vaddr_end, &new_order, &in_folio_offset); 4958 if (ret == 1 && pid == 1) { 4959 split_huge_pages_all(); 4960 ret = strlen(input_buf); 4961 goto out; 4962 } else if (ret != 3 && ret != 4 && ret != 5) { 4963 ret = -EINVAL; 4964 goto out; 4965 } 4966 4967 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order, 4968 in_folio_offset); 4969 if (!ret) 4970 ret = strlen(input_buf); 4971 out: 4972 mutex_unlock(&split_debug_mutex); 4973 return ret; 4974 4975 } 4976 4977 static const struct file_operations split_huge_pages_fops = { 4978 .owner = THIS_MODULE, 4979 .write = split_huge_pages_write, 4980 }; 4981 4982 static int __init split_huge_pages_debugfs(void) 4983 { 4984 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 4985 &split_huge_pages_fops); 4986 return 0; 4987 } 4988 late_initcall(split_huge_pages_debugfs); 4989 #endif 4990 4991 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 4992 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 4993 struct page *page) 4994 { 4995 struct folio *folio = page_folio(page); 4996 struct vm_area_struct *vma = pvmw->vma; 4997 struct mm_struct *mm = vma->vm_mm; 4998 unsigned long address = pvmw->address; 4999 bool anon_exclusive; 5000 pmd_t pmdval; 5001 swp_entry_t entry; 5002 pmd_t pmdswp; 5003 5004 if (!(pvmw->pmd && !pvmw->pte)) 5005 return 0; 5006 5007 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 5008 if (unlikely(!pmd_present(*pvmw->pmd))) 5009 pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd); 5010 else 5011 pmdval = pmdp_invalidate(vma, address, pvmw->pmd); 5012 5013 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ 5014 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); 5015 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { 5016 set_pmd_at(mm, address, pvmw->pmd, pmdval); 5017 return -EBUSY; 5018 } 5019 5020 if (pmd_dirty(pmdval)) 5021 folio_mark_dirty(folio); 5022 if (pmd_write(pmdval)) 5023 entry = make_writable_migration_entry(page_to_pfn(page)); 5024 else if (anon_exclusive) 5025 entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); 5026 else 5027 entry = make_readable_migration_entry(page_to_pfn(page)); 5028 if (pmd_young(pmdval)) 5029 entry = make_migration_entry_young(entry); 5030 if (pmd_dirty(pmdval)) 5031 entry = make_migration_entry_dirty(entry); 5032 pmdswp = swp_entry_to_pmd(entry); 5033 if (pmd_soft_dirty(pmdval)) 5034 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 5035 if (pmd_uffd_wp(pmdval)) 5036 pmdswp = pmd_swp_mkuffd_wp(pmdswp); 5037 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 5038 folio_remove_rmap_pmd(folio, page, vma); 5039 folio_put(folio); 5040 trace_set_migration_pmd(address, pmd_val(pmdswp)); 5041 5042 return 0; 5043 } 5044 5045 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 5046 { 5047 struct folio *folio = page_folio(new); 5048 struct vm_area_struct *vma = pvmw->vma; 5049 struct mm_struct *mm = vma->vm_mm; 5050 unsigned long address = pvmw->address; 5051 unsigned long haddr = address & HPAGE_PMD_MASK; 5052 pmd_t pmde; 5053 softleaf_t entry; 5054 5055 if (!(pvmw->pmd && !pvmw->pte)) 5056 return; 5057 5058 entry = softleaf_from_pmd(*pvmw->pmd); 5059 folio_get(folio); 5060 pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); 5061 5062 if (pmd_swp_soft_dirty(*pvmw->pmd)) 5063 pmde = pmd_mksoft_dirty(pmde); 5064 if (softleaf_is_migration_write(entry)) 5065 pmde = pmd_mkwrite(pmde, vma); 5066 if (pmd_swp_uffd_wp(*pvmw->pmd)) 5067 pmde = pmd_mkuffd_wp(pmde); 5068 if (!softleaf_is_migration_young(entry)) 5069 pmde = pmd_mkold(pmde); 5070 /* NOTE: this may contain setting soft-dirty on some archs */ 5071 if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) 5072 pmde = pmd_mkdirty(pmde); 5073 5074 if (folio_is_device_private(folio)) { 5075 swp_entry_t entry; 5076 5077 if (pmd_write(pmde)) 5078 entry = make_writable_device_private_entry( 5079 page_to_pfn(new)); 5080 else 5081 entry = make_readable_device_private_entry( 5082 page_to_pfn(new)); 5083 pmde = swp_entry_to_pmd(entry); 5084 5085 if (pmd_swp_soft_dirty(*pvmw->pmd)) 5086 pmde = pmd_swp_mksoft_dirty(pmde); 5087 if (pmd_swp_uffd_wp(*pvmw->pmd)) 5088 pmde = pmd_swp_mkuffd_wp(pmde); 5089 } 5090 5091 if (folio_test_anon(folio)) { 5092 rmap_t rmap_flags = RMAP_NONE; 5093 5094 if (!softleaf_is_migration_read(entry)) 5095 rmap_flags |= RMAP_EXCLUSIVE; 5096 5097 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); 5098 } else { 5099 folio_add_file_rmap_pmd(folio, new, vma); 5100 } 5101 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new)); 5102 set_pmd_at(mm, haddr, pvmw->pmd, pmde); 5103 5104 /* No need to invalidate - it was non-present before */ 5105 update_mmu_cache_pmd(vma, address, pvmw->pmd); 5106 trace_remove_migration_pmd(address, pmd_val(pmde)); 5107 } 5108 #endif 5109