1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/mm.h> 11 #include <linux/sched/numa_balancing.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/backing-dev.h> 21 #include <linux/dax.h> 22 #include <linux/mm_types.h> 23 #include <linux/khugepaged.h> 24 #include <linux/freezer.h> 25 #include <linux/mman.h> 26 #include <linux/memremap.h> 27 #include <linux/pagemap.h> 28 #include <linux/debugfs.h> 29 #include <linux/migrate.h> 30 #include <linux/hashtable.h> 31 #include <linux/userfaultfd_k.h> 32 #include <linux/page_idle.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/oom.h> 35 #include <linux/numa.h> 36 #include <linux/page_owner.h> 37 #include <linux/sched/sysctl.h> 38 #include <linux/memory-tiers.h> 39 #include <linux/compat.h> 40 #include <linux/pgalloc_tag.h> 41 #include <linux/pagewalk.h> 42 43 #include <asm/tlb.h> 44 #include <asm/pgalloc.h> 45 #include "internal.h" 46 #include "swap.h" 47 48 #define CREATE_TRACE_POINTS 49 #include <trace/events/thp.h> 50 51 /* 52 * By default, transparent hugepage support is disabled in order to avoid 53 * risking an increased memory footprint for applications that are not 54 * guaranteed to benefit from it. When transparent hugepage support is 55 * enabled, it is for all mappings, and khugepaged scans all mappings. 56 * Defrag is invoked by khugepaged hugepage allocations and by page faults 57 * for all hugepage allocations. 58 */ 59 unsigned long transparent_hugepage_flags __read_mostly = 60 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 61 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 62 #endif 63 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 64 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 65 #endif 66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 68 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 69 70 static struct shrinker *deferred_split_shrinker; 71 static unsigned long deferred_split_count(struct shrinker *shrink, 72 struct shrink_control *sc); 73 static unsigned long deferred_split_scan(struct shrinker *shrink, 74 struct shrink_control *sc); 75 static bool split_underused_thp = true; 76 77 static atomic_t huge_zero_refcount; 78 struct folio *huge_zero_folio __read_mostly; 79 unsigned long huge_zero_pfn __read_mostly = ~0UL; 80 unsigned long huge_anon_orders_always __read_mostly; 81 unsigned long huge_anon_orders_madvise __read_mostly; 82 unsigned long huge_anon_orders_inherit __read_mostly; 83 static bool anon_orders_configured __initdata; 84 85 static inline bool file_thp_enabled(struct vm_area_struct *vma) 86 { 87 struct inode *inode; 88 89 if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) 90 return false; 91 92 if (!vma->vm_file) 93 return false; 94 95 inode = file_inode(vma->vm_file); 96 97 return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); 98 } 99 100 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, 101 vm_flags_t vm_flags, 102 unsigned long tva_flags, 103 unsigned long orders) 104 { 105 bool smaps = tva_flags & TVA_SMAPS; 106 bool in_pf = tva_flags & TVA_IN_PF; 107 bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; 108 unsigned long supported_orders; 109 110 /* Check the intersection of requested and supported orders. */ 111 if (vma_is_anonymous(vma)) 112 supported_orders = THP_ORDERS_ALL_ANON; 113 else if (vma_is_special_huge(vma)) 114 supported_orders = THP_ORDERS_ALL_SPECIAL; 115 else 116 supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; 117 118 orders &= supported_orders; 119 if (!orders) 120 return 0; 121 122 if (!vma->vm_mm) /* vdso */ 123 return 0; 124 125 if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) 126 return 0; 127 128 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ 129 if (vma_is_dax(vma)) 130 return in_pf ? orders : 0; 131 132 /* 133 * khugepaged special VMA and hugetlb VMA. 134 * Must be checked after dax since some dax mappings may have 135 * VM_MIXEDMAP set. 136 */ 137 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) 138 return 0; 139 140 /* 141 * Check alignment for file vma and size for both file and anon vma by 142 * filtering out the unsuitable orders. 143 * 144 * Skip the check for page fault. Huge fault does the check in fault 145 * handlers. 146 */ 147 if (!in_pf) { 148 int order = highest_order(orders); 149 unsigned long addr; 150 151 while (orders) { 152 addr = vma->vm_end - (PAGE_SIZE << order); 153 if (thp_vma_suitable_order(vma, addr, order)) 154 break; 155 order = next_order(&orders, order); 156 } 157 158 if (!orders) 159 return 0; 160 } 161 162 /* 163 * Enabled via shmem mount options or sysfs settings. 164 * Must be done before hugepage flags check since shmem has its 165 * own flags. 166 */ 167 if (!in_pf && shmem_file(vma->vm_file)) 168 return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), 169 vma, vma->vm_pgoff, 0, 170 !enforce_sysfs); 171 172 if (!vma_is_anonymous(vma)) { 173 /* 174 * Enforce sysfs THP requirements as necessary. Anonymous vmas 175 * were already handled in thp_vma_allowable_orders(). 176 */ 177 if (enforce_sysfs && 178 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && 179 !hugepage_global_always()))) 180 return 0; 181 182 /* 183 * Trust that ->huge_fault() handlers know what they are doing 184 * in fault path. 185 */ 186 if (((in_pf || smaps)) && vma->vm_ops->huge_fault) 187 return orders; 188 /* Only regular file is valid in collapse path */ 189 if (((!in_pf || smaps)) && file_thp_enabled(vma)) 190 return orders; 191 return 0; 192 } 193 194 if (vma_is_temporary_stack(vma)) 195 return 0; 196 197 /* 198 * THPeligible bit of smaps should show 1 for proper VMAs even 199 * though anon_vma is not initialized yet. 200 * 201 * Allow page fault since anon_vma may be not initialized until 202 * the first page fault. 203 */ 204 if (!vma->anon_vma) 205 return (smaps || in_pf) ? orders : 0; 206 207 return orders; 208 } 209 210 static bool get_huge_zero_page(void) 211 { 212 struct folio *zero_folio; 213 retry: 214 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 215 return true; 216 217 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 218 HPAGE_PMD_ORDER); 219 if (!zero_folio) { 220 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 221 return false; 222 } 223 /* Ensure zero folio won't have large_rmappable flag set. */ 224 folio_clear_large_rmappable(zero_folio); 225 preempt_disable(); 226 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { 227 preempt_enable(); 228 folio_put(zero_folio); 229 goto retry; 230 } 231 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); 232 233 /* We take additional reference here. It will be put back by shrinker */ 234 atomic_set(&huge_zero_refcount, 2); 235 preempt_enable(); 236 count_vm_event(THP_ZERO_PAGE_ALLOC); 237 return true; 238 } 239 240 static void put_huge_zero_page(void) 241 { 242 /* 243 * Counter should never go to zero here. Only shrinker can put 244 * last reference. 245 */ 246 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 247 } 248 249 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) 250 { 251 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 252 return READ_ONCE(huge_zero_folio); 253 254 if (!get_huge_zero_page()) 255 return NULL; 256 257 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 258 put_huge_zero_page(); 259 260 return READ_ONCE(huge_zero_folio); 261 } 262 263 void mm_put_huge_zero_folio(struct mm_struct *mm) 264 { 265 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 266 put_huge_zero_page(); 267 } 268 269 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 270 struct shrink_control *sc) 271 { 272 /* we can free zero page only if last reference remains */ 273 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 274 } 275 276 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 277 struct shrink_control *sc) 278 { 279 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 280 struct folio *zero_folio = xchg(&huge_zero_folio, NULL); 281 BUG_ON(zero_folio == NULL); 282 WRITE_ONCE(huge_zero_pfn, ~0UL); 283 folio_put(zero_folio); 284 return HPAGE_PMD_NR; 285 } 286 287 return 0; 288 } 289 290 static struct shrinker *huge_zero_page_shrinker; 291 292 #ifdef CONFIG_SYSFS 293 static ssize_t enabled_show(struct kobject *kobj, 294 struct kobj_attribute *attr, char *buf) 295 { 296 const char *output; 297 298 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 299 output = "[always] madvise never"; 300 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 301 &transparent_hugepage_flags)) 302 output = "always [madvise] never"; 303 else 304 output = "always madvise [never]"; 305 306 return sysfs_emit(buf, "%s\n", output); 307 } 308 309 static ssize_t enabled_store(struct kobject *kobj, 310 struct kobj_attribute *attr, 311 const char *buf, size_t count) 312 { 313 ssize_t ret = count; 314 315 if (sysfs_streq(buf, "always")) { 316 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 317 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 318 } else if (sysfs_streq(buf, "madvise")) { 319 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 320 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 321 } else if (sysfs_streq(buf, "never")) { 322 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 323 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 324 } else 325 ret = -EINVAL; 326 327 if (ret > 0) { 328 int err = start_stop_khugepaged(); 329 if (err) 330 ret = err; 331 } 332 return ret; 333 } 334 335 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); 336 337 ssize_t single_hugepage_flag_show(struct kobject *kobj, 338 struct kobj_attribute *attr, char *buf, 339 enum transparent_hugepage_flag flag) 340 { 341 return sysfs_emit(buf, "%d\n", 342 !!test_bit(flag, &transparent_hugepage_flags)); 343 } 344 345 ssize_t single_hugepage_flag_store(struct kobject *kobj, 346 struct kobj_attribute *attr, 347 const char *buf, size_t count, 348 enum transparent_hugepage_flag flag) 349 { 350 unsigned long value; 351 int ret; 352 353 ret = kstrtoul(buf, 10, &value); 354 if (ret < 0) 355 return ret; 356 if (value > 1) 357 return -EINVAL; 358 359 if (value) 360 set_bit(flag, &transparent_hugepage_flags); 361 else 362 clear_bit(flag, &transparent_hugepage_flags); 363 364 return count; 365 } 366 367 static ssize_t defrag_show(struct kobject *kobj, 368 struct kobj_attribute *attr, char *buf) 369 { 370 const char *output; 371 372 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 373 &transparent_hugepage_flags)) 374 output = "[always] defer defer+madvise madvise never"; 375 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 376 &transparent_hugepage_flags)) 377 output = "always [defer] defer+madvise madvise never"; 378 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 379 &transparent_hugepage_flags)) 380 output = "always defer [defer+madvise] madvise never"; 381 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 382 &transparent_hugepage_flags)) 383 output = "always defer defer+madvise [madvise] never"; 384 else 385 output = "always defer defer+madvise madvise [never]"; 386 387 return sysfs_emit(buf, "%s\n", output); 388 } 389 390 static ssize_t defrag_store(struct kobject *kobj, 391 struct kobj_attribute *attr, 392 const char *buf, size_t count) 393 { 394 if (sysfs_streq(buf, "always")) { 395 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 397 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 398 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 399 } else if (sysfs_streq(buf, "defer+madvise")) { 400 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 402 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 403 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 404 } else if (sysfs_streq(buf, "defer")) { 405 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 406 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 407 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 408 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 409 } else if (sysfs_streq(buf, "madvise")) { 410 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 411 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 412 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 413 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 414 } else if (sysfs_streq(buf, "never")) { 415 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 416 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 417 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 418 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 419 } else 420 return -EINVAL; 421 422 return count; 423 } 424 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); 425 426 static ssize_t use_zero_page_show(struct kobject *kobj, 427 struct kobj_attribute *attr, char *buf) 428 { 429 return single_hugepage_flag_show(kobj, attr, buf, 430 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 431 } 432 static ssize_t use_zero_page_store(struct kobject *kobj, 433 struct kobj_attribute *attr, const char *buf, size_t count) 434 { 435 return single_hugepage_flag_store(kobj, attr, buf, count, 436 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 437 } 438 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); 439 440 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 441 struct kobj_attribute *attr, char *buf) 442 { 443 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); 444 } 445 static struct kobj_attribute hpage_pmd_size_attr = 446 __ATTR_RO(hpage_pmd_size); 447 448 static ssize_t split_underused_thp_show(struct kobject *kobj, 449 struct kobj_attribute *attr, char *buf) 450 { 451 return sysfs_emit(buf, "%d\n", split_underused_thp); 452 } 453 454 static ssize_t split_underused_thp_store(struct kobject *kobj, 455 struct kobj_attribute *attr, 456 const char *buf, size_t count) 457 { 458 int err = kstrtobool(buf, &split_underused_thp); 459 460 if (err < 0) 461 return err; 462 463 return count; 464 } 465 466 static struct kobj_attribute split_underused_thp_attr = __ATTR( 467 shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store); 468 469 static struct attribute *hugepage_attr[] = { 470 &enabled_attr.attr, 471 &defrag_attr.attr, 472 &use_zero_page_attr.attr, 473 &hpage_pmd_size_attr.attr, 474 #ifdef CONFIG_SHMEM 475 &shmem_enabled_attr.attr, 476 #endif 477 &split_underused_thp_attr.attr, 478 NULL, 479 }; 480 481 static const struct attribute_group hugepage_attr_group = { 482 .attrs = hugepage_attr, 483 }; 484 485 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); 486 static void thpsize_release(struct kobject *kobj); 487 static DEFINE_SPINLOCK(huge_anon_orders_lock); 488 static LIST_HEAD(thpsize_list); 489 490 static ssize_t anon_enabled_show(struct kobject *kobj, 491 struct kobj_attribute *attr, char *buf) 492 { 493 int order = to_thpsize(kobj)->order; 494 const char *output; 495 496 if (test_bit(order, &huge_anon_orders_always)) 497 output = "[always] inherit madvise never"; 498 else if (test_bit(order, &huge_anon_orders_inherit)) 499 output = "always [inherit] madvise never"; 500 else if (test_bit(order, &huge_anon_orders_madvise)) 501 output = "always inherit [madvise] never"; 502 else 503 output = "always inherit madvise [never]"; 504 505 return sysfs_emit(buf, "%s\n", output); 506 } 507 508 static ssize_t anon_enabled_store(struct kobject *kobj, 509 struct kobj_attribute *attr, 510 const char *buf, size_t count) 511 { 512 int order = to_thpsize(kobj)->order; 513 ssize_t ret = count; 514 515 if (sysfs_streq(buf, "always")) { 516 spin_lock(&huge_anon_orders_lock); 517 clear_bit(order, &huge_anon_orders_inherit); 518 clear_bit(order, &huge_anon_orders_madvise); 519 set_bit(order, &huge_anon_orders_always); 520 spin_unlock(&huge_anon_orders_lock); 521 } else if (sysfs_streq(buf, "inherit")) { 522 spin_lock(&huge_anon_orders_lock); 523 clear_bit(order, &huge_anon_orders_always); 524 clear_bit(order, &huge_anon_orders_madvise); 525 set_bit(order, &huge_anon_orders_inherit); 526 spin_unlock(&huge_anon_orders_lock); 527 } else if (sysfs_streq(buf, "madvise")) { 528 spin_lock(&huge_anon_orders_lock); 529 clear_bit(order, &huge_anon_orders_always); 530 clear_bit(order, &huge_anon_orders_inherit); 531 set_bit(order, &huge_anon_orders_madvise); 532 spin_unlock(&huge_anon_orders_lock); 533 } else if (sysfs_streq(buf, "never")) { 534 spin_lock(&huge_anon_orders_lock); 535 clear_bit(order, &huge_anon_orders_always); 536 clear_bit(order, &huge_anon_orders_inherit); 537 clear_bit(order, &huge_anon_orders_madvise); 538 spin_unlock(&huge_anon_orders_lock); 539 } else 540 ret = -EINVAL; 541 542 if (ret > 0) { 543 int err; 544 545 err = start_stop_khugepaged(); 546 if (err) 547 ret = err; 548 } 549 return ret; 550 } 551 552 static struct kobj_attribute anon_enabled_attr = 553 __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); 554 555 static struct attribute *anon_ctrl_attrs[] = { 556 &anon_enabled_attr.attr, 557 NULL, 558 }; 559 560 static const struct attribute_group anon_ctrl_attr_grp = { 561 .attrs = anon_ctrl_attrs, 562 }; 563 564 static struct attribute *file_ctrl_attrs[] = { 565 #ifdef CONFIG_SHMEM 566 &thpsize_shmem_enabled_attr.attr, 567 #endif 568 NULL, 569 }; 570 571 static const struct attribute_group file_ctrl_attr_grp = { 572 .attrs = file_ctrl_attrs, 573 }; 574 575 static struct attribute *any_ctrl_attrs[] = { 576 NULL, 577 }; 578 579 static const struct attribute_group any_ctrl_attr_grp = { 580 .attrs = any_ctrl_attrs, 581 }; 582 583 static const struct kobj_type thpsize_ktype = { 584 .release = &thpsize_release, 585 .sysfs_ops = &kobj_sysfs_ops, 586 }; 587 588 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; 589 590 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) 591 { 592 unsigned long sum = 0; 593 int cpu; 594 595 for_each_possible_cpu(cpu) { 596 struct mthp_stat *this = &per_cpu(mthp_stats, cpu); 597 598 sum += this->stats[order][item]; 599 } 600 601 return sum; 602 } 603 604 #define DEFINE_MTHP_STAT_ATTR(_name, _index) \ 605 static ssize_t _name##_show(struct kobject *kobj, \ 606 struct kobj_attribute *attr, char *buf) \ 607 { \ 608 int order = to_thpsize(kobj)->order; \ 609 \ 610 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ 611 } \ 612 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 613 614 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); 615 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); 616 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 617 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); 618 DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); 619 DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); 620 DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); 621 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); 622 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); 623 #ifdef CONFIG_SHMEM 624 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); 625 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); 626 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); 627 #endif 628 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); 629 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); 630 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); 631 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); 632 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); 633 634 static struct attribute *anon_stats_attrs[] = { 635 &anon_fault_alloc_attr.attr, 636 &anon_fault_fallback_attr.attr, 637 &anon_fault_fallback_charge_attr.attr, 638 #ifndef CONFIG_SHMEM 639 &zswpout_attr.attr, 640 &swpin_attr.attr, 641 &swpin_fallback_attr.attr, 642 &swpin_fallback_charge_attr.attr, 643 &swpout_attr.attr, 644 &swpout_fallback_attr.attr, 645 #endif 646 &split_deferred_attr.attr, 647 &nr_anon_attr.attr, 648 &nr_anon_partially_mapped_attr.attr, 649 NULL, 650 }; 651 652 static struct attribute_group anon_stats_attr_grp = { 653 .name = "stats", 654 .attrs = anon_stats_attrs, 655 }; 656 657 static struct attribute *file_stats_attrs[] = { 658 #ifdef CONFIG_SHMEM 659 &shmem_alloc_attr.attr, 660 &shmem_fallback_attr.attr, 661 &shmem_fallback_charge_attr.attr, 662 #endif 663 NULL, 664 }; 665 666 static struct attribute_group file_stats_attr_grp = { 667 .name = "stats", 668 .attrs = file_stats_attrs, 669 }; 670 671 static struct attribute *any_stats_attrs[] = { 672 #ifdef CONFIG_SHMEM 673 &zswpout_attr.attr, 674 &swpin_attr.attr, 675 &swpin_fallback_attr.attr, 676 &swpin_fallback_charge_attr.attr, 677 &swpout_attr.attr, 678 &swpout_fallback_attr.attr, 679 #endif 680 &split_attr.attr, 681 &split_failed_attr.attr, 682 NULL, 683 }; 684 685 static struct attribute_group any_stats_attr_grp = { 686 .name = "stats", 687 .attrs = any_stats_attrs, 688 }; 689 690 static int sysfs_add_group(struct kobject *kobj, 691 const struct attribute_group *grp) 692 { 693 int ret = -ENOENT; 694 695 /* 696 * If the group is named, try to merge first, assuming the subdirectory 697 * was already created. This avoids the warning emitted by 698 * sysfs_create_group() if the directory already exists. 699 */ 700 if (grp->name) 701 ret = sysfs_merge_group(kobj, grp); 702 if (ret) 703 ret = sysfs_create_group(kobj, grp); 704 705 return ret; 706 } 707 708 static struct thpsize *thpsize_create(int order, struct kobject *parent) 709 { 710 unsigned long size = (PAGE_SIZE << order) / SZ_1K; 711 struct thpsize *thpsize; 712 int ret = -ENOMEM; 713 714 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); 715 if (!thpsize) 716 goto err; 717 718 thpsize->order = order; 719 720 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, 721 "hugepages-%lukB", size); 722 if (ret) { 723 kfree(thpsize); 724 goto err; 725 } 726 727 728 ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp); 729 if (ret) 730 goto err_put; 731 732 ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp); 733 if (ret) 734 goto err_put; 735 736 if (BIT(order) & THP_ORDERS_ALL_ANON) { 737 ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp); 738 if (ret) 739 goto err_put; 740 741 ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp); 742 if (ret) 743 goto err_put; 744 } 745 746 if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) { 747 ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp); 748 if (ret) 749 goto err_put; 750 751 ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp); 752 if (ret) 753 goto err_put; 754 } 755 756 return thpsize; 757 err_put: 758 kobject_put(&thpsize->kobj); 759 err: 760 return ERR_PTR(ret); 761 } 762 763 static void thpsize_release(struct kobject *kobj) 764 { 765 kfree(to_thpsize(kobj)); 766 } 767 768 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 769 { 770 int err; 771 struct thpsize *thpsize; 772 unsigned long orders; 773 int order; 774 775 /* 776 * Default to setting PMD-sized THP to inherit the global setting and 777 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time 778 * constant so we have to do this here. 779 */ 780 if (!anon_orders_configured) 781 huge_anon_orders_inherit = BIT(PMD_ORDER); 782 783 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 784 if (unlikely(!*hugepage_kobj)) { 785 pr_err("failed to create transparent hugepage kobject\n"); 786 return -ENOMEM; 787 } 788 789 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 790 if (err) { 791 pr_err("failed to register transparent hugepage group\n"); 792 goto delete_obj; 793 } 794 795 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 796 if (err) { 797 pr_err("failed to register transparent hugepage group\n"); 798 goto remove_hp_group; 799 } 800 801 orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT; 802 order = highest_order(orders); 803 while (orders) { 804 thpsize = thpsize_create(order, *hugepage_kobj); 805 if (IS_ERR(thpsize)) { 806 pr_err("failed to create thpsize for order %d\n", order); 807 err = PTR_ERR(thpsize); 808 goto remove_all; 809 } 810 list_add(&thpsize->node, &thpsize_list); 811 order = next_order(&orders, order); 812 } 813 814 return 0; 815 816 remove_all: 817 hugepage_exit_sysfs(*hugepage_kobj); 818 return err; 819 remove_hp_group: 820 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 821 delete_obj: 822 kobject_put(*hugepage_kobj); 823 return err; 824 } 825 826 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 827 { 828 struct thpsize *thpsize, *tmp; 829 830 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { 831 list_del(&thpsize->node); 832 kobject_put(&thpsize->kobj); 833 } 834 835 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 836 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 837 kobject_put(hugepage_kobj); 838 } 839 #else 840 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 841 { 842 return 0; 843 } 844 845 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 846 { 847 } 848 #endif /* CONFIG_SYSFS */ 849 850 static int __init thp_shrinker_init(void) 851 { 852 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); 853 if (!huge_zero_page_shrinker) 854 return -ENOMEM; 855 856 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 857 SHRINKER_MEMCG_AWARE | 858 SHRINKER_NONSLAB, 859 "thp-deferred_split"); 860 if (!deferred_split_shrinker) { 861 shrinker_free(huge_zero_page_shrinker); 862 return -ENOMEM; 863 } 864 865 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; 866 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; 867 shrinker_register(huge_zero_page_shrinker); 868 869 deferred_split_shrinker->count_objects = deferred_split_count; 870 deferred_split_shrinker->scan_objects = deferred_split_scan; 871 shrinker_register(deferred_split_shrinker); 872 873 return 0; 874 } 875 876 static void __init thp_shrinker_exit(void) 877 { 878 shrinker_free(huge_zero_page_shrinker); 879 shrinker_free(deferred_split_shrinker); 880 } 881 882 static int __init hugepage_init(void) 883 { 884 int err; 885 struct kobject *hugepage_kobj; 886 887 if (!has_transparent_hugepage()) { 888 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; 889 return -EINVAL; 890 } 891 892 /* 893 * hugepages can't be allocated by the buddy allocator 894 */ 895 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); 896 897 err = hugepage_init_sysfs(&hugepage_kobj); 898 if (err) 899 goto err_sysfs; 900 901 err = khugepaged_init(); 902 if (err) 903 goto err_slab; 904 905 err = thp_shrinker_init(); 906 if (err) 907 goto err_shrinker; 908 909 /* 910 * By default disable transparent hugepages on smaller systems, 911 * where the extra memory used could hurt more than TLB overhead 912 * is likely to save. The admin can still enable it through /sys. 913 */ 914 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 915 transparent_hugepage_flags = 0; 916 return 0; 917 } 918 919 err = start_stop_khugepaged(); 920 if (err) 921 goto err_khugepaged; 922 923 return 0; 924 err_khugepaged: 925 thp_shrinker_exit(); 926 err_shrinker: 927 khugepaged_destroy(); 928 err_slab: 929 hugepage_exit_sysfs(hugepage_kobj); 930 err_sysfs: 931 return err; 932 } 933 subsys_initcall(hugepage_init); 934 935 static int __init setup_transparent_hugepage(char *str) 936 { 937 int ret = 0; 938 if (!str) 939 goto out; 940 if (!strcmp(str, "always")) { 941 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 942 &transparent_hugepage_flags); 943 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 944 &transparent_hugepage_flags); 945 ret = 1; 946 } else if (!strcmp(str, "madvise")) { 947 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 948 &transparent_hugepage_flags); 949 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 950 &transparent_hugepage_flags); 951 ret = 1; 952 } else if (!strcmp(str, "never")) { 953 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 954 &transparent_hugepage_flags); 955 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 956 &transparent_hugepage_flags); 957 ret = 1; 958 } 959 out: 960 if (!ret) 961 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 962 return ret; 963 } 964 __setup("transparent_hugepage=", setup_transparent_hugepage); 965 966 static char str_dup[PAGE_SIZE] __initdata; 967 static int __init setup_thp_anon(char *str) 968 { 969 char *token, *range, *policy, *subtoken; 970 unsigned long always, inherit, madvise; 971 char *start_size, *end_size; 972 int start, end, nr; 973 char *p; 974 975 if (!str || strlen(str) + 1 > PAGE_SIZE) 976 goto err; 977 strscpy(str_dup, str); 978 979 always = huge_anon_orders_always; 980 madvise = huge_anon_orders_madvise; 981 inherit = huge_anon_orders_inherit; 982 p = str_dup; 983 while ((token = strsep(&p, ";")) != NULL) { 984 range = strsep(&token, ":"); 985 policy = token; 986 987 if (!policy) 988 goto err; 989 990 while ((subtoken = strsep(&range, ",")) != NULL) { 991 if (strchr(subtoken, '-')) { 992 start_size = strsep(&subtoken, "-"); 993 end_size = subtoken; 994 995 start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON); 996 end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON); 997 } else { 998 start_size = end_size = subtoken; 999 start = end = get_order_from_str(subtoken, 1000 THP_ORDERS_ALL_ANON); 1001 } 1002 1003 if (start == -EINVAL) { 1004 pr_err("invalid size %s in thp_anon boot parameter\n", start_size); 1005 goto err; 1006 } 1007 1008 if (end == -EINVAL) { 1009 pr_err("invalid size %s in thp_anon boot parameter\n", end_size); 1010 goto err; 1011 } 1012 1013 if (start < 0 || end < 0 || start > end) 1014 goto err; 1015 1016 nr = end - start + 1; 1017 if (!strcmp(policy, "always")) { 1018 bitmap_set(&always, start, nr); 1019 bitmap_clear(&inherit, start, nr); 1020 bitmap_clear(&madvise, start, nr); 1021 } else if (!strcmp(policy, "madvise")) { 1022 bitmap_set(&madvise, start, nr); 1023 bitmap_clear(&inherit, start, nr); 1024 bitmap_clear(&always, start, nr); 1025 } else if (!strcmp(policy, "inherit")) { 1026 bitmap_set(&inherit, start, nr); 1027 bitmap_clear(&madvise, start, nr); 1028 bitmap_clear(&always, start, nr); 1029 } else if (!strcmp(policy, "never")) { 1030 bitmap_clear(&inherit, start, nr); 1031 bitmap_clear(&madvise, start, nr); 1032 bitmap_clear(&always, start, nr); 1033 } else { 1034 pr_err("invalid policy %s in thp_anon boot parameter\n", policy); 1035 goto err; 1036 } 1037 } 1038 } 1039 1040 huge_anon_orders_always = always; 1041 huge_anon_orders_madvise = madvise; 1042 huge_anon_orders_inherit = inherit; 1043 anon_orders_configured = true; 1044 return 1; 1045 1046 err: 1047 pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str); 1048 return 0; 1049 } 1050 __setup("thp_anon=", setup_thp_anon); 1051 1052 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 1053 { 1054 if (likely(vma->vm_flags & VM_WRITE)) 1055 pmd = pmd_mkwrite(pmd, vma); 1056 return pmd; 1057 } 1058 1059 #ifdef CONFIG_MEMCG 1060 static inline 1061 struct deferred_split *get_deferred_split_queue(struct folio *folio) 1062 { 1063 struct mem_cgroup *memcg = folio_memcg(folio); 1064 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 1065 1066 if (memcg) 1067 return &memcg->deferred_split_queue; 1068 else 1069 return &pgdat->deferred_split_queue; 1070 } 1071 #else 1072 static inline 1073 struct deferred_split *get_deferred_split_queue(struct folio *folio) 1074 { 1075 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 1076 1077 return &pgdat->deferred_split_queue; 1078 } 1079 #endif 1080 1081 static inline bool is_transparent_hugepage(const struct folio *folio) 1082 { 1083 if (!folio_test_large(folio)) 1084 return false; 1085 1086 return is_huge_zero_folio(folio) || 1087 folio_test_large_rmappable(folio); 1088 } 1089 1090 static unsigned long __thp_get_unmapped_area(struct file *filp, 1091 unsigned long addr, unsigned long len, 1092 loff_t off, unsigned long flags, unsigned long size, 1093 vm_flags_t vm_flags) 1094 { 1095 loff_t off_end = off + len; 1096 loff_t off_align = round_up(off, size); 1097 unsigned long len_pad, ret, off_sub; 1098 1099 if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) 1100 return 0; 1101 1102 if (off_end <= off_align || (off_end - off_align) < size) 1103 return 0; 1104 1105 len_pad = len + size; 1106 if (len_pad < len || (off + len_pad) < off) 1107 return 0; 1108 1109 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, 1110 off >> PAGE_SHIFT, flags, vm_flags); 1111 1112 /* 1113 * The failure might be due to length padding. The caller will retry 1114 * without the padding. 1115 */ 1116 if (IS_ERR_VALUE(ret)) 1117 return 0; 1118 1119 /* 1120 * Do not try to align to THP boundary if allocation at the address 1121 * hint succeeds. 1122 */ 1123 if (ret == addr) 1124 return addr; 1125 1126 off_sub = (off - ret) & (size - 1); 1127 1128 if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) 1129 return ret + size; 1130 1131 ret += off_sub; 1132 return ret; 1133 } 1134 1135 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, 1136 unsigned long len, unsigned long pgoff, unsigned long flags, 1137 vm_flags_t vm_flags) 1138 { 1139 unsigned long ret; 1140 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 1141 1142 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); 1143 if (ret) 1144 return ret; 1145 1146 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, 1147 vm_flags); 1148 } 1149 1150 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 1151 unsigned long len, unsigned long pgoff, unsigned long flags) 1152 { 1153 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); 1154 } 1155 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 1156 1157 static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, 1158 unsigned long addr) 1159 { 1160 gfp_t gfp = vma_thp_gfp_mask(vma); 1161 const int order = HPAGE_PMD_ORDER; 1162 struct folio *folio; 1163 1164 folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); 1165 1166 if (unlikely(!folio)) { 1167 count_vm_event(THP_FAULT_FALLBACK); 1168 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); 1169 return NULL; 1170 } 1171 1172 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 1173 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { 1174 folio_put(folio); 1175 count_vm_event(THP_FAULT_FALLBACK); 1176 count_vm_event(THP_FAULT_FALLBACK_CHARGE); 1177 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); 1178 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 1179 return NULL; 1180 } 1181 folio_throttle_swaprate(folio, gfp); 1182 1183 /* 1184 * When a folio is not zeroed during allocation (__GFP_ZERO not used) 1185 * or user folios require special handling, folio_zero_user() is used to 1186 * make sure that the page corresponding to the faulting address will be 1187 * hot in the cache after zeroing. 1188 */ 1189 if (user_alloc_needs_zeroing()) 1190 folio_zero_user(folio, addr); 1191 /* 1192 * The memory barrier inside __folio_mark_uptodate makes sure that 1193 * folio_zero_user writes become visible before the set_pmd_at() 1194 * write. 1195 */ 1196 __folio_mark_uptodate(folio); 1197 return folio; 1198 } 1199 1200 static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, 1201 struct vm_area_struct *vma, unsigned long haddr) 1202 { 1203 pmd_t entry; 1204 1205 entry = folio_mk_pmd(folio, vma->vm_page_prot); 1206 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1207 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); 1208 folio_add_lru_vma(folio, vma); 1209 set_pmd_at(vma->vm_mm, haddr, pmd, entry); 1210 update_mmu_cache_pmd(vma, haddr, pmd); 1211 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1212 count_vm_event(THP_FAULT_ALLOC); 1213 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); 1214 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); 1215 } 1216 1217 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1218 { 1219 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1220 struct vm_area_struct *vma = vmf->vma; 1221 struct folio *folio; 1222 pgtable_t pgtable; 1223 vm_fault_t ret = 0; 1224 1225 folio = vma_alloc_anon_folio_pmd(vma, vmf->address); 1226 if (unlikely(!folio)) 1227 return VM_FAULT_FALLBACK; 1228 1229 pgtable = pte_alloc_one(vma->vm_mm); 1230 if (unlikely(!pgtable)) { 1231 ret = VM_FAULT_OOM; 1232 goto release; 1233 } 1234 1235 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1236 if (unlikely(!pmd_none(*vmf->pmd))) { 1237 goto unlock_release; 1238 } else { 1239 ret = check_stable_address_space(vma->vm_mm); 1240 if (ret) 1241 goto unlock_release; 1242 1243 /* Deliver the page fault to userland */ 1244 if (userfaultfd_missing(vma)) { 1245 spin_unlock(vmf->ptl); 1246 folio_put(folio); 1247 pte_free(vma->vm_mm, pgtable); 1248 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1249 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1250 return ret; 1251 } 1252 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 1253 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); 1254 mm_inc_nr_ptes(vma->vm_mm); 1255 deferred_split_folio(folio, false); 1256 spin_unlock(vmf->ptl); 1257 } 1258 1259 return 0; 1260 unlock_release: 1261 spin_unlock(vmf->ptl); 1262 release: 1263 if (pgtable) 1264 pte_free(vma->vm_mm, pgtable); 1265 folio_put(folio); 1266 return ret; 1267 1268 } 1269 1270 /* 1271 * always: directly stall for all thp allocations 1272 * defer: wake kswapd and fail if not immediately available 1273 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 1274 * fail if not immediately available 1275 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 1276 * available 1277 * never: never stall for any thp allocation 1278 */ 1279 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) 1280 { 1281 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); 1282 1283 /* Always do synchronous compaction */ 1284 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 1285 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 1286 1287 /* Kick kcompactd and fail quickly */ 1288 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 1289 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 1290 1291 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 1292 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 1293 return GFP_TRANSHUGE_LIGHT | 1294 (vma_madvised ? __GFP_DIRECT_RECLAIM : 1295 __GFP_KSWAPD_RECLAIM); 1296 1297 /* Only do synchronous compaction if madvised */ 1298 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 1299 return GFP_TRANSHUGE_LIGHT | 1300 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 1301 1302 return GFP_TRANSHUGE_LIGHT; 1303 } 1304 1305 /* Caller must hold page table lock. */ 1306 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, 1307 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 1308 struct folio *zero_folio) 1309 { 1310 pmd_t entry; 1311 entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); 1312 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1313 set_pmd_at(mm, haddr, pmd, entry); 1314 mm_inc_nr_ptes(mm); 1315 } 1316 1317 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1318 { 1319 struct vm_area_struct *vma = vmf->vma; 1320 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1321 vm_fault_t ret; 1322 1323 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) 1324 return VM_FAULT_FALLBACK; 1325 ret = vmf_anon_prepare(vmf); 1326 if (ret) 1327 return ret; 1328 khugepaged_enter_vma(vma, vma->vm_flags); 1329 1330 if (!(vmf->flags & FAULT_FLAG_WRITE) && 1331 !mm_forbids_zeropage(vma->vm_mm) && 1332 transparent_hugepage_use_zero_page()) { 1333 pgtable_t pgtable; 1334 struct folio *zero_folio; 1335 vm_fault_t ret; 1336 1337 pgtable = pte_alloc_one(vma->vm_mm); 1338 if (unlikely(!pgtable)) 1339 return VM_FAULT_OOM; 1340 zero_folio = mm_get_huge_zero_folio(vma->vm_mm); 1341 if (unlikely(!zero_folio)) { 1342 pte_free(vma->vm_mm, pgtable); 1343 count_vm_event(THP_FAULT_FALLBACK); 1344 return VM_FAULT_FALLBACK; 1345 } 1346 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1347 ret = 0; 1348 if (pmd_none(*vmf->pmd)) { 1349 ret = check_stable_address_space(vma->vm_mm); 1350 if (ret) { 1351 spin_unlock(vmf->ptl); 1352 pte_free(vma->vm_mm, pgtable); 1353 } else if (userfaultfd_missing(vma)) { 1354 spin_unlock(vmf->ptl); 1355 pte_free(vma->vm_mm, pgtable); 1356 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1357 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1358 } else { 1359 set_huge_zero_folio(pgtable, vma->vm_mm, vma, 1360 haddr, vmf->pmd, zero_folio); 1361 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1362 spin_unlock(vmf->ptl); 1363 } 1364 } else { 1365 spin_unlock(vmf->ptl); 1366 pte_free(vma->vm_mm, pgtable); 1367 } 1368 return ret; 1369 } 1370 1371 return __do_huge_pmd_anonymous_page(vmf); 1372 } 1373 1374 struct folio_or_pfn { 1375 union { 1376 struct folio *folio; 1377 unsigned long pfn; 1378 }; 1379 bool is_folio; 1380 }; 1381 1382 static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, 1383 pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot, 1384 bool write, pgtable_t pgtable) 1385 { 1386 struct mm_struct *mm = vma->vm_mm; 1387 pmd_t entry; 1388 1389 lockdep_assert_held(pmd_lockptr(mm, pmd)); 1390 1391 if (!pmd_none(*pmd)) { 1392 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : 1393 fop.pfn; 1394 1395 if (write) { 1396 if (pmd_pfn(*pmd) != pfn) { 1397 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 1398 return -EEXIST; 1399 } 1400 entry = pmd_mkyoung(*pmd); 1401 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1402 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 1403 update_mmu_cache_pmd(vma, addr, pmd); 1404 } 1405 1406 return -EEXIST; 1407 } 1408 1409 if (fop.is_folio) { 1410 entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); 1411 1412 folio_get(fop.folio); 1413 folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); 1414 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); 1415 } else { 1416 entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot)); 1417 entry = pmd_mkspecial(entry); 1418 } 1419 if (write) { 1420 entry = pmd_mkyoung(pmd_mkdirty(entry)); 1421 entry = maybe_pmd_mkwrite(entry, vma); 1422 } 1423 1424 if (pgtable) { 1425 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1426 mm_inc_nr_ptes(mm); 1427 } 1428 1429 set_pmd_at(mm, addr, pmd, entry); 1430 update_mmu_cache_pmd(vma, addr, pmd); 1431 return 0; 1432 } 1433 1434 /** 1435 * vmf_insert_pfn_pmd - insert a pmd size pfn 1436 * @vmf: Structure describing the fault 1437 * @pfn: pfn to insert 1438 * @write: whether it's a write fault 1439 * 1440 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. 1441 * 1442 * Return: vm_fault_t value. 1443 */ 1444 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, 1445 bool write) 1446 { 1447 unsigned long addr = vmf->address & PMD_MASK; 1448 struct vm_area_struct *vma = vmf->vma; 1449 pgprot_t pgprot = vma->vm_page_prot; 1450 struct folio_or_pfn fop = { 1451 .pfn = pfn, 1452 }; 1453 pgtable_t pgtable = NULL; 1454 spinlock_t *ptl; 1455 int error; 1456 1457 /* 1458 * If we had pmd_special, we could avoid all these restrictions, 1459 * but we need to be consistent with PTEs and architectures that 1460 * can't support a 'special' bit. 1461 */ 1462 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1463 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1464 (VM_PFNMAP|VM_MIXEDMAP)); 1465 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1466 1467 if (addr < vma->vm_start || addr >= vma->vm_end) 1468 return VM_FAULT_SIGBUS; 1469 1470 if (arch_needs_pgtable_deposit()) { 1471 pgtable = pte_alloc_one(vma->vm_mm); 1472 if (!pgtable) 1473 return VM_FAULT_OOM; 1474 } 1475 1476 pfnmap_setup_cachemode_pfn(pfn, &pgprot); 1477 1478 ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1479 error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write, 1480 pgtable); 1481 spin_unlock(ptl); 1482 if (error && pgtable) 1483 pte_free(vma->vm_mm, pgtable); 1484 1485 return VM_FAULT_NOPAGE; 1486 } 1487 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1488 1489 vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, 1490 bool write) 1491 { 1492 struct vm_area_struct *vma = vmf->vma; 1493 unsigned long addr = vmf->address & PMD_MASK; 1494 struct mm_struct *mm = vma->vm_mm; 1495 struct folio_or_pfn fop = { 1496 .folio = folio, 1497 .is_folio = true, 1498 }; 1499 spinlock_t *ptl; 1500 pgtable_t pgtable = NULL; 1501 int error; 1502 1503 if (addr < vma->vm_start || addr >= vma->vm_end) 1504 return VM_FAULT_SIGBUS; 1505 1506 if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER)) 1507 return VM_FAULT_SIGBUS; 1508 1509 if (arch_needs_pgtable_deposit()) { 1510 pgtable = pte_alloc_one(vma->vm_mm); 1511 if (!pgtable) 1512 return VM_FAULT_OOM; 1513 } 1514 1515 ptl = pmd_lock(mm, vmf->pmd); 1516 error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, 1517 write, pgtable); 1518 spin_unlock(ptl); 1519 if (error && pgtable) 1520 pte_free(mm, pgtable); 1521 1522 return VM_FAULT_NOPAGE; 1523 } 1524 EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd); 1525 1526 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1527 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 1528 { 1529 if (likely(vma->vm_flags & VM_WRITE)) 1530 pud = pud_mkwrite(pud); 1531 return pud; 1532 } 1533 1534 static void insert_pud(struct vm_area_struct *vma, unsigned long addr, 1535 pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) 1536 { 1537 struct mm_struct *mm = vma->vm_mm; 1538 pud_t entry; 1539 1540 if (!pud_none(*pud)) { 1541 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : 1542 fop.pfn; 1543 1544 if (write) { 1545 if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) 1546 return; 1547 entry = pud_mkyoung(*pud); 1548 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 1549 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 1550 update_mmu_cache_pud(vma, addr, pud); 1551 } 1552 return; 1553 } 1554 1555 if (fop.is_folio) { 1556 entry = folio_mk_pud(fop.folio, vma->vm_page_prot); 1557 1558 folio_get(fop.folio); 1559 folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma); 1560 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR); 1561 } else { 1562 entry = pud_mkhuge(pfn_pud(fop.pfn, prot)); 1563 entry = pud_mkspecial(entry); 1564 } 1565 if (write) { 1566 entry = pud_mkyoung(pud_mkdirty(entry)); 1567 entry = maybe_pud_mkwrite(entry, vma); 1568 } 1569 set_pud_at(mm, addr, pud, entry); 1570 update_mmu_cache_pud(vma, addr, pud); 1571 } 1572 1573 /** 1574 * vmf_insert_pfn_pud - insert a pud size pfn 1575 * @vmf: Structure describing the fault 1576 * @pfn: pfn to insert 1577 * @write: whether it's a write fault 1578 * 1579 * Insert a pud size pfn. See vmf_insert_pfn() for additional info. 1580 * 1581 * Return: vm_fault_t value. 1582 */ 1583 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, 1584 bool write) 1585 { 1586 unsigned long addr = vmf->address & PUD_MASK; 1587 struct vm_area_struct *vma = vmf->vma; 1588 pgprot_t pgprot = vma->vm_page_prot; 1589 struct folio_or_pfn fop = { 1590 .pfn = pfn, 1591 }; 1592 spinlock_t *ptl; 1593 1594 /* 1595 * If we had pud_special, we could avoid all these restrictions, 1596 * but we need to be consistent with PTEs and architectures that 1597 * can't support a 'special' bit. 1598 */ 1599 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1600 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1601 (VM_PFNMAP|VM_MIXEDMAP)); 1602 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1603 1604 if (addr < vma->vm_start || addr >= vma->vm_end) 1605 return VM_FAULT_SIGBUS; 1606 1607 pfnmap_setup_cachemode_pfn(pfn, &pgprot); 1608 1609 ptl = pud_lock(vma->vm_mm, vmf->pud); 1610 insert_pud(vma, addr, vmf->pud, fop, pgprot, write); 1611 spin_unlock(ptl); 1612 1613 return VM_FAULT_NOPAGE; 1614 } 1615 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 1616 1617 /** 1618 * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry 1619 * @vmf: Structure describing the fault 1620 * @folio: folio to insert 1621 * @write: whether it's a write fault 1622 * 1623 * Return: vm_fault_t value. 1624 */ 1625 vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, 1626 bool write) 1627 { 1628 struct vm_area_struct *vma = vmf->vma; 1629 unsigned long addr = vmf->address & PUD_MASK; 1630 pud_t *pud = vmf->pud; 1631 struct mm_struct *mm = vma->vm_mm; 1632 struct folio_or_pfn fop = { 1633 .folio = folio, 1634 .is_folio = true, 1635 }; 1636 spinlock_t *ptl; 1637 1638 if (addr < vma->vm_start || addr >= vma->vm_end) 1639 return VM_FAULT_SIGBUS; 1640 1641 if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER)) 1642 return VM_FAULT_SIGBUS; 1643 1644 ptl = pud_lock(mm, pud); 1645 insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); 1646 spin_unlock(ptl); 1647 1648 return VM_FAULT_NOPAGE; 1649 } 1650 EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); 1651 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1652 1653 void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1654 pmd_t *pmd, bool write) 1655 { 1656 pmd_t _pmd; 1657 1658 _pmd = pmd_mkyoung(*pmd); 1659 if (write) 1660 _pmd = pmd_mkdirty(_pmd); 1661 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1662 pmd, _pmd, write)) 1663 update_mmu_cache_pmd(vma, addr, pmd); 1664 } 1665 1666 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1667 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1668 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1669 { 1670 spinlock_t *dst_ptl, *src_ptl; 1671 struct page *src_page; 1672 struct folio *src_folio; 1673 pmd_t pmd; 1674 pgtable_t pgtable = NULL; 1675 int ret = -ENOMEM; 1676 1677 pmd = pmdp_get_lockless(src_pmd); 1678 if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { 1679 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1680 src_ptl = pmd_lockptr(src_mm, src_pmd); 1681 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1682 /* 1683 * No need to recheck the pmd, it can't change with write 1684 * mmap lock held here. 1685 * 1686 * Meanwhile, making sure it's not a CoW VMA with writable 1687 * mapping, otherwise it means either the anon page wrongly 1688 * applied special bit, or we made the PRIVATE mapping be 1689 * able to wrongly write to the backend MMIO. 1690 */ 1691 VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); 1692 goto set_pmd; 1693 } 1694 1695 /* Skip if can be re-fill on fault */ 1696 if (!vma_is_anonymous(dst_vma)) 1697 return 0; 1698 1699 pgtable = pte_alloc_one(dst_mm); 1700 if (unlikely(!pgtable)) 1701 goto out; 1702 1703 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1704 src_ptl = pmd_lockptr(src_mm, src_pmd); 1705 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1706 1707 ret = -EAGAIN; 1708 pmd = *src_pmd; 1709 1710 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1711 if (unlikely(is_swap_pmd(pmd))) { 1712 swp_entry_t entry = pmd_to_swp_entry(pmd); 1713 1714 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1715 if (!is_readable_migration_entry(entry)) { 1716 entry = make_readable_migration_entry( 1717 swp_offset(entry)); 1718 pmd = swp_entry_to_pmd(entry); 1719 if (pmd_swp_soft_dirty(*src_pmd)) 1720 pmd = pmd_swp_mksoft_dirty(pmd); 1721 if (pmd_swp_uffd_wp(*src_pmd)) 1722 pmd = pmd_swp_mkuffd_wp(pmd); 1723 set_pmd_at(src_mm, addr, src_pmd, pmd); 1724 } 1725 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1726 mm_inc_nr_ptes(dst_mm); 1727 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1728 if (!userfaultfd_wp(dst_vma)) 1729 pmd = pmd_swp_clear_uffd_wp(pmd); 1730 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1731 ret = 0; 1732 goto out_unlock; 1733 } 1734 #endif 1735 1736 if (unlikely(!pmd_trans_huge(pmd))) { 1737 pte_free(dst_mm, pgtable); 1738 goto out_unlock; 1739 } 1740 /* 1741 * When page table lock is held, the huge zero pmd should not be 1742 * under splitting since we don't split the page itself, only pmd to 1743 * a page table. 1744 */ 1745 if (is_huge_zero_pmd(pmd)) { 1746 /* 1747 * mm_get_huge_zero_folio() will never allocate a new 1748 * folio here, since we already have a zero page to 1749 * copy. It just takes a reference. 1750 */ 1751 mm_get_huge_zero_folio(dst_mm); 1752 goto out_zero_page; 1753 } 1754 1755 src_page = pmd_page(pmd); 1756 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1757 src_folio = page_folio(src_page); 1758 1759 folio_get(src_folio); 1760 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) { 1761 /* Page maybe pinned: split and retry the fault on PTEs. */ 1762 folio_put(src_folio); 1763 pte_free(dst_mm, pgtable); 1764 spin_unlock(src_ptl); 1765 spin_unlock(dst_ptl); 1766 __split_huge_pmd(src_vma, src_pmd, addr, false); 1767 return -EAGAIN; 1768 } 1769 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1770 out_zero_page: 1771 mm_inc_nr_ptes(dst_mm); 1772 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1773 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1774 if (!userfaultfd_wp(dst_vma)) 1775 pmd = pmd_clear_uffd_wp(pmd); 1776 pmd = pmd_wrprotect(pmd); 1777 set_pmd: 1778 pmd = pmd_mkold(pmd); 1779 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1780 1781 ret = 0; 1782 out_unlock: 1783 spin_unlock(src_ptl); 1784 spin_unlock(dst_ptl); 1785 out: 1786 return ret; 1787 } 1788 1789 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1790 void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1791 pud_t *pud, bool write) 1792 { 1793 pud_t _pud; 1794 1795 _pud = pud_mkyoung(*pud); 1796 if (write) 1797 _pud = pud_mkdirty(_pud); 1798 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1799 pud, _pud, write)) 1800 update_mmu_cache_pud(vma, addr, pud); 1801 } 1802 1803 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1804 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1805 struct vm_area_struct *vma) 1806 { 1807 spinlock_t *dst_ptl, *src_ptl; 1808 pud_t pud; 1809 int ret; 1810 1811 dst_ptl = pud_lock(dst_mm, dst_pud); 1812 src_ptl = pud_lockptr(src_mm, src_pud); 1813 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1814 1815 ret = -EAGAIN; 1816 pud = *src_pud; 1817 if (unlikely(!pud_trans_huge(pud))) 1818 goto out_unlock; 1819 1820 /* 1821 * TODO: once we support anonymous pages, use 1822 * folio_try_dup_anon_rmap_*() and split if duplicating fails. 1823 */ 1824 if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { 1825 pudp_set_wrprotect(src_mm, addr, src_pud); 1826 pud = pud_wrprotect(pud); 1827 } 1828 pud = pud_mkold(pud); 1829 set_pud_at(dst_mm, addr, dst_pud, pud); 1830 1831 ret = 0; 1832 out_unlock: 1833 spin_unlock(src_ptl); 1834 spin_unlock(dst_ptl); 1835 return ret; 1836 } 1837 1838 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1839 { 1840 bool write = vmf->flags & FAULT_FLAG_WRITE; 1841 1842 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1843 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1844 goto unlock; 1845 1846 touch_pud(vmf->vma, vmf->address, vmf->pud, write); 1847 unlock: 1848 spin_unlock(vmf->ptl); 1849 } 1850 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1851 1852 void huge_pmd_set_accessed(struct vm_fault *vmf) 1853 { 1854 bool write = vmf->flags & FAULT_FLAG_WRITE; 1855 1856 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1857 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) 1858 goto unlock; 1859 1860 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); 1861 1862 unlock: 1863 spin_unlock(vmf->ptl); 1864 } 1865 1866 static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) 1867 { 1868 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1869 struct vm_area_struct *vma = vmf->vma; 1870 struct mmu_notifier_range range; 1871 struct folio *folio; 1872 vm_fault_t ret = 0; 1873 1874 folio = vma_alloc_anon_folio_pmd(vma, vmf->address); 1875 if (unlikely(!folio)) 1876 return VM_FAULT_FALLBACK; 1877 1878 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, 1879 haddr + HPAGE_PMD_SIZE); 1880 mmu_notifier_invalidate_range_start(&range); 1881 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1882 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) 1883 goto release; 1884 ret = check_stable_address_space(vma->vm_mm); 1885 if (ret) 1886 goto release; 1887 (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); 1888 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); 1889 goto unlock; 1890 release: 1891 folio_put(folio); 1892 unlock: 1893 spin_unlock(vmf->ptl); 1894 mmu_notifier_invalidate_range_end(&range); 1895 return ret; 1896 } 1897 1898 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) 1899 { 1900 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 1901 struct vm_area_struct *vma = vmf->vma; 1902 struct folio *folio; 1903 struct page *page; 1904 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1905 pmd_t orig_pmd = vmf->orig_pmd; 1906 1907 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 1908 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1909 1910 if (is_huge_zero_pmd(orig_pmd)) { 1911 vm_fault_t ret = do_huge_zero_wp_pmd(vmf); 1912 1913 if (!(ret & VM_FAULT_FALLBACK)) 1914 return ret; 1915 1916 /* Fallback to splitting PMD if THP cannot be allocated */ 1917 goto fallback; 1918 } 1919 1920 spin_lock(vmf->ptl); 1921 1922 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1923 spin_unlock(vmf->ptl); 1924 return 0; 1925 } 1926 1927 page = pmd_page(orig_pmd); 1928 folio = page_folio(page); 1929 VM_BUG_ON_PAGE(!PageHead(page), page); 1930 1931 /* Early check when only holding the PT lock. */ 1932 if (PageAnonExclusive(page)) 1933 goto reuse; 1934 1935 if (!folio_trylock(folio)) { 1936 folio_get(folio); 1937 spin_unlock(vmf->ptl); 1938 folio_lock(folio); 1939 spin_lock(vmf->ptl); 1940 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1941 spin_unlock(vmf->ptl); 1942 folio_unlock(folio); 1943 folio_put(folio); 1944 return 0; 1945 } 1946 folio_put(folio); 1947 } 1948 1949 /* Recheck after temporarily dropping the PT lock. */ 1950 if (PageAnonExclusive(page)) { 1951 folio_unlock(folio); 1952 goto reuse; 1953 } 1954 1955 /* 1956 * See do_wp_page(): we can only reuse the folio exclusively if 1957 * there are no additional references. Note that we always drain 1958 * the LRU cache immediately after adding a THP. 1959 */ 1960 if (folio_ref_count(folio) > 1961 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) 1962 goto unlock_fallback; 1963 if (folio_test_swapcache(folio)) 1964 folio_free_swap(folio); 1965 if (folio_ref_count(folio) == 1) { 1966 pmd_t entry; 1967 1968 folio_move_anon_rmap(folio, vma); 1969 SetPageAnonExclusive(page); 1970 folio_unlock(folio); 1971 reuse: 1972 if (unlikely(unshare)) { 1973 spin_unlock(vmf->ptl); 1974 return 0; 1975 } 1976 entry = pmd_mkyoung(orig_pmd); 1977 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1978 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1979 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1980 spin_unlock(vmf->ptl); 1981 return 0; 1982 } 1983 1984 unlock_fallback: 1985 folio_unlock(folio); 1986 spin_unlock(vmf->ptl); 1987 fallback: 1988 __split_huge_pmd(vma, vmf->pmd, vmf->address, false); 1989 return VM_FAULT_FALLBACK; 1990 } 1991 1992 static inline bool can_change_pmd_writable(struct vm_area_struct *vma, 1993 unsigned long addr, pmd_t pmd) 1994 { 1995 struct page *page; 1996 1997 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) 1998 return false; 1999 2000 /* Don't touch entries that are not even readable (NUMA hinting). */ 2001 if (pmd_protnone(pmd)) 2002 return false; 2003 2004 /* Do we need write faults for softdirty tracking? */ 2005 if (pmd_needs_soft_dirty_wp(vma, pmd)) 2006 return false; 2007 2008 /* Do we need write faults for uffd-wp tracking? */ 2009 if (userfaultfd_huge_pmd_wp(vma, pmd)) 2010 return false; 2011 2012 if (!(vma->vm_flags & VM_SHARED)) { 2013 /* See can_change_pte_writable(). */ 2014 page = vm_normal_page_pmd(vma, addr, pmd); 2015 return page && PageAnon(page) && PageAnonExclusive(page); 2016 } 2017 2018 /* See can_change_pte_writable(). */ 2019 return pmd_dirty(pmd); 2020 } 2021 2022 /* NUMA hinting page fault entry point for trans huge pmds */ 2023 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) 2024 { 2025 struct vm_area_struct *vma = vmf->vma; 2026 struct folio *folio; 2027 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 2028 int nid = NUMA_NO_NODE; 2029 int target_nid, last_cpupid; 2030 pmd_t pmd, old_pmd; 2031 bool writable = false; 2032 int flags = 0; 2033 2034 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 2035 old_pmd = pmdp_get(vmf->pmd); 2036 2037 if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) { 2038 spin_unlock(vmf->ptl); 2039 return 0; 2040 } 2041 2042 pmd = pmd_modify(old_pmd, vma->vm_page_prot); 2043 2044 /* 2045 * Detect now whether the PMD could be writable; this information 2046 * is only valid while holding the PT lock. 2047 */ 2048 writable = pmd_write(pmd); 2049 if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 2050 can_change_pmd_writable(vma, vmf->address, pmd)) 2051 writable = true; 2052 2053 folio = vm_normal_folio_pmd(vma, haddr, pmd); 2054 if (!folio) 2055 goto out_map; 2056 2057 nid = folio_nid(folio); 2058 2059 target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable, 2060 &last_cpupid); 2061 if (target_nid == NUMA_NO_NODE) 2062 goto out_map; 2063 if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { 2064 flags |= TNF_MIGRATE_FAIL; 2065 goto out_map; 2066 } 2067 /* The folio is isolated and isolation code holds a folio reference. */ 2068 spin_unlock(vmf->ptl); 2069 writable = false; 2070 2071 if (!migrate_misplaced_folio(folio, target_nid)) { 2072 flags |= TNF_MIGRATED; 2073 nid = target_nid; 2074 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 2075 return 0; 2076 } 2077 2078 flags |= TNF_MIGRATE_FAIL; 2079 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 2080 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) { 2081 spin_unlock(vmf->ptl); 2082 return 0; 2083 } 2084 out_map: 2085 /* Restore the PMD */ 2086 pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot); 2087 pmd = pmd_mkyoung(pmd); 2088 if (writable) 2089 pmd = pmd_mkwrite(pmd, vma); 2090 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 2091 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 2092 spin_unlock(vmf->ptl); 2093 2094 if (nid != NUMA_NO_NODE) 2095 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 2096 return 0; 2097 } 2098 2099 /* 2100 * Return true if we do MADV_FREE successfully on entire pmd page. 2101 * Otherwise, return false. 2102 */ 2103 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2104 pmd_t *pmd, unsigned long addr, unsigned long next) 2105 { 2106 spinlock_t *ptl; 2107 pmd_t orig_pmd; 2108 struct folio *folio; 2109 struct mm_struct *mm = tlb->mm; 2110 bool ret = false; 2111 2112 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2113 2114 ptl = pmd_trans_huge_lock(pmd, vma); 2115 if (!ptl) 2116 goto out_unlocked; 2117 2118 orig_pmd = *pmd; 2119 if (is_huge_zero_pmd(orig_pmd)) 2120 goto out; 2121 2122 if (unlikely(!pmd_present(orig_pmd))) { 2123 VM_BUG_ON(thp_migration_supported() && 2124 !is_pmd_migration_entry(orig_pmd)); 2125 goto out; 2126 } 2127 2128 folio = pmd_folio(orig_pmd); 2129 /* 2130 * If other processes are mapping this folio, we couldn't discard 2131 * the folio unless they all do MADV_FREE so let's skip the folio. 2132 */ 2133 if (folio_maybe_mapped_shared(folio)) 2134 goto out; 2135 2136 if (!folio_trylock(folio)) 2137 goto out; 2138 2139 /* 2140 * If user want to discard part-pages of THP, split it so MADV_FREE 2141 * will deactivate only them. 2142 */ 2143 if (next - addr != HPAGE_PMD_SIZE) { 2144 folio_get(folio); 2145 spin_unlock(ptl); 2146 split_folio(folio); 2147 folio_unlock(folio); 2148 folio_put(folio); 2149 goto out_unlocked; 2150 } 2151 2152 if (folio_test_dirty(folio)) 2153 folio_clear_dirty(folio); 2154 folio_unlock(folio); 2155 2156 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 2157 pmdp_invalidate(vma, addr, pmd); 2158 orig_pmd = pmd_mkold(orig_pmd); 2159 orig_pmd = pmd_mkclean(orig_pmd); 2160 2161 set_pmd_at(mm, addr, pmd, orig_pmd); 2162 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2163 } 2164 2165 folio_mark_lazyfree(folio); 2166 ret = true; 2167 out: 2168 spin_unlock(ptl); 2169 out_unlocked: 2170 return ret; 2171 } 2172 2173 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 2174 { 2175 pgtable_t pgtable; 2176 2177 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2178 pte_free(mm, pgtable); 2179 mm_dec_nr_ptes(mm); 2180 } 2181 2182 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2183 pmd_t *pmd, unsigned long addr) 2184 { 2185 pmd_t orig_pmd; 2186 spinlock_t *ptl; 2187 2188 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2189 2190 ptl = __pmd_trans_huge_lock(pmd, vma); 2191 if (!ptl) 2192 return 0; 2193 /* 2194 * For architectures like ppc64 we look at deposited pgtable 2195 * when calling pmdp_huge_get_and_clear. So do the 2196 * pgtable_trans_huge_withdraw after finishing pmdp related 2197 * operations. 2198 */ 2199 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, 2200 tlb->fullmm); 2201 arch_check_zapped_pmd(vma, orig_pmd); 2202 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2203 if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { 2204 if (arch_needs_pgtable_deposit()) 2205 zap_deposited_table(tlb->mm, pmd); 2206 spin_unlock(ptl); 2207 } else if (is_huge_zero_pmd(orig_pmd)) { 2208 if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) 2209 zap_deposited_table(tlb->mm, pmd); 2210 spin_unlock(ptl); 2211 } else { 2212 struct folio *folio = NULL; 2213 int flush_needed = 1; 2214 2215 if (pmd_present(orig_pmd)) { 2216 struct page *page = pmd_page(orig_pmd); 2217 2218 folio = page_folio(page); 2219 folio_remove_rmap_pmd(folio, page, vma); 2220 WARN_ON_ONCE(folio_mapcount(folio) < 0); 2221 VM_BUG_ON_PAGE(!PageHead(page), page); 2222 } else if (thp_migration_supported()) { 2223 swp_entry_t entry; 2224 2225 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 2226 entry = pmd_to_swp_entry(orig_pmd); 2227 folio = pfn_swap_entry_folio(entry); 2228 flush_needed = 0; 2229 } else 2230 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 2231 2232 if (folio_test_anon(folio)) { 2233 zap_deposited_table(tlb->mm, pmd); 2234 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 2235 } else { 2236 if (arch_needs_pgtable_deposit()) 2237 zap_deposited_table(tlb->mm, pmd); 2238 add_mm_counter(tlb->mm, mm_counter_file(folio), 2239 -HPAGE_PMD_NR); 2240 2241 /* 2242 * Use flush_needed to indicate whether the PMD entry 2243 * is present, instead of checking pmd_present() again. 2244 */ 2245 if (flush_needed && pmd_young(orig_pmd) && 2246 likely(vma_has_recency(vma))) 2247 folio_mark_accessed(folio); 2248 } 2249 2250 spin_unlock(ptl); 2251 if (flush_needed) 2252 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); 2253 } 2254 return 1; 2255 } 2256 2257 #ifndef pmd_move_must_withdraw 2258 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 2259 spinlock_t *old_pmd_ptl, 2260 struct vm_area_struct *vma) 2261 { 2262 /* 2263 * With split pmd lock we also need to move preallocated 2264 * PTE page table if new_pmd is on different PMD page table. 2265 * 2266 * We also don't deposit and withdraw tables for file pages. 2267 */ 2268 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 2269 } 2270 #endif 2271 2272 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 2273 { 2274 #ifdef CONFIG_MEM_SOFT_DIRTY 2275 if (unlikely(is_pmd_migration_entry(pmd))) 2276 pmd = pmd_swp_mksoft_dirty(pmd); 2277 else if (pmd_present(pmd)) 2278 pmd = pmd_mksoft_dirty(pmd); 2279 #endif 2280 return pmd; 2281 } 2282 2283 static pmd_t clear_uffd_wp_pmd(pmd_t pmd) 2284 { 2285 if (pmd_present(pmd)) 2286 pmd = pmd_clear_uffd_wp(pmd); 2287 else if (is_swap_pmd(pmd)) 2288 pmd = pmd_swp_clear_uffd_wp(pmd); 2289 2290 return pmd; 2291 } 2292 2293 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 2294 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) 2295 { 2296 spinlock_t *old_ptl, *new_ptl; 2297 pmd_t pmd; 2298 struct mm_struct *mm = vma->vm_mm; 2299 bool force_flush = false; 2300 2301 /* 2302 * The destination pmd shouldn't be established, free_pgtables() 2303 * should have released it; but move_page_tables() might have already 2304 * inserted a page table, if racing against shmem/file collapse. 2305 */ 2306 if (!pmd_none(*new_pmd)) { 2307 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 2308 return false; 2309 } 2310 2311 /* 2312 * We don't have to worry about the ordering of src and dst 2313 * ptlocks because exclusive mmap_lock prevents deadlock. 2314 */ 2315 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 2316 if (old_ptl) { 2317 new_ptl = pmd_lockptr(mm, new_pmd); 2318 if (new_ptl != old_ptl) 2319 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 2320 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 2321 if (pmd_present(pmd)) 2322 force_flush = true; 2323 VM_BUG_ON(!pmd_none(*new_pmd)); 2324 2325 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 2326 pgtable_t pgtable; 2327 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 2328 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 2329 } 2330 pmd = move_soft_dirty_pmd(pmd); 2331 if (vma_has_uffd_without_event_remap(vma)) 2332 pmd = clear_uffd_wp_pmd(pmd); 2333 set_pmd_at(mm, new_addr, new_pmd, pmd); 2334 if (force_flush) 2335 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 2336 if (new_ptl != old_ptl) 2337 spin_unlock(new_ptl); 2338 spin_unlock(old_ptl); 2339 return true; 2340 } 2341 return false; 2342 } 2343 2344 /* 2345 * Returns 2346 * - 0 if PMD could not be locked 2347 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary 2348 * or if prot_numa but THP migration is not supported 2349 * - HPAGE_PMD_NR if protections changed and TLB flush necessary 2350 */ 2351 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2352 pmd_t *pmd, unsigned long addr, pgprot_t newprot, 2353 unsigned long cp_flags) 2354 { 2355 struct mm_struct *mm = vma->vm_mm; 2356 spinlock_t *ptl; 2357 pmd_t oldpmd, entry; 2358 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 2359 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 2360 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 2361 int ret = 1; 2362 2363 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2364 2365 if (prot_numa && !thp_migration_supported()) 2366 return 1; 2367 2368 ptl = __pmd_trans_huge_lock(pmd, vma); 2369 if (!ptl) 2370 return 0; 2371 2372 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2373 if (is_swap_pmd(*pmd)) { 2374 swp_entry_t entry = pmd_to_swp_entry(*pmd); 2375 struct folio *folio = pfn_swap_entry_folio(entry); 2376 pmd_t newpmd; 2377 2378 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 2379 if (is_writable_migration_entry(entry)) { 2380 /* 2381 * A protection check is difficult so 2382 * just be safe and disable write 2383 */ 2384 if (folio_test_anon(folio)) 2385 entry = make_readable_exclusive_migration_entry(swp_offset(entry)); 2386 else 2387 entry = make_readable_migration_entry(swp_offset(entry)); 2388 newpmd = swp_entry_to_pmd(entry); 2389 if (pmd_swp_soft_dirty(*pmd)) 2390 newpmd = pmd_swp_mksoft_dirty(newpmd); 2391 } else { 2392 newpmd = *pmd; 2393 } 2394 2395 if (uffd_wp) 2396 newpmd = pmd_swp_mkuffd_wp(newpmd); 2397 else if (uffd_wp_resolve) 2398 newpmd = pmd_swp_clear_uffd_wp(newpmd); 2399 if (!pmd_same(*pmd, newpmd)) 2400 set_pmd_at(mm, addr, pmd, newpmd); 2401 goto unlock; 2402 } 2403 #endif 2404 2405 if (prot_numa) { 2406 struct folio *folio; 2407 bool toptier; 2408 /* 2409 * Avoid trapping faults against the zero page. The read-only 2410 * data is likely to be read-cached on the local CPU and 2411 * local/remote hits to the zero page are not interesting. 2412 */ 2413 if (is_huge_zero_pmd(*pmd)) 2414 goto unlock; 2415 2416 if (pmd_protnone(*pmd)) 2417 goto unlock; 2418 2419 folio = pmd_folio(*pmd); 2420 toptier = node_is_toptier(folio_nid(folio)); 2421 /* 2422 * Skip scanning top tier node if normal numa 2423 * balancing is disabled 2424 */ 2425 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 2426 toptier) 2427 goto unlock; 2428 2429 if (folio_use_access_time(folio)) 2430 folio_xchg_access_time(folio, 2431 jiffies_to_msecs(jiffies)); 2432 } 2433 /* 2434 * In case prot_numa, we are under mmap_read_lock(mm). It's critical 2435 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 2436 * which is also under mmap_read_lock(mm): 2437 * 2438 * CPU0: CPU1: 2439 * change_huge_pmd(prot_numa=1) 2440 * pmdp_huge_get_and_clear_notify() 2441 * madvise_dontneed() 2442 * zap_pmd_range() 2443 * pmd_trans_huge(*pmd) == 0 (without ptl) 2444 * // skip the pmd 2445 * set_pmd_at(); 2446 * // pmd is re-established 2447 * 2448 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 2449 * which may break userspace. 2450 * 2451 * pmdp_invalidate_ad() is required to make sure we don't miss 2452 * dirty/young flags set by hardware. 2453 */ 2454 oldpmd = pmdp_invalidate_ad(vma, addr, pmd); 2455 2456 entry = pmd_modify(oldpmd, newprot); 2457 if (uffd_wp) 2458 entry = pmd_mkuffd_wp(entry); 2459 else if (uffd_wp_resolve) 2460 /* 2461 * Leave the write bit to be handled by PF interrupt 2462 * handler, then things like COW could be properly 2463 * handled. 2464 */ 2465 entry = pmd_clear_uffd_wp(entry); 2466 2467 /* See change_pte_range(). */ 2468 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && 2469 can_change_pmd_writable(vma, addr, entry)) 2470 entry = pmd_mkwrite(entry, vma); 2471 2472 ret = HPAGE_PMD_NR; 2473 set_pmd_at(mm, addr, pmd, entry); 2474 2475 if (huge_pmd_needs_flush(oldpmd, entry)) 2476 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); 2477 unlock: 2478 spin_unlock(ptl); 2479 return ret; 2480 } 2481 2482 /* 2483 * Returns: 2484 * 2485 * - 0: if pud leaf changed from under us 2486 * - 1: if pud can be skipped 2487 * - HPAGE_PUD_NR: if pud was successfully processed 2488 */ 2489 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2490 int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2491 pud_t *pudp, unsigned long addr, pgprot_t newprot, 2492 unsigned long cp_flags) 2493 { 2494 struct mm_struct *mm = vma->vm_mm; 2495 pud_t oldpud, entry; 2496 spinlock_t *ptl; 2497 2498 tlb_change_page_size(tlb, HPAGE_PUD_SIZE); 2499 2500 /* NUMA balancing doesn't apply to dax */ 2501 if (cp_flags & MM_CP_PROT_NUMA) 2502 return 1; 2503 2504 /* 2505 * Huge entries on userfault-wp only works with anonymous, while we 2506 * don't have anonymous PUDs yet. 2507 */ 2508 if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) 2509 return 1; 2510 2511 ptl = __pud_trans_huge_lock(pudp, vma); 2512 if (!ptl) 2513 return 0; 2514 2515 /* 2516 * Can't clear PUD or it can race with concurrent zapping. See 2517 * change_huge_pmd(). 2518 */ 2519 oldpud = pudp_invalidate(vma, addr, pudp); 2520 entry = pud_modify(oldpud, newprot); 2521 set_pud_at(mm, addr, pudp, entry); 2522 tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE); 2523 2524 spin_unlock(ptl); 2525 return HPAGE_PUD_NR; 2526 } 2527 #endif 2528 2529 #ifdef CONFIG_USERFAULTFD 2530 /* 2531 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by 2532 * the caller, but it must return after releasing the page_table_lock. 2533 * Just move the page from src_pmd to dst_pmd if possible. 2534 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be 2535 * repeated by the caller, or other errors in case of failure. 2536 */ 2537 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 2538 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 2539 unsigned long dst_addr, unsigned long src_addr) 2540 { 2541 pmd_t _dst_pmd, src_pmdval; 2542 struct page *src_page; 2543 struct folio *src_folio; 2544 struct anon_vma *src_anon_vma; 2545 spinlock_t *src_ptl, *dst_ptl; 2546 pgtable_t src_pgtable; 2547 struct mmu_notifier_range range; 2548 int err = 0; 2549 2550 src_pmdval = *src_pmd; 2551 src_ptl = pmd_lockptr(mm, src_pmd); 2552 2553 lockdep_assert_held(src_ptl); 2554 vma_assert_locked(src_vma); 2555 vma_assert_locked(dst_vma); 2556 2557 /* Sanity checks before the operation */ 2558 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || 2559 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { 2560 spin_unlock(src_ptl); 2561 return -EINVAL; 2562 } 2563 2564 if (!pmd_trans_huge(src_pmdval)) { 2565 spin_unlock(src_ptl); 2566 if (is_pmd_migration_entry(src_pmdval)) { 2567 pmd_migration_entry_wait(mm, &src_pmdval); 2568 return -EAGAIN; 2569 } 2570 return -ENOENT; 2571 } 2572 2573 src_page = pmd_page(src_pmdval); 2574 2575 if (!is_huge_zero_pmd(src_pmdval)) { 2576 if (unlikely(!PageAnonExclusive(src_page))) { 2577 spin_unlock(src_ptl); 2578 return -EBUSY; 2579 } 2580 2581 src_folio = page_folio(src_page); 2582 folio_get(src_folio); 2583 } else 2584 src_folio = NULL; 2585 2586 spin_unlock(src_ptl); 2587 2588 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); 2589 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, 2590 src_addr + HPAGE_PMD_SIZE); 2591 mmu_notifier_invalidate_range_start(&range); 2592 2593 if (src_folio) { 2594 folio_lock(src_folio); 2595 2596 /* 2597 * split_huge_page walks the anon_vma chain without the page 2598 * lock. Serialize against it with the anon_vma lock, the page 2599 * lock is not enough. 2600 */ 2601 src_anon_vma = folio_get_anon_vma(src_folio); 2602 if (!src_anon_vma) { 2603 err = -EAGAIN; 2604 goto unlock_folio; 2605 } 2606 anon_vma_lock_write(src_anon_vma); 2607 } else 2608 src_anon_vma = NULL; 2609 2610 dst_ptl = pmd_lockptr(mm, dst_pmd); 2611 double_pt_lock(src_ptl, dst_ptl); 2612 if (unlikely(!pmd_same(*src_pmd, src_pmdval) || 2613 !pmd_same(*dst_pmd, dst_pmdval))) { 2614 err = -EAGAIN; 2615 goto unlock_ptls; 2616 } 2617 if (src_folio) { 2618 if (folio_maybe_dma_pinned(src_folio) || 2619 !PageAnonExclusive(&src_folio->page)) { 2620 err = -EBUSY; 2621 goto unlock_ptls; 2622 } 2623 2624 if (WARN_ON_ONCE(!folio_test_head(src_folio)) || 2625 WARN_ON_ONCE(!folio_test_anon(src_folio))) { 2626 err = -EBUSY; 2627 goto unlock_ptls; 2628 } 2629 2630 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2631 /* Folio got pinned from under us. Put it back and fail the move. */ 2632 if (folio_maybe_dma_pinned(src_folio)) { 2633 set_pmd_at(mm, src_addr, src_pmd, src_pmdval); 2634 err = -EBUSY; 2635 goto unlock_ptls; 2636 } 2637 2638 folio_move_anon_rmap(src_folio, dst_vma); 2639 src_folio->index = linear_page_index(dst_vma, dst_addr); 2640 2641 _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); 2642 /* Follow mremap() behavior and treat the entry dirty after the move */ 2643 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); 2644 } else { 2645 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2646 _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); 2647 } 2648 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); 2649 2650 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); 2651 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2652 unlock_ptls: 2653 double_pt_unlock(src_ptl, dst_ptl); 2654 if (src_anon_vma) { 2655 anon_vma_unlock_write(src_anon_vma); 2656 put_anon_vma(src_anon_vma); 2657 } 2658 unlock_folio: 2659 /* unblock rmap walks */ 2660 if (src_folio) 2661 folio_unlock(src_folio); 2662 mmu_notifier_invalidate_range_end(&range); 2663 if (src_folio) 2664 folio_put(src_folio); 2665 return err; 2666 } 2667 #endif /* CONFIG_USERFAULTFD */ 2668 2669 /* 2670 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2671 * 2672 * Note that if it returns page table lock pointer, this routine returns without 2673 * unlocking page table lock. So callers must unlock it. 2674 */ 2675 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2676 { 2677 spinlock_t *ptl; 2678 ptl = pmd_lock(vma->vm_mm, pmd); 2679 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) 2680 return ptl; 2681 spin_unlock(ptl); 2682 return NULL; 2683 } 2684 2685 /* 2686 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. 2687 * 2688 * Note that if it returns page table lock pointer, this routine returns without 2689 * unlocking page table lock. So callers must unlock it. 2690 */ 2691 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2692 { 2693 spinlock_t *ptl; 2694 2695 ptl = pud_lock(vma->vm_mm, pud); 2696 if (likely(pud_trans_huge(*pud))) 2697 return ptl; 2698 spin_unlock(ptl); 2699 return NULL; 2700 } 2701 2702 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2703 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2704 pud_t *pud, unsigned long addr) 2705 { 2706 spinlock_t *ptl; 2707 pud_t orig_pud; 2708 2709 ptl = __pud_trans_huge_lock(pud, vma); 2710 if (!ptl) 2711 return 0; 2712 2713 orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); 2714 arch_check_zapped_pud(vma, orig_pud); 2715 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2716 if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { 2717 spin_unlock(ptl); 2718 /* No zero page support yet */ 2719 } else { 2720 struct page *page = NULL; 2721 struct folio *folio; 2722 2723 /* No support for anonymous PUD pages or migration yet */ 2724 VM_WARN_ON_ONCE(vma_is_anonymous(vma) || 2725 !pud_present(orig_pud)); 2726 2727 page = pud_page(orig_pud); 2728 folio = page_folio(page); 2729 folio_remove_rmap_pud(folio, page, vma); 2730 add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR); 2731 2732 spin_unlock(ptl); 2733 tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE); 2734 } 2735 return 1; 2736 } 2737 2738 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2739 unsigned long haddr) 2740 { 2741 struct folio *folio; 2742 struct page *page; 2743 pud_t old_pud; 2744 2745 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2746 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2747 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2748 VM_BUG_ON(!pud_trans_huge(*pud)); 2749 2750 count_vm_event(THP_SPLIT_PUD); 2751 2752 old_pud = pudp_huge_clear_flush(vma, haddr, pud); 2753 2754 if (!vma_is_dax(vma)) 2755 return; 2756 2757 page = pud_page(old_pud); 2758 folio = page_folio(page); 2759 2760 if (!folio_test_dirty(folio) && pud_dirty(old_pud)) 2761 folio_mark_dirty(folio); 2762 if (!folio_test_referenced(folio) && pud_young(old_pud)) 2763 folio_set_referenced(folio); 2764 folio_remove_rmap_pud(folio, page, vma); 2765 folio_put(folio); 2766 add_mm_counter(vma->vm_mm, mm_counter_file(folio), 2767 -HPAGE_PUD_NR); 2768 } 2769 2770 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2771 unsigned long address) 2772 { 2773 spinlock_t *ptl; 2774 struct mmu_notifier_range range; 2775 2776 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2777 address & HPAGE_PUD_MASK, 2778 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2779 mmu_notifier_invalidate_range_start(&range); 2780 ptl = pud_lock(vma->vm_mm, pud); 2781 if (unlikely(!pud_trans_huge(*pud))) 2782 goto out; 2783 __split_huge_pud_locked(vma, pud, range.start); 2784 2785 out: 2786 spin_unlock(ptl); 2787 mmu_notifier_invalidate_range_end(&range); 2788 } 2789 #else 2790 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2791 unsigned long address) 2792 { 2793 } 2794 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2795 2796 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2797 unsigned long haddr, pmd_t *pmd) 2798 { 2799 struct mm_struct *mm = vma->vm_mm; 2800 pgtable_t pgtable; 2801 pmd_t _pmd, old_pmd; 2802 unsigned long addr; 2803 pte_t *pte; 2804 int i; 2805 2806 /* 2807 * Leave pmd empty until pte is filled note that it is fine to delay 2808 * notification until mmu_notifier_invalidate_range_end() as we are 2809 * replacing a zero pmd write protected page with a zero pte write 2810 * protected page. 2811 * 2812 * See Documentation/mm/mmu_notifier.rst 2813 */ 2814 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2815 2816 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2817 pmd_populate(mm, &_pmd, pgtable); 2818 2819 pte = pte_offset_map(&_pmd, haddr); 2820 VM_BUG_ON(!pte); 2821 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2822 pte_t entry; 2823 2824 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); 2825 entry = pte_mkspecial(entry); 2826 if (pmd_uffd_wp(old_pmd)) 2827 entry = pte_mkuffd_wp(entry); 2828 VM_BUG_ON(!pte_none(ptep_get(pte))); 2829 set_pte_at(mm, addr, pte, entry); 2830 pte++; 2831 } 2832 pte_unmap(pte - 1); 2833 smp_wmb(); /* make pte visible before pmd */ 2834 pmd_populate(mm, pmd, pgtable); 2835 } 2836 2837 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2838 unsigned long haddr, bool freeze) 2839 { 2840 struct mm_struct *mm = vma->vm_mm; 2841 struct folio *folio; 2842 struct page *page; 2843 pgtable_t pgtable; 2844 pmd_t old_pmd, _pmd; 2845 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; 2846 bool anon_exclusive = false, dirty = false; 2847 unsigned long addr; 2848 pte_t *pte; 2849 int i; 2850 2851 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2852 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2853 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2854 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); 2855 2856 count_vm_event(THP_SPLIT_PMD); 2857 2858 if (!vma_is_anonymous(vma)) { 2859 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2860 /* 2861 * We are going to unmap this huge page. So 2862 * just go ahead and zap it 2863 */ 2864 if (arch_needs_pgtable_deposit()) 2865 zap_deposited_table(mm, pmd); 2866 if (!vma_is_dax(vma) && vma_is_special_huge(vma)) 2867 return; 2868 if (unlikely(is_pmd_migration_entry(old_pmd))) { 2869 swp_entry_t entry; 2870 2871 entry = pmd_to_swp_entry(old_pmd); 2872 folio = pfn_swap_entry_folio(entry); 2873 } else if (is_huge_zero_pmd(old_pmd)) { 2874 return; 2875 } else { 2876 page = pmd_page(old_pmd); 2877 folio = page_folio(page); 2878 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) 2879 folio_mark_dirty(folio); 2880 if (!folio_test_referenced(folio) && pmd_young(old_pmd)) 2881 folio_set_referenced(folio); 2882 folio_remove_rmap_pmd(folio, page, vma); 2883 folio_put(folio); 2884 } 2885 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); 2886 return; 2887 } 2888 2889 if (is_huge_zero_pmd(*pmd)) { 2890 /* 2891 * FIXME: Do we want to invalidate secondary mmu by calling 2892 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below 2893 * inside __split_huge_pmd() ? 2894 * 2895 * We are going from a zero huge page write protected to zero 2896 * small page also write protected so it does not seems useful 2897 * to invalidate secondary mmu at this time. 2898 */ 2899 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2900 } 2901 2902 pmd_migration = is_pmd_migration_entry(*pmd); 2903 if (unlikely(pmd_migration)) { 2904 swp_entry_t entry; 2905 2906 old_pmd = *pmd; 2907 entry = pmd_to_swp_entry(old_pmd); 2908 page = pfn_swap_entry_to_page(entry); 2909 write = is_writable_migration_entry(entry); 2910 if (PageAnon(page)) 2911 anon_exclusive = is_readable_exclusive_migration_entry(entry); 2912 young = is_migration_entry_young(entry); 2913 dirty = is_migration_entry_dirty(entry); 2914 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2915 uffd_wp = pmd_swp_uffd_wp(old_pmd); 2916 } else { 2917 /* 2918 * Up to this point the pmd is present and huge and userland has 2919 * the whole access to the hugepage during the split (which 2920 * happens in place). If we overwrite the pmd with the not-huge 2921 * version pointing to the pte here (which of course we could if 2922 * all CPUs were bug free), userland could trigger a small page 2923 * size TLB miss on the small sized TLB while the hugepage TLB 2924 * entry is still established in the huge TLB. Some CPU doesn't 2925 * like that. See 2926 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum 2927 * 383 on page 105. Intel should be safe but is also warns that 2928 * it's only safe if the permission and cache attributes of the 2929 * two entries loaded in the two TLB is identical (which should 2930 * be the case here). But it is generally safer to never allow 2931 * small and huge TLB entries for the same virtual address to be 2932 * loaded simultaneously. So instead of doing "pmd_populate(); 2933 * flush_pmd_tlb_range();" we first mark the current pmd 2934 * notpresent (atomically because here the pmd_trans_huge must 2935 * remain set at all times on the pmd until the split is 2936 * complete for this pmd), then we flush the SMP TLB and finally 2937 * we write the non-huge version of the pmd entry with 2938 * pmd_populate. 2939 */ 2940 old_pmd = pmdp_invalidate(vma, haddr, pmd); 2941 page = pmd_page(old_pmd); 2942 folio = page_folio(page); 2943 if (pmd_dirty(old_pmd)) { 2944 dirty = true; 2945 folio_set_dirty(folio); 2946 } 2947 write = pmd_write(old_pmd); 2948 young = pmd_young(old_pmd); 2949 soft_dirty = pmd_soft_dirty(old_pmd); 2950 uffd_wp = pmd_uffd_wp(old_pmd); 2951 2952 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); 2953 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 2954 2955 /* 2956 * Without "freeze", we'll simply split the PMD, propagating the 2957 * PageAnonExclusive() flag for each PTE by setting it for 2958 * each subpage -- no need to (temporarily) clear. 2959 * 2960 * With "freeze" we want to replace mapped pages by 2961 * migration entries right away. This is only possible if we 2962 * managed to clear PageAnonExclusive() -- see 2963 * set_pmd_migration_entry(). 2964 * 2965 * In case we cannot clear PageAnonExclusive(), split the PMD 2966 * only and let try_to_migrate_one() fail later. 2967 * 2968 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. 2969 */ 2970 anon_exclusive = PageAnonExclusive(page); 2971 if (freeze && anon_exclusive && 2972 folio_try_share_anon_rmap_pmd(folio, page)) 2973 freeze = false; 2974 if (!freeze) { 2975 rmap_t rmap_flags = RMAP_NONE; 2976 2977 folio_ref_add(folio, HPAGE_PMD_NR - 1); 2978 if (anon_exclusive) 2979 rmap_flags |= RMAP_EXCLUSIVE; 2980 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, 2981 vma, haddr, rmap_flags); 2982 } 2983 } 2984 2985 /* 2986 * Withdraw the table only after we mark the pmd entry invalid. 2987 * This's critical for some architectures (Power). 2988 */ 2989 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2990 pmd_populate(mm, &_pmd, pgtable); 2991 2992 pte = pte_offset_map(&_pmd, haddr); 2993 VM_BUG_ON(!pte); 2994 2995 /* 2996 * Note that NUMA hinting access restrictions are not transferred to 2997 * avoid any possibility of altering permissions across VMAs. 2998 */ 2999 if (freeze || pmd_migration) { 3000 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 3001 pte_t entry; 3002 swp_entry_t swp_entry; 3003 3004 if (write) 3005 swp_entry = make_writable_migration_entry( 3006 page_to_pfn(page + i)); 3007 else if (anon_exclusive) 3008 swp_entry = make_readable_exclusive_migration_entry( 3009 page_to_pfn(page + i)); 3010 else 3011 swp_entry = make_readable_migration_entry( 3012 page_to_pfn(page + i)); 3013 if (young) 3014 swp_entry = make_migration_entry_young(swp_entry); 3015 if (dirty) 3016 swp_entry = make_migration_entry_dirty(swp_entry); 3017 entry = swp_entry_to_pte(swp_entry); 3018 if (soft_dirty) 3019 entry = pte_swp_mksoft_dirty(entry); 3020 if (uffd_wp) 3021 entry = pte_swp_mkuffd_wp(entry); 3022 3023 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 3024 set_pte_at(mm, addr, pte + i, entry); 3025 } 3026 } else { 3027 pte_t entry; 3028 3029 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); 3030 if (write) 3031 entry = pte_mkwrite(entry, vma); 3032 if (!young) 3033 entry = pte_mkold(entry); 3034 /* NOTE: this may set soft-dirty too on some archs */ 3035 if (dirty) 3036 entry = pte_mkdirty(entry); 3037 if (soft_dirty) 3038 entry = pte_mksoft_dirty(entry); 3039 if (uffd_wp) 3040 entry = pte_mkuffd_wp(entry); 3041 3042 for (i = 0; i < HPAGE_PMD_NR; i++) 3043 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 3044 3045 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); 3046 } 3047 pte_unmap(pte); 3048 3049 if (!pmd_migration) 3050 folio_remove_rmap_pmd(folio, page, vma); 3051 if (freeze) 3052 put_page(page); 3053 3054 smp_wmb(); /* make pte visible before pmd */ 3055 pmd_populate(mm, pmd, pgtable); 3056 } 3057 3058 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, 3059 pmd_t *pmd, bool freeze) 3060 { 3061 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); 3062 if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) 3063 __split_huge_pmd_locked(vma, pmd, address, freeze); 3064 } 3065 3066 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 3067 unsigned long address, bool freeze) 3068 { 3069 spinlock_t *ptl; 3070 struct mmu_notifier_range range; 3071 3072 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 3073 address & HPAGE_PMD_MASK, 3074 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 3075 mmu_notifier_invalidate_range_start(&range); 3076 ptl = pmd_lock(vma->vm_mm, pmd); 3077 split_huge_pmd_locked(vma, range.start, pmd, freeze); 3078 spin_unlock(ptl); 3079 mmu_notifier_invalidate_range_end(&range); 3080 } 3081 3082 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 3083 bool freeze) 3084 { 3085 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 3086 3087 if (!pmd) 3088 return; 3089 3090 __split_huge_pmd(vma, pmd, address, freeze); 3091 } 3092 3093 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) 3094 { 3095 /* 3096 * If the new address isn't hpage aligned and it could previously 3097 * contain an hugepage: check if we need to split an huge pmd. 3098 */ 3099 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && 3100 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), 3101 ALIGN(address, HPAGE_PMD_SIZE))) 3102 split_huge_pmd_address(vma, address, false); 3103 } 3104 3105 void vma_adjust_trans_huge(struct vm_area_struct *vma, 3106 unsigned long start, 3107 unsigned long end, 3108 struct vm_area_struct *next) 3109 { 3110 /* Check if we need to split start first. */ 3111 split_huge_pmd_if_needed(vma, start); 3112 3113 /* Check if we need to split end next. */ 3114 split_huge_pmd_if_needed(vma, end); 3115 3116 /* If we're incrementing next->vm_start, we might need to split it. */ 3117 if (next) 3118 split_huge_pmd_if_needed(next, end); 3119 } 3120 3121 static void unmap_folio(struct folio *folio) 3122 { 3123 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | 3124 TTU_BATCH_FLUSH; 3125 3126 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 3127 3128 if (folio_test_pmd_mappable(folio)) 3129 ttu_flags |= TTU_SPLIT_HUGE_PMD; 3130 3131 /* 3132 * Anon pages need migration entries to preserve them, but file 3133 * pages can simply be left unmapped, then faulted back on demand. 3134 * If that is ever changed (perhaps for mlock), update remap_page(). 3135 */ 3136 if (folio_test_anon(folio)) 3137 try_to_migrate(folio, ttu_flags); 3138 else 3139 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 3140 3141 try_to_unmap_flush(); 3142 } 3143 3144 static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, 3145 unsigned long addr, pmd_t *pmdp, 3146 struct folio *folio) 3147 { 3148 struct mm_struct *mm = vma->vm_mm; 3149 int ref_count, map_count; 3150 pmd_t orig_pmd = *pmdp; 3151 3152 if (pmd_dirty(orig_pmd)) 3153 folio_set_dirty(folio); 3154 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { 3155 folio_set_swapbacked(folio); 3156 return false; 3157 } 3158 3159 orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp); 3160 3161 /* 3162 * Syncing against concurrent GUP-fast: 3163 * - clear PMD; barrier; read refcount 3164 * - inc refcount; barrier; read PMD 3165 */ 3166 smp_mb(); 3167 3168 ref_count = folio_ref_count(folio); 3169 map_count = folio_mapcount(folio); 3170 3171 /* 3172 * Order reads for folio refcount and dirty flag 3173 * (see comments in __remove_mapping()). 3174 */ 3175 smp_rmb(); 3176 3177 /* 3178 * If the folio or its PMD is redirtied at this point, or if there 3179 * are unexpected references, we will give up to discard this folio 3180 * and remap it. 3181 * 3182 * The only folio refs must be one from isolation plus the rmap(s). 3183 */ 3184 if (pmd_dirty(orig_pmd)) 3185 folio_set_dirty(folio); 3186 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { 3187 folio_set_swapbacked(folio); 3188 set_pmd_at(mm, addr, pmdp, orig_pmd); 3189 return false; 3190 } 3191 3192 if (ref_count != map_count + 1) { 3193 set_pmd_at(mm, addr, pmdp, orig_pmd); 3194 return false; 3195 } 3196 3197 folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma); 3198 zap_deposited_table(mm, pmdp); 3199 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); 3200 if (vma->vm_flags & VM_LOCKED) 3201 mlock_drain_local(); 3202 folio_put(folio); 3203 3204 return true; 3205 } 3206 3207 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, 3208 pmd_t *pmdp, struct folio *folio) 3209 { 3210 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); 3211 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 3212 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 3213 VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio); 3214 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE)); 3215 3216 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); 3217 } 3218 3219 static void remap_page(struct folio *folio, unsigned long nr, int flags) 3220 { 3221 int i = 0; 3222 3223 /* If unmap_folio() uses try_to_migrate() on file, remove this check */ 3224 if (!folio_test_anon(folio)) 3225 return; 3226 for (;;) { 3227 remove_migration_ptes(folio, folio, RMP_LOCKED | flags); 3228 i += folio_nr_pages(folio); 3229 if (i >= nr) 3230 break; 3231 folio = folio_next(folio); 3232 } 3233 } 3234 3235 static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, 3236 struct lruvec *lruvec, struct list_head *list) 3237 { 3238 VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); 3239 lockdep_assert_held(&lruvec->lru_lock); 3240 3241 if (list) { 3242 /* page reclaim is reclaiming a huge page */ 3243 VM_WARN_ON(folio_test_lru(folio)); 3244 folio_get(new_folio); 3245 list_add_tail(&new_folio->lru, list); 3246 } else { 3247 /* head is still on lru (and we have it frozen) */ 3248 VM_WARN_ON(!folio_test_lru(folio)); 3249 if (folio_test_unevictable(folio)) 3250 new_folio->mlock_count = 0; 3251 else 3252 list_add_tail(&new_folio->lru, &folio->lru); 3253 folio_set_lru(new_folio); 3254 } 3255 } 3256 3257 /* Racy check whether the huge page can be split */ 3258 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) 3259 { 3260 int extra_pins; 3261 3262 /* Additional pins from page cache */ 3263 if (folio_test_anon(folio)) 3264 extra_pins = folio_test_swapcache(folio) ? 3265 folio_nr_pages(folio) : 0; 3266 else 3267 extra_pins = folio_nr_pages(folio); 3268 if (pextra_pins) 3269 *pextra_pins = extra_pins; 3270 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 3271 caller_pins; 3272 } 3273 3274 /* 3275 * It splits @folio into @new_order folios and copies the @folio metadata to 3276 * all the resulting folios. 3277 */ 3278 static void __split_folio_to_order(struct folio *folio, int old_order, 3279 int new_order) 3280 { 3281 long new_nr_pages = 1 << new_order; 3282 long nr_pages = 1 << old_order; 3283 long i; 3284 3285 /* 3286 * Skip the first new_nr_pages, since the new folio from them have all 3287 * the flags from the original folio. 3288 */ 3289 for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { 3290 struct page *new_head = &folio->page + i; 3291 3292 /* 3293 * Careful: new_folio is not a "real" folio before we cleared PageTail. 3294 * Don't pass it around before clear_compound_head(). 3295 */ 3296 struct folio *new_folio = (struct folio *)new_head; 3297 3298 VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head); 3299 3300 /* 3301 * Clone page flags before unfreezing refcount. 3302 * 3303 * After successful get_page_unless_zero() might follow flags change, 3304 * for example lock_page() which set PG_waiters. 3305 * 3306 * Note that for mapped sub-pages of an anonymous THP, 3307 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in 3308 * the migration entry instead from where remap_page() will restore it. 3309 * We can still have PG_anon_exclusive set on effectively unmapped and 3310 * unreferenced sub-pages of an anonymous THP: we can simply drop 3311 * PG_anon_exclusive (-> PG_mappedtodisk) for these here. 3312 */ 3313 new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3314 new_folio->flags |= (folio->flags & 3315 ((1L << PG_referenced) | 3316 (1L << PG_swapbacked) | 3317 (1L << PG_swapcache) | 3318 (1L << PG_mlocked) | 3319 (1L << PG_uptodate) | 3320 (1L << PG_active) | 3321 (1L << PG_workingset) | 3322 (1L << PG_locked) | 3323 (1L << PG_unevictable) | 3324 #ifdef CONFIG_ARCH_USES_PG_ARCH_2 3325 (1L << PG_arch_2) | 3326 #endif 3327 #ifdef CONFIG_ARCH_USES_PG_ARCH_3 3328 (1L << PG_arch_3) | 3329 #endif 3330 (1L << PG_dirty) | 3331 LRU_GEN_MASK | LRU_REFS_MASK)); 3332 3333 new_folio->mapping = folio->mapping; 3334 new_folio->index = folio->index + i; 3335 3336 /* 3337 * page->private should not be set in tail pages. Fix up and warn once 3338 * if private is unexpectedly set. 3339 */ 3340 if (unlikely(new_folio->private)) { 3341 VM_WARN_ON_ONCE_PAGE(true, new_head); 3342 new_folio->private = NULL; 3343 } 3344 3345 if (folio_test_swapcache(folio)) 3346 new_folio->swap.val = folio->swap.val + i; 3347 3348 /* Page flags must be visible before we make the page non-compound. */ 3349 smp_wmb(); 3350 3351 /* 3352 * Clear PageTail before unfreezing page refcount. 3353 * 3354 * After successful get_page_unless_zero() might follow put_page() 3355 * which needs correct compound_head(). 3356 */ 3357 clear_compound_head(new_head); 3358 if (new_order) { 3359 prep_compound_page(new_head, new_order); 3360 folio_set_large_rmappable(new_folio); 3361 } 3362 3363 if (folio_test_young(folio)) 3364 folio_set_young(new_folio); 3365 if (folio_test_idle(folio)) 3366 folio_set_idle(new_folio); 3367 #ifdef CONFIG_MEMCG 3368 new_folio->memcg_data = folio->memcg_data; 3369 #endif 3370 3371 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); 3372 } 3373 3374 if (new_order) 3375 folio_set_order(folio, new_order); 3376 else 3377 ClearPageCompound(&folio->page); 3378 } 3379 3380 /* 3381 * It splits an unmapped @folio to lower order smaller folios in two ways. 3382 * @folio: the to-be-split folio 3383 * @new_order: the smallest order of the after split folios (since buddy 3384 * allocator like split generates folios with orders from @folio's 3385 * order - 1 to new_order). 3386 * @split_at: in buddy allocator like split, the folio containing @split_at 3387 * will be split until its order becomes @new_order. 3388 * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller 3389 * @mapping: @folio->mapping 3390 * @uniform_split: if the split is uniform or not (buddy allocator like split) 3391 * 3392 * 3393 * 1. uniform split: the given @folio into multiple @new_order small folios, 3394 * where all small folios have the same order. This is done when 3395 * uniform_split is true. 3396 * 2. buddy allocator like (non-uniform) split: the given @folio is split into 3397 * half and one of the half (containing the given page) is split into half 3398 * until the given @page's order becomes @new_order. This is done when 3399 * uniform_split is false. 3400 * 3401 * The high level flow for these two methods are: 3402 * 1. uniform split: a single __split_folio_to_order() is called to split the 3403 * @folio into @new_order, then we traverse all the resulting folios one by 3404 * one in PFN ascending order and perform stats, unfreeze, adding to list, 3405 * and file mapping index operations. 3406 * 2. non-uniform split: in general, folio_order - @new_order calls to 3407 * __split_folio_to_order() are made in a for loop to split the @folio 3408 * to one lower order at a time. The resulting small folios are processed 3409 * like what is done during the traversal in 1, except the one containing 3410 * @page, which is split in next for loop. 3411 * 3412 * After splitting, the caller's folio reference will be transferred to the 3413 * folio containing @page. The caller needs to unlock and/or free after-split 3414 * folios if necessary. 3415 * 3416 * For !uniform_split, when -ENOMEM is returned, the original folio might be 3417 * split. The caller needs to check the input folio. 3418 */ 3419 static int __split_unmapped_folio(struct folio *folio, int new_order, 3420 struct page *split_at, struct xa_state *xas, 3421 struct address_space *mapping, bool uniform_split) 3422 { 3423 int order = folio_order(folio); 3424 int start_order = uniform_split ? new_order : order - 1; 3425 bool stop_split = false; 3426 struct folio *next; 3427 int split_order; 3428 int ret = 0; 3429 3430 if (folio_test_anon(folio)) 3431 mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); 3432 3433 folio_clear_has_hwpoisoned(folio); 3434 3435 /* 3436 * split to new_order one order at a time. For uniform split, 3437 * folio is split to new_order directly. 3438 */ 3439 for (split_order = start_order; 3440 split_order >= new_order && !stop_split; 3441 split_order--) { 3442 struct folio *end_folio = folio_next(folio); 3443 int old_order = folio_order(folio); 3444 struct folio *new_folio; 3445 3446 /* order-1 anonymous folio is not supported */ 3447 if (folio_test_anon(folio) && split_order == 1) 3448 continue; 3449 if (uniform_split && split_order != new_order) 3450 continue; 3451 3452 if (mapping) { 3453 /* 3454 * uniform split has xas_split_alloc() called before 3455 * irq is disabled to allocate enough memory, whereas 3456 * non-uniform split can handle ENOMEM. 3457 */ 3458 if (uniform_split) 3459 xas_split(xas, folio, old_order); 3460 else { 3461 xas_set_order(xas, folio->index, split_order); 3462 xas_try_split(xas, folio, old_order); 3463 if (xas_error(xas)) { 3464 ret = xas_error(xas); 3465 stop_split = true; 3466 } 3467 } 3468 } 3469 3470 if (!stop_split) { 3471 folio_split_memcg_refs(folio, old_order, split_order); 3472 split_page_owner(&folio->page, old_order, split_order); 3473 pgalloc_tag_split(folio, old_order, split_order); 3474 3475 __split_folio_to_order(folio, old_order, split_order); 3476 } 3477 3478 /* 3479 * Iterate through after-split folios and update folio stats. 3480 * But in buddy allocator like split, the folio 3481 * containing the specified page is skipped until its order 3482 * is new_order, since the folio will be worked on in next 3483 * iteration. 3484 */ 3485 for (new_folio = folio; new_folio != end_folio; new_folio = next) { 3486 next = folio_next(new_folio); 3487 /* 3488 * for buddy allocator like split, new_folio containing 3489 * @split_at page could be split again, thus do not 3490 * change stats yet. Wait until new_folio's order is 3491 * @new_order or stop_split is set to true by the above 3492 * xas_split() failure. 3493 */ 3494 if (new_folio == page_folio(split_at)) { 3495 folio = new_folio; 3496 if (split_order != new_order && !stop_split) 3497 continue; 3498 } 3499 if (folio_test_anon(new_folio)) 3500 mod_mthp_stat(folio_order(new_folio), 3501 MTHP_STAT_NR_ANON, 1); 3502 } 3503 } 3504 3505 return ret; 3506 } 3507 3508 bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, 3509 bool warns) 3510 { 3511 if (folio_test_anon(folio)) { 3512 /* order-1 is not supported for anonymous THP. */ 3513 VM_WARN_ONCE(warns && new_order == 1, 3514 "Cannot split to order-1 folio"); 3515 return new_order != 1; 3516 } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 3517 !mapping_large_folio_support(folio->mapping)) { 3518 /* 3519 * No split if the file system does not support large folio. 3520 * Note that we might still have THPs in such mappings due to 3521 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping 3522 * does not actually support large folios properly. 3523 */ 3524 VM_WARN_ONCE(warns, 3525 "Cannot split file folio to non-0 order"); 3526 return false; 3527 } 3528 3529 /* Only swapping a whole PMD-mapped folio is supported */ 3530 if (folio_test_swapcache(folio)) { 3531 VM_WARN_ONCE(warns, 3532 "Cannot split swapcache folio to non-0 order"); 3533 return false; 3534 } 3535 3536 return true; 3537 } 3538 3539 /* See comments in non_uniform_split_supported() */ 3540 bool uniform_split_supported(struct folio *folio, unsigned int new_order, 3541 bool warns) 3542 { 3543 if (folio_test_anon(folio)) { 3544 VM_WARN_ONCE(warns && new_order == 1, 3545 "Cannot split to order-1 folio"); 3546 return new_order != 1; 3547 } else if (new_order) { 3548 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 3549 !mapping_large_folio_support(folio->mapping)) { 3550 VM_WARN_ONCE(warns, 3551 "Cannot split file folio to non-0 order"); 3552 return false; 3553 } 3554 } 3555 3556 if (new_order && folio_test_swapcache(folio)) { 3557 VM_WARN_ONCE(warns, 3558 "Cannot split swapcache folio to non-0 order"); 3559 return false; 3560 } 3561 3562 return true; 3563 } 3564 3565 /* 3566 * __folio_split: split a folio at @split_at to a @new_order folio 3567 * @folio: folio to split 3568 * @new_order: the order of the new folio 3569 * @split_at: a page within the new folio 3570 * @lock_at: a page within @folio to be left locked to caller 3571 * @list: after-split folios will be put on it if non NULL 3572 * @uniform_split: perform uniform split or not (non-uniform split) 3573 * 3574 * It calls __split_unmapped_folio() to perform uniform and non-uniform split. 3575 * It is in charge of checking whether the split is supported or not and 3576 * preparing @folio for __split_unmapped_folio(). 3577 * 3578 * After splitting, the after-split folio containing @lock_at remains locked 3579 * and others are unlocked: 3580 * 1. for uniform split, @lock_at points to one of @folio's subpages; 3581 * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. 3582 * 3583 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be 3584 * split but not to @new_order, the caller needs to check) 3585 */ 3586 static int __folio_split(struct folio *folio, unsigned int new_order, 3587 struct page *split_at, struct page *lock_at, 3588 struct list_head *list, bool uniform_split) 3589 { 3590 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3591 XA_STATE(xas, &folio->mapping->i_pages, folio->index); 3592 struct folio *end_folio = folio_next(folio); 3593 bool is_anon = folio_test_anon(folio); 3594 struct address_space *mapping = NULL; 3595 struct anon_vma *anon_vma = NULL; 3596 int order = folio_order(folio); 3597 struct folio *new_folio, *next; 3598 int nr_shmem_dropped = 0; 3599 int remap_flags = 0; 3600 int extra_pins, ret; 3601 pgoff_t end; 3602 bool is_hzp; 3603 3604 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 3605 VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); 3606 3607 if (folio != page_folio(split_at) || folio != page_folio(lock_at)) 3608 return -EINVAL; 3609 3610 if (new_order >= folio_order(folio)) 3611 return -EINVAL; 3612 3613 if (uniform_split && !uniform_split_supported(folio, new_order, true)) 3614 return -EINVAL; 3615 3616 if (!uniform_split && 3617 !non_uniform_split_supported(folio, new_order, true)) 3618 return -EINVAL; 3619 3620 is_hzp = is_huge_zero_folio(folio); 3621 if (is_hzp) { 3622 pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); 3623 return -EBUSY; 3624 } 3625 3626 if (folio_test_writeback(folio)) 3627 return -EBUSY; 3628 3629 if (is_anon) { 3630 /* 3631 * The caller does not necessarily hold an mmap_lock that would 3632 * prevent the anon_vma disappearing so we first we take a 3633 * reference to it and then lock the anon_vma for write. This 3634 * is similar to folio_lock_anon_vma_read except the write lock 3635 * is taken to serialise against parallel split or collapse 3636 * operations. 3637 */ 3638 anon_vma = folio_get_anon_vma(folio); 3639 if (!anon_vma) { 3640 ret = -EBUSY; 3641 goto out; 3642 } 3643 mapping = NULL; 3644 anon_vma_lock_write(anon_vma); 3645 } else { 3646 unsigned int min_order; 3647 gfp_t gfp; 3648 3649 mapping = folio->mapping; 3650 3651 /* Truncated ? */ 3652 /* 3653 * TODO: add support for large shmem folio in swap cache. 3654 * When shmem is in swap cache, mapping is NULL and 3655 * folio_test_swapcache() is true. 3656 */ 3657 if (!mapping) { 3658 ret = -EBUSY; 3659 goto out; 3660 } 3661 3662 min_order = mapping_min_folio_order(folio->mapping); 3663 if (new_order < min_order) { 3664 VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", 3665 min_order); 3666 ret = -EINVAL; 3667 goto out; 3668 } 3669 3670 gfp = current_gfp_context(mapping_gfp_mask(mapping) & 3671 GFP_RECLAIM_MASK); 3672 3673 if (!filemap_release_folio(folio, gfp)) { 3674 ret = -EBUSY; 3675 goto out; 3676 } 3677 3678 if (uniform_split) { 3679 xas_set_order(&xas, folio->index, new_order); 3680 xas_split_alloc(&xas, folio, folio_order(folio), gfp); 3681 if (xas_error(&xas)) { 3682 ret = xas_error(&xas); 3683 goto out; 3684 } 3685 } 3686 3687 anon_vma = NULL; 3688 i_mmap_lock_read(mapping); 3689 3690 /* 3691 *__split_unmapped_folio() may need to trim off pages beyond 3692 * EOF: but on 32-bit, i_size_read() takes an irq-unsafe 3693 * seqlock, which cannot be nested inside the page tree lock. 3694 * So note end now: i_size itself may be changed at any moment, 3695 * but folio lock is good enough to serialize the trimming. 3696 */ 3697 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3698 if (shmem_mapping(mapping)) 3699 end = shmem_fallocend(mapping->host, end); 3700 } 3701 3702 /* 3703 * Racy check if we can split the page, before unmap_folio() will 3704 * split PMDs 3705 */ 3706 if (!can_split_folio(folio, 1, &extra_pins)) { 3707 ret = -EAGAIN; 3708 goto out_unlock; 3709 } 3710 3711 unmap_folio(folio); 3712 3713 /* block interrupt reentry in xa_lock and spinlock */ 3714 local_irq_disable(); 3715 if (mapping) { 3716 /* 3717 * Check if the folio is present in page cache. 3718 * We assume all tail are present too, if folio is there. 3719 */ 3720 xas_lock(&xas); 3721 xas_reset(&xas); 3722 if (xas_load(&xas) != folio) { 3723 ret = -EAGAIN; 3724 goto fail; 3725 } 3726 } 3727 3728 /* Prevent deferred_split_scan() touching ->_refcount */ 3729 spin_lock(&ds_queue->split_queue_lock); 3730 if (folio_ref_freeze(folio, 1 + extra_pins)) { 3731 struct address_space *swap_cache = NULL; 3732 struct lruvec *lruvec; 3733 int expected_refs; 3734 3735 if (folio_order(folio) > 1 && 3736 !list_empty(&folio->_deferred_list)) { 3737 ds_queue->split_queue_len--; 3738 if (folio_test_partially_mapped(folio)) { 3739 folio_clear_partially_mapped(folio); 3740 mod_mthp_stat(folio_order(folio), 3741 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 3742 } 3743 /* 3744 * Reinitialize page_deferred_list after removing the 3745 * page from the split_queue, otherwise a subsequent 3746 * split will see list corruption when checking the 3747 * page_deferred_list. 3748 */ 3749 list_del_init(&folio->_deferred_list); 3750 } 3751 spin_unlock(&ds_queue->split_queue_lock); 3752 if (mapping) { 3753 int nr = folio_nr_pages(folio); 3754 3755 if (folio_test_pmd_mappable(folio) && 3756 new_order < HPAGE_PMD_ORDER) { 3757 if (folio_test_swapbacked(folio)) { 3758 __lruvec_stat_mod_folio(folio, 3759 NR_SHMEM_THPS, -nr); 3760 } else { 3761 __lruvec_stat_mod_folio(folio, 3762 NR_FILE_THPS, -nr); 3763 filemap_nr_thps_dec(mapping); 3764 } 3765 } 3766 } 3767 3768 if (folio_test_swapcache(folio)) { 3769 if (mapping) { 3770 VM_WARN_ON_ONCE_FOLIO(mapping, folio); 3771 ret = -EINVAL; 3772 goto fail; 3773 } 3774 3775 swap_cache = swap_address_space(folio->swap); 3776 xa_lock(&swap_cache->i_pages); 3777 } 3778 3779 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 3780 lruvec = folio_lruvec_lock(folio); 3781 3782 ret = __split_unmapped_folio(folio, new_order, split_at, &xas, 3783 mapping, uniform_split); 3784 3785 /* 3786 * Unfreeze after-split folios and put them back to the right 3787 * list. @folio should be kept frozon until page cache 3788 * entries are updated with all the other after-split folios 3789 * to prevent others seeing stale page cache entries. 3790 * As a result, new_folio starts from the next folio of 3791 * @folio. 3792 */ 3793 for (new_folio = folio_next(folio); new_folio != end_folio; 3794 new_folio = next) { 3795 unsigned long nr_pages = folio_nr_pages(new_folio); 3796 3797 next = folio_next(new_folio); 3798 3799 expected_refs = folio_expected_ref_count(new_folio) + 1; 3800 folio_ref_unfreeze(new_folio, expected_refs); 3801 3802 lru_add_split_folio(folio, new_folio, lruvec, list); 3803 3804 /* 3805 * Anonymous folio with swap cache. 3806 * NOTE: shmem in swap cache is not supported yet. 3807 */ 3808 if (swap_cache) { 3809 __xa_store(&swap_cache->i_pages, 3810 swap_cache_index(new_folio->swap), 3811 new_folio, 0); 3812 continue; 3813 } 3814 3815 /* Anonymous folio without swap cache */ 3816 if (!mapping) 3817 continue; 3818 3819 /* Add the new folio to the page cache. */ 3820 if (new_folio->index < end) { 3821 __xa_store(&mapping->i_pages, new_folio->index, 3822 new_folio, 0); 3823 continue; 3824 } 3825 3826 /* Drop folio beyond EOF: ->index >= end */ 3827 if (shmem_mapping(mapping)) 3828 nr_shmem_dropped += nr_pages; 3829 else if (folio_test_clear_dirty(new_folio)) 3830 folio_account_cleaned( 3831 new_folio, inode_to_wb(mapping->host)); 3832 __filemap_remove_folio(new_folio, NULL); 3833 folio_put_refs(new_folio, nr_pages); 3834 } 3835 /* 3836 * Unfreeze @folio only after all page cache entries, which 3837 * used to point to it, have been updated with new folios. 3838 * Otherwise, a parallel folio_try_get() can grab @folio 3839 * and its caller can see stale page cache entries. 3840 */ 3841 expected_refs = folio_expected_ref_count(folio) + 1; 3842 folio_ref_unfreeze(folio, expected_refs); 3843 3844 unlock_page_lruvec(lruvec); 3845 3846 if (swap_cache) 3847 xa_unlock(&swap_cache->i_pages); 3848 } else { 3849 spin_unlock(&ds_queue->split_queue_lock); 3850 ret = -EAGAIN; 3851 } 3852 fail: 3853 if (mapping) 3854 xas_unlock(&xas); 3855 3856 local_irq_enable(); 3857 3858 if (nr_shmem_dropped) 3859 shmem_uncharge(mapping->host, nr_shmem_dropped); 3860 3861 if (!ret && is_anon) 3862 remap_flags = RMP_USE_SHARED_ZEROPAGE; 3863 remap_page(folio, 1 << order, remap_flags); 3864 3865 /* 3866 * Unlock all after-split folios except the one containing 3867 * @lock_at page. If @folio is not split, it will be kept locked. 3868 */ 3869 for (new_folio = folio; new_folio != end_folio; new_folio = next) { 3870 next = folio_next(new_folio); 3871 if (new_folio == page_folio(lock_at)) 3872 continue; 3873 3874 folio_unlock(new_folio); 3875 /* 3876 * Subpages may be freed if there wasn't any mapping 3877 * like if add_to_swap() is running on a lru page that 3878 * had its mapping zapped. And freeing these pages 3879 * requires taking the lru_lock so we do the put_page 3880 * of the tail pages after the split is complete. 3881 */ 3882 free_folio_and_swap_cache(new_folio); 3883 } 3884 3885 out_unlock: 3886 if (anon_vma) { 3887 anon_vma_unlock_write(anon_vma); 3888 put_anon_vma(anon_vma); 3889 } 3890 if (mapping) 3891 i_mmap_unlock_read(mapping); 3892 out: 3893 xas_destroy(&xas); 3894 if (order == HPAGE_PMD_ORDER) 3895 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3896 count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); 3897 return ret; 3898 } 3899 3900 /* 3901 * This function splits a large folio into smaller folios of order @new_order. 3902 * @page can point to any page of the large folio to split. The split operation 3903 * does not change the position of @page. 3904 * 3905 * Prerequisites: 3906 * 3907 * 1) The caller must hold a reference on the @page's owning folio, also known 3908 * as the large folio. 3909 * 3910 * 2) The large folio must be locked. 3911 * 3912 * 3) The folio must not be pinned. Any unexpected folio references, including 3913 * GUP pins, will result in the folio not getting split; instead, the caller 3914 * will receive an -EAGAIN. 3915 * 3916 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not 3917 * supported for non-file-backed folios, because folio->_deferred_list, which 3918 * is used by partially mapped folios, is stored in subpage 2, but an order-1 3919 * folio only has subpages 0 and 1. File-backed order-1 folios are supported, 3920 * since they do not use _deferred_list. 3921 * 3922 * After splitting, the caller's folio reference will be transferred to @page, 3923 * resulting in a raised refcount of @page after this call. The other pages may 3924 * be freed if they are not mapped. 3925 * 3926 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 3927 * 3928 * Pages in @new_order will inherit the mapping, flags, and so on from the 3929 * huge page. 3930 * 3931 * Returns 0 if the huge page was split successfully. 3932 * 3933 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if 3934 * the folio was concurrently removed from the page cache. 3935 * 3936 * Returns -EBUSY when trying to split the huge zeropage, if the folio is 3937 * under writeback, if fs-specific folio metadata cannot currently be 3938 * released, or if some unexpected race happened (e.g., anon VMA disappeared, 3939 * truncation). 3940 * 3941 * Callers should ensure that the order respects the address space mapping 3942 * min-order if one is set for non-anonymous folios. 3943 * 3944 * Returns -EINVAL when trying to split to an order that is incompatible 3945 * with the folio. Splitting to order 0 is compatible with all folios. 3946 */ 3947 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 3948 unsigned int new_order) 3949 { 3950 struct folio *folio = page_folio(page); 3951 3952 return __folio_split(folio, new_order, &folio->page, page, list, true); 3953 } 3954 3955 /* 3956 * folio_split: split a folio at @split_at to a @new_order folio 3957 * @folio: folio to split 3958 * @new_order: the order of the new folio 3959 * @split_at: a page within the new folio 3960 * 3961 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be 3962 * split but not to @new_order, the caller needs to check) 3963 * 3964 * It has the same prerequisites and returns as 3965 * split_huge_page_to_list_to_order(). 3966 * 3967 * Split a folio at @split_at to a new_order folio, leave the 3968 * remaining subpages of the original folio as large as possible. For example, 3969 * in the case of splitting an order-9 folio at its third order-3 subpages to 3970 * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio. 3971 * After the split, there will be a group of folios with different orders and 3972 * the new folio containing @split_at is marked in bracket: 3973 * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. 3974 * 3975 * After split, folio is left locked for caller. 3976 */ 3977 int folio_split(struct folio *folio, unsigned int new_order, 3978 struct page *split_at, struct list_head *list) 3979 { 3980 return __folio_split(folio, new_order, split_at, &folio->page, list, 3981 false); 3982 } 3983 3984 int min_order_for_split(struct folio *folio) 3985 { 3986 if (folio_test_anon(folio)) 3987 return 0; 3988 3989 if (!folio->mapping) { 3990 if (folio_test_pmd_mappable(folio)) 3991 count_vm_event(THP_SPLIT_PAGE_FAILED); 3992 return -EBUSY; 3993 } 3994 3995 return mapping_min_folio_order(folio->mapping); 3996 } 3997 3998 int split_folio_to_list(struct folio *folio, struct list_head *list) 3999 { 4000 int ret = min_order_for_split(folio); 4001 4002 if (ret < 0) 4003 return ret; 4004 4005 return split_huge_page_to_list_to_order(&folio->page, list, ret); 4006 } 4007 4008 /* 4009 * __folio_unqueue_deferred_split() is not to be called directly: 4010 * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h 4011 * limits its calls to those folios which may have a _deferred_list for 4012 * queueing THP splits, and that list is (racily observed to be) non-empty. 4013 * 4014 * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is 4015 * zero: because even when split_queue_lock is held, a non-empty _deferred_list 4016 * might be in use on deferred_split_scan()'s unlocked on-stack list. 4017 * 4018 * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is 4019 * therefore important to unqueue deferred split before changing folio memcg. 4020 */ 4021 bool __folio_unqueue_deferred_split(struct folio *folio) 4022 { 4023 struct deferred_split *ds_queue; 4024 unsigned long flags; 4025 bool unqueued = false; 4026 4027 WARN_ON_ONCE(folio_ref_count(folio)); 4028 WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); 4029 4030 ds_queue = get_deferred_split_queue(folio); 4031 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 4032 if (!list_empty(&folio->_deferred_list)) { 4033 ds_queue->split_queue_len--; 4034 if (folio_test_partially_mapped(folio)) { 4035 folio_clear_partially_mapped(folio); 4036 mod_mthp_stat(folio_order(folio), 4037 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 4038 } 4039 list_del_init(&folio->_deferred_list); 4040 unqueued = true; 4041 } 4042 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 4043 4044 return unqueued; /* useful for debug warnings */ 4045 } 4046 4047 /* partially_mapped=false won't clear PG_partially_mapped folio flag */ 4048 void deferred_split_folio(struct folio *folio, bool partially_mapped) 4049 { 4050 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 4051 #ifdef CONFIG_MEMCG 4052 struct mem_cgroup *memcg = folio_memcg(folio); 4053 #endif 4054 unsigned long flags; 4055 4056 /* 4057 * Order 1 folios have no space for a deferred list, but we also 4058 * won't waste much memory by not adding them to the deferred list. 4059 */ 4060 if (folio_order(folio) <= 1) 4061 return; 4062 4063 if (!partially_mapped && !split_underused_thp) 4064 return; 4065 4066 /* 4067 * Exclude swapcache: originally to avoid a corrupt deferred split 4068 * queue. Nowadays that is fully prevented by memcg1_swapout(); 4069 * but if page reclaim is already handling the same folio, it is 4070 * unnecessary to handle it again in the shrinker, so excluding 4071 * swapcache here may still be a useful optimization. 4072 */ 4073 if (folio_test_swapcache(folio)) 4074 return; 4075 4076 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 4077 if (partially_mapped) { 4078 if (!folio_test_partially_mapped(folio)) { 4079 folio_set_partially_mapped(folio); 4080 if (folio_test_pmd_mappable(folio)) 4081 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 4082 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); 4083 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); 4084 4085 } 4086 } else { 4087 /* partially mapped folios cannot become non-partially mapped */ 4088 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); 4089 } 4090 if (list_empty(&folio->_deferred_list)) { 4091 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); 4092 ds_queue->split_queue_len++; 4093 #ifdef CONFIG_MEMCG 4094 if (memcg) 4095 set_shrinker_bit(memcg, folio_nid(folio), 4096 deferred_split_shrinker->id); 4097 #endif 4098 } 4099 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 4100 } 4101 4102 static unsigned long deferred_split_count(struct shrinker *shrink, 4103 struct shrink_control *sc) 4104 { 4105 struct pglist_data *pgdata = NODE_DATA(sc->nid); 4106 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 4107 4108 #ifdef CONFIG_MEMCG 4109 if (sc->memcg) 4110 ds_queue = &sc->memcg->deferred_split_queue; 4111 #endif 4112 return READ_ONCE(ds_queue->split_queue_len); 4113 } 4114 4115 static bool thp_underused(struct folio *folio) 4116 { 4117 int num_zero_pages = 0, num_filled_pages = 0; 4118 void *kaddr; 4119 int i; 4120 4121 if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) 4122 return false; 4123 4124 for (i = 0; i < folio_nr_pages(folio); i++) { 4125 kaddr = kmap_local_folio(folio, i * PAGE_SIZE); 4126 if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { 4127 num_zero_pages++; 4128 if (num_zero_pages > khugepaged_max_ptes_none) { 4129 kunmap_local(kaddr); 4130 return true; 4131 } 4132 } else { 4133 /* 4134 * Another path for early exit once the number 4135 * of non-zero filled pages exceeds threshold. 4136 */ 4137 num_filled_pages++; 4138 if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { 4139 kunmap_local(kaddr); 4140 return false; 4141 } 4142 } 4143 kunmap_local(kaddr); 4144 } 4145 return false; 4146 } 4147 4148 static unsigned long deferred_split_scan(struct shrinker *shrink, 4149 struct shrink_control *sc) 4150 { 4151 struct pglist_data *pgdata = NODE_DATA(sc->nid); 4152 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 4153 unsigned long flags; 4154 LIST_HEAD(list); 4155 struct folio *folio, *next, *prev = NULL; 4156 int split = 0, removed = 0; 4157 4158 #ifdef CONFIG_MEMCG 4159 if (sc->memcg) 4160 ds_queue = &sc->memcg->deferred_split_queue; 4161 #endif 4162 4163 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 4164 /* Take pin on all head pages to avoid freeing them under us */ 4165 list_for_each_entry_safe(folio, next, &ds_queue->split_queue, 4166 _deferred_list) { 4167 if (folio_try_get(folio)) { 4168 list_move(&folio->_deferred_list, &list); 4169 } else { 4170 /* We lost race with folio_put() */ 4171 if (folio_test_partially_mapped(folio)) { 4172 folio_clear_partially_mapped(folio); 4173 mod_mthp_stat(folio_order(folio), 4174 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 4175 } 4176 list_del_init(&folio->_deferred_list); 4177 ds_queue->split_queue_len--; 4178 } 4179 if (!--sc->nr_to_scan) 4180 break; 4181 } 4182 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 4183 4184 list_for_each_entry_safe(folio, next, &list, _deferred_list) { 4185 bool did_split = false; 4186 bool underused = false; 4187 4188 if (!folio_test_partially_mapped(folio)) { 4189 underused = thp_underused(folio); 4190 if (!underused) 4191 goto next; 4192 } 4193 if (!folio_trylock(folio)) 4194 goto next; 4195 if (!split_folio(folio)) { 4196 did_split = true; 4197 if (underused) 4198 count_vm_event(THP_UNDERUSED_SPLIT_PAGE); 4199 split++; 4200 } 4201 folio_unlock(folio); 4202 next: 4203 /* 4204 * split_folio() removes folio from list on success. 4205 * Only add back to the queue if folio is partially mapped. 4206 * If thp_underused returns false, or if split_folio fails 4207 * in the case it was underused, then consider it used and 4208 * don't add it back to split_queue. 4209 */ 4210 if (did_split) { 4211 ; /* folio already removed from list */ 4212 } else if (!folio_test_partially_mapped(folio)) { 4213 list_del_init(&folio->_deferred_list); 4214 removed++; 4215 } else { 4216 /* 4217 * That unlocked list_del_init() above would be unsafe, 4218 * unless its folio is separated from any earlier folios 4219 * left on the list (which may be concurrently unqueued) 4220 * by one safe folio with refcount still raised. 4221 */ 4222 swap(folio, prev); 4223 } 4224 if (folio) 4225 folio_put(folio); 4226 } 4227 4228 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 4229 list_splice_tail(&list, &ds_queue->split_queue); 4230 ds_queue->split_queue_len -= removed; 4231 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 4232 4233 if (prev) 4234 folio_put(prev); 4235 4236 /* 4237 * Stop shrinker if we didn't split any page, but the queue is empty. 4238 * This can happen if pages were freed under us. 4239 */ 4240 if (!split && list_empty(&ds_queue->split_queue)) 4241 return SHRINK_STOP; 4242 return split; 4243 } 4244 4245 #ifdef CONFIG_DEBUG_FS 4246 static void split_huge_pages_all(void) 4247 { 4248 struct zone *zone; 4249 struct page *page; 4250 struct folio *folio; 4251 unsigned long pfn, max_zone_pfn; 4252 unsigned long total = 0, split = 0; 4253 4254 pr_debug("Split all THPs\n"); 4255 for_each_zone(zone) { 4256 if (!managed_zone(zone)) 4257 continue; 4258 max_zone_pfn = zone_end_pfn(zone); 4259 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 4260 int nr_pages; 4261 4262 page = pfn_to_online_page(pfn); 4263 if (!page || PageTail(page)) 4264 continue; 4265 folio = page_folio(page); 4266 if (!folio_try_get(folio)) 4267 continue; 4268 4269 if (unlikely(page_folio(page) != folio)) 4270 goto next; 4271 4272 if (zone != folio_zone(folio)) 4273 goto next; 4274 4275 if (!folio_test_large(folio) 4276 || folio_test_hugetlb(folio) 4277 || !folio_test_lru(folio)) 4278 goto next; 4279 4280 total++; 4281 folio_lock(folio); 4282 nr_pages = folio_nr_pages(folio); 4283 if (!split_folio(folio)) 4284 split++; 4285 pfn += nr_pages - 1; 4286 folio_unlock(folio); 4287 next: 4288 folio_put(folio); 4289 cond_resched(); 4290 } 4291 } 4292 4293 pr_debug("%lu of %lu THP split\n", split, total); 4294 } 4295 4296 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) 4297 { 4298 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || 4299 is_vm_hugetlb_page(vma); 4300 } 4301 4302 static int split_huge_pages_pid(int pid, unsigned long vaddr_start, 4303 unsigned long vaddr_end, unsigned int new_order, 4304 long in_folio_offset) 4305 { 4306 int ret = 0; 4307 struct task_struct *task; 4308 struct mm_struct *mm; 4309 unsigned long total = 0, split = 0; 4310 unsigned long addr; 4311 4312 vaddr_start &= PAGE_MASK; 4313 vaddr_end &= PAGE_MASK; 4314 4315 task = find_get_task_by_vpid(pid); 4316 if (!task) { 4317 ret = -ESRCH; 4318 goto out; 4319 } 4320 4321 /* Find the mm_struct */ 4322 mm = get_task_mm(task); 4323 put_task_struct(task); 4324 4325 if (!mm) { 4326 ret = -EINVAL; 4327 goto out; 4328 } 4329 4330 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", 4331 pid, vaddr_start, vaddr_end); 4332 4333 mmap_read_lock(mm); 4334 /* 4335 * always increase addr by PAGE_SIZE, since we could have a PTE page 4336 * table filled with PTE-mapped THPs, each of which is distinct. 4337 */ 4338 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { 4339 struct vm_area_struct *vma = vma_lookup(mm, addr); 4340 struct folio_walk fw; 4341 struct folio *folio; 4342 struct address_space *mapping; 4343 unsigned int target_order = new_order; 4344 4345 if (!vma) 4346 break; 4347 4348 /* skip special VMA and hugetlb VMA */ 4349 if (vma_not_suitable_for_thp_split(vma)) { 4350 addr = vma->vm_end; 4351 continue; 4352 } 4353 4354 folio = folio_walk_start(&fw, vma, addr, 0); 4355 if (!folio) 4356 continue; 4357 4358 if (!is_transparent_hugepage(folio)) 4359 goto next; 4360 4361 if (!folio_test_anon(folio)) { 4362 mapping = folio->mapping; 4363 target_order = max(new_order, 4364 mapping_min_folio_order(mapping)); 4365 } 4366 4367 if (target_order >= folio_order(folio)) 4368 goto next; 4369 4370 total++; 4371 /* 4372 * For folios with private, split_huge_page_to_list_to_order() 4373 * will try to drop it before split and then check if the folio 4374 * can be split or not. So skip the check here. 4375 */ 4376 if (!folio_test_private(folio) && 4377 !can_split_folio(folio, 0, NULL)) 4378 goto next; 4379 4380 if (!folio_trylock(folio)) 4381 goto next; 4382 folio_get(folio); 4383 folio_walk_end(&fw, vma); 4384 4385 if (!folio_test_anon(folio) && folio->mapping != mapping) 4386 goto unlock; 4387 4388 if (in_folio_offset < 0 || 4389 in_folio_offset >= folio_nr_pages(folio)) { 4390 if (!split_folio_to_order(folio, target_order)) 4391 split++; 4392 } else { 4393 struct page *split_at = folio_page(folio, 4394 in_folio_offset); 4395 if (!folio_split(folio, target_order, split_at, NULL)) 4396 split++; 4397 } 4398 4399 unlock: 4400 4401 folio_unlock(folio); 4402 folio_put(folio); 4403 4404 cond_resched(); 4405 continue; 4406 next: 4407 folio_walk_end(&fw, vma); 4408 cond_resched(); 4409 } 4410 mmap_read_unlock(mm); 4411 mmput(mm); 4412 4413 pr_debug("%lu of %lu THP split\n", split, total); 4414 4415 out: 4416 return ret; 4417 } 4418 4419 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, 4420 pgoff_t off_end, unsigned int new_order, 4421 long in_folio_offset) 4422 { 4423 struct filename *file; 4424 struct file *candidate; 4425 struct address_space *mapping; 4426 int ret = -EINVAL; 4427 pgoff_t index; 4428 int nr_pages = 1; 4429 unsigned long total = 0, split = 0; 4430 unsigned int min_order; 4431 unsigned int target_order; 4432 4433 file = getname_kernel(file_path); 4434 if (IS_ERR(file)) 4435 return ret; 4436 4437 candidate = file_open_name(file, O_RDONLY, 0); 4438 if (IS_ERR(candidate)) 4439 goto out; 4440 4441 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", 4442 file_path, off_start, off_end); 4443 4444 mapping = candidate->f_mapping; 4445 min_order = mapping_min_folio_order(mapping); 4446 target_order = max(new_order, min_order); 4447 4448 for (index = off_start; index < off_end; index += nr_pages) { 4449 struct folio *folio = filemap_get_folio(mapping, index); 4450 4451 nr_pages = 1; 4452 if (IS_ERR(folio)) 4453 continue; 4454 4455 if (!folio_test_large(folio)) 4456 goto next; 4457 4458 total++; 4459 nr_pages = folio_nr_pages(folio); 4460 4461 if (target_order >= folio_order(folio)) 4462 goto next; 4463 4464 if (!folio_trylock(folio)) 4465 goto next; 4466 4467 if (folio->mapping != mapping) 4468 goto unlock; 4469 4470 if (in_folio_offset < 0 || in_folio_offset >= nr_pages) { 4471 if (!split_folio_to_order(folio, target_order)) 4472 split++; 4473 } else { 4474 struct page *split_at = folio_page(folio, 4475 in_folio_offset); 4476 if (!folio_split(folio, target_order, split_at, NULL)) 4477 split++; 4478 } 4479 4480 unlock: 4481 folio_unlock(folio); 4482 next: 4483 folio_put(folio); 4484 cond_resched(); 4485 } 4486 4487 filp_close(candidate, NULL); 4488 ret = 0; 4489 4490 pr_debug("%lu of %lu file-backed THP split\n", split, total); 4491 out: 4492 putname(file); 4493 return ret; 4494 } 4495 4496 #define MAX_INPUT_BUF_SZ 255 4497 4498 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, 4499 size_t count, loff_t *ppops) 4500 { 4501 static DEFINE_MUTEX(split_debug_mutex); 4502 ssize_t ret; 4503 /* 4504 * hold pid, start_vaddr, end_vaddr, new_order or 4505 * file_path, off_start, off_end, new_order 4506 */ 4507 char input_buf[MAX_INPUT_BUF_SZ]; 4508 int pid; 4509 unsigned long vaddr_start, vaddr_end; 4510 unsigned int new_order = 0; 4511 long in_folio_offset = -1; 4512 4513 ret = mutex_lock_interruptible(&split_debug_mutex); 4514 if (ret) 4515 return ret; 4516 4517 ret = -EFAULT; 4518 4519 memset(input_buf, 0, MAX_INPUT_BUF_SZ); 4520 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) 4521 goto out; 4522 4523 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; 4524 4525 if (input_buf[0] == '/') { 4526 char *tok; 4527 char *tok_buf = input_buf; 4528 char file_path[MAX_INPUT_BUF_SZ]; 4529 pgoff_t off_start = 0, off_end = 0; 4530 size_t input_len = strlen(input_buf); 4531 4532 tok = strsep(&tok_buf, ","); 4533 if (tok && tok_buf) { 4534 strscpy(file_path, tok); 4535 } else { 4536 ret = -EINVAL; 4537 goto out; 4538 } 4539 4540 ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end, 4541 &new_order, &in_folio_offset); 4542 if (ret != 2 && ret != 3 && ret != 4) { 4543 ret = -EINVAL; 4544 goto out; 4545 } 4546 ret = split_huge_pages_in_file(file_path, off_start, off_end, 4547 new_order, in_folio_offset); 4548 if (!ret) 4549 ret = input_len; 4550 4551 goto out; 4552 } 4553 4554 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start, 4555 &vaddr_end, &new_order, &in_folio_offset); 4556 if (ret == 1 && pid == 1) { 4557 split_huge_pages_all(); 4558 ret = strlen(input_buf); 4559 goto out; 4560 } else if (ret != 3 && ret != 4 && ret != 5) { 4561 ret = -EINVAL; 4562 goto out; 4563 } 4564 4565 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order, 4566 in_folio_offset); 4567 if (!ret) 4568 ret = strlen(input_buf); 4569 out: 4570 mutex_unlock(&split_debug_mutex); 4571 return ret; 4572 4573 } 4574 4575 static const struct file_operations split_huge_pages_fops = { 4576 .owner = THIS_MODULE, 4577 .write = split_huge_pages_write, 4578 }; 4579 4580 static int __init split_huge_pages_debugfs(void) 4581 { 4582 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 4583 &split_huge_pages_fops); 4584 return 0; 4585 } 4586 late_initcall(split_huge_pages_debugfs); 4587 #endif 4588 4589 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 4590 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 4591 struct page *page) 4592 { 4593 struct folio *folio = page_folio(page); 4594 struct vm_area_struct *vma = pvmw->vma; 4595 struct mm_struct *mm = vma->vm_mm; 4596 unsigned long address = pvmw->address; 4597 bool anon_exclusive; 4598 pmd_t pmdval; 4599 swp_entry_t entry; 4600 pmd_t pmdswp; 4601 4602 if (!(pvmw->pmd && !pvmw->pte)) 4603 return 0; 4604 4605 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 4606 pmdval = pmdp_invalidate(vma, address, pvmw->pmd); 4607 4608 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ 4609 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); 4610 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { 4611 set_pmd_at(mm, address, pvmw->pmd, pmdval); 4612 return -EBUSY; 4613 } 4614 4615 if (pmd_dirty(pmdval)) 4616 folio_mark_dirty(folio); 4617 if (pmd_write(pmdval)) 4618 entry = make_writable_migration_entry(page_to_pfn(page)); 4619 else if (anon_exclusive) 4620 entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); 4621 else 4622 entry = make_readable_migration_entry(page_to_pfn(page)); 4623 if (pmd_young(pmdval)) 4624 entry = make_migration_entry_young(entry); 4625 if (pmd_dirty(pmdval)) 4626 entry = make_migration_entry_dirty(entry); 4627 pmdswp = swp_entry_to_pmd(entry); 4628 if (pmd_soft_dirty(pmdval)) 4629 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 4630 if (pmd_uffd_wp(pmdval)) 4631 pmdswp = pmd_swp_mkuffd_wp(pmdswp); 4632 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 4633 folio_remove_rmap_pmd(folio, page, vma); 4634 folio_put(folio); 4635 trace_set_migration_pmd(address, pmd_val(pmdswp)); 4636 4637 return 0; 4638 } 4639 4640 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 4641 { 4642 struct folio *folio = page_folio(new); 4643 struct vm_area_struct *vma = pvmw->vma; 4644 struct mm_struct *mm = vma->vm_mm; 4645 unsigned long address = pvmw->address; 4646 unsigned long haddr = address & HPAGE_PMD_MASK; 4647 pmd_t pmde; 4648 swp_entry_t entry; 4649 4650 if (!(pvmw->pmd && !pvmw->pte)) 4651 return; 4652 4653 entry = pmd_to_swp_entry(*pvmw->pmd); 4654 folio_get(folio); 4655 pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); 4656 if (pmd_swp_soft_dirty(*pvmw->pmd)) 4657 pmde = pmd_mksoft_dirty(pmde); 4658 if (is_writable_migration_entry(entry)) 4659 pmde = pmd_mkwrite(pmde, vma); 4660 if (pmd_swp_uffd_wp(*pvmw->pmd)) 4661 pmde = pmd_mkuffd_wp(pmde); 4662 if (!is_migration_entry_young(entry)) 4663 pmde = pmd_mkold(pmde); 4664 /* NOTE: this may contain setting soft-dirty on some archs */ 4665 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) 4666 pmde = pmd_mkdirty(pmde); 4667 4668 if (folio_test_anon(folio)) { 4669 rmap_t rmap_flags = RMAP_NONE; 4670 4671 if (!is_readable_migration_entry(entry)) 4672 rmap_flags |= RMAP_EXCLUSIVE; 4673 4674 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); 4675 } else { 4676 folio_add_file_rmap_pmd(folio, new, vma); 4677 } 4678 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new)); 4679 set_pmd_at(mm, haddr, pvmw->pmd, pmde); 4680 4681 /* No need to invalidate - it was non-present before */ 4682 update_mmu_cache_pmd(vma, address, pvmw->pmd); 4683 trace_remove_migration_pmd(address, pmd_val(pmde)); 4684 } 4685 #endif 4686