1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/mm.h> 11 #include <linux/sched/coredump.h> 12 #include <linux/sched/numa_balancing.h> 13 #include <linux/highmem.h> 14 #include <linux/hugetlb.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/rmap.h> 17 #include <linux/swap.h> 18 #include <linux/shrinker.h> 19 #include <linux/mm_inline.h> 20 #include <linux/swapops.h> 21 #include <linux/backing-dev.h> 22 #include <linux/dax.h> 23 #include <linux/khugepaged.h> 24 #include <linux/freezer.h> 25 #include <linux/pfn_t.h> 26 #include <linux/mman.h> 27 #include <linux/memremap.h> 28 #include <linux/pagemap.h> 29 #include <linux/debugfs.h> 30 #include <linux/migrate.h> 31 #include <linux/hashtable.h> 32 #include <linux/userfaultfd_k.h> 33 #include <linux/page_idle.h> 34 #include <linux/shmem_fs.h> 35 #include <linux/oom.h> 36 #include <linux/numa.h> 37 #include <linux/page_owner.h> 38 #include <linux/sched/sysctl.h> 39 #include <linux/memory-tiers.h> 40 #include <linux/compat.h> 41 #include <linux/pgalloc_tag.h> 42 43 #include <asm/tlb.h> 44 #include <asm/pgalloc.h> 45 #include "internal.h" 46 #include "swap.h" 47 48 #define CREATE_TRACE_POINTS 49 #include <trace/events/thp.h> 50 51 /* 52 * By default, transparent hugepage support is disabled in order to avoid 53 * risking an increased memory footprint for applications that are not 54 * guaranteed to benefit from it. When transparent hugepage support is 55 * enabled, it is for all mappings, and khugepaged scans all mappings. 56 * Defrag is invoked by khugepaged hugepage allocations and by page faults 57 * for all hugepage allocations. 58 */ 59 unsigned long transparent_hugepage_flags __read_mostly = 60 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 61 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 62 #endif 63 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 64 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 65 #endif 66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 68 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 69 70 static struct shrinker *deferred_split_shrinker; 71 static unsigned long deferred_split_count(struct shrinker *shrink, 72 struct shrink_control *sc); 73 static unsigned long deferred_split_scan(struct shrinker *shrink, 74 struct shrink_control *sc); 75 76 static atomic_t huge_zero_refcount; 77 struct folio *huge_zero_folio __read_mostly; 78 unsigned long huge_zero_pfn __read_mostly = ~0UL; 79 unsigned long huge_anon_orders_always __read_mostly; 80 unsigned long huge_anon_orders_madvise __read_mostly; 81 unsigned long huge_anon_orders_inherit __read_mostly; 82 83 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, 84 unsigned long vm_flags, bool smaps, 85 bool in_pf, bool enforce_sysfs, 86 unsigned long orders) 87 { 88 /* Check the intersection of requested and supported orders. */ 89 orders &= vma_is_anonymous(vma) ? 90 THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; 91 if (!orders) 92 return 0; 93 94 if (!vma->vm_mm) /* vdso */ 95 return 0; 96 97 /* 98 * Explicitly disabled through madvise or prctl, or some 99 * architectures may disable THP for some mappings, for 100 * example, s390 kvm. 101 * */ 102 if ((vm_flags & VM_NOHUGEPAGE) || 103 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 104 return 0; 105 /* 106 * If the hardware/firmware marked hugepage support disabled. 107 */ 108 if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) 109 return 0; 110 111 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ 112 if (vma_is_dax(vma)) 113 return in_pf ? orders : 0; 114 115 /* 116 * khugepaged special VMA and hugetlb VMA. 117 * Must be checked after dax since some dax mappings may have 118 * VM_MIXEDMAP set. 119 */ 120 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) 121 return 0; 122 123 /* 124 * Check alignment for file vma and size for both file and anon vma by 125 * filtering out the unsuitable orders. 126 * 127 * Skip the check for page fault. Huge fault does the check in fault 128 * handlers. 129 */ 130 if (!in_pf) { 131 int order = highest_order(orders); 132 unsigned long addr; 133 134 while (orders) { 135 addr = vma->vm_end - (PAGE_SIZE << order); 136 if (thp_vma_suitable_order(vma, addr, order)) 137 break; 138 order = next_order(&orders, order); 139 } 140 141 if (!orders) 142 return 0; 143 } 144 145 /* 146 * Enabled via shmem mount options or sysfs settings. 147 * Must be done before hugepage flags check since shmem has its 148 * own flags. 149 */ 150 if (!in_pf && shmem_file(vma->vm_file)) 151 return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 152 !enforce_sysfs, vma->vm_mm, vm_flags) 153 ? orders : 0; 154 155 if (!vma_is_anonymous(vma)) { 156 /* 157 * Enforce sysfs THP requirements as necessary. Anonymous vmas 158 * were already handled in thp_vma_allowable_orders(). 159 */ 160 if (enforce_sysfs && 161 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && 162 !hugepage_global_always()))) 163 return 0; 164 165 /* 166 * Trust that ->huge_fault() handlers know what they are doing 167 * in fault path. 168 */ 169 if (((in_pf || smaps)) && vma->vm_ops->huge_fault) 170 return orders; 171 /* Only regular file is valid in collapse path */ 172 if (((!in_pf || smaps)) && file_thp_enabled(vma)) 173 return orders; 174 return 0; 175 } 176 177 if (vma_is_temporary_stack(vma)) 178 return 0; 179 180 /* 181 * THPeligible bit of smaps should show 1 for proper VMAs even 182 * though anon_vma is not initialized yet. 183 * 184 * Allow page fault since anon_vma may be not initialized until 185 * the first page fault. 186 */ 187 if (!vma->anon_vma) 188 return (smaps || in_pf) ? orders : 0; 189 190 return orders; 191 } 192 193 static bool get_huge_zero_page(void) 194 { 195 struct folio *zero_folio; 196 retry: 197 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 198 return true; 199 200 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 201 HPAGE_PMD_ORDER); 202 if (!zero_folio) { 203 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 204 return false; 205 } 206 preempt_disable(); 207 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { 208 preempt_enable(); 209 folio_put(zero_folio); 210 goto retry; 211 } 212 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); 213 214 /* We take additional reference here. It will be put back by shrinker */ 215 atomic_set(&huge_zero_refcount, 2); 216 preempt_enable(); 217 count_vm_event(THP_ZERO_PAGE_ALLOC); 218 return true; 219 } 220 221 static void put_huge_zero_page(void) 222 { 223 /* 224 * Counter should never go to zero here. Only shrinker can put 225 * last reference. 226 */ 227 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 228 } 229 230 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) 231 { 232 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 233 return READ_ONCE(huge_zero_folio); 234 235 if (!get_huge_zero_page()) 236 return NULL; 237 238 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 239 put_huge_zero_page(); 240 241 return READ_ONCE(huge_zero_folio); 242 } 243 244 void mm_put_huge_zero_folio(struct mm_struct *mm) 245 { 246 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 247 put_huge_zero_page(); 248 } 249 250 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 251 struct shrink_control *sc) 252 { 253 /* we can free zero page only if last reference remains */ 254 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 255 } 256 257 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 258 struct shrink_control *sc) 259 { 260 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 261 struct folio *zero_folio = xchg(&huge_zero_folio, NULL); 262 BUG_ON(zero_folio == NULL); 263 WRITE_ONCE(huge_zero_pfn, ~0UL); 264 folio_put(zero_folio); 265 return HPAGE_PMD_NR; 266 } 267 268 return 0; 269 } 270 271 static struct shrinker *huge_zero_page_shrinker; 272 273 #ifdef CONFIG_SYSFS 274 static ssize_t enabled_show(struct kobject *kobj, 275 struct kobj_attribute *attr, char *buf) 276 { 277 const char *output; 278 279 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 280 output = "[always] madvise never"; 281 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 282 &transparent_hugepage_flags)) 283 output = "always [madvise] never"; 284 else 285 output = "always madvise [never]"; 286 287 return sysfs_emit(buf, "%s\n", output); 288 } 289 290 static ssize_t enabled_store(struct kobject *kobj, 291 struct kobj_attribute *attr, 292 const char *buf, size_t count) 293 { 294 ssize_t ret = count; 295 296 if (sysfs_streq(buf, "always")) { 297 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 298 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 299 } else if (sysfs_streq(buf, "madvise")) { 300 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 301 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 302 } else if (sysfs_streq(buf, "never")) { 303 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 304 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 305 } else 306 ret = -EINVAL; 307 308 if (ret > 0) { 309 int err = start_stop_khugepaged(); 310 if (err) 311 ret = err; 312 } 313 return ret; 314 } 315 316 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); 317 318 ssize_t single_hugepage_flag_show(struct kobject *kobj, 319 struct kobj_attribute *attr, char *buf, 320 enum transparent_hugepage_flag flag) 321 { 322 return sysfs_emit(buf, "%d\n", 323 !!test_bit(flag, &transparent_hugepage_flags)); 324 } 325 326 ssize_t single_hugepage_flag_store(struct kobject *kobj, 327 struct kobj_attribute *attr, 328 const char *buf, size_t count, 329 enum transparent_hugepage_flag flag) 330 { 331 unsigned long value; 332 int ret; 333 334 ret = kstrtoul(buf, 10, &value); 335 if (ret < 0) 336 return ret; 337 if (value > 1) 338 return -EINVAL; 339 340 if (value) 341 set_bit(flag, &transparent_hugepage_flags); 342 else 343 clear_bit(flag, &transparent_hugepage_flags); 344 345 return count; 346 } 347 348 static ssize_t defrag_show(struct kobject *kobj, 349 struct kobj_attribute *attr, char *buf) 350 { 351 const char *output; 352 353 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 354 &transparent_hugepage_flags)) 355 output = "[always] defer defer+madvise madvise never"; 356 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 357 &transparent_hugepage_flags)) 358 output = "always [defer] defer+madvise madvise never"; 359 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 360 &transparent_hugepage_flags)) 361 output = "always defer [defer+madvise] madvise never"; 362 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 363 &transparent_hugepage_flags)) 364 output = "always defer defer+madvise [madvise] never"; 365 else 366 output = "always defer defer+madvise madvise [never]"; 367 368 return sysfs_emit(buf, "%s\n", output); 369 } 370 371 static ssize_t defrag_store(struct kobject *kobj, 372 struct kobj_attribute *attr, 373 const char *buf, size_t count) 374 { 375 if (sysfs_streq(buf, "always")) { 376 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 377 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 378 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 379 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 380 } else if (sysfs_streq(buf, "defer+madvise")) { 381 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 382 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 383 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 384 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 385 } else if (sysfs_streq(buf, "defer")) { 386 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 387 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 388 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 389 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 390 } else if (sysfs_streq(buf, "madvise")) { 391 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 392 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 393 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 394 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 395 } else if (sysfs_streq(buf, "never")) { 396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 397 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 398 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 399 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 400 } else 401 return -EINVAL; 402 403 return count; 404 } 405 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); 406 407 static ssize_t use_zero_page_show(struct kobject *kobj, 408 struct kobj_attribute *attr, char *buf) 409 { 410 return single_hugepage_flag_show(kobj, attr, buf, 411 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 412 } 413 static ssize_t use_zero_page_store(struct kobject *kobj, 414 struct kobj_attribute *attr, const char *buf, size_t count) 415 { 416 return single_hugepage_flag_store(kobj, attr, buf, count, 417 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 418 } 419 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); 420 421 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 422 struct kobj_attribute *attr, char *buf) 423 { 424 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); 425 } 426 static struct kobj_attribute hpage_pmd_size_attr = 427 __ATTR_RO(hpage_pmd_size); 428 429 static struct attribute *hugepage_attr[] = { 430 &enabled_attr.attr, 431 &defrag_attr.attr, 432 &use_zero_page_attr.attr, 433 &hpage_pmd_size_attr.attr, 434 #ifdef CONFIG_SHMEM 435 &shmem_enabled_attr.attr, 436 #endif 437 NULL, 438 }; 439 440 static const struct attribute_group hugepage_attr_group = { 441 .attrs = hugepage_attr, 442 }; 443 444 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); 445 static void thpsize_release(struct kobject *kobj); 446 static DEFINE_SPINLOCK(huge_anon_orders_lock); 447 static LIST_HEAD(thpsize_list); 448 449 struct thpsize { 450 struct kobject kobj; 451 struct list_head node; 452 int order; 453 }; 454 455 #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) 456 457 static ssize_t thpsize_enabled_show(struct kobject *kobj, 458 struct kobj_attribute *attr, char *buf) 459 { 460 int order = to_thpsize(kobj)->order; 461 const char *output; 462 463 if (test_bit(order, &huge_anon_orders_always)) 464 output = "[always] inherit madvise never"; 465 else if (test_bit(order, &huge_anon_orders_inherit)) 466 output = "always [inherit] madvise never"; 467 else if (test_bit(order, &huge_anon_orders_madvise)) 468 output = "always inherit [madvise] never"; 469 else 470 output = "always inherit madvise [never]"; 471 472 return sysfs_emit(buf, "%s\n", output); 473 } 474 475 static ssize_t thpsize_enabled_store(struct kobject *kobj, 476 struct kobj_attribute *attr, 477 const char *buf, size_t count) 478 { 479 int order = to_thpsize(kobj)->order; 480 ssize_t ret = count; 481 482 if (sysfs_streq(buf, "always")) { 483 spin_lock(&huge_anon_orders_lock); 484 clear_bit(order, &huge_anon_orders_inherit); 485 clear_bit(order, &huge_anon_orders_madvise); 486 set_bit(order, &huge_anon_orders_always); 487 spin_unlock(&huge_anon_orders_lock); 488 } else if (sysfs_streq(buf, "inherit")) { 489 spin_lock(&huge_anon_orders_lock); 490 clear_bit(order, &huge_anon_orders_always); 491 clear_bit(order, &huge_anon_orders_madvise); 492 set_bit(order, &huge_anon_orders_inherit); 493 spin_unlock(&huge_anon_orders_lock); 494 } else if (sysfs_streq(buf, "madvise")) { 495 spin_lock(&huge_anon_orders_lock); 496 clear_bit(order, &huge_anon_orders_always); 497 clear_bit(order, &huge_anon_orders_inherit); 498 set_bit(order, &huge_anon_orders_madvise); 499 spin_unlock(&huge_anon_orders_lock); 500 } else if (sysfs_streq(buf, "never")) { 501 spin_lock(&huge_anon_orders_lock); 502 clear_bit(order, &huge_anon_orders_always); 503 clear_bit(order, &huge_anon_orders_inherit); 504 clear_bit(order, &huge_anon_orders_madvise); 505 spin_unlock(&huge_anon_orders_lock); 506 } else 507 ret = -EINVAL; 508 509 return ret; 510 } 511 512 static struct kobj_attribute thpsize_enabled_attr = 513 __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); 514 515 static struct attribute *thpsize_attrs[] = { 516 &thpsize_enabled_attr.attr, 517 NULL, 518 }; 519 520 static const struct attribute_group thpsize_attr_group = { 521 .attrs = thpsize_attrs, 522 }; 523 524 static const struct kobj_type thpsize_ktype = { 525 .release = &thpsize_release, 526 .sysfs_ops = &kobj_sysfs_ops, 527 }; 528 529 static struct thpsize *thpsize_create(int order, struct kobject *parent) 530 { 531 unsigned long size = (PAGE_SIZE << order) / SZ_1K; 532 struct thpsize *thpsize; 533 int ret; 534 535 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); 536 if (!thpsize) 537 return ERR_PTR(-ENOMEM); 538 539 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, 540 "hugepages-%lukB", size); 541 if (ret) { 542 kfree(thpsize); 543 return ERR_PTR(ret); 544 } 545 546 ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); 547 if (ret) { 548 kobject_put(&thpsize->kobj); 549 return ERR_PTR(ret); 550 } 551 552 thpsize->order = order; 553 return thpsize; 554 } 555 556 static void thpsize_release(struct kobject *kobj) 557 { 558 kfree(to_thpsize(kobj)); 559 } 560 561 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 562 { 563 int err; 564 struct thpsize *thpsize; 565 unsigned long orders; 566 int order; 567 568 /* 569 * Default to setting PMD-sized THP to inherit the global setting and 570 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time 571 * constant so we have to do this here. 572 */ 573 huge_anon_orders_inherit = BIT(PMD_ORDER); 574 575 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 576 if (unlikely(!*hugepage_kobj)) { 577 pr_err("failed to create transparent hugepage kobject\n"); 578 return -ENOMEM; 579 } 580 581 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 582 if (err) { 583 pr_err("failed to register transparent hugepage group\n"); 584 goto delete_obj; 585 } 586 587 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 588 if (err) { 589 pr_err("failed to register transparent hugepage group\n"); 590 goto remove_hp_group; 591 } 592 593 orders = THP_ORDERS_ALL_ANON; 594 order = highest_order(orders); 595 while (orders) { 596 thpsize = thpsize_create(order, *hugepage_kobj); 597 if (IS_ERR(thpsize)) { 598 pr_err("failed to create thpsize for order %d\n", order); 599 err = PTR_ERR(thpsize); 600 goto remove_all; 601 } 602 list_add(&thpsize->node, &thpsize_list); 603 order = next_order(&orders, order); 604 } 605 606 return 0; 607 608 remove_all: 609 hugepage_exit_sysfs(*hugepage_kobj); 610 return err; 611 remove_hp_group: 612 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 613 delete_obj: 614 kobject_put(*hugepage_kobj); 615 return err; 616 } 617 618 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 619 { 620 struct thpsize *thpsize, *tmp; 621 622 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { 623 list_del(&thpsize->node); 624 kobject_put(&thpsize->kobj); 625 } 626 627 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 628 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 629 kobject_put(hugepage_kobj); 630 } 631 #else 632 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 633 { 634 return 0; 635 } 636 637 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 638 { 639 } 640 #endif /* CONFIG_SYSFS */ 641 642 static int __init thp_shrinker_init(void) 643 { 644 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); 645 if (!huge_zero_page_shrinker) 646 return -ENOMEM; 647 648 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 649 SHRINKER_MEMCG_AWARE | 650 SHRINKER_NONSLAB, 651 "thp-deferred_split"); 652 if (!deferred_split_shrinker) { 653 shrinker_free(huge_zero_page_shrinker); 654 return -ENOMEM; 655 } 656 657 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; 658 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; 659 shrinker_register(huge_zero_page_shrinker); 660 661 deferred_split_shrinker->count_objects = deferred_split_count; 662 deferred_split_shrinker->scan_objects = deferred_split_scan; 663 shrinker_register(deferred_split_shrinker); 664 665 return 0; 666 } 667 668 static void __init thp_shrinker_exit(void) 669 { 670 shrinker_free(huge_zero_page_shrinker); 671 shrinker_free(deferred_split_shrinker); 672 } 673 674 static int __init hugepage_init(void) 675 { 676 int err; 677 struct kobject *hugepage_kobj; 678 679 if (!has_transparent_hugepage()) { 680 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; 681 return -EINVAL; 682 } 683 684 /* 685 * hugepages can't be allocated by the buddy allocator 686 */ 687 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); 688 /* 689 * we use page->mapping and page->index in second tail page 690 * as list_head: assuming THP order >= 2 691 */ 692 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 693 694 err = hugepage_init_sysfs(&hugepage_kobj); 695 if (err) 696 goto err_sysfs; 697 698 err = khugepaged_init(); 699 if (err) 700 goto err_slab; 701 702 err = thp_shrinker_init(); 703 if (err) 704 goto err_shrinker; 705 706 /* 707 * By default disable transparent hugepages on smaller systems, 708 * where the extra memory used could hurt more than TLB overhead 709 * is likely to save. The admin can still enable it through /sys. 710 */ 711 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 712 transparent_hugepage_flags = 0; 713 return 0; 714 } 715 716 err = start_stop_khugepaged(); 717 if (err) 718 goto err_khugepaged; 719 720 return 0; 721 err_khugepaged: 722 thp_shrinker_exit(); 723 err_shrinker: 724 khugepaged_destroy(); 725 err_slab: 726 hugepage_exit_sysfs(hugepage_kobj); 727 err_sysfs: 728 return err; 729 } 730 subsys_initcall(hugepage_init); 731 732 static int __init setup_transparent_hugepage(char *str) 733 { 734 int ret = 0; 735 if (!str) 736 goto out; 737 if (!strcmp(str, "always")) { 738 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 739 &transparent_hugepage_flags); 740 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 741 &transparent_hugepage_flags); 742 ret = 1; 743 } else if (!strcmp(str, "madvise")) { 744 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 745 &transparent_hugepage_flags); 746 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 747 &transparent_hugepage_flags); 748 ret = 1; 749 } else if (!strcmp(str, "never")) { 750 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 751 &transparent_hugepage_flags); 752 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 753 &transparent_hugepage_flags); 754 ret = 1; 755 } 756 out: 757 if (!ret) 758 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 759 return ret; 760 } 761 __setup("transparent_hugepage=", setup_transparent_hugepage); 762 763 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 764 { 765 if (likely(vma->vm_flags & VM_WRITE)) 766 pmd = pmd_mkwrite(pmd, vma); 767 return pmd; 768 } 769 770 #ifdef CONFIG_MEMCG 771 static inline 772 struct deferred_split *get_deferred_split_queue(struct folio *folio) 773 { 774 struct mem_cgroup *memcg = folio_memcg(folio); 775 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 776 777 if (memcg) 778 return &memcg->deferred_split_queue; 779 else 780 return &pgdat->deferred_split_queue; 781 } 782 #else 783 static inline 784 struct deferred_split *get_deferred_split_queue(struct folio *folio) 785 { 786 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 787 788 return &pgdat->deferred_split_queue; 789 } 790 #endif 791 792 static inline bool is_transparent_hugepage(const struct folio *folio) 793 { 794 if (!folio_test_large(folio)) 795 return false; 796 797 return is_huge_zero_folio(folio) || 798 folio_test_large_rmappable(folio); 799 } 800 801 static unsigned long __thp_get_unmapped_area(struct file *filp, 802 unsigned long addr, unsigned long len, 803 loff_t off, unsigned long flags, unsigned long size, 804 vm_flags_t vm_flags) 805 { 806 loff_t off_end = off + len; 807 loff_t off_align = round_up(off, size); 808 unsigned long len_pad, ret, off_sub; 809 810 if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) 811 return 0; 812 813 if (off_end <= off_align || (off_end - off_align) < size) 814 return 0; 815 816 len_pad = len + size; 817 if (len_pad < len || (off + len_pad) < off) 818 return 0; 819 820 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, 821 off >> PAGE_SHIFT, flags, vm_flags); 822 823 /* 824 * The failure might be due to length padding. The caller will retry 825 * without the padding. 826 */ 827 if (IS_ERR_VALUE(ret)) 828 return 0; 829 830 /* 831 * Do not try to align to THP boundary if allocation at the address 832 * hint succeeds. 833 */ 834 if (ret == addr) 835 return addr; 836 837 off_sub = (off - ret) & (size - 1); 838 839 if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) 840 return ret + size; 841 842 ret += off_sub; 843 return ret; 844 } 845 846 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, 847 unsigned long len, unsigned long pgoff, unsigned long flags, 848 vm_flags_t vm_flags) 849 { 850 unsigned long ret; 851 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 852 853 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); 854 if (ret) 855 return ret; 856 857 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, 858 vm_flags); 859 } 860 861 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 862 unsigned long len, unsigned long pgoff, unsigned long flags) 863 { 864 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); 865 } 866 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 867 868 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, 869 struct page *page, gfp_t gfp) 870 { 871 struct vm_area_struct *vma = vmf->vma; 872 struct folio *folio = page_folio(page); 873 pgtable_t pgtable; 874 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 875 vm_fault_t ret = 0; 876 877 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 878 879 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { 880 folio_put(folio); 881 count_vm_event(THP_FAULT_FALLBACK); 882 count_vm_event(THP_FAULT_FALLBACK_CHARGE); 883 return VM_FAULT_FALLBACK; 884 } 885 folio_throttle_swaprate(folio, gfp); 886 887 pgtable = pte_alloc_one(vma->vm_mm); 888 if (unlikely(!pgtable)) { 889 ret = VM_FAULT_OOM; 890 goto release; 891 } 892 893 clear_huge_page(page, vmf->address, HPAGE_PMD_NR); 894 /* 895 * The memory barrier inside __folio_mark_uptodate makes sure that 896 * clear_huge_page writes become visible before the set_pmd_at() 897 * write. 898 */ 899 __folio_mark_uptodate(folio); 900 901 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 902 if (unlikely(!pmd_none(*vmf->pmd))) { 903 goto unlock_release; 904 } else { 905 pmd_t entry; 906 907 ret = check_stable_address_space(vma->vm_mm); 908 if (ret) 909 goto unlock_release; 910 911 /* Deliver the page fault to userland */ 912 if (userfaultfd_missing(vma)) { 913 spin_unlock(vmf->ptl); 914 folio_put(folio); 915 pte_free(vma->vm_mm, pgtable); 916 ret = handle_userfault(vmf, VM_UFFD_MISSING); 917 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 918 return ret; 919 } 920 921 entry = mk_huge_pmd(page, vma->vm_page_prot); 922 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 923 folio_add_new_anon_rmap(folio, vma, haddr); 924 folio_add_lru_vma(folio, vma); 925 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 926 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 927 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 928 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 929 mm_inc_nr_ptes(vma->vm_mm); 930 spin_unlock(vmf->ptl); 931 count_vm_event(THP_FAULT_ALLOC); 932 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); 933 } 934 935 return 0; 936 unlock_release: 937 spin_unlock(vmf->ptl); 938 release: 939 if (pgtable) 940 pte_free(vma->vm_mm, pgtable); 941 folio_put(folio); 942 return ret; 943 944 } 945 946 /* 947 * always: directly stall for all thp allocations 948 * defer: wake kswapd and fail if not immediately available 949 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 950 * fail if not immediately available 951 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 952 * available 953 * never: never stall for any thp allocation 954 */ 955 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) 956 { 957 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); 958 959 /* Always do synchronous compaction */ 960 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 961 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 962 963 /* Kick kcompactd and fail quickly */ 964 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 965 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 966 967 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 968 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 969 return GFP_TRANSHUGE_LIGHT | 970 (vma_madvised ? __GFP_DIRECT_RECLAIM : 971 __GFP_KSWAPD_RECLAIM); 972 973 /* Only do synchronous compaction if madvised */ 974 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 975 return GFP_TRANSHUGE_LIGHT | 976 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 977 978 return GFP_TRANSHUGE_LIGHT; 979 } 980 981 /* Caller must hold page table lock. */ 982 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, 983 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 984 struct folio *zero_folio) 985 { 986 pmd_t entry; 987 if (!pmd_none(*pmd)) 988 return; 989 entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); 990 entry = pmd_mkhuge(entry); 991 pgtable_trans_huge_deposit(mm, pmd, pgtable); 992 set_pmd_at(mm, haddr, pmd, entry); 993 mm_inc_nr_ptes(mm); 994 } 995 996 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 997 { 998 struct vm_area_struct *vma = vmf->vma; 999 gfp_t gfp; 1000 struct folio *folio; 1001 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1002 1003 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) 1004 return VM_FAULT_FALLBACK; 1005 if (unlikely(anon_vma_prepare(vma))) 1006 return VM_FAULT_OOM; 1007 khugepaged_enter_vma(vma, vma->vm_flags); 1008 1009 if (!(vmf->flags & FAULT_FLAG_WRITE) && 1010 !mm_forbids_zeropage(vma->vm_mm) && 1011 transparent_hugepage_use_zero_page()) { 1012 pgtable_t pgtable; 1013 struct folio *zero_folio; 1014 vm_fault_t ret; 1015 1016 pgtable = pte_alloc_one(vma->vm_mm); 1017 if (unlikely(!pgtable)) 1018 return VM_FAULT_OOM; 1019 zero_folio = mm_get_huge_zero_folio(vma->vm_mm); 1020 if (unlikely(!zero_folio)) { 1021 pte_free(vma->vm_mm, pgtable); 1022 count_vm_event(THP_FAULT_FALLBACK); 1023 return VM_FAULT_FALLBACK; 1024 } 1025 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1026 ret = 0; 1027 if (pmd_none(*vmf->pmd)) { 1028 ret = check_stable_address_space(vma->vm_mm); 1029 if (ret) { 1030 spin_unlock(vmf->ptl); 1031 pte_free(vma->vm_mm, pgtable); 1032 } else if (userfaultfd_missing(vma)) { 1033 spin_unlock(vmf->ptl); 1034 pte_free(vma->vm_mm, pgtable); 1035 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1036 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1037 } else { 1038 set_huge_zero_folio(pgtable, vma->vm_mm, vma, 1039 haddr, vmf->pmd, zero_folio); 1040 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1041 spin_unlock(vmf->ptl); 1042 } 1043 } else { 1044 spin_unlock(vmf->ptl); 1045 pte_free(vma->vm_mm, pgtable); 1046 } 1047 return ret; 1048 } 1049 gfp = vma_thp_gfp_mask(vma); 1050 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); 1051 if (unlikely(!folio)) { 1052 count_vm_event(THP_FAULT_FALLBACK); 1053 return VM_FAULT_FALLBACK; 1054 } 1055 return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); 1056 } 1057 1058 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 1059 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 1060 pgtable_t pgtable) 1061 { 1062 struct mm_struct *mm = vma->vm_mm; 1063 pmd_t entry; 1064 spinlock_t *ptl; 1065 1066 ptl = pmd_lock(mm, pmd); 1067 if (!pmd_none(*pmd)) { 1068 if (write) { 1069 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { 1070 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 1071 goto out_unlock; 1072 } 1073 entry = pmd_mkyoung(*pmd); 1074 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1075 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 1076 update_mmu_cache_pmd(vma, addr, pmd); 1077 } 1078 1079 goto out_unlock; 1080 } 1081 1082 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 1083 if (pfn_t_devmap(pfn)) 1084 entry = pmd_mkdevmap(entry); 1085 if (write) { 1086 entry = pmd_mkyoung(pmd_mkdirty(entry)); 1087 entry = maybe_pmd_mkwrite(entry, vma); 1088 } 1089 1090 if (pgtable) { 1091 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1092 mm_inc_nr_ptes(mm); 1093 pgtable = NULL; 1094 } 1095 1096 set_pmd_at(mm, addr, pmd, entry); 1097 update_mmu_cache_pmd(vma, addr, pmd); 1098 1099 out_unlock: 1100 spin_unlock(ptl); 1101 if (pgtable) 1102 pte_free(mm, pgtable); 1103 } 1104 1105 /** 1106 * vmf_insert_pfn_pmd - insert a pmd size pfn 1107 * @vmf: Structure describing the fault 1108 * @pfn: pfn to insert 1109 * @write: whether it's a write fault 1110 * 1111 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. 1112 * 1113 * Return: vm_fault_t value. 1114 */ 1115 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 1116 { 1117 unsigned long addr = vmf->address & PMD_MASK; 1118 struct vm_area_struct *vma = vmf->vma; 1119 pgprot_t pgprot = vma->vm_page_prot; 1120 pgtable_t pgtable = NULL; 1121 1122 /* 1123 * If we had pmd_special, we could avoid all these restrictions, 1124 * but we need to be consistent with PTEs and architectures that 1125 * can't support a 'special' bit. 1126 */ 1127 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1128 !pfn_t_devmap(pfn)); 1129 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1130 (VM_PFNMAP|VM_MIXEDMAP)); 1131 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1132 1133 if (addr < vma->vm_start || addr >= vma->vm_end) 1134 return VM_FAULT_SIGBUS; 1135 1136 if (arch_needs_pgtable_deposit()) { 1137 pgtable = pte_alloc_one(vma->vm_mm); 1138 if (!pgtable) 1139 return VM_FAULT_OOM; 1140 } 1141 1142 track_pfn_insert(vma, &pgprot, pfn); 1143 1144 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 1145 return VM_FAULT_NOPAGE; 1146 } 1147 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1148 1149 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1150 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 1151 { 1152 if (likely(vma->vm_flags & VM_WRITE)) 1153 pud = pud_mkwrite(pud); 1154 return pud; 1155 } 1156 1157 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 1158 pud_t *pud, pfn_t pfn, bool write) 1159 { 1160 struct mm_struct *mm = vma->vm_mm; 1161 pgprot_t prot = vma->vm_page_prot; 1162 pud_t entry; 1163 spinlock_t *ptl; 1164 1165 ptl = pud_lock(mm, pud); 1166 if (!pud_none(*pud)) { 1167 if (write) { 1168 if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { 1169 WARN_ON_ONCE(!is_huge_zero_pud(*pud)); 1170 goto out_unlock; 1171 } 1172 entry = pud_mkyoung(*pud); 1173 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 1174 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 1175 update_mmu_cache_pud(vma, addr, pud); 1176 } 1177 goto out_unlock; 1178 } 1179 1180 entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 1181 if (pfn_t_devmap(pfn)) 1182 entry = pud_mkdevmap(entry); 1183 if (write) { 1184 entry = pud_mkyoung(pud_mkdirty(entry)); 1185 entry = maybe_pud_mkwrite(entry, vma); 1186 } 1187 set_pud_at(mm, addr, pud, entry); 1188 update_mmu_cache_pud(vma, addr, pud); 1189 1190 out_unlock: 1191 spin_unlock(ptl); 1192 } 1193 1194 /** 1195 * vmf_insert_pfn_pud - insert a pud size pfn 1196 * @vmf: Structure describing the fault 1197 * @pfn: pfn to insert 1198 * @write: whether it's a write fault 1199 * 1200 * Insert a pud size pfn. See vmf_insert_pfn() for additional info. 1201 * 1202 * Return: vm_fault_t value. 1203 */ 1204 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 1205 { 1206 unsigned long addr = vmf->address & PUD_MASK; 1207 struct vm_area_struct *vma = vmf->vma; 1208 pgprot_t pgprot = vma->vm_page_prot; 1209 1210 /* 1211 * If we had pud_special, we could avoid all these restrictions, 1212 * but we need to be consistent with PTEs and architectures that 1213 * can't support a 'special' bit. 1214 */ 1215 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1216 !pfn_t_devmap(pfn)); 1217 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1218 (VM_PFNMAP|VM_MIXEDMAP)); 1219 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1220 1221 if (addr < vma->vm_start || addr >= vma->vm_end) 1222 return VM_FAULT_SIGBUS; 1223 1224 track_pfn_insert(vma, &pgprot, pfn); 1225 1226 insert_pfn_pud(vma, addr, vmf->pud, pfn, write); 1227 return VM_FAULT_NOPAGE; 1228 } 1229 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 1230 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1231 1232 void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1233 pmd_t *pmd, bool write) 1234 { 1235 pmd_t _pmd; 1236 1237 _pmd = pmd_mkyoung(*pmd); 1238 if (write) 1239 _pmd = pmd_mkdirty(_pmd); 1240 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1241 pmd, _pmd, write)) 1242 update_mmu_cache_pmd(vma, addr, pmd); 1243 } 1244 1245 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 1246 pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 1247 { 1248 unsigned long pfn = pmd_pfn(*pmd); 1249 struct mm_struct *mm = vma->vm_mm; 1250 struct page *page; 1251 int ret; 1252 1253 assert_spin_locked(pmd_lockptr(mm, pmd)); 1254 1255 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1256 return NULL; 1257 1258 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 1259 /* pass */; 1260 else 1261 return NULL; 1262 1263 if (flags & FOLL_TOUCH) 1264 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); 1265 1266 /* 1267 * device mapped pages can only be returned if the 1268 * caller will manage the page reference count. 1269 */ 1270 if (!(flags & (FOLL_GET | FOLL_PIN))) 1271 return ERR_PTR(-EEXIST); 1272 1273 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 1274 *pgmap = get_dev_pagemap(pfn, *pgmap); 1275 if (!*pgmap) 1276 return ERR_PTR(-EFAULT); 1277 page = pfn_to_page(pfn); 1278 ret = try_grab_page(page, flags); 1279 if (ret) 1280 page = ERR_PTR(ret); 1281 1282 return page; 1283 } 1284 1285 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1286 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1287 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1288 { 1289 spinlock_t *dst_ptl, *src_ptl; 1290 struct page *src_page; 1291 struct folio *src_folio; 1292 pmd_t pmd; 1293 pgtable_t pgtable = NULL; 1294 int ret = -ENOMEM; 1295 1296 /* Skip if can be re-fill on fault */ 1297 if (!vma_is_anonymous(dst_vma)) 1298 return 0; 1299 1300 pgtable = pte_alloc_one(dst_mm); 1301 if (unlikely(!pgtable)) 1302 goto out; 1303 1304 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1305 src_ptl = pmd_lockptr(src_mm, src_pmd); 1306 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1307 1308 ret = -EAGAIN; 1309 pmd = *src_pmd; 1310 1311 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1312 if (unlikely(is_swap_pmd(pmd))) { 1313 swp_entry_t entry = pmd_to_swp_entry(pmd); 1314 1315 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1316 if (!is_readable_migration_entry(entry)) { 1317 entry = make_readable_migration_entry( 1318 swp_offset(entry)); 1319 pmd = swp_entry_to_pmd(entry); 1320 if (pmd_swp_soft_dirty(*src_pmd)) 1321 pmd = pmd_swp_mksoft_dirty(pmd); 1322 if (pmd_swp_uffd_wp(*src_pmd)) 1323 pmd = pmd_swp_mkuffd_wp(pmd); 1324 set_pmd_at(src_mm, addr, src_pmd, pmd); 1325 } 1326 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1327 mm_inc_nr_ptes(dst_mm); 1328 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1329 if (!userfaultfd_wp(dst_vma)) 1330 pmd = pmd_swp_clear_uffd_wp(pmd); 1331 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1332 ret = 0; 1333 goto out_unlock; 1334 } 1335 #endif 1336 1337 if (unlikely(!pmd_trans_huge(pmd))) { 1338 pte_free(dst_mm, pgtable); 1339 goto out_unlock; 1340 } 1341 /* 1342 * When page table lock is held, the huge zero pmd should not be 1343 * under splitting since we don't split the page itself, only pmd to 1344 * a page table. 1345 */ 1346 if (is_huge_zero_pmd(pmd)) { 1347 /* 1348 * mm_get_huge_zero_folio() will never allocate a new 1349 * folio here, since we already have a zero page to 1350 * copy. It just takes a reference. 1351 */ 1352 mm_get_huge_zero_folio(dst_mm); 1353 goto out_zero_page; 1354 } 1355 1356 src_page = pmd_page(pmd); 1357 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1358 src_folio = page_folio(src_page); 1359 1360 folio_get(src_folio); 1361 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) { 1362 /* Page maybe pinned: split and retry the fault on PTEs. */ 1363 folio_put(src_folio); 1364 pte_free(dst_mm, pgtable); 1365 spin_unlock(src_ptl); 1366 spin_unlock(dst_ptl); 1367 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); 1368 return -EAGAIN; 1369 } 1370 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1371 out_zero_page: 1372 mm_inc_nr_ptes(dst_mm); 1373 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1374 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1375 if (!userfaultfd_wp(dst_vma)) 1376 pmd = pmd_clear_uffd_wp(pmd); 1377 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1378 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1379 1380 ret = 0; 1381 out_unlock: 1382 spin_unlock(src_ptl); 1383 spin_unlock(dst_ptl); 1384 out: 1385 return ret; 1386 } 1387 1388 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1389 void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1390 pud_t *pud, bool write) 1391 { 1392 pud_t _pud; 1393 1394 _pud = pud_mkyoung(*pud); 1395 if (write) 1396 _pud = pud_mkdirty(_pud); 1397 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1398 pud, _pud, write)) 1399 update_mmu_cache_pud(vma, addr, pud); 1400 } 1401 1402 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1403 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1404 struct vm_area_struct *vma) 1405 { 1406 spinlock_t *dst_ptl, *src_ptl; 1407 pud_t pud; 1408 int ret; 1409 1410 dst_ptl = pud_lock(dst_mm, dst_pud); 1411 src_ptl = pud_lockptr(src_mm, src_pud); 1412 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1413 1414 ret = -EAGAIN; 1415 pud = *src_pud; 1416 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 1417 goto out_unlock; 1418 1419 /* 1420 * When page table lock is held, the huge zero pud should not be 1421 * under splitting since we don't split the page itself, only pud to 1422 * a page table. 1423 */ 1424 if (is_huge_zero_pud(pud)) { 1425 /* No huge zero pud yet */ 1426 } 1427 1428 /* 1429 * TODO: once we support anonymous pages, use 1430 * folio_try_dup_anon_rmap_*() and split if duplicating fails. 1431 */ 1432 pudp_set_wrprotect(src_mm, addr, src_pud); 1433 pud = pud_mkold(pud_wrprotect(pud)); 1434 set_pud_at(dst_mm, addr, dst_pud, pud); 1435 1436 ret = 0; 1437 out_unlock: 1438 spin_unlock(src_ptl); 1439 spin_unlock(dst_ptl); 1440 return ret; 1441 } 1442 1443 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1444 { 1445 bool write = vmf->flags & FAULT_FLAG_WRITE; 1446 1447 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1448 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1449 goto unlock; 1450 1451 touch_pud(vmf->vma, vmf->address, vmf->pud, write); 1452 unlock: 1453 spin_unlock(vmf->ptl); 1454 } 1455 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1456 1457 void huge_pmd_set_accessed(struct vm_fault *vmf) 1458 { 1459 bool write = vmf->flags & FAULT_FLAG_WRITE; 1460 1461 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1462 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) 1463 goto unlock; 1464 1465 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); 1466 1467 unlock: 1468 spin_unlock(vmf->ptl); 1469 } 1470 1471 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) 1472 { 1473 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 1474 struct vm_area_struct *vma = vmf->vma; 1475 struct folio *folio; 1476 struct page *page; 1477 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1478 pmd_t orig_pmd = vmf->orig_pmd; 1479 1480 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 1481 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1482 1483 if (is_huge_zero_pmd(orig_pmd)) 1484 goto fallback; 1485 1486 spin_lock(vmf->ptl); 1487 1488 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1489 spin_unlock(vmf->ptl); 1490 return 0; 1491 } 1492 1493 page = pmd_page(orig_pmd); 1494 folio = page_folio(page); 1495 VM_BUG_ON_PAGE(!PageHead(page), page); 1496 1497 /* Early check when only holding the PT lock. */ 1498 if (PageAnonExclusive(page)) 1499 goto reuse; 1500 1501 if (!folio_trylock(folio)) { 1502 folio_get(folio); 1503 spin_unlock(vmf->ptl); 1504 folio_lock(folio); 1505 spin_lock(vmf->ptl); 1506 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1507 spin_unlock(vmf->ptl); 1508 folio_unlock(folio); 1509 folio_put(folio); 1510 return 0; 1511 } 1512 folio_put(folio); 1513 } 1514 1515 /* Recheck after temporarily dropping the PT lock. */ 1516 if (PageAnonExclusive(page)) { 1517 folio_unlock(folio); 1518 goto reuse; 1519 } 1520 1521 /* 1522 * See do_wp_page(): we can only reuse the folio exclusively if 1523 * there are no additional references. Note that we always drain 1524 * the LRU cache immediately after adding a THP. 1525 */ 1526 if (folio_ref_count(folio) > 1527 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) 1528 goto unlock_fallback; 1529 if (folio_test_swapcache(folio)) 1530 folio_free_swap(folio); 1531 if (folio_ref_count(folio) == 1) { 1532 pmd_t entry; 1533 1534 folio_move_anon_rmap(folio, vma); 1535 SetPageAnonExclusive(page); 1536 folio_unlock(folio); 1537 reuse: 1538 if (unlikely(unshare)) { 1539 spin_unlock(vmf->ptl); 1540 return 0; 1541 } 1542 entry = pmd_mkyoung(orig_pmd); 1543 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1544 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1545 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1546 spin_unlock(vmf->ptl); 1547 return 0; 1548 } 1549 1550 unlock_fallback: 1551 folio_unlock(folio); 1552 spin_unlock(vmf->ptl); 1553 fallback: 1554 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); 1555 return VM_FAULT_FALLBACK; 1556 } 1557 1558 static inline bool can_change_pmd_writable(struct vm_area_struct *vma, 1559 unsigned long addr, pmd_t pmd) 1560 { 1561 struct page *page; 1562 1563 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) 1564 return false; 1565 1566 /* Don't touch entries that are not even readable (NUMA hinting). */ 1567 if (pmd_protnone(pmd)) 1568 return false; 1569 1570 /* Do we need write faults for softdirty tracking? */ 1571 if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) 1572 return false; 1573 1574 /* Do we need write faults for uffd-wp tracking? */ 1575 if (userfaultfd_huge_pmd_wp(vma, pmd)) 1576 return false; 1577 1578 if (!(vma->vm_flags & VM_SHARED)) { 1579 /* See can_change_pte_writable(). */ 1580 page = vm_normal_page_pmd(vma, addr, pmd); 1581 return page && PageAnon(page) && PageAnonExclusive(page); 1582 } 1583 1584 /* See can_change_pte_writable(). */ 1585 return pmd_dirty(pmd); 1586 } 1587 1588 /* NUMA hinting page fault entry point for trans huge pmds */ 1589 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) 1590 { 1591 struct vm_area_struct *vma = vmf->vma; 1592 pmd_t oldpmd = vmf->orig_pmd; 1593 pmd_t pmd; 1594 struct folio *folio; 1595 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1596 int nid = NUMA_NO_NODE; 1597 int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); 1598 bool migrated = false, writable = false; 1599 int flags = 0; 1600 1601 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1602 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 1603 spin_unlock(vmf->ptl); 1604 goto out; 1605 } 1606 1607 pmd = pmd_modify(oldpmd, vma->vm_page_prot); 1608 1609 /* 1610 * Detect now whether the PMD could be writable; this information 1611 * is only valid while holding the PT lock. 1612 */ 1613 writable = pmd_write(pmd); 1614 if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 1615 can_change_pmd_writable(vma, vmf->address, pmd)) 1616 writable = true; 1617 1618 folio = vm_normal_folio_pmd(vma, haddr, pmd); 1619 if (!folio) 1620 goto out_map; 1621 1622 /* See similar comment in do_numa_page for explanation */ 1623 if (!writable) 1624 flags |= TNF_NO_GROUP; 1625 1626 nid = folio_nid(folio); 1627 /* 1628 * For memory tiering mode, cpupid of slow memory page is used 1629 * to record page access time. So use default value. 1630 */ 1631 if (node_is_toptier(nid)) 1632 last_cpupid = folio_last_cpupid(folio); 1633 target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags); 1634 if (target_nid == NUMA_NO_NODE) { 1635 folio_put(folio); 1636 goto out_map; 1637 } 1638 1639 spin_unlock(vmf->ptl); 1640 writable = false; 1641 1642 migrated = migrate_misplaced_folio(folio, vma, target_nid); 1643 if (migrated) { 1644 flags |= TNF_MIGRATED; 1645 nid = target_nid; 1646 } else { 1647 flags |= TNF_MIGRATE_FAIL; 1648 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1649 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 1650 spin_unlock(vmf->ptl); 1651 goto out; 1652 } 1653 goto out_map; 1654 } 1655 1656 out: 1657 if (nid != NUMA_NO_NODE) 1658 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 1659 1660 return 0; 1661 1662 out_map: 1663 /* Restore the PMD */ 1664 pmd = pmd_modify(oldpmd, vma->vm_page_prot); 1665 pmd = pmd_mkyoung(pmd); 1666 if (writable) 1667 pmd = pmd_mkwrite(pmd, vma); 1668 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 1669 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1670 spin_unlock(vmf->ptl); 1671 goto out; 1672 } 1673 1674 /* 1675 * Return true if we do MADV_FREE successfully on entire pmd page. 1676 * Otherwise, return false. 1677 */ 1678 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1679 pmd_t *pmd, unsigned long addr, unsigned long next) 1680 { 1681 spinlock_t *ptl; 1682 pmd_t orig_pmd; 1683 struct folio *folio; 1684 struct mm_struct *mm = tlb->mm; 1685 bool ret = false; 1686 1687 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1688 1689 ptl = pmd_trans_huge_lock(pmd, vma); 1690 if (!ptl) 1691 goto out_unlocked; 1692 1693 orig_pmd = *pmd; 1694 if (is_huge_zero_pmd(orig_pmd)) 1695 goto out; 1696 1697 if (unlikely(!pmd_present(orig_pmd))) { 1698 VM_BUG_ON(thp_migration_supported() && 1699 !is_pmd_migration_entry(orig_pmd)); 1700 goto out; 1701 } 1702 1703 folio = pmd_folio(orig_pmd); 1704 /* 1705 * If other processes are mapping this folio, we couldn't discard 1706 * the folio unless they all do MADV_FREE so let's skip the folio. 1707 */ 1708 if (folio_likely_mapped_shared(folio)) 1709 goto out; 1710 1711 if (!folio_trylock(folio)) 1712 goto out; 1713 1714 /* 1715 * If user want to discard part-pages of THP, split it so MADV_FREE 1716 * will deactivate only them. 1717 */ 1718 if (next - addr != HPAGE_PMD_SIZE) { 1719 folio_get(folio); 1720 spin_unlock(ptl); 1721 split_folio(folio); 1722 folio_unlock(folio); 1723 folio_put(folio); 1724 goto out_unlocked; 1725 } 1726 1727 if (folio_test_dirty(folio)) 1728 folio_clear_dirty(folio); 1729 folio_unlock(folio); 1730 1731 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1732 pmdp_invalidate(vma, addr, pmd); 1733 orig_pmd = pmd_mkold(orig_pmd); 1734 orig_pmd = pmd_mkclean(orig_pmd); 1735 1736 set_pmd_at(mm, addr, pmd, orig_pmd); 1737 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1738 } 1739 1740 folio_mark_lazyfree(folio); 1741 ret = true; 1742 out: 1743 spin_unlock(ptl); 1744 out_unlocked: 1745 return ret; 1746 } 1747 1748 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 1749 { 1750 pgtable_t pgtable; 1751 1752 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1753 pte_free(mm, pgtable); 1754 mm_dec_nr_ptes(mm); 1755 } 1756 1757 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1758 pmd_t *pmd, unsigned long addr) 1759 { 1760 pmd_t orig_pmd; 1761 spinlock_t *ptl; 1762 1763 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1764 1765 ptl = __pmd_trans_huge_lock(pmd, vma); 1766 if (!ptl) 1767 return 0; 1768 /* 1769 * For architectures like ppc64 we look at deposited pgtable 1770 * when calling pmdp_huge_get_and_clear. So do the 1771 * pgtable_trans_huge_withdraw after finishing pmdp related 1772 * operations. 1773 */ 1774 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, 1775 tlb->fullmm); 1776 arch_check_zapped_pmd(vma, orig_pmd); 1777 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1778 if (vma_is_special_huge(vma)) { 1779 if (arch_needs_pgtable_deposit()) 1780 zap_deposited_table(tlb->mm, pmd); 1781 spin_unlock(ptl); 1782 } else if (is_huge_zero_pmd(orig_pmd)) { 1783 zap_deposited_table(tlb->mm, pmd); 1784 spin_unlock(ptl); 1785 } else { 1786 struct folio *folio = NULL; 1787 int flush_needed = 1; 1788 1789 if (pmd_present(orig_pmd)) { 1790 struct page *page = pmd_page(orig_pmd); 1791 1792 folio = page_folio(page); 1793 folio_remove_rmap_pmd(folio, page, vma); 1794 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1795 VM_BUG_ON_PAGE(!PageHead(page), page); 1796 } else if (thp_migration_supported()) { 1797 swp_entry_t entry; 1798 1799 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 1800 entry = pmd_to_swp_entry(orig_pmd); 1801 folio = pfn_swap_entry_folio(entry); 1802 flush_needed = 0; 1803 } else 1804 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 1805 1806 if (folio_test_anon(folio)) { 1807 zap_deposited_table(tlb->mm, pmd); 1808 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1809 } else { 1810 if (arch_needs_pgtable_deposit()) 1811 zap_deposited_table(tlb->mm, pmd); 1812 add_mm_counter(tlb->mm, mm_counter_file(folio), 1813 -HPAGE_PMD_NR); 1814 } 1815 1816 spin_unlock(ptl); 1817 if (flush_needed) 1818 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); 1819 } 1820 return 1; 1821 } 1822 1823 #ifndef pmd_move_must_withdraw 1824 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 1825 spinlock_t *old_pmd_ptl, 1826 struct vm_area_struct *vma) 1827 { 1828 /* 1829 * With split pmd lock we also need to move preallocated 1830 * PTE page table if new_pmd is on different PMD page table. 1831 * 1832 * We also don't deposit and withdraw tables for file pages. 1833 */ 1834 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 1835 } 1836 #endif 1837 1838 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 1839 { 1840 #ifdef CONFIG_MEM_SOFT_DIRTY 1841 if (unlikely(is_pmd_migration_entry(pmd))) 1842 pmd = pmd_swp_mksoft_dirty(pmd); 1843 else if (pmd_present(pmd)) 1844 pmd = pmd_mksoft_dirty(pmd); 1845 #endif 1846 return pmd; 1847 } 1848 1849 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1850 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) 1851 { 1852 spinlock_t *old_ptl, *new_ptl; 1853 pmd_t pmd; 1854 struct mm_struct *mm = vma->vm_mm; 1855 bool force_flush = false; 1856 1857 /* 1858 * The destination pmd shouldn't be established, free_pgtables() 1859 * should have released it; but move_page_tables() might have already 1860 * inserted a page table, if racing against shmem/file collapse. 1861 */ 1862 if (!pmd_none(*new_pmd)) { 1863 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1864 return false; 1865 } 1866 1867 /* 1868 * We don't have to worry about the ordering of src and dst 1869 * ptlocks because exclusive mmap_lock prevents deadlock. 1870 */ 1871 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1872 if (old_ptl) { 1873 new_ptl = pmd_lockptr(mm, new_pmd); 1874 if (new_ptl != old_ptl) 1875 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1876 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1877 if (pmd_present(pmd)) 1878 force_flush = true; 1879 VM_BUG_ON(!pmd_none(*new_pmd)); 1880 1881 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 1882 pgtable_t pgtable; 1883 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1884 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1885 } 1886 pmd = move_soft_dirty_pmd(pmd); 1887 set_pmd_at(mm, new_addr, new_pmd, pmd); 1888 if (force_flush) 1889 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 1890 if (new_ptl != old_ptl) 1891 spin_unlock(new_ptl); 1892 spin_unlock(old_ptl); 1893 return true; 1894 } 1895 return false; 1896 } 1897 1898 /* 1899 * Returns 1900 * - 0 if PMD could not be locked 1901 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary 1902 * or if prot_numa but THP migration is not supported 1903 * - HPAGE_PMD_NR if protections changed and TLB flush necessary 1904 */ 1905 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1906 pmd_t *pmd, unsigned long addr, pgprot_t newprot, 1907 unsigned long cp_flags) 1908 { 1909 struct mm_struct *mm = vma->vm_mm; 1910 spinlock_t *ptl; 1911 pmd_t oldpmd, entry; 1912 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 1913 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 1914 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 1915 int ret = 1; 1916 1917 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1918 1919 if (prot_numa && !thp_migration_supported()) 1920 return 1; 1921 1922 ptl = __pmd_trans_huge_lock(pmd, vma); 1923 if (!ptl) 1924 return 0; 1925 1926 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1927 if (is_swap_pmd(*pmd)) { 1928 swp_entry_t entry = pmd_to_swp_entry(*pmd); 1929 struct folio *folio = pfn_swap_entry_folio(entry); 1930 pmd_t newpmd; 1931 1932 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 1933 if (is_writable_migration_entry(entry)) { 1934 /* 1935 * A protection check is difficult so 1936 * just be safe and disable write 1937 */ 1938 if (folio_test_anon(folio)) 1939 entry = make_readable_exclusive_migration_entry(swp_offset(entry)); 1940 else 1941 entry = make_readable_migration_entry(swp_offset(entry)); 1942 newpmd = swp_entry_to_pmd(entry); 1943 if (pmd_swp_soft_dirty(*pmd)) 1944 newpmd = pmd_swp_mksoft_dirty(newpmd); 1945 } else { 1946 newpmd = *pmd; 1947 } 1948 1949 if (uffd_wp) 1950 newpmd = pmd_swp_mkuffd_wp(newpmd); 1951 else if (uffd_wp_resolve) 1952 newpmd = pmd_swp_clear_uffd_wp(newpmd); 1953 if (!pmd_same(*pmd, newpmd)) 1954 set_pmd_at(mm, addr, pmd, newpmd); 1955 goto unlock; 1956 } 1957 #endif 1958 1959 if (prot_numa) { 1960 struct folio *folio; 1961 bool toptier; 1962 /* 1963 * Avoid trapping faults against the zero page. The read-only 1964 * data is likely to be read-cached on the local CPU and 1965 * local/remote hits to the zero page are not interesting. 1966 */ 1967 if (is_huge_zero_pmd(*pmd)) 1968 goto unlock; 1969 1970 if (pmd_protnone(*pmd)) 1971 goto unlock; 1972 1973 folio = pmd_folio(*pmd); 1974 toptier = node_is_toptier(folio_nid(folio)); 1975 /* 1976 * Skip scanning top tier node if normal numa 1977 * balancing is disabled 1978 */ 1979 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 1980 toptier) 1981 goto unlock; 1982 1983 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && 1984 !toptier) 1985 folio_xchg_access_time(folio, 1986 jiffies_to_msecs(jiffies)); 1987 } 1988 /* 1989 * In case prot_numa, we are under mmap_read_lock(mm). It's critical 1990 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 1991 * which is also under mmap_read_lock(mm): 1992 * 1993 * CPU0: CPU1: 1994 * change_huge_pmd(prot_numa=1) 1995 * pmdp_huge_get_and_clear_notify() 1996 * madvise_dontneed() 1997 * zap_pmd_range() 1998 * pmd_trans_huge(*pmd) == 0 (without ptl) 1999 * // skip the pmd 2000 * set_pmd_at(); 2001 * // pmd is re-established 2002 * 2003 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 2004 * which may break userspace. 2005 * 2006 * pmdp_invalidate_ad() is required to make sure we don't miss 2007 * dirty/young flags set by hardware. 2008 */ 2009 oldpmd = pmdp_invalidate_ad(vma, addr, pmd); 2010 2011 entry = pmd_modify(oldpmd, newprot); 2012 if (uffd_wp) 2013 entry = pmd_mkuffd_wp(entry); 2014 else if (uffd_wp_resolve) 2015 /* 2016 * Leave the write bit to be handled by PF interrupt 2017 * handler, then things like COW could be properly 2018 * handled. 2019 */ 2020 entry = pmd_clear_uffd_wp(entry); 2021 2022 /* See change_pte_range(). */ 2023 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && 2024 can_change_pmd_writable(vma, addr, entry)) 2025 entry = pmd_mkwrite(entry, vma); 2026 2027 ret = HPAGE_PMD_NR; 2028 set_pmd_at(mm, addr, pmd, entry); 2029 2030 if (huge_pmd_needs_flush(oldpmd, entry)) 2031 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); 2032 unlock: 2033 spin_unlock(ptl); 2034 return ret; 2035 } 2036 2037 #ifdef CONFIG_USERFAULTFD 2038 /* 2039 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by 2040 * the caller, but it must return after releasing the page_table_lock. 2041 * Just move the page from src_pmd to dst_pmd if possible. 2042 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be 2043 * repeated by the caller, or other errors in case of failure. 2044 */ 2045 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 2046 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 2047 unsigned long dst_addr, unsigned long src_addr) 2048 { 2049 pmd_t _dst_pmd, src_pmdval; 2050 struct page *src_page; 2051 struct folio *src_folio; 2052 struct anon_vma *src_anon_vma; 2053 spinlock_t *src_ptl, *dst_ptl; 2054 pgtable_t src_pgtable; 2055 struct mmu_notifier_range range; 2056 int err = 0; 2057 2058 src_pmdval = *src_pmd; 2059 src_ptl = pmd_lockptr(mm, src_pmd); 2060 2061 lockdep_assert_held(src_ptl); 2062 vma_assert_locked(src_vma); 2063 vma_assert_locked(dst_vma); 2064 2065 /* Sanity checks before the operation */ 2066 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || 2067 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { 2068 spin_unlock(src_ptl); 2069 return -EINVAL; 2070 } 2071 2072 if (!pmd_trans_huge(src_pmdval)) { 2073 spin_unlock(src_ptl); 2074 if (is_pmd_migration_entry(src_pmdval)) { 2075 pmd_migration_entry_wait(mm, &src_pmdval); 2076 return -EAGAIN; 2077 } 2078 return -ENOENT; 2079 } 2080 2081 src_page = pmd_page(src_pmdval); 2082 2083 if (!is_huge_zero_pmd(src_pmdval)) { 2084 if (unlikely(!PageAnonExclusive(src_page))) { 2085 spin_unlock(src_ptl); 2086 return -EBUSY; 2087 } 2088 2089 src_folio = page_folio(src_page); 2090 folio_get(src_folio); 2091 } else 2092 src_folio = NULL; 2093 2094 spin_unlock(src_ptl); 2095 2096 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); 2097 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, 2098 src_addr + HPAGE_PMD_SIZE); 2099 mmu_notifier_invalidate_range_start(&range); 2100 2101 if (src_folio) { 2102 folio_lock(src_folio); 2103 2104 /* 2105 * split_huge_page walks the anon_vma chain without the page 2106 * lock. Serialize against it with the anon_vma lock, the page 2107 * lock is not enough. 2108 */ 2109 src_anon_vma = folio_get_anon_vma(src_folio); 2110 if (!src_anon_vma) { 2111 err = -EAGAIN; 2112 goto unlock_folio; 2113 } 2114 anon_vma_lock_write(src_anon_vma); 2115 } else 2116 src_anon_vma = NULL; 2117 2118 dst_ptl = pmd_lockptr(mm, dst_pmd); 2119 double_pt_lock(src_ptl, dst_ptl); 2120 if (unlikely(!pmd_same(*src_pmd, src_pmdval) || 2121 !pmd_same(*dst_pmd, dst_pmdval))) { 2122 err = -EAGAIN; 2123 goto unlock_ptls; 2124 } 2125 if (src_folio) { 2126 if (folio_maybe_dma_pinned(src_folio) || 2127 !PageAnonExclusive(&src_folio->page)) { 2128 err = -EBUSY; 2129 goto unlock_ptls; 2130 } 2131 2132 if (WARN_ON_ONCE(!folio_test_head(src_folio)) || 2133 WARN_ON_ONCE(!folio_test_anon(src_folio))) { 2134 err = -EBUSY; 2135 goto unlock_ptls; 2136 } 2137 2138 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2139 /* Folio got pinned from under us. Put it back and fail the move. */ 2140 if (folio_maybe_dma_pinned(src_folio)) { 2141 set_pmd_at(mm, src_addr, src_pmd, src_pmdval); 2142 err = -EBUSY; 2143 goto unlock_ptls; 2144 } 2145 2146 folio_move_anon_rmap(src_folio, dst_vma); 2147 WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); 2148 2149 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); 2150 /* Follow mremap() behavior and treat the entry dirty after the move */ 2151 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); 2152 } else { 2153 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2154 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); 2155 } 2156 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); 2157 2158 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); 2159 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2160 unlock_ptls: 2161 double_pt_unlock(src_ptl, dst_ptl); 2162 if (src_anon_vma) { 2163 anon_vma_unlock_write(src_anon_vma); 2164 put_anon_vma(src_anon_vma); 2165 } 2166 unlock_folio: 2167 /* unblock rmap walks */ 2168 if (src_folio) 2169 folio_unlock(src_folio); 2170 mmu_notifier_invalidate_range_end(&range); 2171 if (src_folio) 2172 folio_put(src_folio); 2173 return err; 2174 } 2175 #endif /* CONFIG_USERFAULTFD */ 2176 2177 /* 2178 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2179 * 2180 * Note that if it returns page table lock pointer, this routine returns without 2181 * unlocking page table lock. So callers must unlock it. 2182 */ 2183 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2184 { 2185 spinlock_t *ptl; 2186 ptl = pmd_lock(vma->vm_mm, pmd); 2187 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 2188 pmd_devmap(*pmd))) 2189 return ptl; 2190 spin_unlock(ptl); 2191 return NULL; 2192 } 2193 2194 /* 2195 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. 2196 * 2197 * Note that if it returns page table lock pointer, this routine returns without 2198 * unlocking page table lock. So callers must unlock it. 2199 */ 2200 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2201 { 2202 spinlock_t *ptl; 2203 2204 ptl = pud_lock(vma->vm_mm, pud); 2205 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 2206 return ptl; 2207 spin_unlock(ptl); 2208 return NULL; 2209 } 2210 2211 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2212 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2213 pud_t *pud, unsigned long addr) 2214 { 2215 spinlock_t *ptl; 2216 2217 ptl = __pud_trans_huge_lock(pud, vma); 2218 if (!ptl) 2219 return 0; 2220 2221 pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); 2222 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2223 if (vma_is_special_huge(vma)) { 2224 spin_unlock(ptl); 2225 /* No zero page support yet */ 2226 } else { 2227 /* No support for anonymous PUD pages yet */ 2228 BUG(); 2229 } 2230 return 1; 2231 } 2232 2233 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2234 unsigned long haddr) 2235 { 2236 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2237 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2238 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2239 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 2240 2241 count_vm_event(THP_SPLIT_PUD); 2242 2243 pudp_huge_clear_flush(vma, haddr, pud); 2244 } 2245 2246 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2247 unsigned long address) 2248 { 2249 spinlock_t *ptl; 2250 struct mmu_notifier_range range; 2251 2252 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2253 address & HPAGE_PUD_MASK, 2254 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2255 mmu_notifier_invalidate_range_start(&range); 2256 ptl = pud_lock(vma->vm_mm, pud); 2257 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2258 goto out; 2259 __split_huge_pud_locked(vma, pud, range.start); 2260 2261 out: 2262 spin_unlock(ptl); 2263 mmu_notifier_invalidate_range_end(&range); 2264 } 2265 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2266 2267 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2268 unsigned long haddr, pmd_t *pmd) 2269 { 2270 struct mm_struct *mm = vma->vm_mm; 2271 pgtable_t pgtable; 2272 pmd_t _pmd, old_pmd; 2273 unsigned long addr; 2274 pte_t *pte; 2275 int i; 2276 2277 /* 2278 * Leave pmd empty until pte is filled note that it is fine to delay 2279 * notification until mmu_notifier_invalidate_range_end() as we are 2280 * replacing a zero pmd write protected page with a zero pte write 2281 * protected page. 2282 * 2283 * See Documentation/mm/mmu_notifier.rst 2284 */ 2285 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2286 2287 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2288 pmd_populate(mm, &_pmd, pgtable); 2289 2290 pte = pte_offset_map(&_pmd, haddr); 2291 VM_BUG_ON(!pte); 2292 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2293 pte_t entry; 2294 2295 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); 2296 entry = pte_mkspecial(entry); 2297 if (pmd_uffd_wp(old_pmd)) 2298 entry = pte_mkuffd_wp(entry); 2299 VM_BUG_ON(!pte_none(ptep_get(pte))); 2300 set_pte_at(mm, addr, pte, entry); 2301 pte++; 2302 } 2303 pte_unmap(pte - 1); 2304 smp_wmb(); /* make pte visible before pmd */ 2305 pmd_populate(mm, pmd, pgtable); 2306 } 2307 2308 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2309 unsigned long haddr, bool freeze) 2310 { 2311 struct mm_struct *mm = vma->vm_mm; 2312 struct folio *folio; 2313 struct page *page; 2314 pgtable_t pgtable; 2315 pmd_t old_pmd, _pmd; 2316 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; 2317 bool anon_exclusive = false, dirty = false; 2318 unsigned long addr; 2319 pte_t *pte; 2320 int i; 2321 2322 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2323 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2324 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2325 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 2326 && !pmd_devmap(*pmd)); 2327 2328 count_vm_event(THP_SPLIT_PMD); 2329 2330 if (!vma_is_anonymous(vma)) { 2331 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2332 /* 2333 * We are going to unmap this huge page. So 2334 * just go ahead and zap it 2335 */ 2336 if (arch_needs_pgtable_deposit()) 2337 zap_deposited_table(mm, pmd); 2338 if (vma_is_special_huge(vma)) 2339 return; 2340 if (unlikely(is_pmd_migration_entry(old_pmd))) { 2341 swp_entry_t entry; 2342 2343 entry = pmd_to_swp_entry(old_pmd); 2344 folio = pfn_swap_entry_folio(entry); 2345 } else { 2346 page = pmd_page(old_pmd); 2347 folio = page_folio(page); 2348 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) 2349 folio_mark_dirty(folio); 2350 if (!folio_test_referenced(folio) && pmd_young(old_pmd)) 2351 folio_set_referenced(folio); 2352 folio_remove_rmap_pmd(folio, page, vma); 2353 folio_put(folio); 2354 } 2355 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); 2356 return; 2357 } 2358 2359 if (is_huge_zero_pmd(*pmd)) { 2360 /* 2361 * FIXME: Do we want to invalidate secondary mmu by calling 2362 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below 2363 * inside __split_huge_pmd() ? 2364 * 2365 * We are going from a zero huge page write protected to zero 2366 * small page also write protected so it does not seems useful 2367 * to invalidate secondary mmu at this time. 2368 */ 2369 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2370 } 2371 2372 /* 2373 * Up to this point the pmd is present and huge and userland has the 2374 * whole access to the hugepage during the split (which happens in 2375 * place). If we overwrite the pmd with the not-huge version pointing 2376 * to the pte here (which of course we could if all CPUs were bug 2377 * free), userland could trigger a small page size TLB miss on the 2378 * small sized TLB while the hugepage TLB entry is still established in 2379 * the huge TLB. Some CPU doesn't like that. 2380 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum 2381 * 383 on page 105. Intel should be safe but is also warns that it's 2382 * only safe if the permission and cache attributes of the two entries 2383 * loaded in the two TLB is identical (which should be the case here). 2384 * But it is generally safer to never allow small and huge TLB entries 2385 * for the same virtual address to be loaded simultaneously. So instead 2386 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 2387 * current pmd notpresent (atomically because here the pmd_trans_huge 2388 * must remain set at all times on the pmd until the split is complete 2389 * for this pmd), then we flush the SMP TLB and finally we write the 2390 * non-huge version of the pmd entry with pmd_populate. 2391 */ 2392 old_pmd = pmdp_invalidate(vma, haddr, pmd); 2393 2394 pmd_migration = is_pmd_migration_entry(old_pmd); 2395 if (unlikely(pmd_migration)) { 2396 swp_entry_t entry; 2397 2398 entry = pmd_to_swp_entry(old_pmd); 2399 page = pfn_swap_entry_to_page(entry); 2400 write = is_writable_migration_entry(entry); 2401 if (PageAnon(page)) 2402 anon_exclusive = is_readable_exclusive_migration_entry(entry); 2403 young = is_migration_entry_young(entry); 2404 dirty = is_migration_entry_dirty(entry); 2405 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2406 uffd_wp = pmd_swp_uffd_wp(old_pmd); 2407 } else { 2408 page = pmd_page(old_pmd); 2409 folio = page_folio(page); 2410 if (pmd_dirty(old_pmd)) { 2411 dirty = true; 2412 folio_set_dirty(folio); 2413 } 2414 write = pmd_write(old_pmd); 2415 young = pmd_young(old_pmd); 2416 soft_dirty = pmd_soft_dirty(old_pmd); 2417 uffd_wp = pmd_uffd_wp(old_pmd); 2418 2419 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); 2420 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 2421 2422 /* 2423 * Without "freeze", we'll simply split the PMD, propagating the 2424 * PageAnonExclusive() flag for each PTE by setting it for 2425 * each subpage -- no need to (temporarily) clear. 2426 * 2427 * With "freeze" we want to replace mapped pages by 2428 * migration entries right away. This is only possible if we 2429 * managed to clear PageAnonExclusive() -- see 2430 * set_pmd_migration_entry(). 2431 * 2432 * In case we cannot clear PageAnonExclusive(), split the PMD 2433 * only and let try_to_migrate_one() fail later. 2434 * 2435 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. 2436 */ 2437 anon_exclusive = PageAnonExclusive(page); 2438 if (freeze && anon_exclusive && 2439 folio_try_share_anon_rmap_pmd(folio, page)) 2440 freeze = false; 2441 if (!freeze) { 2442 rmap_t rmap_flags = RMAP_NONE; 2443 2444 folio_ref_add(folio, HPAGE_PMD_NR - 1); 2445 if (anon_exclusive) 2446 rmap_flags |= RMAP_EXCLUSIVE; 2447 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, 2448 vma, haddr, rmap_flags); 2449 } 2450 } 2451 2452 /* 2453 * Withdraw the table only after we mark the pmd entry invalid. 2454 * This's critical for some architectures (Power). 2455 */ 2456 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2457 pmd_populate(mm, &_pmd, pgtable); 2458 2459 pte = pte_offset_map(&_pmd, haddr); 2460 VM_BUG_ON(!pte); 2461 2462 /* 2463 * Note that NUMA hinting access restrictions are not transferred to 2464 * avoid any possibility of altering permissions across VMAs. 2465 */ 2466 if (freeze || pmd_migration) { 2467 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2468 pte_t entry; 2469 swp_entry_t swp_entry; 2470 2471 if (write) 2472 swp_entry = make_writable_migration_entry( 2473 page_to_pfn(page + i)); 2474 else if (anon_exclusive) 2475 swp_entry = make_readable_exclusive_migration_entry( 2476 page_to_pfn(page + i)); 2477 else 2478 swp_entry = make_readable_migration_entry( 2479 page_to_pfn(page + i)); 2480 if (young) 2481 swp_entry = make_migration_entry_young(swp_entry); 2482 if (dirty) 2483 swp_entry = make_migration_entry_dirty(swp_entry); 2484 entry = swp_entry_to_pte(swp_entry); 2485 if (soft_dirty) 2486 entry = pte_swp_mksoft_dirty(entry); 2487 if (uffd_wp) 2488 entry = pte_swp_mkuffd_wp(entry); 2489 2490 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2491 set_pte_at(mm, addr, pte + i, entry); 2492 } 2493 } else { 2494 pte_t entry; 2495 2496 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); 2497 if (write) 2498 entry = pte_mkwrite(entry, vma); 2499 if (!young) 2500 entry = pte_mkold(entry); 2501 /* NOTE: this may set soft-dirty too on some archs */ 2502 if (dirty) 2503 entry = pte_mkdirty(entry); 2504 if (soft_dirty) 2505 entry = pte_mksoft_dirty(entry); 2506 if (uffd_wp) 2507 entry = pte_mkuffd_wp(entry); 2508 2509 for (i = 0; i < HPAGE_PMD_NR; i++) 2510 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2511 2512 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); 2513 } 2514 pte_unmap(pte); 2515 2516 if (!pmd_migration) 2517 folio_remove_rmap_pmd(folio, page, vma); 2518 if (freeze) 2519 put_page(page); 2520 2521 smp_wmb(); /* make pte visible before pmd */ 2522 pmd_populate(mm, pmd, pgtable); 2523 } 2524 2525 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2526 unsigned long address, bool freeze, struct folio *folio) 2527 { 2528 spinlock_t *ptl; 2529 struct mmu_notifier_range range; 2530 2531 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2532 address & HPAGE_PMD_MASK, 2533 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 2534 mmu_notifier_invalidate_range_start(&range); 2535 ptl = pmd_lock(vma->vm_mm, pmd); 2536 2537 /* 2538 * If caller asks to setup a migration entry, we need a folio to check 2539 * pmd against. Otherwise we can end up replacing wrong folio. 2540 */ 2541 VM_BUG_ON(freeze && !folio); 2542 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); 2543 2544 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || 2545 is_pmd_migration_entry(*pmd)) { 2546 /* 2547 * It's safe to call pmd_page when folio is set because it's 2548 * guaranteed that pmd is present. 2549 */ 2550 if (folio && folio != pmd_folio(*pmd)) 2551 goto out; 2552 __split_huge_pmd_locked(vma, pmd, range.start, freeze); 2553 } 2554 2555 out: 2556 spin_unlock(ptl); 2557 mmu_notifier_invalidate_range_end(&range); 2558 } 2559 2560 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2561 bool freeze, struct folio *folio) 2562 { 2563 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 2564 2565 if (!pmd) 2566 return; 2567 2568 __split_huge_pmd(vma, pmd, address, freeze, folio); 2569 } 2570 2571 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) 2572 { 2573 /* 2574 * If the new address isn't hpage aligned and it could previously 2575 * contain an hugepage: check if we need to split an huge pmd. 2576 */ 2577 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && 2578 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), 2579 ALIGN(address, HPAGE_PMD_SIZE))) 2580 split_huge_pmd_address(vma, address, false, NULL); 2581 } 2582 2583 void vma_adjust_trans_huge(struct vm_area_struct *vma, 2584 unsigned long start, 2585 unsigned long end, 2586 long adjust_next) 2587 { 2588 /* Check if we need to split start first. */ 2589 split_huge_pmd_if_needed(vma, start); 2590 2591 /* Check if we need to split end next. */ 2592 split_huge_pmd_if_needed(vma, end); 2593 2594 /* 2595 * If we're also updating the next vma vm_start, 2596 * check if we need to split it. 2597 */ 2598 if (adjust_next > 0) { 2599 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); 2600 unsigned long nstart = next->vm_start; 2601 nstart += adjust_next; 2602 split_huge_pmd_if_needed(next, nstart); 2603 } 2604 } 2605 2606 static void unmap_folio(struct folio *folio) 2607 { 2608 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | 2609 TTU_BATCH_FLUSH; 2610 2611 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 2612 2613 if (folio_test_pmd_mappable(folio)) 2614 ttu_flags |= TTU_SPLIT_HUGE_PMD; 2615 2616 /* 2617 * Anon pages need migration entries to preserve them, but file 2618 * pages can simply be left unmapped, then faulted back on demand. 2619 * If that is ever changed (perhaps for mlock), update remap_page(). 2620 */ 2621 if (folio_test_anon(folio)) 2622 try_to_migrate(folio, ttu_flags); 2623 else 2624 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 2625 2626 try_to_unmap_flush(); 2627 } 2628 2629 static void remap_page(struct folio *folio, unsigned long nr) 2630 { 2631 int i = 0; 2632 2633 /* If unmap_folio() uses try_to_migrate() on file, remove this check */ 2634 if (!folio_test_anon(folio)) 2635 return; 2636 for (;;) { 2637 remove_migration_ptes(folio, folio, true); 2638 i += folio_nr_pages(folio); 2639 if (i >= nr) 2640 break; 2641 folio = folio_next(folio); 2642 } 2643 } 2644 2645 static void lru_add_page_tail(struct page *head, struct page *tail, 2646 struct lruvec *lruvec, struct list_head *list) 2647 { 2648 VM_BUG_ON_PAGE(!PageHead(head), head); 2649 VM_BUG_ON_PAGE(PageLRU(tail), head); 2650 lockdep_assert_held(&lruvec->lru_lock); 2651 2652 if (list) { 2653 /* page reclaim is reclaiming a huge page */ 2654 VM_WARN_ON(PageLRU(head)); 2655 get_page(tail); 2656 list_add_tail(&tail->lru, list); 2657 } else { 2658 /* head is still on lru (and we have it frozen) */ 2659 VM_WARN_ON(!PageLRU(head)); 2660 if (PageUnevictable(tail)) 2661 tail->mlock_count = 0; 2662 else 2663 list_add_tail(&tail->lru, &head->lru); 2664 SetPageLRU(tail); 2665 } 2666 } 2667 2668 static void __split_huge_page_tail(struct folio *folio, int tail, 2669 struct lruvec *lruvec, struct list_head *list, 2670 unsigned int new_order) 2671 { 2672 struct page *head = &folio->page; 2673 struct page *page_tail = head + tail; 2674 /* 2675 * Careful: new_folio is not a "real" folio before we cleared PageTail. 2676 * Don't pass it around before clear_compound_head(). 2677 */ 2678 struct folio *new_folio = (struct folio *)page_tail; 2679 2680 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 2681 2682 /* 2683 * Clone page flags before unfreezing refcount. 2684 * 2685 * After successful get_page_unless_zero() might follow flags change, 2686 * for example lock_page() which set PG_waiters. 2687 * 2688 * Note that for mapped sub-pages of an anonymous THP, 2689 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in 2690 * the migration entry instead from where remap_page() will restore it. 2691 * We can still have PG_anon_exclusive set on effectively unmapped and 2692 * unreferenced sub-pages of an anonymous THP: we can simply drop 2693 * PG_anon_exclusive (-> PG_mappedtodisk) for these here. 2694 */ 2695 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 2696 page_tail->flags |= (head->flags & 2697 ((1L << PG_referenced) | 2698 (1L << PG_swapbacked) | 2699 (1L << PG_swapcache) | 2700 (1L << PG_mlocked) | 2701 (1L << PG_uptodate) | 2702 (1L << PG_active) | 2703 (1L << PG_workingset) | 2704 (1L << PG_locked) | 2705 (1L << PG_unevictable) | 2706 #ifdef CONFIG_ARCH_USES_PG_ARCH_X 2707 (1L << PG_arch_2) | 2708 (1L << PG_arch_3) | 2709 #endif 2710 (1L << PG_dirty) | 2711 LRU_GEN_MASK | LRU_REFS_MASK)); 2712 2713 /* ->mapping in first and second tail page is replaced by other uses */ 2714 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 2715 page_tail); 2716 page_tail->mapping = head->mapping; 2717 page_tail->index = head->index + tail; 2718 2719 /* 2720 * page->private should not be set in tail pages. Fix up and warn once 2721 * if private is unexpectedly set. 2722 */ 2723 if (unlikely(page_tail->private)) { 2724 VM_WARN_ON_ONCE_PAGE(true, page_tail); 2725 page_tail->private = 0; 2726 } 2727 if (folio_test_swapcache(folio)) 2728 new_folio->swap.val = folio->swap.val + tail; 2729 2730 /* Page flags must be visible before we make the page non-compound. */ 2731 smp_wmb(); 2732 2733 /* 2734 * Clear PageTail before unfreezing page refcount. 2735 * 2736 * After successful get_page_unless_zero() might follow put_page() 2737 * which needs correct compound_head(). 2738 */ 2739 clear_compound_head(page_tail); 2740 if (new_order) { 2741 prep_compound_page(page_tail, new_order); 2742 folio_set_large_rmappable(new_folio); 2743 } 2744 2745 /* Finally unfreeze refcount. Additional reference from page cache. */ 2746 page_ref_unfreeze(page_tail, 2747 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? 2748 folio_nr_pages(new_folio) : 0)); 2749 2750 if (folio_test_young(folio)) 2751 folio_set_young(new_folio); 2752 if (folio_test_idle(folio)) 2753 folio_set_idle(new_folio); 2754 2755 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); 2756 2757 /* 2758 * always add to the tail because some iterators expect new 2759 * pages to show after the currently processed elements - e.g. 2760 * migrate_pages 2761 */ 2762 lru_add_page_tail(head, page_tail, lruvec, list); 2763 } 2764 2765 static void __split_huge_page(struct page *page, struct list_head *list, 2766 pgoff_t end, unsigned int new_order) 2767 { 2768 struct folio *folio = page_folio(page); 2769 struct page *head = &folio->page; 2770 struct lruvec *lruvec; 2771 struct address_space *swap_cache = NULL; 2772 unsigned long offset = 0; 2773 int i, nr_dropped = 0; 2774 unsigned int new_nr = 1 << new_order; 2775 int order = folio_order(folio); 2776 unsigned int nr = 1 << order; 2777 2778 /* complete memcg works before add pages to LRU */ 2779 split_page_memcg(head, order, new_order); 2780 2781 if (folio_test_anon(folio) && folio_test_swapcache(folio)) { 2782 offset = swp_offset(folio->swap); 2783 swap_cache = swap_address_space(folio->swap); 2784 xa_lock(&swap_cache->i_pages); 2785 } 2786 2787 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 2788 lruvec = folio_lruvec_lock(folio); 2789 2790 ClearPageHasHWPoisoned(head); 2791 2792 for (i = nr - new_nr; i >= new_nr; i -= new_nr) { 2793 __split_huge_page_tail(folio, i, lruvec, list, new_order); 2794 /* Some pages can be beyond EOF: drop them from page cache */ 2795 if (head[i].index >= end) { 2796 struct folio *tail = page_folio(head + i); 2797 2798 if (shmem_mapping(folio->mapping)) 2799 nr_dropped++; 2800 else if (folio_test_clear_dirty(tail)) 2801 folio_account_cleaned(tail, 2802 inode_to_wb(folio->mapping->host)); 2803 __filemap_remove_folio(tail, NULL); 2804 folio_put(tail); 2805 } else if (!PageAnon(page)) { 2806 __xa_store(&folio->mapping->i_pages, head[i].index, 2807 head + i, 0); 2808 } else if (swap_cache) { 2809 __xa_store(&swap_cache->i_pages, offset + i, 2810 head + i, 0); 2811 } 2812 } 2813 2814 if (!new_order) 2815 ClearPageCompound(head); 2816 else { 2817 struct folio *new_folio = (struct folio *)head; 2818 2819 folio_set_order(new_folio, new_order); 2820 } 2821 unlock_page_lruvec(lruvec); 2822 /* Caller disabled irqs, so they are still disabled here */ 2823 2824 split_page_owner(head, order, new_order); 2825 pgalloc_tag_split(head, 1 << order); 2826 2827 /* See comment in __split_huge_page_tail() */ 2828 if (folio_test_anon(folio)) { 2829 /* Additional pin to swap cache */ 2830 if (folio_test_swapcache(folio)) { 2831 folio_ref_add(folio, 1 + new_nr); 2832 xa_unlock(&swap_cache->i_pages); 2833 } else { 2834 folio_ref_inc(folio); 2835 } 2836 } else { 2837 /* Additional pin to page cache */ 2838 folio_ref_add(folio, 1 + new_nr); 2839 xa_unlock(&folio->mapping->i_pages); 2840 } 2841 local_irq_enable(); 2842 2843 if (nr_dropped) 2844 shmem_uncharge(folio->mapping->host, nr_dropped); 2845 remap_page(folio, nr); 2846 2847 if (folio_test_swapcache(folio)) 2848 split_swap_cluster(folio->swap); 2849 2850 /* 2851 * set page to its compound_head when split to non order-0 pages, so 2852 * we can skip unlocking it below, since PG_locked is transferred to 2853 * the compound_head of the page and the caller will unlock it. 2854 */ 2855 if (new_order) 2856 page = compound_head(page); 2857 2858 for (i = 0; i < nr; i += new_nr) { 2859 struct page *subpage = head + i; 2860 struct folio *new_folio = page_folio(subpage); 2861 if (subpage == page) 2862 continue; 2863 folio_unlock(new_folio); 2864 2865 /* 2866 * Subpages may be freed if there wasn't any mapping 2867 * like if add_to_swap() is running on a lru page that 2868 * had its mapping zapped. And freeing these pages 2869 * requires taking the lru_lock so we do the put_page 2870 * of the tail pages after the split is complete. 2871 */ 2872 free_page_and_swap_cache(subpage); 2873 } 2874 } 2875 2876 /* Racy check whether the huge page can be split */ 2877 bool can_split_folio(struct folio *folio, int *pextra_pins) 2878 { 2879 int extra_pins; 2880 2881 /* Additional pins from page cache */ 2882 if (folio_test_anon(folio)) 2883 extra_pins = folio_test_swapcache(folio) ? 2884 folio_nr_pages(folio) : 0; 2885 else 2886 extra_pins = folio_nr_pages(folio); 2887 if (pextra_pins) 2888 *pextra_pins = extra_pins; 2889 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; 2890 } 2891 2892 /* 2893 * This function splits a large folio into smaller folios of order @new_order. 2894 * @page can point to any page of the large folio to split. The split operation 2895 * does not change the position of @page. 2896 * 2897 * Prerequisites: 2898 * 2899 * 1) The caller must hold a reference on the @page's owning folio, also known 2900 * as the large folio. 2901 * 2902 * 2) The large folio must be locked. 2903 * 2904 * 3) The folio must not be pinned. Any unexpected folio references, including 2905 * GUP pins, will result in the folio not getting split; instead, the caller 2906 * will receive an -EBUSY. 2907 * 2908 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not 2909 * supported for non-file-backed folios, because folio->_deferred_list, which 2910 * is used by partially mapped folios, is stored in subpage 2, but an order-1 2911 * folio only has subpages 0 and 1. File-backed order-1 folios are supported, 2912 * since they do not use _deferred_list. 2913 * 2914 * After splitting, the caller's folio reference will be transferred to @page, 2915 * resulting in a raised refcount of @page after this call. The other pages may 2916 * be freed if they are not mapped. 2917 * 2918 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 2919 * 2920 * Pages in @new_order will inherit the mapping, flags, and so on from the 2921 * huge page. 2922 * 2923 * Returns 0 if the huge page was split successfully. 2924 * 2925 * Returns -EBUSY if @page's folio is pinned, or if the anon_vma disappeared 2926 * from under us. 2927 */ 2928 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 2929 unsigned int new_order) 2930 { 2931 struct folio *folio = page_folio(page); 2932 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 2933 /* reset xarray order to new order after split */ 2934 XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); 2935 struct anon_vma *anon_vma = NULL; 2936 struct address_space *mapping = NULL; 2937 int extra_pins, ret; 2938 pgoff_t end; 2939 bool is_hzp; 2940 2941 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 2942 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 2943 2944 if (new_order >= folio_order(folio)) 2945 return -EINVAL; 2946 2947 /* Cannot split anonymous THP to order-1 */ 2948 if (new_order == 1 && folio_test_anon(folio)) { 2949 VM_WARN_ONCE(1, "Cannot split to order-1 folio"); 2950 return -EINVAL; 2951 } 2952 2953 if (new_order) { 2954 /* Only swapping a whole PMD-mapped folio is supported */ 2955 if (folio_test_swapcache(folio)) 2956 return -EINVAL; 2957 /* Split shmem folio to non-zero order not supported */ 2958 if (shmem_mapping(folio->mapping)) { 2959 VM_WARN_ONCE(1, 2960 "Cannot split shmem folio to non-0 order"); 2961 return -EINVAL; 2962 } 2963 /* No split if the file system does not support large folio */ 2964 if (!mapping_large_folio_support(folio->mapping)) { 2965 VM_WARN_ONCE(1, 2966 "Cannot split file folio to non-0 order"); 2967 return -EINVAL; 2968 } 2969 } 2970 2971 2972 is_hzp = is_huge_zero_folio(folio); 2973 if (is_hzp) { 2974 pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); 2975 return -EBUSY; 2976 } 2977 2978 if (folio_test_writeback(folio)) 2979 return -EBUSY; 2980 2981 if (folio_test_anon(folio)) { 2982 /* 2983 * The caller does not necessarily hold an mmap_lock that would 2984 * prevent the anon_vma disappearing so we first we take a 2985 * reference to it and then lock the anon_vma for write. This 2986 * is similar to folio_lock_anon_vma_read except the write lock 2987 * is taken to serialise against parallel split or collapse 2988 * operations. 2989 */ 2990 anon_vma = folio_get_anon_vma(folio); 2991 if (!anon_vma) { 2992 ret = -EBUSY; 2993 goto out; 2994 } 2995 end = -1; 2996 mapping = NULL; 2997 anon_vma_lock_write(anon_vma); 2998 } else { 2999 gfp_t gfp; 3000 3001 mapping = folio->mapping; 3002 3003 /* Truncated ? */ 3004 if (!mapping) { 3005 ret = -EBUSY; 3006 goto out; 3007 } 3008 3009 gfp = current_gfp_context(mapping_gfp_mask(mapping) & 3010 GFP_RECLAIM_MASK); 3011 3012 if (!filemap_release_folio(folio, gfp)) { 3013 ret = -EBUSY; 3014 goto out; 3015 } 3016 3017 xas_split_alloc(&xas, folio, folio_order(folio), gfp); 3018 if (xas_error(&xas)) { 3019 ret = xas_error(&xas); 3020 goto out; 3021 } 3022 3023 anon_vma = NULL; 3024 i_mmap_lock_read(mapping); 3025 3026 /* 3027 *__split_huge_page() may need to trim off pages beyond EOF: 3028 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 3029 * which cannot be nested inside the page tree lock. So note 3030 * end now: i_size itself may be changed at any moment, but 3031 * folio lock is good enough to serialize the trimming. 3032 */ 3033 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3034 if (shmem_mapping(mapping)) 3035 end = shmem_fallocend(mapping->host, end); 3036 } 3037 3038 /* 3039 * Racy check if we can split the page, before unmap_folio() will 3040 * split PMDs 3041 */ 3042 if (!can_split_folio(folio, &extra_pins)) { 3043 ret = -EAGAIN; 3044 goto out_unlock; 3045 } 3046 3047 unmap_folio(folio); 3048 3049 /* block interrupt reentry in xa_lock and spinlock */ 3050 local_irq_disable(); 3051 if (mapping) { 3052 /* 3053 * Check if the folio is present in page cache. 3054 * We assume all tail are present too, if folio is there. 3055 */ 3056 xas_lock(&xas); 3057 xas_reset(&xas); 3058 if (xas_load(&xas) != folio) 3059 goto fail; 3060 } 3061 3062 /* Prevent deferred_split_scan() touching ->_refcount */ 3063 spin_lock(&ds_queue->split_queue_lock); 3064 if (folio_ref_freeze(folio, 1 + extra_pins)) { 3065 if (folio_order(folio) > 1 && 3066 !list_empty(&folio->_deferred_list)) { 3067 ds_queue->split_queue_len--; 3068 /* 3069 * Reinitialize page_deferred_list after removing the 3070 * page from the split_queue, otherwise a subsequent 3071 * split will see list corruption when checking the 3072 * page_deferred_list. 3073 */ 3074 list_del_init(&folio->_deferred_list); 3075 } 3076 spin_unlock(&ds_queue->split_queue_lock); 3077 if (mapping) { 3078 int nr = folio_nr_pages(folio); 3079 3080 xas_split(&xas, folio, folio_order(folio)); 3081 if (folio_test_pmd_mappable(folio) && 3082 new_order < HPAGE_PMD_ORDER) { 3083 if (folio_test_swapbacked(folio)) { 3084 __lruvec_stat_mod_folio(folio, 3085 NR_SHMEM_THPS, -nr); 3086 } else { 3087 __lruvec_stat_mod_folio(folio, 3088 NR_FILE_THPS, -nr); 3089 filemap_nr_thps_dec(mapping); 3090 } 3091 } 3092 } 3093 3094 __split_huge_page(page, list, end, new_order); 3095 ret = 0; 3096 } else { 3097 spin_unlock(&ds_queue->split_queue_lock); 3098 fail: 3099 if (mapping) 3100 xas_unlock(&xas); 3101 local_irq_enable(); 3102 remap_page(folio, folio_nr_pages(folio)); 3103 ret = -EAGAIN; 3104 } 3105 3106 out_unlock: 3107 if (anon_vma) { 3108 anon_vma_unlock_write(anon_vma); 3109 put_anon_vma(anon_vma); 3110 } 3111 if (mapping) 3112 i_mmap_unlock_read(mapping); 3113 out: 3114 xas_destroy(&xas); 3115 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3116 return ret; 3117 } 3118 3119 void folio_undo_large_rmappable(struct folio *folio) 3120 { 3121 struct deferred_split *ds_queue; 3122 unsigned long flags; 3123 3124 if (folio_order(folio) <= 1) 3125 return; 3126 3127 /* 3128 * At this point, there is no one trying to add the folio to 3129 * deferred_list. If folio is not in deferred_list, it's safe 3130 * to check without acquiring the split_queue_lock. 3131 */ 3132 if (data_race(list_empty(&folio->_deferred_list))) 3133 return; 3134 3135 ds_queue = get_deferred_split_queue(folio); 3136 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3137 if (!list_empty(&folio->_deferred_list)) { 3138 ds_queue->split_queue_len--; 3139 list_del_init(&folio->_deferred_list); 3140 } 3141 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3142 } 3143 3144 void deferred_split_folio(struct folio *folio) 3145 { 3146 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3147 #ifdef CONFIG_MEMCG 3148 struct mem_cgroup *memcg = folio_memcg(folio); 3149 #endif 3150 unsigned long flags; 3151 3152 /* 3153 * Order 1 folios have no space for a deferred list, but we also 3154 * won't waste much memory by not adding them to the deferred list. 3155 */ 3156 if (folio_order(folio) <= 1) 3157 return; 3158 3159 /* 3160 * The try_to_unmap() in page reclaim path might reach here too, 3161 * this may cause a race condition to corrupt deferred split queue. 3162 * And, if page reclaim is already handling the same folio, it is 3163 * unnecessary to handle it again in shrinker. 3164 * 3165 * Check the swapcache flag to determine if the folio is being 3166 * handled by page reclaim since THP swap would add the folio into 3167 * swap cache before calling try_to_unmap(). 3168 */ 3169 if (folio_test_swapcache(folio)) 3170 return; 3171 3172 if (!list_empty(&folio->_deferred_list)) 3173 return; 3174 3175 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3176 if (list_empty(&folio->_deferred_list)) { 3177 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 3178 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); 3179 ds_queue->split_queue_len++; 3180 #ifdef CONFIG_MEMCG 3181 if (memcg) 3182 set_shrinker_bit(memcg, folio_nid(folio), 3183 deferred_split_shrinker->id); 3184 #endif 3185 } 3186 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3187 } 3188 3189 static unsigned long deferred_split_count(struct shrinker *shrink, 3190 struct shrink_control *sc) 3191 { 3192 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3193 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3194 3195 #ifdef CONFIG_MEMCG 3196 if (sc->memcg) 3197 ds_queue = &sc->memcg->deferred_split_queue; 3198 #endif 3199 return READ_ONCE(ds_queue->split_queue_len); 3200 } 3201 3202 static unsigned long deferred_split_scan(struct shrinker *shrink, 3203 struct shrink_control *sc) 3204 { 3205 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3206 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3207 unsigned long flags; 3208 LIST_HEAD(list); 3209 struct folio *folio, *next; 3210 int split = 0; 3211 3212 #ifdef CONFIG_MEMCG 3213 if (sc->memcg) 3214 ds_queue = &sc->memcg->deferred_split_queue; 3215 #endif 3216 3217 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3218 /* Take pin on all head pages to avoid freeing them under us */ 3219 list_for_each_entry_safe(folio, next, &ds_queue->split_queue, 3220 _deferred_list) { 3221 if (folio_try_get(folio)) { 3222 list_move(&folio->_deferred_list, &list); 3223 } else { 3224 /* We lost race with folio_put() */ 3225 list_del_init(&folio->_deferred_list); 3226 ds_queue->split_queue_len--; 3227 } 3228 if (!--sc->nr_to_scan) 3229 break; 3230 } 3231 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3232 3233 list_for_each_entry_safe(folio, next, &list, _deferred_list) { 3234 if (!folio_trylock(folio)) 3235 goto next; 3236 /* split_huge_page() removes page from list on success */ 3237 if (!split_folio(folio)) 3238 split++; 3239 folio_unlock(folio); 3240 next: 3241 folio_put(folio); 3242 } 3243 3244 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3245 list_splice_tail(&list, &ds_queue->split_queue); 3246 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3247 3248 /* 3249 * Stop shrinker if we didn't split any page, but the queue is empty. 3250 * This can happen if pages were freed under us. 3251 */ 3252 if (!split && list_empty(&ds_queue->split_queue)) 3253 return SHRINK_STOP; 3254 return split; 3255 } 3256 3257 #ifdef CONFIG_DEBUG_FS 3258 static void split_huge_pages_all(void) 3259 { 3260 struct zone *zone; 3261 struct page *page; 3262 struct folio *folio; 3263 unsigned long pfn, max_zone_pfn; 3264 unsigned long total = 0, split = 0; 3265 3266 pr_debug("Split all THPs\n"); 3267 for_each_zone(zone) { 3268 if (!managed_zone(zone)) 3269 continue; 3270 max_zone_pfn = zone_end_pfn(zone); 3271 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 3272 int nr_pages; 3273 3274 page = pfn_to_online_page(pfn); 3275 if (!page || PageTail(page)) 3276 continue; 3277 folio = page_folio(page); 3278 if (!folio_try_get(folio)) 3279 continue; 3280 3281 if (unlikely(page_folio(page) != folio)) 3282 goto next; 3283 3284 if (zone != folio_zone(folio)) 3285 goto next; 3286 3287 if (!folio_test_large(folio) 3288 || folio_test_hugetlb(folio) 3289 || !folio_test_lru(folio)) 3290 goto next; 3291 3292 total++; 3293 folio_lock(folio); 3294 nr_pages = folio_nr_pages(folio); 3295 if (!split_folio(folio)) 3296 split++; 3297 pfn += nr_pages - 1; 3298 folio_unlock(folio); 3299 next: 3300 folio_put(folio); 3301 cond_resched(); 3302 } 3303 } 3304 3305 pr_debug("%lu of %lu THP split\n", split, total); 3306 } 3307 3308 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) 3309 { 3310 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || 3311 is_vm_hugetlb_page(vma); 3312 } 3313 3314 static int split_huge_pages_pid(int pid, unsigned long vaddr_start, 3315 unsigned long vaddr_end, unsigned int new_order) 3316 { 3317 int ret = 0; 3318 struct task_struct *task; 3319 struct mm_struct *mm; 3320 unsigned long total = 0, split = 0; 3321 unsigned long addr; 3322 3323 vaddr_start &= PAGE_MASK; 3324 vaddr_end &= PAGE_MASK; 3325 3326 /* Find the task_struct from pid */ 3327 rcu_read_lock(); 3328 task = find_task_by_vpid(pid); 3329 if (!task) { 3330 rcu_read_unlock(); 3331 ret = -ESRCH; 3332 goto out; 3333 } 3334 get_task_struct(task); 3335 rcu_read_unlock(); 3336 3337 /* Find the mm_struct */ 3338 mm = get_task_mm(task); 3339 put_task_struct(task); 3340 3341 if (!mm) { 3342 ret = -EINVAL; 3343 goto out; 3344 } 3345 3346 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", 3347 pid, vaddr_start, vaddr_end); 3348 3349 mmap_read_lock(mm); 3350 /* 3351 * always increase addr by PAGE_SIZE, since we could have a PTE page 3352 * table filled with PTE-mapped THPs, each of which is distinct. 3353 */ 3354 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { 3355 struct vm_area_struct *vma = vma_lookup(mm, addr); 3356 struct page *page; 3357 struct folio *folio; 3358 3359 if (!vma) 3360 break; 3361 3362 /* skip special VMA and hugetlb VMA */ 3363 if (vma_not_suitable_for_thp_split(vma)) { 3364 addr = vma->vm_end; 3365 continue; 3366 } 3367 3368 /* FOLL_DUMP to ignore special (like zero) pages */ 3369 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 3370 3371 if (IS_ERR_OR_NULL(page)) 3372 continue; 3373 3374 folio = page_folio(page); 3375 if (!is_transparent_hugepage(folio)) 3376 goto next; 3377 3378 if (new_order >= folio_order(folio)) 3379 goto next; 3380 3381 total++; 3382 /* 3383 * For folios with private, split_huge_page_to_list_to_order() 3384 * will try to drop it before split and then check if the folio 3385 * can be split or not. So skip the check here. 3386 */ 3387 if (!folio_test_private(folio) && 3388 !can_split_folio(folio, NULL)) 3389 goto next; 3390 3391 if (!folio_trylock(folio)) 3392 goto next; 3393 3394 if (!split_folio_to_order(folio, new_order)) 3395 split++; 3396 3397 folio_unlock(folio); 3398 next: 3399 folio_put(folio); 3400 cond_resched(); 3401 } 3402 mmap_read_unlock(mm); 3403 mmput(mm); 3404 3405 pr_debug("%lu of %lu THP split\n", split, total); 3406 3407 out: 3408 return ret; 3409 } 3410 3411 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, 3412 pgoff_t off_end, unsigned int new_order) 3413 { 3414 struct filename *file; 3415 struct file *candidate; 3416 struct address_space *mapping; 3417 int ret = -EINVAL; 3418 pgoff_t index; 3419 int nr_pages = 1; 3420 unsigned long total = 0, split = 0; 3421 3422 file = getname_kernel(file_path); 3423 if (IS_ERR(file)) 3424 return ret; 3425 3426 candidate = file_open_name(file, O_RDONLY, 0); 3427 if (IS_ERR(candidate)) 3428 goto out; 3429 3430 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", 3431 file_path, off_start, off_end); 3432 3433 mapping = candidate->f_mapping; 3434 3435 for (index = off_start; index < off_end; index += nr_pages) { 3436 struct folio *folio = filemap_get_folio(mapping, index); 3437 3438 nr_pages = 1; 3439 if (IS_ERR(folio)) 3440 continue; 3441 3442 if (!folio_test_large(folio)) 3443 goto next; 3444 3445 total++; 3446 nr_pages = folio_nr_pages(folio); 3447 3448 if (new_order >= folio_order(folio)) 3449 goto next; 3450 3451 if (!folio_trylock(folio)) 3452 goto next; 3453 3454 if (!split_folio_to_order(folio, new_order)) 3455 split++; 3456 3457 folio_unlock(folio); 3458 next: 3459 folio_put(folio); 3460 cond_resched(); 3461 } 3462 3463 filp_close(candidate, NULL); 3464 ret = 0; 3465 3466 pr_debug("%lu of %lu file-backed THP split\n", split, total); 3467 out: 3468 putname(file); 3469 return ret; 3470 } 3471 3472 #define MAX_INPUT_BUF_SZ 255 3473 3474 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, 3475 size_t count, loff_t *ppops) 3476 { 3477 static DEFINE_MUTEX(split_debug_mutex); 3478 ssize_t ret; 3479 /* 3480 * hold pid, start_vaddr, end_vaddr, new_order or 3481 * file_path, off_start, off_end, new_order 3482 */ 3483 char input_buf[MAX_INPUT_BUF_SZ]; 3484 int pid; 3485 unsigned long vaddr_start, vaddr_end; 3486 unsigned int new_order = 0; 3487 3488 ret = mutex_lock_interruptible(&split_debug_mutex); 3489 if (ret) 3490 return ret; 3491 3492 ret = -EFAULT; 3493 3494 memset(input_buf, 0, MAX_INPUT_BUF_SZ); 3495 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) 3496 goto out; 3497 3498 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; 3499 3500 if (input_buf[0] == '/') { 3501 char *tok; 3502 char *buf = input_buf; 3503 char file_path[MAX_INPUT_BUF_SZ]; 3504 pgoff_t off_start = 0, off_end = 0; 3505 size_t input_len = strlen(input_buf); 3506 3507 tok = strsep(&buf, ","); 3508 if (tok) { 3509 strcpy(file_path, tok); 3510 } else { 3511 ret = -EINVAL; 3512 goto out; 3513 } 3514 3515 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); 3516 if (ret != 2 && ret != 3) { 3517 ret = -EINVAL; 3518 goto out; 3519 } 3520 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); 3521 if (!ret) 3522 ret = input_len; 3523 3524 goto out; 3525 } 3526 3527 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); 3528 if (ret == 1 && pid == 1) { 3529 split_huge_pages_all(); 3530 ret = strlen(input_buf); 3531 goto out; 3532 } else if (ret != 3 && ret != 4) { 3533 ret = -EINVAL; 3534 goto out; 3535 } 3536 3537 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); 3538 if (!ret) 3539 ret = strlen(input_buf); 3540 out: 3541 mutex_unlock(&split_debug_mutex); 3542 return ret; 3543 3544 } 3545 3546 static const struct file_operations split_huge_pages_fops = { 3547 .owner = THIS_MODULE, 3548 .write = split_huge_pages_write, 3549 .llseek = no_llseek, 3550 }; 3551 3552 static int __init split_huge_pages_debugfs(void) 3553 { 3554 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 3555 &split_huge_pages_fops); 3556 return 0; 3557 } 3558 late_initcall(split_huge_pages_debugfs); 3559 #endif 3560 3561 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 3562 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 3563 struct page *page) 3564 { 3565 struct folio *folio = page_folio(page); 3566 struct vm_area_struct *vma = pvmw->vma; 3567 struct mm_struct *mm = vma->vm_mm; 3568 unsigned long address = pvmw->address; 3569 bool anon_exclusive; 3570 pmd_t pmdval; 3571 swp_entry_t entry; 3572 pmd_t pmdswp; 3573 3574 if (!(pvmw->pmd && !pvmw->pte)) 3575 return 0; 3576 3577 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 3578 pmdval = pmdp_invalidate(vma, address, pvmw->pmd); 3579 3580 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ 3581 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); 3582 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { 3583 set_pmd_at(mm, address, pvmw->pmd, pmdval); 3584 return -EBUSY; 3585 } 3586 3587 if (pmd_dirty(pmdval)) 3588 folio_mark_dirty(folio); 3589 if (pmd_write(pmdval)) 3590 entry = make_writable_migration_entry(page_to_pfn(page)); 3591 else if (anon_exclusive) 3592 entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); 3593 else 3594 entry = make_readable_migration_entry(page_to_pfn(page)); 3595 if (pmd_young(pmdval)) 3596 entry = make_migration_entry_young(entry); 3597 if (pmd_dirty(pmdval)) 3598 entry = make_migration_entry_dirty(entry); 3599 pmdswp = swp_entry_to_pmd(entry); 3600 if (pmd_soft_dirty(pmdval)) 3601 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 3602 if (pmd_uffd_wp(pmdval)) 3603 pmdswp = pmd_swp_mkuffd_wp(pmdswp); 3604 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 3605 folio_remove_rmap_pmd(folio, page, vma); 3606 folio_put(folio); 3607 trace_set_migration_pmd(address, pmd_val(pmdswp)); 3608 3609 return 0; 3610 } 3611 3612 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 3613 { 3614 struct folio *folio = page_folio(new); 3615 struct vm_area_struct *vma = pvmw->vma; 3616 struct mm_struct *mm = vma->vm_mm; 3617 unsigned long address = pvmw->address; 3618 unsigned long haddr = address & HPAGE_PMD_MASK; 3619 pmd_t pmde; 3620 swp_entry_t entry; 3621 3622 if (!(pvmw->pmd && !pvmw->pte)) 3623 return; 3624 3625 entry = pmd_to_swp_entry(*pvmw->pmd); 3626 folio_get(folio); 3627 pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); 3628 if (pmd_swp_soft_dirty(*pvmw->pmd)) 3629 pmde = pmd_mksoft_dirty(pmde); 3630 if (is_writable_migration_entry(entry)) 3631 pmde = pmd_mkwrite(pmde, vma); 3632 if (pmd_swp_uffd_wp(*pvmw->pmd)) 3633 pmde = pmd_mkuffd_wp(pmde); 3634 if (!is_migration_entry_young(entry)) 3635 pmde = pmd_mkold(pmde); 3636 /* NOTE: this may contain setting soft-dirty on some archs */ 3637 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) 3638 pmde = pmd_mkdirty(pmde); 3639 3640 if (folio_test_anon(folio)) { 3641 rmap_t rmap_flags = RMAP_NONE; 3642 3643 if (!is_readable_migration_entry(entry)) 3644 rmap_flags |= RMAP_EXCLUSIVE; 3645 3646 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); 3647 } else { 3648 folio_add_file_rmap_pmd(folio, new, vma); 3649 } 3650 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new)); 3651 set_pmd_at(mm, haddr, pvmw->pmd, pmde); 3652 3653 /* No need to invalidate - it was non-present before */ 3654 update_mmu_cache_pmd(vma, address, pvmw->pmd); 3655 trace_remove_migration_pmd(address, pmd_val(pmde)); 3656 } 3657 #endif 3658