1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/mm.h> 11 #include <linux/sched/coredump.h> 12 #include <linux/sched/numa_balancing.h> 13 #include <linux/highmem.h> 14 #include <linux/hugetlb.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/rmap.h> 17 #include <linux/swap.h> 18 #include <linux/shrinker.h> 19 #include <linux/mm_inline.h> 20 #include <linux/swapops.h> 21 #include <linux/backing-dev.h> 22 #include <linux/dax.h> 23 #include <linux/khugepaged.h> 24 #include <linux/freezer.h> 25 #include <linux/pfn_t.h> 26 #include <linux/mman.h> 27 #include <linux/memremap.h> 28 #include <linux/pagemap.h> 29 #include <linux/debugfs.h> 30 #include <linux/migrate.h> 31 #include <linux/hashtable.h> 32 #include <linux/userfaultfd_k.h> 33 #include <linux/page_idle.h> 34 #include <linux/shmem_fs.h> 35 #include <linux/oom.h> 36 #include <linux/numa.h> 37 #include <linux/page_owner.h> 38 #include <linux/sched/sysctl.h> 39 #include <linux/memory-tiers.h> 40 #include <linux/compat.h> 41 #include <linux/pgalloc_tag.h> 42 43 #include <asm/tlb.h> 44 #include <asm/pgalloc.h> 45 #include "internal.h" 46 #include "swap.h" 47 48 #define CREATE_TRACE_POINTS 49 #include <trace/events/thp.h> 50 51 /* 52 * By default, transparent hugepage support is disabled in order to avoid 53 * risking an increased memory footprint for applications that are not 54 * guaranteed to benefit from it. When transparent hugepage support is 55 * enabled, it is for all mappings, and khugepaged scans all mappings. 56 * Defrag is invoked by khugepaged hugepage allocations and by page faults 57 * for all hugepage allocations. 58 */ 59 unsigned long transparent_hugepage_flags __read_mostly = 60 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 61 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 62 #endif 63 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 64 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 65 #endif 66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 68 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 69 70 static struct shrinker *deferred_split_shrinker; 71 static unsigned long deferred_split_count(struct shrinker *shrink, 72 struct shrink_control *sc); 73 static unsigned long deferred_split_scan(struct shrinker *shrink, 74 struct shrink_control *sc); 75 76 static atomic_t huge_zero_refcount; 77 struct page *huge_zero_page __read_mostly; 78 unsigned long huge_zero_pfn __read_mostly = ~0UL; 79 unsigned long huge_anon_orders_always __read_mostly; 80 unsigned long huge_anon_orders_madvise __read_mostly; 81 unsigned long huge_anon_orders_inherit __read_mostly; 82 83 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, 84 unsigned long vm_flags, bool smaps, 85 bool in_pf, bool enforce_sysfs, 86 unsigned long orders) 87 { 88 /* Check the intersection of requested and supported orders. */ 89 orders &= vma_is_anonymous(vma) ? 90 THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; 91 if (!orders) 92 return 0; 93 94 if (!vma->vm_mm) /* vdso */ 95 return 0; 96 97 /* 98 * Explicitly disabled through madvise or prctl, or some 99 * architectures may disable THP for some mappings, for 100 * example, s390 kvm. 101 * */ 102 if ((vm_flags & VM_NOHUGEPAGE) || 103 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 104 return 0; 105 /* 106 * If the hardware/firmware marked hugepage support disabled. 107 */ 108 if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) 109 return 0; 110 111 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ 112 if (vma_is_dax(vma)) 113 return in_pf ? orders : 0; 114 115 /* 116 * khugepaged special VMA and hugetlb VMA. 117 * Must be checked after dax since some dax mappings may have 118 * VM_MIXEDMAP set. 119 */ 120 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) 121 return 0; 122 123 /* 124 * Check alignment for file vma and size for both file and anon vma by 125 * filtering out the unsuitable orders. 126 * 127 * Skip the check for page fault. Huge fault does the check in fault 128 * handlers. 129 */ 130 if (!in_pf) { 131 int order = highest_order(orders); 132 unsigned long addr; 133 134 while (orders) { 135 addr = vma->vm_end - (PAGE_SIZE << order); 136 if (thp_vma_suitable_order(vma, addr, order)) 137 break; 138 order = next_order(&orders, order); 139 } 140 141 if (!orders) 142 return 0; 143 } 144 145 /* 146 * Enabled via shmem mount options or sysfs settings. 147 * Must be done before hugepage flags check since shmem has its 148 * own flags. 149 */ 150 if (!in_pf && shmem_file(vma->vm_file)) 151 return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 152 !enforce_sysfs, vma->vm_mm, vm_flags) 153 ? orders : 0; 154 155 if (!vma_is_anonymous(vma)) { 156 /* 157 * Enforce sysfs THP requirements as necessary. Anonymous vmas 158 * were already handled in thp_vma_allowable_orders(). 159 */ 160 if (enforce_sysfs && 161 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && 162 !hugepage_global_always()))) 163 return 0; 164 165 /* 166 * Trust that ->huge_fault() handlers know what they are doing 167 * in fault path. 168 */ 169 if (((in_pf || smaps)) && vma->vm_ops->huge_fault) 170 return orders; 171 /* Only regular file is valid in collapse path */ 172 if (((!in_pf || smaps)) && file_thp_enabled(vma)) 173 return orders; 174 return 0; 175 } 176 177 if (vma_is_temporary_stack(vma)) 178 return 0; 179 180 /* 181 * THPeligible bit of smaps should show 1 for proper VMAs even 182 * though anon_vma is not initialized yet. 183 * 184 * Allow page fault since anon_vma may be not initialized until 185 * the first page fault. 186 */ 187 if (!vma->anon_vma) 188 return (smaps || in_pf) ? orders : 0; 189 190 return orders; 191 } 192 193 static bool get_huge_zero_page(void) 194 { 195 struct page *zero_page; 196 retry: 197 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 198 return true; 199 200 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 201 HPAGE_PMD_ORDER); 202 if (!zero_page) { 203 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 204 return false; 205 } 206 preempt_disable(); 207 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 208 preempt_enable(); 209 __free_pages(zero_page, compound_order(zero_page)); 210 goto retry; 211 } 212 WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); 213 214 /* We take additional reference here. It will be put back by shrinker */ 215 atomic_set(&huge_zero_refcount, 2); 216 preempt_enable(); 217 count_vm_event(THP_ZERO_PAGE_ALLOC); 218 return true; 219 } 220 221 static void put_huge_zero_page(void) 222 { 223 /* 224 * Counter should never go to zero here. Only shrinker can put 225 * last reference. 226 */ 227 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 228 } 229 230 struct page *mm_get_huge_zero_page(struct mm_struct *mm) 231 { 232 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 233 return READ_ONCE(huge_zero_page); 234 235 if (!get_huge_zero_page()) 236 return NULL; 237 238 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 239 put_huge_zero_page(); 240 241 return READ_ONCE(huge_zero_page); 242 } 243 244 void mm_put_huge_zero_page(struct mm_struct *mm) 245 { 246 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 247 put_huge_zero_page(); 248 } 249 250 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 251 struct shrink_control *sc) 252 { 253 /* we can free zero page only if last reference remains */ 254 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 255 } 256 257 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 258 struct shrink_control *sc) 259 { 260 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 261 struct page *zero_page = xchg(&huge_zero_page, NULL); 262 BUG_ON(zero_page == NULL); 263 WRITE_ONCE(huge_zero_pfn, ~0UL); 264 __free_pages(zero_page, compound_order(zero_page)); 265 return HPAGE_PMD_NR; 266 } 267 268 return 0; 269 } 270 271 static struct shrinker *huge_zero_page_shrinker; 272 273 #ifdef CONFIG_SYSFS 274 static ssize_t enabled_show(struct kobject *kobj, 275 struct kobj_attribute *attr, char *buf) 276 { 277 const char *output; 278 279 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 280 output = "[always] madvise never"; 281 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 282 &transparent_hugepage_flags)) 283 output = "always [madvise] never"; 284 else 285 output = "always madvise [never]"; 286 287 return sysfs_emit(buf, "%s\n", output); 288 } 289 290 static ssize_t enabled_store(struct kobject *kobj, 291 struct kobj_attribute *attr, 292 const char *buf, size_t count) 293 { 294 ssize_t ret = count; 295 296 if (sysfs_streq(buf, "always")) { 297 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 298 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 299 } else if (sysfs_streq(buf, "madvise")) { 300 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 301 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 302 } else if (sysfs_streq(buf, "never")) { 303 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 304 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 305 } else 306 ret = -EINVAL; 307 308 if (ret > 0) { 309 int err = start_stop_khugepaged(); 310 if (err) 311 ret = err; 312 } 313 return ret; 314 } 315 316 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); 317 318 ssize_t single_hugepage_flag_show(struct kobject *kobj, 319 struct kobj_attribute *attr, char *buf, 320 enum transparent_hugepage_flag flag) 321 { 322 return sysfs_emit(buf, "%d\n", 323 !!test_bit(flag, &transparent_hugepage_flags)); 324 } 325 326 ssize_t single_hugepage_flag_store(struct kobject *kobj, 327 struct kobj_attribute *attr, 328 const char *buf, size_t count, 329 enum transparent_hugepage_flag flag) 330 { 331 unsigned long value; 332 int ret; 333 334 ret = kstrtoul(buf, 10, &value); 335 if (ret < 0) 336 return ret; 337 if (value > 1) 338 return -EINVAL; 339 340 if (value) 341 set_bit(flag, &transparent_hugepage_flags); 342 else 343 clear_bit(flag, &transparent_hugepage_flags); 344 345 return count; 346 } 347 348 static ssize_t defrag_show(struct kobject *kobj, 349 struct kobj_attribute *attr, char *buf) 350 { 351 const char *output; 352 353 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 354 &transparent_hugepage_flags)) 355 output = "[always] defer defer+madvise madvise never"; 356 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 357 &transparent_hugepage_flags)) 358 output = "always [defer] defer+madvise madvise never"; 359 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 360 &transparent_hugepage_flags)) 361 output = "always defer [defer+madvise] madvise never"; 362 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 363 &transparent_hugepage_flags)) 364 output = "always defer defer+madvise [madvise] never"; 365 else 366 output = "always defer defer+madvise madvise [never]"; 367 368 return sysfs_emit(buf, "%s\n", output); 369 } 370 371 static ssize_t defrag_store(struct kobject *kobj, 372 struct kobj_attribute *attr, 373 const char *buf, size_t count) 374 { 375 if (sysfs_streq(buf, "always")) { 376 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 377 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 378 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 379 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 380 } else if (sysfs_streq(buf, "defer+madvise")) { 381 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 382 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 383 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 384 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 385 } else if (sysfs_streq(buf, "defer")) { 386 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 387 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 388 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 389 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 390 } else if (sysfs_streq(buf, "madvise")) { 391 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 392 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 393 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 394 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 395 } else if (sysfs_streq(buf, "never")) { 396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 397 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 398 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 399 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 400 } else 401 return -EINVAL; 402 403 return count; 404 } 405 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); 406 407 static ssize_t use_zero_page_show(struct kobject *kobj, 408 struct kobj_attribute *attr, char *buf) 409 { 410 return single_hugepage_flag_show(kobj, attr, buf, 411 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 412 } 413 static ssize_t use_zero_page_store(struct kobject *kobj, 414 struct kobj_attribute *attr, const char *buf, size_t count) 415 { 416 return single_hugepage_flag_store(kobj, attr, buf, count, 417 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 418 } 419 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); 420 421 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 422 struct kobj_attribute *attr, char *buf) 423 { 424 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); 425 } 426 static struct kobj_attribute hpage_pmd_size_attr = 427 __ATTR_RO(hpage_pmd_size); 428 429 static struct attribute *hugepage_attr[] = { 430 &enabled_attr.attr, 431 &defrag_attr.attr, 432 &use_zero_page_attr.attr, 433 &hpage_pmd_size_attr.attr, 434 #ifdef CONFIG_SHMEM 435 &shmem_enabled_attr.attr, 436 #endif 437 NULL, 438 }; 439 440 static const struct attribute_group hugepage_attr_group = { 441 .attrs = hugepage_attr, 442 }; 443 444 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); 445 static void thpsize_release(struct kobject *kobj); 446 static DEFINE_SPINLOCK(huge_anon_orders_lock); 447 static LIST_HEAD(thpsize_list); 448 449 struct thpsize { 450 struct kobject kobj; 451 struct list_head node; 452 int order; 453 }; 454 455 #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) 456 457 static ssize_t thpsize_enabled_show(struct kobject *kobj, 458 struct kobj_attribute *attr, char *buf) 459 { 460 int order = to_thpsize(kobj)->order; 461 const char *output; 462 463 if (test_bit(order, &huge_anon_orders_always)) 464 output = "[always] inherit madvise never"; 465 else if (test_bit(order, &huge_anon_orders_inherit)) 466 output = "always [inherit] madvise never"; 467 else if (test_bit(order, &huge_anon_orders_madvise)) 468 output = "always inherit [madvise] never"; 469 else 470 output = "always inherit madvise [never]"; 471 472 return sysfs_emit(buf, "%s\n", output); 473 } 474 475 static ssize_t thpsize_enabled_store(struct kobject *kobj, 476 struct kobj_attribute *attr, 477 const char *buf, size_t count) 478 { 479 int order = to_thpsize(kobj)->order; 480 ssize_t ret = count; 481 482 if (sysfs_streq(buf, "always")) { 483 spin_lock(&huge_anon_orders_lock); 484 clear_bit(order, &huge_anon_orders_inherit); 485 clear_bit(order, &huge_anon_orders_madvise); 486 set_bit(order, &huge_anon_orders_always); 487 spin_unlock(&huge_anon_orders_lock); 488 } else if (sysfs_streq(buf, "inherit")) { 489 spin_lock(&huge_anon_orders_lock); 490 clear_bit(order, &huge_anon_orders_always); 491 clear_bit(order, &huge_anon_orders_madvise); 492 set_bit(order, &huge_anon_orders_inherit); 493 spin_unlock(&huge_anon_orders_lock); 494 } else if (sysfs_streq(buf, "madvise")) { 495 spin_lock(&huge_anon_orders_lock); 496 clear_bit(order, &huge_anon_orders_always); 497 clear_bit(order, &huge_anon_orders_inherit); 498 set_bit(order, &huge_anon_orders_madvise); 499 spin_unlock(&huge_anon_orders_lock); 500 } else if (sysfs_streq(buf, "never")) { 501 spin_lock(&huge_anon_orders_lock); 502 clear_bit(order, &huge_anon_orders_always); 503 clear_bit(order, &huge_anon_orders_inherit); 504 clear_bit(order, &huge_anon_orders_madvise); 505 spin_unlock(&huge_anon_orders_lock); 506 } else 507 ret = -EINVAL; 508 509 return ret; 510 } 511 512 static struct kobj_attribute thpsize_enabled_attr = 513 __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); 514 515 static struct attribute *thpsize_attrs[] = { 516 &thpsize_enabled_attr.attr, 517 NULL, 518 }; 519 520 static const struct attribute_group thpsize_attr_group = { 521 .attrs = thpsize_attrs, 522 }; 523 524 static const struct kobj_type thpsize_ktype = { 525 .release = &thpsize_release, 526 .sysfs_ops = &kobj_sysfs_ops, 527 }; 528 529 static struct thpsize *thpsize_create(int order, struct kobject *parent) 530 { 531 unsigned long size = (PAGE_SIZE << order) / SZ_1K; 532 struct thpsize *thpsize; 533 int ret; 534 535 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); 536 if (!thpsize) 537 return ERR_PTR(-ENOMEM); 538 539 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, 540 "hugepages-%lukB", size); 541 if (ret) { 542 kfree(thpsize); 543 return ERR_PTR(ret); 544 } 545 546 ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); 547 if (ret) { 548 kobject_put(&thpsize->kobj); 549 return ERR_PTR(ret); 550 } 551 552 thpsize->order = order; 553 return thpsize; 554 } 555 556 static void thpsize_release(struct kobject *kobj) 557 { 558 kfree(to_thpsize(kobj)); 559 } 560 561 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 562 { 563 int err; 564 struct thpsize *thpsize; 565 unsigned long orders; 566 int order; 567 568 /* 569 * Default to setting PMD-sized THP to inherit the global setting and 570 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time 571 * constant so we have to do this here. 572 */ 573 huge_anon_orders_inherit = BIT(PMD_ORDER); 574 575 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 576 if (unlikely(!*hugepage_kobj)) { 577 pr_err("failed to create transparent hugepage kobject\n"); 578 return -ENOMEM; 579 } 580 581 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 582 if (err) { 583 pr_err("failed to register transparent hugepage group\n"); 584 goto delete_obj; 585 } 586 587 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 588 if (err) { 589 pr_err("failed to register transparent hugepage group\n"); 590 goto remove_hp_group; 591 } 592 593 orders = THP_ORDERS_ALL_ANON; 594 order = highest_order(orders); 595 while (orders) { 596 thpsize = thpsize_create(order, *hugepage_kobj); 597 if (IS_ERR(thpsize)) { 598 pr_err("failed to create thpsize for order %d\n", order); 599 err = PTR_ERR(thpsize); 600 goto remove_all; 601 } 602 list_add(&thpsize->node, &thpsize_list); 603 order = next_order(&orders, order); 604 } 605 606 return 0; 607 608 remove_all: 609 hugepage_exit_sysfs(*hugepage_kobj); 610 return err; 611 remove_hp_group: 612 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 613 delete_obj: 614 kobject_put(*hugepage_kobj); 615 return err; 616 } 617 618 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 619 { 620 struct thpsize *thpsize, *tmp; 621 622 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { 623 list_del(&thpsize->node); 624 kobject_put(&thpsize->kobj); 625 } 626 627 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 628 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 629 kobject_put(hugepage_kobj); 630 } 631 #else 632 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 633 { 634 return 0; 635 } 636 637 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 638 { 639 } 640 #endif /* CONFIG_SYSFS */ 641 642 static int __init thp_shrinker_init(void) 643 { 644 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); 645 if (!huge_zero_page_shrinker) 646 return -ENOMEM; 647 648 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 649 SHRINKER_MEMCG_AWARE | 650 SHRINKER_NONSLAB, 651 "thp-deferred_split"); 652 if (!deferred_split_shrinker) { 653 shrinker_free(huge_zero_page_shrinker); 654 return -ENOMEM; 655 } 656 657 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; 658 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; 659 shrinker_register(huge_zero_page_shrinker); 660 661 deferred_split_shrinker->count_objects = deferred_split_count; 662 deferred_split_shrinker->scan_objects = deferred_split_scan; 663 shrinker_register(deferred_split_shrinker); 664 665 return 0; 666 } 667 668 static void __init thp_shrinker_exit(void) 669 { 670 shrinker_free(huge_zero_page_shrinker); 671 shrinker_free(deferred_split_shrinker); 672 } 673 674 static int __init hugepage_init(void) 675 { 676 int err; 677 struct kobject *hugepage_kobj; 678 679 if (!has_transparent_hugepage()) { 680 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; 681 return -EINVAL; 682 } 683 684 /* 685 * hugepages can't be allocated by the buddy allocator 686 */ 687 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); 688 /* 689 * we use page->mapping and page->index in second tail page 690 * as list_head: assuming THP order >= 2 691 */ 692 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 693 694 err = hugepage_init_sysfs(&hugepage_kobj); 695 if (err) 696 goto err_sysfs; 697 698 err = khugepaged_init(); 699 if (err) 700 goto err_slab; 701 702 err = thp_shrinker_init(); 703 if (err) 704 goto err_shrinker; 705 706 /* 707 * By default disable transparent hugepages on smaller systems, 708 * where the extra memory used could hurt more than TLB overhead 709 * is likely to save. The admin can still enable it through /sys. 710 */ 711 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 712 transparent_hugepage_flags = 0; 713 return 0; 714 } 715 716 err = start_stop_khugepaged(); 717 if (err) 718 goto err_khugepaged; 719 720 return 0; 721 err_khugepaged: 722 thp_shrinker_exit(); 723 err_shrinker: 724 khugepaged_destroy(); 725 err_slab: 726 hugepage_exit_sysfs(hugepage_kobj); 727 err_sysfs: 728 return err; 729 } 730 subsys_initcall(hugepage_init); 731 732 static int __init setup_transparent_hugepage(char *str) 733 { 734 int ret = 0; 735 if (!str) 736 goto out; 737 if (!strcmp(str, "always")) { 738 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 739 &transparent_hugepage_flags); 740 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 741 &transparent_hugepage_flags); 742 ret = 1; 743 } else if (!strcmp(str, "madvise")) { 744 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 745 &transparent_hugepage_flags); 746 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 747 &transparent_hugepage_flags); 748 ret = 1; 749 } else if (!strcmp(str, "never")) { 750 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 751 &transparent_hugepage_flags); 752 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 753 &transparent_hugepage_flags); 754 ret = 1; 755 } 756 out: 757 if (!ret) 758 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 759 return ret; 760 } 761 __setup("transparent_hugepage=", setup_transparent_hugepage); 762 763 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 764 { 765 if (likely(vma->vm_flags & VM_WRITE)) 766 pmd = pmd_mkwrite(pmd, vma); 767 return pmd; 768 } 769 770 #ifdef CONFIG_MEMCG 771 static inline 772 struct deferred_split *get_deferred_split_queue(struct folio *folio) 773 { 774 struct mem_cgroup *memcg = folio_memcg(folio); 775 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 776 777 if (memcg) 778 return &memcg->deferred_split_queue; 779 else 780 return &pgdat->deferred_split_queue; 781 } 782 #else 783 static inline 784 struct deferred_split *get_deferred_split_queue(struct folio *folio) 785 { 786 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 787 788 return &pgdat->deferred_split_queue; 789 } 790 #endif 791 792 void folio_prep_large_rmappable(struct folio *folio) 793 { 794 if (!folio || !folio_test_large(folio)) 795 return; 796 if (folio_order(folio) > 1) 797 INIT_LIST_HEAD(&folio->_deferred_list); 798 folio_set_large_rmappable(folio); 799 } 800 801 static inline bool is_transparent_hugepage(struct folio *folio) 802 { 803 if (!folio_test_large(folio)) 804 return false; 805 806 return is_huge_zero_page(&folio->page) || 807 folio_test_large_rmappable(folio); 808 } 809 810 static unsigned long __thp_get_unmapped_area(struct file *filp, 811 unsigned long addr, unsigned long len, 812 loff_t off, unsigned long flags, unsigned long size) 813 { 814 loff_t off_end = off + len; 815 loff_t off_align = round_up(off, size); 816 unsigned long len_pad, ret, off_sub; 817 818 if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) 819 return 0; 820 821 if (off_end <= off_align || (off_end - off_align) < size) 822 return 0; 823 824 len_pad = len + size; 825 if (len_pad < len || (off + len_pad) < off) 826 return 0; 827 828 ret = current->mm->get_unmapped_area(filp, addr, len_pad, 829 off >> PAGE_SHIFT, flags); 830 831 /* 832 * The failure might be due to length padding. The caller will retry 833 * without the padding. 834 */ 835 if (IS_ERR_VALUE(ret)) 836 return 0; 837 838 /* 839 * Do not try to align to THP boundary if allocation at the address 840 * hint succeeds. 841 */ 842 if (ret == addr) 843 return addr; 844 845 off_sub = (off - ret) & (size - 1); 846 847 if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown && 848 !off_sub) 849 return ret + size; 850 851 ret += off_sub; 852 return ret; 853 } 854 855 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 856 unsigned long len, unsigned long pgoff, unsigned long flags) 857 { 858 unsigned long ret; 859 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 860 861 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); 862 if (ret) 863 return ret; 864 865 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 866 } 867 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 868 869 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, 870 struct page *page, gfp_t gfp) 871 { 872 struct vm_area_struct *vma = vmf->vma; 873 struct folio *folio = page_folio(page); 874 pgtable_t pgtable; 875 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 876 vm_fault_t ret = 0; 877 878 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 879 880 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { 881 folio_put(folio); 882 count_vm_event(THP_FAULT_FALLBACK); 883 count_vm_event(THP_FAULT_FALLBACK_CHARGE); 884 return VM_FAULT_FALLBACK; 885 } 886 folio_throttle_swaprate(folio, gfp); 887 888 pgtable = pte_alloc_one(vma->vm_mm); 889 if (unlikely(!pgtable)) { 890 ret = VM_FAULT_OOM; 891 goto release; 892 } 893 894 clear_huge_page(page, vmf->address, HPAGE_PMD_NR); 895 /* 896 * The memory barrier inside __folio_mark_uptodate makes sure that 897 * clear_huge_page writes become visible before the set_pmd_at() 898 * write. 899 */ 900 __folio_mark_uptodate(folio); 901 902 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 903 if (unlikely(!pmd_none(*vmf->pmd))) { 904 goto unlock_release; 905 } else { 906 pmd_t entry; 907 908 ret = check_stable_address_space(vma->vm_mm); 909 if (ret) 910 goto unlock_release; 911 912 /* Deliver the page fault to userland */ 913 if (userfaultfd_missing(vma)) { 914 spin_unlock(vmf->ptl); 915 folio_put(folio); 916 pte_free(vma->vm_mm, pgtable); 917 ret = handle_userfault(vmf, VM_UFFD_MISSING); 918 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 919 return ret; 920 } 921 922 entry = mk_huge_pmd(page, vma->vm_page_prot); 923 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 924 folio_add_new_anon_rmap(folio, vma, haddr); 925 folio_add_lru_vma(folio, vma); 926 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 927 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 928 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 929 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 930 mm_inc_nr_ptes(vma->vm_mm); 931 spin_unlock(vmf->ptl); 932 count_vm_event(THP_FAULT_ALLOC); 933 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); 934 } 935 936 return 0; 937 unlock_release: 938 spin_unlock(vmf->ptl); 939 release: 940 if (pgtable) 941 pte_free(vma->vm_mm, pgtable); 942 folio_put(folio); 943 return ret; 944 945 } 946 947 /* 948 * always: directly stall for all thp allocations 949 * defer: wake kswapd and fail if not immediately available 950 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 951 * fail if not immediately available 952 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 953 * available 954 * never: never stall for any thp allocation 955 */ 956 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) 957 { 958 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); 959 960 /* Always do synchronous compaction */ 961 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 962 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 963 964 /* Kick kcompactd and fail quickly */ 965 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 966 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 967 968 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 969 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 970 return GFP_TRANSHUGE_LIGHT | 971 (vma_madvised ? __GFP_DIRECT_RECLAIM : 972 __GFP_KSWAPD_RECLAIM); 973 974 /* Only do synchronous compaction if madvised */ 975 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 976 return GFP_TRANSHUGE_LIGHT | 977 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 978 979 return GFP_TRANSHUGE_LIGHT; 980 } 981 982 /* Caller must hold page table lock. */ 983 static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 984 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 985 struct page *zero_page) 986 { 987 pmd_t entry; 988 if (!pmd_none(*pmd)) 989 return; 990 entry = mk_pmd(zero_page, vma->vm_page_prot); 991 entry = pmd_mkhuge(entry); 992 pgtable_trans_huge_deposit(mm, pmd, pgtable); 993 set_pmd_at(mm, haddr, pmd, entry); 994 mm_inc_nr_ptes(mm); 995 } 996 997 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 998 { 999 struct vm_area_struct *vma = vmf->vma; 1000 gfp_t gfp; 1001 struct folio *folio; 1002 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1003 1004 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) 1005 return VM_FAULT_FALLBACK; 1006 if (unlikely(anon_vma_prepare(vma))) 1007 return VM_FAULT_OOM; 1008 khugepaged_enter_vma(vma, vma->vm_flags); 1009 1010 if (!(vmf->flags & FAULT_FLAG_WRITE) && 1011 !mm_forbids_zeropage(vma->vm_mm) && 1012 transparent_hugepage_use_zero_page()) { 1013 pgtable_t pgtable; 1014 struct page *zero_page; 1015 vm_fault_t ret; 1016 pgtable = pte_alloc_one(vma->vm_mm); 1017 if (unlikely(!pgtable)) 1018 return VM_FAULT_OOM; 1019 zero_page = mm_get_huge_zero_page(vma->vm_mm); 1020 if (unlikely(!zero_page)) { 1021 pte_free(vma->vm_mm, pgtable); 1022 count_vm_event(THP_FAULT_FALLBACK); 1023 return VM_FAULT_FALLBACK; 1024 } 1025 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1026 ret = 0; 1027 if (pmd_none(*vmf->pmd)) { 1028 ret = check_stable_address_space(vma->vm_mm); 1029 if (ret) { 1030 spin_unlock(vmf->ptl); 1031 pte_free(vma->vm_mm, pgtable); 1032 } else if (userfaultfd_missing(vma)) { 1033 spin_unlock(vmf->ptl); 1034 pte_free(vma->vm_mm, pgtable); 1035 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1036 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1037 } else { 1038 set_huge_zero_page(pgtable, vma->vm_mm, vma, 1039 haddr, vmf->pmd, zero_page); 1040 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1041 spin_unlock(vmf->ptl); 1042 } 1043 } else { 1044 spin_unlock(vmf->ptl); 1045 pte_free(vma->vm_mm, pgtable); 1046 } 1047 return ret; 1048 } 1049 gfp = vma_thp_gfp_mask(vma); 1050 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); 1051 if (unlikely(!folio)) { 1052 count_vm_event(THP_FAULT_FALLBACK); 1053 return VM_FAULT_FALLBACK; 1054 } 1055 return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); 1056 } 1057 1058 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 1059 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 1060 pgtable_t pgtable) 1061 { 1062 struct mm_struct *mm = vma->vm_mm; 1063 pmd_t entry; 1064 spinlock_t *ptl; 1065 1066 ptl = pmd_lock(mm, pmd); 1067 if (!pmd_none(*pmd)) { 1068 if (write) { 1069 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { 1070 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 1071 goto out_unlock; 1072 } 1073 entry = pmd_mkyoung(*pmd); 1074 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1075 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 1076 update_mmu_cache_pmd(vma, addr, pmd); 1077 } 1078 1079 goto out_unlock; 1080 } 1081 1082 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 1083 if (pfn_t_devmap(pfn)) 1084 entry = pmd_mkdevmap(entry); 1085 if (write) { 1086 entry = pmd_mkyoung(pmd_mkdirty(entry)); 1087 entry = maybe_pmd_mkwrite(entry, vma); 1088 } 1089 1090 if (pgtable) { 1091 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1092 mm_inc_nr_ptes(mm); 1093 pgtable = NULL; 1094 } 1095 1096 set_pmd_at(mm, addr, pmd, entry); 1097 update_mmu_cache_pmd(vma, addr, pmd); 1098 1099 out_unlock: 1100 spin_unlock(ptl); 1101 if (pgtable) 1102 pte_free(mm, pgtable); 1103 } 1104 1105 /** 1106 * vmf_insert_pfn_pmd - insert a pmd size pfn 1107 * @vmf: Structure describing the fault 1108 * @pfn: pfn to insert 1109 * @write: whether it's a write fault 1110 * 1111 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. 1112 * 1113 * Return: vm_fault_t value. 1114 */ 1115 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 1116 { 1117 unsigned long addr = vmf->address & PMD_MASK; 1118 struct vm_area_struct *vma = vmf->vma; 1119 pgprot_t pgprot = vma->vm_page_prot; 1120 pgtable_t pgtable = NULL; 1121 1122 /* 1123 * If we had pmd_special, we could avoid all these restrictions, 1124 * but we need to be consistent with PTEs and architectures that 1125 * can't support a 'special' bit. 1126 */ 1127 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1128 !pfn_t_devmap(pfn)); 1129 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1130 (VM_PFNMAP|VM_MIXEDMAP)); 1131 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1132 1133 if (addr < vma->vm_start || addr >= vma->vm_end) 1134 return VM_FAULT_SIGBUS; 1135 1136 if (arch_needs_pgtable_deposit()) { 1137 pgtable = pte_alloc_one(vma->vm_mm); 1138 if (!pgtable) 1139 return VM_FAULT_OOM; 1140 } 1141 1142 track_pfn_insert(vma, &pgprot, pfn); 1143 1144 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 1145 return VM_FAULT_NOPAGE; 1146 } 1147 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1148 1149 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1150 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 1151 { 1152 if (likely(vma->vm_flags & VM_WRITE)) 1153 pud = pud_mkwrite(pud); 1154 return pud; 1155 } 1156 1157 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 1158 pud_t *pud, pfn_t pfn, bool write) 1159 { 1160 struct mm_struct *mm = vma->vm_mm; 1161 pgprot_t prot = vma->vm_page_prot; 1162 pud_t entry; 1163 spinlock_t *ptl; 1164 1165 ptl = pud_lock(mm, pud); 1166 if (!pud_none(*pud)) { 1167 if (write) { 1168 if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { 1169 WARN_ON_ONCE(!is_huge_zero_pud(*pud)); 1170 goto out_unlock; 1171 } 1172 entry = pud_mkyoung(*pud); 1173 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 1174 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 1175 update_mmu_cache_pud(vma, addr, pud); 1176 } 1177 goto out_unlock; 1178 } 1179 1180 entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 1181 if (pfn_t_devmap(pfn)) 1182 entry = pud_mkdevmap(entry); 1183 if (write) { 1184 entry = pud_mkyoung(pud_mkdirty(entry)); 1185 entry = maybe_pud_mkwrite(entry, vma); 1186 } 1187 set_pud_at(mm, addr, pud, entry); 1188 update_mmu_cache_pud(vma, addr, pud); 1189 1190 out_unlock: 1191 spin_unlock(ptl); 1192 } 1193 1194 /** 1195 * vmf_insert_pfn_pud - insert a pud size pfn 1196 * @vmf: Structure describing the fault 1197 * @pfn: pfn to insert 1198 * @write: whether it's a write fault 1199 * 1200 * Insert a pud size pfn. See vmf_insert_pfn() for additional info. 1201 * 1202 * Return: vm_fault_t value. 1203 */ 1204 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 1205 { 1206 unsigned long addr = vmf->address & PUD_MASK; 1207 struct vm_area_struct *vma = vmf->vma; 1208 pgprot_t pgprot = vma->vm_page_prot; 1209 1210 /* 1211 * If we had pud_special, we could avoid all these restrictions, 1212 * but we need to be consistent with PTEs and architectures that 1213 * can't support a 'special' bit. 1214 */ 1215 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1216 !pfn_t_devmap(pfn)); 1217 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1218 (VM_PFNMAP|VM_MIXEDMAP)); 1219 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1220 1221 if (addr < vma->vm_start || addr >= vma->vm_end) 1222 return VM_FAULT_SIGBUS; 1223 1224 track_pfn_insert(vma, &pgprot, pfn); 1225 1226 insert_pfn_pud(vma, addr, vmf->pud, pfn, write); 1227 return VM_FAULT_NOPAGE; 1228 } 1229 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 1230 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1231 1232 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1233 pmd_t *pmd, bool write) 1234 { 1235 pmd_t _pmd; 1236 1237 _pmd = pmd_mkyoung(*pmd); 1238 if (write) 1239 _pmd = pmd_mkdirty(_pmd); 1240 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1241 pmd, _pmd, write)) 1242 update_mmu_cache_pmd(vma, addr, pmd); 1243 } 1244 1245 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 1246 pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 1247 { 1248 unsigned long pfn = pmd_pfn(*pmd); 1249 struct mm_struct *mm = vma->vm_mm; 1250 struct page *page; 1251 int ret; 1252 1253 assert_spin_locked(pmd_lockptr(mm, pmd)); 1254 1255 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1256 return NULL; 1257 1258 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 1259 /* pass */; 1260 else 1261 return NULL; 1262 1263 if (flags & FOLL_TOUCH) 1264 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); 1265 1266 /* 1267 * device mapped pages can only be returned if the 1268 * caller will manage the page reference count. 1269 */ 1270 if (!(flags & (FOLL_GET | FOLL_PIN))) 1271 return ERR_PTR(-EEXIST); 1272 1273 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 1274 *pgmap = get_dev_pagemap(pfn, *pgmap); 1275 if (!*pgmap) 1276 return ERR_PTR(-EFAULT); 1277 page = pfn_to_page(pfn); 1278 ret = try_grab_page(page, flags); 1279 if (ret) 1280 page = ERR_PTR(ret); 1281 1282 return page; 1283 } 1284 1285 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1286 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1287 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1288 { 1289 spinlock_t *dst_ptl, *src_ptl; 1290 struct page *src_page; 1291 struct folio *src_folio; 1292 pmd_t pmd; 1293 pgtable_t pgtable = NULL; 1294 int ret = -ENOMEM; 1295 1296 /* Skip if can be re-fill on fault */ 1297 if (!vma_is_anonymous(dst_vma)) 1298 return 0; 1299 1300 pgtable = pte_alloc_one(dst_mm); 1301 if (unlikely(!pgtable)) 1302 goto out; 1303 1304 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1305 src_ptl = pmd_lockptr(src_mm, src_pmd); 1306 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1307 1308 ret = -EAGAIN; 1309 pmd = *src_pmd; 1310 1311 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1312 if (unlikely(is_swap_pmd(pmd))) { 1313 swp_entry_t entry = pmd_to_swp_entry(pmd); 1314 1315 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1316 if (!is_readable_migration_entry(entry)) { 1317 entry = make_readable_migration_entry( 1318 swp_offset(entry)); 1319 pmd = swp_entry_to_pmd(entry); 1320 if (pmd_swp_soft_dirty(*src_pmd)) 1321 pmd = pmd_swp_mksoft_dirty(pmd); 1322 if (pmd_swp_uffd_wp(*src_pmd)) 1323 pmd = pmd_swp_mkuffd_wp(pmd); 1324 set_pmd_at(src_mm, addr, src_pmd, pmd); 1325 } 1326 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1327 mm_inc_nr_ptes(dst_mm); 1328 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1329 if (!userfaultfd_wp(dst_vma)) 1330 pmd = pmd_swp_clear_uffd_wp(pmd); 1331 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1332 ret = 0; 1333 goto out_unlock; 1334 } 1335 #endif 1336 1337 if (unlikely(!pmd_trans_huge(pmd))) { 1338 pte_free(dst_mm, pgtable); 1339 goto out_unlock; 1340 } 1341 /* 1342 * When page table lock is held, the huge zero pmd should not be 1343 * under splitting since we don't split the page itself, only pmd to 1344 * a page table. 1345 */ 1346 if (is_huge_zero_pmd(pmd)) { 1347 /* 1348 * get_huge_zero_page() will never allocate a new page here, 1349 * since we already have a zero page to copy. It just takes a 1350 * reference. 1351 */ 1352 mm_get_huge_zero_page(dst_mm); 1353 goto out_zero_page; 1354 } 1355 1356 src_page = pmd_page(pmd); 1357 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1358 src_folio = page_folio(src_page); 1359 1360 folio_get(src_folio); 1361 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) { 1362 /* Page maybe pinned: split and retry the fault on PTEs. */ 1363 folio_put(src_folio); 1364 pte_free(dst_mm, pgtable); 1365 spin_unlock(src_ptl); 1366 spin_unlock(dst_ptl); 1367 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); 1368 return -EAGAIN; 1369 } 1370 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1371 out_zero_page: 1372 mm_inc_nr_ptes(dst_mm); 1373 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1374 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1375 if (!userfaultfd_wp(dst_vma)) 1376 pmd = pmd_clear_uffd_wp(pmd); 1377 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1378 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1379 1380 ret = 0; 1381 out_unlock: 1382 spin_unlock(src_ptl); 1383 spin_unlock(dst_ptl); 1384 out: 1385 return ret; 1386 } 1387 1388 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1389 static void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1390 pud_t *pud, bool write) 1391 { 1392 pud_t _pud; 1393 1394 _pud = pud_mkyoung(*pud); 1395 if (write) 1396 _pud = pud_mkdirty(_pud); 1397 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1398 pud, _pud, write)) 1399 update_mmu_cache_pud(vma, addr, pud); 1400 } 1401 1402 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, 1403 pud_t *pud, int flags, struct dev_pagemap **pgmap) 1404 { 1405 unsigned long pfn = pud_pfn(*pud); 1406 struct mm_struct *mm = vma->vm_mm; 1407 struct page *page; 1408 int ret; 1409 1410 assert_spin_locked(pud_lockptr(mm, pud)); 1411 1412 if (flags & FOLL_WRITE && !pud_write(*pud)) 1413 return NULL; 1414 1415 if (pud_present(*pud) && pud_devmap(*pud)) 1416 /* pass */; 1417 else 1418 return NULL; 1419 1420 if (flags & FOLL_TOUCH) 1421 touch_pud(vma, addr, pud, flags & FOLL_WRITE); 1422 1423 /* 1424 * device mapped pages can only be returned if the 1425 * caller will manage the page reference count. 1426 * 1427 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: 1428 */ 1429 if (!(flags & (FOLL_GET | FOLL_PIN))) 1430 return ERR_PTR(-EEXIST); 1431 1432 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; 1433 *pgmap = get_dev_pagemap(pfn, *pgmap); 1434 if (!*pgmap) 1435 return ERR_PTR(-EFAULT); 1436 page = pfn_to_page(pfn); 1437 1438 ret = try_grab_page(page, flags); 1439 if (ret) 1440 page = ERR_PTR(ret); 1441 1442 return page; 1443 } 1444 1445 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1446 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1447 struct vm_area_struct *vma) 1448 { 1449 spinlock_t *dst_ptl, *src_ptl; 1450 pud_t pud; 1451 int ret; 1452 1453 dst_ptl = pud_lock(dst_mm, dst_pud); 1454 src_ptl = pud_lockptr(src_mm, src_pud); 1455 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1456 1457 ret = -EAGAIN; 1458 pud = *src_pud; 1459 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 1460 goto out_unlock; 1461 1462 /* 1463 * When page table lock is held, the huge zero pud should not be 1464 * under splitting since we don't split the page itself, only pud to 1465 * a page table. 1466 */ 1467 if (is_huge_zero_pud(pud)) { 1468 /* No huge zero pud yet */ 1469 } 1470 1471 /* 1472 * TODO: once we support anonymous pages, use 1473 * folio_try_dup_anon_rmap_*() and split if duplicating fails. 1474 */ 1475 pudp_set_wrprotect(src_mm, addr, src_pud); 1476 pud = pud_mkold(pud_wrprotect(pud)); 1477 set_pud_at(dst_mm, addr, dst_pud, pud); 1478 1479 ret = 0; 1480 out_unlock: 1481 spin_unlock(src_ptl); 1482 spin_unlock(dst_ptl); 1483 return ret; 1484 } 1485 1486 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1487 { 1488 bool write = vmf->flags & FAULT_FLAG_WRITE; 1489 1490 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1491 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1492 goto unlock; 1493 1494 touch_pud(vmf->vma, vmf->address, vmf->pud, write); 1495 unlock: 1496 spin_unlock(vmf->ptl); 1497 } 1498 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1499 1500 void huge_pmd_set_accessed(struct vm_fault *vmf) 1501 { 1502 bool write = vmf->flags & FAULT_FLAG_WRITE; 1503 1504 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1505 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) 1506 goto unlock; 1507 1508 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); 1509 1510 unlock: 1511 spin_unlock(vmf->ptl); 1512 } 1513 1514 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) 1515 { 1516 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 1517 struct vm_area_struct *vma = vmf->vma; 1518 struct folio *folio; 1519 struct page *page; 1520 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1521 pmd_t orig_pmd = vmf->orig_pmd; 1522 1523 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 1524 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1525 1526 if (is_huge_zero_pmd(orig_pmd)) 1527 goto fallback; 1528 1529 spin_lock(vmf->ptl); 1530 1531 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1532 spin_unlock(vmf->ptl); 1533 return 0; 1534 } 1535 1536 page = pmd_page(orig_pmd); 1537 folio = page_folio(page); 1538 VM_BUG_ON_PAGE(!PageHead(page), page); 1539 1540 /* Early check when only holding the PT lock. */ 1541 if (PageAnonExclusive(page)) 1542 goto reuse; 1543 1544 if (!folio_trylock(folio)) { 1545 folio_get(folio); 1546 spin_unlock(vmf->ptl); 1547 folio_lock(folio); 1548 spin_lock(vmf->ptl); 1549 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1550 spin_unlock(vmf->ptl); 1551 folio_unlock(folio); 1552 folio_put(folio); 1553 return 0; 1554 } 1555 folio_put(folio); 1556 } 1557 1558 /* Recheck after temporarily dropping the PT lock. */ 1559 if (PageAnonExclusive(page)) { 1560 folio_unlock(folio); 1561 goto reuse; 1562 } 1563 1564 /* 1565 * See do_wp_page(): we can only reuse the folio exclusively if 1566 * there are no additional references. Note that we always drain 1567 * the LRU cache immediately after adding a THP. 1568 */ 1569 if (folio_ref_count(folio) > 1570 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) 1571 goto unlock_fallback; 1572 if (folio_test_swapcache(folio)) 1573 folio_free_swap(folio); 1574 if (folio_ref_count(folio) == 1) { 1575 pmd_t entry; 1576 1577 folio_move_anon_rmap(folio, vma); 1578 SetPageAnonExclusive(page); 1579 folio_unlock(folio); 1580 reuse: 1581 if (unlikely(unshare)) { 1582 spin_unlock(vmf->ptl); 1583 return 0; 1584 } 1585 entry = pmd_mkyoung(orig_pmd); 1586 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1587 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1588 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1589 spin_unlock(vmf->ptl); 1590 return 0; 1591 } 1592 1593 unlock_fallback: 1594 folio_unlock(folio); 1595 spin_unlock(vmf->ptl); 1596 fallback: 1597 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); 1598 return VM_FAULT_FALLBACK; 1599 } 1600 1601 static inline bool can_change_pmd_writable(struct vm_area_struct *vma, 1602 unsigned long addr, pmd_t pmd) 1603 { 1604 struct page *page; 1605 1606 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) 1607 return false; 1608 1609 /* Don't touch entries that are not even readable (NUMA hinting). */ 1610 if (pmd_protnone(pmd)) 1611 return false; 1612 1613 /* Do we need write faults for softdirty tracking? */ 1614 if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) 1615 return false; 1616 1617 /* Do we need write faults for uffd-wp tracking? */ 1618 if (userfaultfd_huge_pmd_wp(vma, pmd)) 1619 return false; 1620 1621 if (!(vma->vm_flags & VM_SHARED)) { 1622 /* See can_change_pte_writable(). */ 1623 page = vm_normal_page_pmd(vma, addr, pmd); 1624 return page && PageAnon(page) && PageAnonExclusive(page); 1625 } 1626 1627 /* See can_change_pte_writable(). */ 1628 return pmd_dirty(pmd); 1629 } 1630 1631 /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ 1632 static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, 1633 struct vm_area_struct *vma, 1634 unsigned int flags) 1635 { 1636 /* If the pmd is writable, we can write to the page. */ 1637 if (pmd_write(pmd)) 1638 return true; 1639 1640 /* Maybe FOLL_FORCE is set to override it? */ 1641 if (!(flags & FOLL_FORCE)) 1642 return false; 1643 1644 /* But FOLL_FORCE has no effect on shared mappings */ 1645 if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) 1646 return false; 1647 1648 /* ... or read-only private ones */ 1649 if (!(vma->vm_flags & VM_MAYWRITE)) 1650 return false; 1651 1652 /* ... or already writable ones that just need to take a write fault */ 1653 if (vma->vm_flags & VM_WRITE) 1654 return false; 1655 1656 /* 1657 * See can_change_pte_writable(): we broke COW and could map the page 1658 * writable if we have an exclusive anonymous page ... 1659 */ 1660 if (!page || !PageAnon(page) || !PageAnonExclusive(page)) 1661 return false; 1662 1663 /* ... and a write-fault isn't required for other reasons. */ 1664 if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) 1665 return false; 1666 return !userfaultfd_huge_pmd_wp(vma, pmd); 1667 } 1668 1669 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1670 unsigned long addr, 1671 pmd_t *pmd, 1672 unsigned int flags) 1673 { 1674 struct mm_struct *mm = vma->vm_mm; 1675 struct page *page; 1676 int ret; 1677 1678 assert_spin_locked(pmd_lockptr(mm, pmd)); 1679 1680 page = pmd_page(*pmd); 1681 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); 1682 1683 if ((flags & FOLL_WRITE) && 1684 !can_follow_write_pmd(*pmd, page, vma, flags)) 1685 return NULL; 1686 1687 /* Avoid dumping huge zero page */ 1688 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1689 return ERR_PTR(-EFAULT); 1690 1691 if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) 1692 return NULL; 1693 1694 if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) 1695 return ERR_PTR(-EMLINK); 1696 1697 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && 1698 !PageAnonExclusive(page), page); 1699 1700 ret = try_grab_page(page, flags); 1701 if (ret) 1702 return ERR_PTR(ret); 1703 1704 if (flags & FOLL_TOUCH) 1705 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); 1706 1707 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1708 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); 1709 1710 return page; 1711 } 1712 1713 /* NUMA hinting page fault entry point for trans huge pmds */ 1714 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) 1715 { 1716 struct vm_area_struct *vma = vmf->vma; 1717 pmd_t oldpmd = vmf->orig_pmd; 1718 pmd_t pmd; 1719 struct folio *folio; 1720 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1721 int nid = NUMA_NO_NODE; 1722 int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); 1723 bool migrated = false, writable = false; 1724 int flags = 0; 1725 1726 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1727 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 1728 spin_unlock(vmf->ptl); 1729 goto out; 1730 } 1731 1732 pmd = pmd_modify(oldpmd, vma->vm_page_prot); 1733 1734 /* 1735 * Detect now whether the PMD could be writable; this information 1736 * is only valid while holding the PT lock. 1737 */ 1738 writable = pmd_write(pmd); 1739 if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 1740 can_change_pmd_writable(vma, vmf->address, pmd)) 1741 writable = true; 1742 1743 folio = vm_normal_folio_pmd(vma, haddr, pmd); 1744 if (!folio) 1745 goto out_map; 1746 1747 /* See similar comment in do_numa_page for explanation */ 1748 if (!writable) 1749 flags |= TNF_NO_GROUP; 1750 1751 nid = folio_nid(folio); 1752 /* 1753 * For memory tiering mode, cpupid of slow memory page is used 1754 * to record page access time. So use default value. 1755 */ 1756 if (node_is_toptier(nid)) 1757 last_cpupid = folio_last_cpupid(folio); 1758 target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags); 1759 if (target_nid == NUMA_NO_NODE) { 1760 folio_put(folio); 1761 goto out_map; 1762 } 1763 1764 spin_unlock(vmf->ptl); 1765 writable = false; 1766 1767 migrated = migrate_misplaced_folio(folio, vma, target_nid); 1768 if (migrated) { 1769 flags |= TNF_MIGRATED; 1770 nid = target_nid; 1771 } else { 1772 flags |= TNF_MIGRATE_FAIL; 1773 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1774 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 1775 spin_unlock(vmf->ptl); 1776 goto out; 1777 } 1778 goto out_map; 1779 } 1780 1781 out: 1782 if (nid != NUMA_NO_NODE) 1783 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 1784 1785 return 0; 1786 1787 out_map: 1788 /* Restore the PMD */ 1789 pmd = pmd_modify(oldpmd, vma->vm_page_prot); 1790 pmd = pmd_mkyoung(pmd); 1791 if (writable) 1792 pmd = pmd_mkwrite(pmd, vma); 1793 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 1794 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1795 spin_unlock(vmf->ptl); 1796 goto out; 1797 } 1798 1799 /* 1800 * Return true if we do MADV_FREE successfully on entire pmd page. 1801 * Otherwise, return false. 1802 */ 1803 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1804 pmd_t *pmd, unsigned long addr, unsigned long next) 1805 { 1806 spinlock_t *ptl; 1807 pmd_t orig_pmd; 1808 struct folio *folio; 1809 struct mm_struct *mm = tlb->mm; 1810 bool ret = false; 1811 1812 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1813 1814 ptl = pmd_trans_huge_lock(pmd, vma); 1815 if (!ptl) 1816 goto out_unlocked; 1817 1818 orig_pmd = *pmd; 1819 if (is_huge_zero_pmd(orig_pmd)) 1820 goto out; 1821 1822 if (unlikely(!pmd_present(orig_pmd))) { 1823 VM_BUG_ON(thp_migration_supported() && 1824 !is_pmd_migration_entry(orig_pmd)); 1825 goto out; 1826 } 1827 1828 folio = pfn_folio(pmd_pfn(orig_pmd)); 1829 /* 1830 * If other processes are mapping this folio, we couldn't discard 1831 * the folio unless they all do MADV_FREE so let's skip the folio. 1832 */ 1833 if (folio_estimated_sharers(folio) != 1) 1834 goto out; 1835 1836 if (!folio_trylock(folio)) 1837 goto out; 1838 1839 /* 1840 * If user want to discard part-pages of THP, split it so MADV_FREE 1841 * will deactivate only them. 1842 */ 1843 if (next - addr != HPAGE_PMD_SIZE) { 1844 folio_get(folio); 1845 spin_unlock(ptl); 1846 split_folio(folio); 1847 folio_unlock(folio); 1848 folio_put(folio); 1849 goto out_unlocked; 1850 } 1851 1852 if (folio_test_dirty(folio)) 1853 folio_clear_dirty(folio); 1854 folio_unlock(folio); 1855 1856 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1857 pmdp_invalidate(vma, addr, pmd); 1858 orig_pmd = pmd_mkold(orig_pmd); 1859 orig_pmd = pmd_mkclean(orig_pmd); 1860 1861 set_pmd_at(mm, addr, pmd, orig_pmd); 1862 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1863 } 1864 1865 folio_mark_lazyfree(folio); 1866 ret = true; 1867 out: 1868 spin_unlock(ptl); 1869 out_unlocked: 1870 return ret; 1871 } 1872 1873 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 1874 { 1875 pgtable_t pgtable; 1876 1877 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1878 pte_free(mm, pgtable); 1879 mm_dec_nr_ptes(mm); 1880 } 1881 1882 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1883 pmd_t *pmd, unsigned long addr) 1884 { 1885 pmd_t orig_pmd; 1886 spinlock_t *ptl; 1887 1888 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1889 1890 ptl = __pmd_trans_huge_lock(pmd, vma); 1891 if (!ptl) 1892 return 0; 1893 /* 1894 * For architectures like ppc64 we look at deposited pgtable 1895 * when calling pmdp_huge_get_and_clear. So do the 1896 * pgtable_trans_huge_withdraw after finishing pmdp related 1897 * operations. 1898 */ 1899 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, 1900 tlb->fullmm); 1901 arch_check_zapped_pmd(vma, orig_pmd); 1902 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1903 if (vma_is_special_huge(vma)) { 1904 if (arch_needs_pgtable_deposit()) 1905 zap_deposited_table(tlb->mm, pmd); 1906 spin_unlock(ptl); 1907 } else if (is_huge_zero_pmd(orig_pmd)) { 1908 zap_deposited_table(tlb->mm, pmd); 1909 spin_unlock(ptl); 1910 } else { 1911 struct folio *folio = NULL; 1912 int flush_needed = 1; 1913 1914 if (pmd_present(orig_pmd)) { 1915 struct page *page = pmd_page(orig_pmd); 1916 1917 folio = page_folio(page); 1918 folio_remove_rmap_pmd(folio, page, vma); 1919 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1920 VM_BUG_ON_PAGE(!PageHead(page), page); 1921 } else if (thp_migration_supported()) { 1922 swp_entry_t entry; 1923 1924 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 1925 entry = pmd_to_swp_entry(orig_pmd); 1926 folio = pfn_swap_entry_folio(entry); 1927 flush_needed = 0; 1928 } else 1929 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 1930 1931 if (folio_test_anon(folio)) { 1932 zap_deposited_table(tlb->mm, pmd); 1933 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1934 } else { 1935 if (arch_needs_pgtable_deposit()) 1936 zap_deposited_table(tlb->mm, pmd); 1937 add_mm_counter(tlb->mm, mm_counter_file(folio), 1938 -HPAGE_PMD_NR); 1939 } 1940 1941 spin_unlock(ptl); 1942 if (flush_needed) 1943 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); 1944 } 1945 return 1; 1946 } 1947 1948 #ifndef pmd_move_must_withdraw 1949 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 1950 spinlock_t *old_pmd_ptl, 1951 struct vm_area_struct *vma) 1952 { 1953 /* 1954 * With split pmd lock we also need to move preallocated 1955 * PTE page table if new_pmd is on different PMD page table. 1956 * 1957 * We also don't deposit and withdraw tables for file pages. 1958 */ 1959 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 1960 } 1961 #endif 1962 1963 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 1964 { 1965 #ifdef CONFIG_MEM_SOFT_DIRTY 1966 if (unlikely(is_pmd_migration_entry(pmd))) 1967 pmd = pmd_swp_mksoft_dirty(pmd); 1968 else if (pmd_present(pmd)) 1969 pmd = pmd_mksoft_dirty(pmd); 1970 #endif 1971 return pmd; 1972 } 1973 1974 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1975 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) 1976 { 1977 spinlock_t *old_ptl, *new_ptl; 1978 pmd_t pmd; 1979 struct mm_struct *mm = vma->vm_mm; 1980 bool force_flush = false; 1981 1982 /* 1983 * The destination pmd shouldn't be established, free_pgtables() 1984 * should have released it; but move_page_tables() might have already 1985 * inserted a page table, if racing against shmem/file collapse. 1986 */ 1987 if (!pmd_none(*new_pmd)) { 1988 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1989 return false; 1990 } 1991 1992 /* 1993 * We don't have to worry about the ordering of src and dst 1994 * ptlocks because exclusive mmap_lock prevents deadlock. 1995 */ 1996 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1997 if (old_ptl) { 1998 new_ptl = pmd_lockptr(mm, new_pmd); 1999 if (new_ptl != old_ptl) 2000 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 2001 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 2002 if (pmd_present(pmd)) 2003 force_flush = true; 2004 VM_BUG_ON(!pmd_none(*new_pmd)); 2005 2006 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 2007 pgtable_t pgtable; 2008 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 2009 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 2010 } 2011 pmd = move_soft_dirty_pmd(pmd); 2012 set_pmd_at(mm, new_addr, new_pmd, pmd); 2013 if (force_flush) 2014 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 2015 if (new_ptl != old_ptl) 2016 spin_unlock(new_ptl); 2017 spin_unlock(old_ptl); 2018 return true; 2019 } 2020 return false; 2021 } 2022 2023 /* 2024 * Returns 2025 * - 0 if PMD could not be locked 2026 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary 2027 * or if prot_numa but THP migration is not supported 2028 * - HPAGE_PMD_NR if protections changed and TLB flush necessary 2029 */ 2030 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2031 pmd_t *pmd, unsigned long addr, pgprot_t newprot, 2032 unsigned long cp_flags) 2033 { 2034 struct mm_struct *mm = vma->vm_mm; 2035 spinlock_t *ptl; 2036 pmd_t oldpmd, entry; 2037 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 2038 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 2039 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 2040 int ret = 1; 2041 2042 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2043 2044 if (prot_numa && !thp_migration_supported()) 2045 return 1; 2046 2047 ptl = __pmd_trans_huge_lock(pmd, vma); 2048 if (!ptl) 2049 return 0; 2050 2051 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2052 if (is_swap_pmd(*pmd)) { 2053 swp_entry_t entry = pmd_to_swp_entry(*pmd); 2054 struct folio *folio = pfn_swap_entry_folio(entry); 2055 pmd_t newpmd; 2056 2057 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 2058 if (is_writable_migration_entry(entry)) { 2059 /* 2060 * A protection check is difficult so 2061 * just be safe and disable write 2062 */ 2063 if (folio_test_anon(folio)) 2064 entry = make_readable_exclusive_migration_entry(swp_offset(entry)); 2065 else 2066 entry = make_readable_migration_entry(swp_offset(entry)); 2067 newpmd = swp_entry_to_pmd(entry); 2068 if (pmd_swp_soft_dirty(*pmd)) 2069 newpmd = pmd_swp_mksoft_dirty(newpmd); 2070 } else { 2071 newpmd = *pmd; 2072 } 2073 2074 if (uffd_wp) 2075 newpmd = pmd_swp_mkuffd_wp(newpmd); 2076 else if (uffd_wp_resolve) 2077 newpmd = pmd_swp_clear_uffd_wp(newpmd); 2078 if (!pmd_same(*pmd, newpmd)) 2079 set_pmd_at(mm, addr, pmd, newpmd); 2080 goto unlock; 2081 } 2082 #endif 2083 2084 if (prot_numa) { 2085 struct folio *folio; 2086 bool toptier; 2087 /* 2088 * Avoid trapping faults against the zero page. The read-only 2089 * data is likely to be read-cached on the local CPU and 2090 * local/remote hits to the zero page are not interesting. 2091 */ 2092 if (is_huge_zero_pmd(*pmd)) 2093 goto unlock; 2094 2095 if (pmd_protnone(*pmd)) 2096 goto unlock; 2097 2098 folio = page_folio(pmd_page(*pmd)); 2099 toptier = node_is_toptier(folio_nid(folio)); 2100 /* 2101 * Skip scanning top tier node if normal numa 2102 * balancing is disabled 2103 */ 2104 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 2105 toptier) 2106 goto unlock; 2107 2108 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && 2109 !toptier) 2110 folio_xchg_access_time(folio, 2111 jiffies_to_msecs(jiffies)); 2112 } 2113 /* 2114 * In case prot_numa, we are under mmap_read_lock(mm). It's critical 2115 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 2116 * which is also under mmap_read_lock(mm): 2117 * 2118 * CPU0: CPU1: 2119 * change_huge_pmd(prot_numa=1) 2120 * pmdp_huge_get_and_clear_notify() 2121 * madvise_dontneed() 2122 * zap_pmd_range() 2123 * pmd_trans_huge(*pmd) == 0 (without ptl) 2124 * // skip the pmd 2125 * set_pmd_at(); 2126 * // pmd is re-established 2127 * 2128 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 2129 * which may break userspace. 2130 * 2131 * pmdp_invalidate_ad() is required to make sure we don't miss 2132 * dirty/young flags set by hardware. 2133 */ 2134 oldpmd = pmdp_invalidate_ad(vma, addr, pmd); 2135 2136 entry = pmd_modify(oldpmd, newprot); 2137 if (uffd_wp) 2138 entry = pmd_mkuffd_wp(entry); 2139 else if (uffd_wp_resolve) 2140 /* 2141 * Leave the write bit to be handled by PF interrupt 2142 * handler, then things like COW could be properly 2143 * handled. 2144 */ 2145 entry = pmd_clear_uffd_wp(entry); 2146 2147 /* See change_pte_range(). */ 2148 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && 2149 can_change_pmd_writable(vma, addr, entry)) 2150 entry = pmd_mkwrite(entry, vma); 2151 2152 ret = HPAGE_PMD_NR; 2153 set_pmd_at(mm, addr, pmd, entry); 2154 2155 if (huge_pmd_needs_flush(oldpmd, entry)) 2156 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); 2157 unlock: 2158 spin_unlock(ptl); 2159 return ret; 2160 } 2161 2162 #ifdef CONFIG_USERFAULTFD 2163 /* 2164 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by 2165 * the caller, but it must return after releasing the page_table_lock. 2166 * Just move the page from src_pmd to dst_pmd if possible. 2167 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be 2168 * repeated by the caller, or other errors in case of failure. 2169 */ 2170 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 2171 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 2172 unsigned long dst_addr, unsigned long src_addr) 2173 { 2174 pmd_t _dst_pmd, src_pmdval; 2175 struct page *src_page; 2176 struct folio *src_folio; 2177 struct anon_vma *src_anon_vma; 2178 spinlock_t *src_ptl, *dst_ptl; 2179 pgtable_t src_pgtable; 2180 struct mmu_notifier_range range; 2181 int err = 0; 2182 2183 src_pmdval = *src_pmd; 2184 src_ptl = pmd_lockptr(mm, src_pmd); 2185 2186 lockdep_assert_held(src_ptl); 2187 vma_assert_locked(src_vma); 2188 vma_assert_locked(dst_vma); 2189 2190 /* Sanity checks before the operation */ 2191 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || 2192 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { 2193 spin_unlock(src_ptl); 2194 return -EINVAL; 2195 } 2196 2197 if (!pmd_trans_huge(src_pmdval)) { 2198 spin_unlock(src_ptl); 2199 if (is_pmd_migration_entry(src_pmdval)) { 2200 pmd_migration_entry_wait(mm, &src_pmdval); 2201 return -EAGAIN; 2202 } 2203 return -ENOENT; 2204 } 2205 2206 src_page = pmd_page(src_pmdval); 2207 2208 if (!is_huge_zero_pmd(src_pmdval)) { 2209 if (unlikely(!PageAnonExclusive(src_page))) { 2210 spin_unlock(src_ptl); 2211 return -EBUSY; 2212 } 2213 2214 src_folio = page_folio(src_page); 2215 folio_get(src_folio); 2216 } else 2217 src_folio = NULL; 2218 2219 spin_unlock(src_ptl); 2220 2221 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); 2222 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, 2223 src_addr + HPAGE_PMD_SIZE); 2224 mmu_notifier_invalidate_range_start(&range); 2225 2226 if (src_folio) { 2227 folio_lock(src_folio); 2228 2229 /* 2230 * split_huge_page walks the anon_vma chain without the page 2231 * lock. Serialize against it with the anon_vma lock, the page 2232 * lock is not enough. 2233 */ 2234 src_anon_vma = folio_get_anon_vma(src_folio); 2235 if (!src_anon_vma) { 2236 err = -EAGAIN; 2237 goto unlock_folio; 2238 } 2239 anon_vma_lock_write(src_anon_vma); 2240 } else 2241 src_anon_vma = NULL; 2242 2243 dst_ptl = pmd_lockptr(mm, dst_pmd); 2244 double_pt_lock(src_ptl, dst_ptl); 2245 if (unlikely(!pmd_same(*src_pmd, src_pmdval) || 2246 !pmd_same(*dst_pmd, dst_pmdval))) { 2247 err = -EAGAIN; 2248 goto unlock_ptls; 2249 } 2250 if (src_folio) { 2251 if (folio_maybe_dma_pinned(src_folio) || 2252 !PageAnonExclusive(&src_folio->page)) { 2253 err = -EBUSY; 2254 goto unlock_ptls; 2255 } 2256 2257 if (WARN_ON_ONCE(!folio_test_head(src_folio)) || 2258 WARN_ON_ONCE(!folio_test_anon(src_folio))) { 2259 err = -EBUSY; 2260 goto unlock_ptls; 2261 } 2262 2263 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2264 /* Folio got pinned from under us. Put it back and fail the move. */ 2265 if (folio_maybe_dma_pinned(src_folio)) { 2266 set_pmd_at(mm, src_addr, src_pmd, src_pmdval); 2267 err = -EBUSY; 2268 goto unlock_ptls; 2269 } 2270 2271 folio_move_anon_rmap(src_folio, dst_vma); 2272 WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); 2273 2274 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); 2275 /* Follow mremap() behavior and treat the entry dirty after the move */ 2276 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); 2277 } else { 2278 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2279 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); 2280 } 2281 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); 2282 2283 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); 2284 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2285 unlock_ptls: 2286 double_pt_unlock(src_ptl, dst_ptl); 2287 if (src_anon_vma) { 2288 anon_vma_unlock_write(src_anon_vma); 2289 put_anon_vma(src_anon_vma); 2290 } 2291 unlock_folio: 2292 /* unblock rmap walks */ 2293 if (src_folio) 2294 folio_unlock(src_folio); 2295 mmu_notifier_invalidate_range_end(&range); 2296 if (src_folio) 2297 folio_put(src_folio); 2298 return err; 2299 } 2300 #endif /* CONFIG_USERFAULTFD */ 2301 2302 /* 2303 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2304 * 2305 * Note that if it returns page table lock pointer, this routine returns without 2306 * unlocking page table lock. So callers must unlock it. 2307 */ 2308 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2309 { 2310 spinlock_t *ptl; 2311 ptl = pmd_lock(vma->vm_mm, pmd); 2312 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 2313 pmd_devmap(*pmd))) 2314 return ptl; 2315 spin_unlock(ptl); 2316 return NULL; 2317 } 2318 2319 /* 2320 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. 2321 * 2322 * Note that if it returns page table lock pointer, this routine returns without 2323 * unlocking page table lock. So callers must unlock it. 2324 */ 2325 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2326 { 2327 spinlock_t *ptl; 2328 2329 ptl = pud_lock(vma->vm_mm, pud); 2330 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 2331 return ptl; 2332 spin_unlock(ptl); 2333 return NULL; 2334 } 2335 2336 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2337 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2338 pud_t *pud, unsigned long addr) 2339 { 2340 spinlock_t *ptl; 2341 2342 ptl = __pud_trans_huge_lock(pud, vma); 2343 if (!ptl) 2344 return 0; 2345 2346 pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); 2347 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2348 if (vma_is_special_huge(vma)) { 2349 spin_unlock(ptl); 2350 /* No zero page support yet */ 2351 } else { 2352 /* No support for anonymous PUD pages yet */ 2353 BUG(); 2354 } 2355 return 1; 2356 } 2357 2358 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2359 unsigned long haddr) 2360 { 2361 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2362 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2363 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2364 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 2365 2366 count_vm_event(THP_SPLIT_PUD); 2367 2368 pudp_huge_clear_flush(vma, haddr, pud); 2369 } 2370 2371 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2372 unsigned long address) 2373 { 2374 spinlock_t *ptl; 2375 struct mmu_notifier_range range; 2376 2377 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2378 address & HPAGE_PUD_MASK, 2379 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2380 mmu_notifier_invalidate_range_start(&range); 2381 ptl = pud_lock(vma->vm_mm, pud); 2382 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2383 goto out; 2384 __split_huge_pud_locked(vma, pud, range.start); 2385 2386 out: 2387 spin_unlock(ptl); 2388 mmu_notifier_invalidate_range_end(&range); 2389 } 2390 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2391 2392 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2393 unsigned long haddr, pmd_t *pmd) 2394 { 2395 struct mm_struct *mm = vma->vm_mm; 2396 pgtable_t pgtable; 2397 pmd_t _pmd, old_pmd; 2398 unsigned long addr; 2399 pte_t *pte; 2400 int i; 2401 2402 /* 2403 * Leave pmd empty until pte is filled note that it is fine to delay 2404 * notification until mmu_notifier_invalidate_range_end() as we are 2405 * replacing a zero pmd write protected page with a zero pte write 2406 * protected page. 2407 * 2408 * See Documentation/mm/mmu_notifier.rst 2409 */ 2410 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2411 2412 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2413 pmd_populate(mm, &_pmd, pgtable); 2414 2415 pte = pte_offset_map(&_pmd, haddr); 2416 VM_BUG_ON(!pte); 2417 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2418 pte_t entry; 2419 2420 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); 2421 entry = pte_mkspecial(entry); 2422 if (pmd_uffd_wp(old_pmd)) 2423 entry = pte_mkuffd_wp(entry); 2424 VM_BUG_ON(!pte_none(ptep_get(pte))); 2425 set_pte_at(mm, addr, pte, entry); 2426 pte++; 2427 } 2428 pte_unmap(pte - 1); 2429 smp_wmb(); /* make pte visible before pmd */ 2430 pmd_populate(mm, pmd, pgtable); 2431 } 2432 2433 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2434 unsigned long haddr, bool freeze) 2435 { 2436 struct mm_struct *mm = vma->vm_mm; 2437 struct folio *folio; 2438 struct page *page; 2439 pgtable_t pgtable; 2440 pmd_t old_pmd, _pmd; 2441 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; 2442 bool anon_exclusive = false, dirty = false; 2443 unsigned long addr; 2444 pte_t *pte; 2445 int i; 2446 2447 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2448 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2449 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2450 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 2451 && !pmd_devmap(*pmd)); 2452 2453 count_vm_event(THP_SPLIT_PMD); 2454 2455 if (!vma_is_anonymous(vma)) { 2456 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2457 /* 2458 * We are going to unmap this huge page. So 2459 * just go ahead and zap it 2460 */ 2461 if (arch_needs_pgtable_deposit()) 2462 zap_deposited_table(mm, pmd); 2463 if (vma_is_special_huge(vma)) 2464 return; 2465 if (unlikely(is_pmd_migration_entry(old_pmd))) { 2466 swp_entry_t entry; 2467 2468 entry = pmd_to_swp_entry(old_pmd); 2469 folio = pfn_swap_entry_folio(entry); 2470 } else { 2471 page = pmd_page(old_pmd); 2472 folio = page_folio(page); 2473 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) 2474 folio_mark_dirty(folio); 2475 if (!folio_test_referenced(folio) && pmd_young(old_pmd)) 2476 folio_set_referenced(folio); 2477 folio_remove_rmap_pmd(folio, page, vma); 2478 folio_put(folio); 2479 } 2480 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); 2481 return; 2482 } 2483 2484 if (is_huge_zero_pmd(*pmd)) { 2485 /* 2486 * FIXME: Do we want to invalidate secondary mmu by calling 2487 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below 2488 * inside __split_huge_pmd() ? 2489 * 2490 * We are going from a zero huge page write protected to zero 2491 * small page also write protected so it does not seems useful 2492 * to invalidate secondary mmu at this time. 2493 */ 2494 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2495 } 2496 2497 /* 2498 * Up to this point the pmd is present and huge and userland has the 2499 * whole access to the hugepage during the split (which happens in 2500 * place). If we overwrite the pmd with the not-huge version pointing 2501 * to the pte here (which of course we could if all CPUs were bug 2502 * free), userland could trigger a small page size TLB miss on the 2503 * small sized TLB while the hugepage TLB entry is still established in 2504 * the huge TLB. Some CPU doesn't like that. 2505 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum 2506 * 383 on page 105. Intel should be safe but is also warns that it's 2507 * only safe if the permission and cache attributes of the two entries 2508 * loaded in the two TLB is identical (which should be the case here). 2509 * But it is generally safer to never allow small and huge TLB entries 2510 * for the same virtual address to be loaded simultaneously. So instead 2511 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 2512 * current pmd notpresent (atomically because here the pmd_trans_huge 2513 * must remain set at all times on the pmd until the split is complete 2514 * for this pmd), then we flush the SMP TLB and finally we write the 2515 * non-huge version of the pmd entry with pmd_populate. 2516 */ 2517 old_pmd = pmdp_invalidate(vma, haddr, pmd); 2518 2519 pmd_migration = is_pmd_migration_entry(old_pmd); 2520 if (unlikely(pmd_migration)) { 2521 swp_entry_t entry; 2522 2523 entry = pmd_to_swp_entry(old_pmd); 2524 page = pfn_swap_entry_to_page(entry); 2525 write = is_writable_migration_entry(entry); 2526 if (PageAnon(page)) 2527 anon_exclusive = is_readable_exclusive_migration_entry(entry); 2528 young = is_migration_entry_young(entry); 2529 dirty = is_migration_entry_dirty(entry); 2530 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2531 uffd_wp = pmd_swp_uffd_wp(old_pmd); 2532 } else { 2533 page = pmd_page(old_pmd); 2534 folio = page_folio(page); 2535 if (pmd_dirty(old_pmd)) { 2536 dirty = true; 2537 folio_set_dirty(folio); 2538 } 2539 write = pmd_write(old_pmd); 2540 young = pmd_young(old_pmd); 2541 soft_dirty = pmd_soft_dirty(old_pmd); 2542 uffd_wp = pmd_uffd_wp(old_pmd); 2543 2544 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); 2545 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 2546 2547 /* 2548 * Without "freeze", we'll simply split the PMD, propagating the 2549 * PageAnonExclusive() flag for each PTE by setting it for 2550 * each subpage -- no need to (temporarily) clear. 2551 * 2552 * With "freeze" we want to replace mapped pages by 2553 * migration entries right away. This is only possible if we 2554 * managed to clear PageAnonExclusive() -- see 2555 * set_pmd_migration_entry(). 2556 * 2557 * In case we cannot clear PageAnonExclusive(), split the PMD 2558 * only and let try_to_migrate_one() fail later. 2559 * 2560 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. 2561 */ 2562 anon_exclusive = PageAnonExclusive(page); 2563 if (freeze && anon_exclusive && 2564 folio_try_share_anon_rmap_pmd(folio, page)) 2565 freeze = false; 2566 if (!freeze) { 2567 rmap_t rmap_flags = RMAP_NONE; 2568 2569 folio_ref_add(folio, HPAGE_PMD_NR - 1); 2570 if (anon_exclusive) 2571 rmap_flags |= RMAP_EXCLUSIVE; 2572 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, 2573 vma, haddr, rmap_flags); 2574 } 2575 } 2576 2577 /* 2578 * Withdraw the table only after we mark the pmd entry invalid. 2579 * This's critical for some architectures (Power). 2580 */ 2581 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2582 pmd_populate(mm, &_pmd, pgtable); 2583 2584 pte = pte_offset_map(&_pmd, haddr); 2585 VM_BUG_ON(!pte); 2586 2587 /* 2588 * Note that NUMA hinting access restrictions are not transferred to 2589 * avoid any possibility of altering permissions across VMAs. 2590 */ 2591 if (freeze || pmd_migration) { 2592 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2593 pte_t entry; 2594 swp_entry_t swp_entry; 2595 2596 if (write) 2597 swp_entry = make_writable_migration_entry( 2598 page_to_pfn(page + i)); 2599 else if (anon_exclusive) 2600 swp_entry = make_readable_exclusive_migration_entry( 2601 page_to_pfn(page + i)); 2602 else 2603 swp_entry = make_readable_migration_entry( 2604 page_to_pfn(page + i)); 2605 if (young) 2606 swp_entry = make_migration_entry_young(swp_entry); 2607 if (dirty) 2608 swp_entry = make_migration_entry_dirty(swp_entry); 2609 entry = swp_entry_to_pte(swp_entry); 2610 if (soft_dirty) 2611 entry = pte_swp_mksoft_dirty(entry); 2612 if (uffd_wp) 2613 entry = pte_swp_mkuffd_wp(entry); 2614 2615 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2616 set_pte_at(mm, addr, pte + i, entry); 2617 } 2618 } else { 2619 pte_t entry; 2620 2621 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); 2622 if (write) 2623 entry = pte_mkwrite(entry, vma); 2624 if (!young) 2625 entry = pte_mkold(entry); 2626 /* NOTE: this may set soft-dirty too on some archs */ 2627 if (dirty) 2628 entry = pte_mkdirty(entry); 2629 if (soft_dirty) 2630 entry = pte_mksoft_dirty(entry); 2631 if (uffd_wp) 2632 entry = pte_mkuffd_wp(entry); 2633 2634 for (i = 0; i < HPAGE_PMD_NR; i++) 2635 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2636 2637 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); 2638 } 2639 pte_unmap(pte); 2640 2641 if (!pmd_migration) 2642 folio_remove_rmap_pmd(folio, page, vma); 2643 if (freeze) 2644 put_page(page); 2645 2646 smp_wmb(); /* make pte visible before pmd */ 2647 pmd_populate(mm, pmd, pgtable); 2648 } 2649 2650 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2651 unsigned long address, bool freeze, struct folio *folio) 2652 { 2653 spinlock_t *ptl; 2654 struct mmu_notifier_range range; 2655 2656 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2657 address & HPAGE_PMD_MASK, 2658 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 2659 mmu_notifier_invalidate_range_start(&range); 2660 ptl = pmd_lock(vma->vm_mm, pmd); 2661 2662 /* 2663 * If caller asks to setup a migration entry, we need a folio to check 2664 * pmd against. Otherwise we can end up replacing wrong folio. 2665 */ 2666 VM_BUG_ON(freeze && !folio); 2667 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); 2668 2669 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || 2670 is_pmd_migration_entry(*pmd)) { 2671 /* 2672 * It's safe to call pmd_page when folio is set because it's 2673 * guaranteed that pmd is present. 2674 */ 2675 if (folio && folio != page_folio(pmd_page(*pmd))) 2676 goto out; 2677 __split_huge_pmd_locked(vma, pmd, range.start, freeze); 2678 } 2679 2680 out: 2681 spin_unlock(ptl); 2682 mmu_notifier_invalidate_range_end(&range); 2683 } 2684 2685 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2686 bool freeze, struct folio *folio) 2687 { 2688 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 2689 2690 if (!pmd) 2691 return; 2692 2693 __split_huge_pmd(vma, pmd, address, freeze, folio); 2694 } 2695 2696 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) 2697 { 2698 /* 2699 * If the new address isn't hpage aligned and it could previously 2700 * contain an hugepage: check if we need to split an huge pmd. 2701 */ 2702 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && 2703 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), 2704 ALIGN(address, HPAGE_PMD_SIZE))) 2705 split_huge_pmd_address(vma, address, false, NULL); 2706 } 2707 2708 void vma_adjust_trans_huge(struct vm_area_struct *vma, 2709 unsigned long start, 2710 unsigned long end, 2711 long adjust_next) 2712 { 2713 /* Check if we need to split start first. */ 2714 split_huge_pmd_if_needed(vma, start); 2715 2716 /* Check if we need to split end next. */ 2717 split_huge_pmd_if_needed(vma, end); 2718 2719 /* 2720 * If we're also updating the next vma vm_start, 2721 * check if we need to split it. 2722 */ 2723 if (adjust_next > 0) { 2724 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); 2725 unsigned long nstart = next->vm_start; 2726 nstart += adjust_next; 2727 split_huge_pmd_if_needed(next, nstart); 2728 } 2729 } 2730 2731 static void unmap_folio(struct folio *folio) 2732 { 2733 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | 2734 TTU_BATCH_FLUSH; 2735 2736 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 2737 2738 if (folio_test_pmd_mappable(folio)) 2739 ttu_flags |= TTU_SPLIT_HUGE_PMD; 2740 2741 /* 2742 * Anon pages need migration entries to preserve them, but file 2743 * pages can simply be left unmapped, then faulted back on demand. 2744 * If that is ever changed (perhaps for mlock), update remap_page(). 2745 */ 2746 if (folio_test_anon(folio)) 2747 try_to_migrate(folio, ttu_flags); 2748 else 2749 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 2750 2751 try_to_unmap_flush(); 2752 } 2753 2754 static void remap_page(struct folio *folio, unsigned long nr) 2755 { 2756 int i = 0; 2757 2758 /* If unmap_folio() uses try_to_migrate() on file, remove this check */ 2759 if (!folio_test_anon(folio)) 2760 return; 2761 for (;;) { 2762 remove_migration_ptes(folio, folio, true); 2763 i += folio_nr_pages(folio); 2764 if (i >= nr) 2765 break; 2766 folio = folio_next(folio); 2767 } 2768 } 2769 2770 static void lru_add_page_tail(struct page *head, struct page *tail, 2771 struct lruvec *lruvec, struct list_head *list) 2772 { 2773 VM_BUG_ON_PAGE(!PageHead(head), head); 2774 VM_BUG_ON_PAGE(PageLRU(tail), head); 2775 lockdep_assert_held(&lruvec->lru_lock); 2776 2777 if (list) { 2778 /* page reclaim is reclaiming a huge page */ 2779 VM_WARN_ON(PageLRU(head)); 2780 get_page(tail); 2781 list_add_tail(&tail->lru, list); 2782 } else { 2783 /* head is still on lru (and we have it frozen) */ 2784 VM_WARN_ON(!PageLRU(head)); 2785 if (PageUnevictable(tail)) 2786 tail->mlock_count = 0; 2787 else 2788 list_add_tail(&tail->lru, &head->lru); 2789 SetPageLRU(tail); 2790 } 2791 } 2792 2793 static void __split_huge_page_tail(struct folio *folio, int tail, 2794 struct lruvec *lruvec, struct list_head *list, 2795 unsigned int new_order) 2796 { 2797 struct page *head = &folio->page; 2798 struct page *page_tail = head + tail; 2799 /* 2800 * Careful: new_folio is not a "real" folio before we cleared PageTail. 2801 * Don't pass it around before clear_compound_head(). 2802 */ 2803 struct folio *new_folio = (struct folio *)page_tail; 2804 2805 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 2806 2807 /* 2808 * Clone page flags before unfreezing refcount. 2809 * 2810 * After successful get_page_unless_zero() might follow flags change, 2811 * for example lock_page() which set PG_waiters. 2812 * 2813 * Note that for mapped sub-pages of an anonymous THP, 2814 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in 2815 * the migration entry instead from where remap_page() will restore it. 2816 * We can still have PG_anon_exclusive set on effectively unmapped and 2817 * unreferenced sub-pages of an anonymous THP: we can simply drop 2818 * PG_anon_exclusive (-> PG_mappedtodisk) for these here. 2819 */ 2820 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 2821 page_tail->flags |= (head->flags & 2822 ((1L << PG_referenced) | 2823 (1L << PG_swapbacked) | 2824 (1L << PG_swapcache) | 2825 (1L << PG_mlocked) | 2826 (1L << PG_uptodate) | 2827 (1L << PG_active) | 2828 (1L << PG_workingset) | 2829 (1L << PG_locked) | 2830 (1L << PG_unevictable) | 2831 #ifdef CONFIG_ARCH_USES_PG_ARCH_X 2832 (1L << PG_arch_2) | 2833 (1L << PG_arch_3) | 2834 #endif 2835 (1L << PG_dirty) | 2836 LRU_GEN_MASK | LRU_REFS_MASK)); 2837 2838 /* ->mapping in first and second tail page is replaced by other uses */ 2839 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 2840 page_tail); 2841 page_tail->mapping = head->mapping; 2842 page_tail->index = head->index + tail; 2843 2844 /* 2845 * page->private should not be set in tail pages. Fix up and warn once 2846 * if private is unexpectedly set. 2847 */ 2848 if (unlikely(page_tail->private)) { 2849 VM_WARN_ON_ONCE_PAGE(true, page_tail); 2850 page_tail->private = 0; 2851 } 2852 if (folio_test_swapcache(folio)) 2853 new_folio->swap.val = folio->swap.val + tail; 2854 2855 /* Page flags must be visible before we make the page non-compound. */ 2856 smp_wmb(); 2857 2858 /* 2859 * Clear PageTail before unfreezing page refcount. 2860 * 2861 * After successful get_page_unless_zero() might follow put_page() 2862 * which needs correct compound_head(). 2863 */ 2864 clear_compound_head(page_tail); 2865 if (new_order) { 2866 prep_compound_page(page_tail, new_order); 2867 folio_prep_large_rmappable(new_folio); 2868 } 2869 2870 /* Finally unfreeze refcount. Additional reference from page cache. */ 2871 page_ref_unfreeze(page_tail, 2872 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? 2873 folio_nr_pages(new_folio) : 0)); 2874 2875 if (folio_test_young(folio)) 2876 folio_set_young(new_folio); 2877 if (folio_test_idle(folio)) 2878 folio_set_idle(new_folio); 2879 2880 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); 2881 2882 /* 2883 * always add to the tail because some iterators expect new 2884 * pages to show after the currently processed elements - e.g. 2885 * migrate_pages 2886 */ 2887 lru_add_page_tail(head, page_tail, lruvec, list); 2888 } 2889 2890 static void __split_huge_page(struct page *page, struct list_head *list, 2891 pgoff_t end, unsigned int new_order) 2892 { 2893 struct folio *folio = page_folio(page); 2894 struct page *head = &folio->page; 2895 struct lruvec *lruvec; 2896 struct address_space *swap_cache = NULL; 2897 unsigned long offset = 0; 2898 int i, nr_dropped = 0; 2899 unsigned int new_nr = 1 << new_order; 2900 int order = folio_order(folio); 2901 unsigned int nr = 1 << order; 2902 2903 /* complete memcg works before add pages to LRU */ 2904 split_page_memcg(head, order, new_order); 2905 2906 if (folio_test_anon(folio) && folio_test_swapcache(folio)) { 2907 offset = swp_offset(folio->swap); 2908 swap_cache = swap_address_space(folio->swap); 2909 xa_lock(&swap_cache->i_pages); 2910 } 2911 2912 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 2913 lruvec = folio_lruvec_lock(folio); 2914 2915 ClearPageHasHWPoisoned(head); 2916 2917 for (i = nr - new_nr; i >= new_nr; i -= new_nr) { 2918 __split_huge_page_tail(folio, i, lruvec, list, new_order); 2919 /* Some pages can be beyond EOF: drop them from page cache */ 2920 if (head[i].index >= end) { 2921 struct folio *tail = page_folio(head + i); 2922 2923 if (shmem_mapping(folio->mapping)) 2924 nr_dropped++; 2925 else if (folio_test_clear_dirty(tail)) 2926 folio_account_cleaned(tail, 2927 inode_to_wb(folio->mapping->host)); 2928 __filemap_remove_folio(tail, NULL); 2929 folio_put(tail); 2930 } else if (!PageAnon(page)) { 2931 __xa_store(&folio->mapping->i_pages, head[i].index, 2932 head + i, 0); 2933 } else if (swap_cache) { 2934 __xa_store(&swap_cache->i_pages, offset + i, 2935 head + i, 0); 2936 } 2937 } 2938 2939 if (!new_order) 2940 ClearPageCompound(head); 2941 else { 2942 struct folio *new_folio = (struct folio *)head; 2943 2944 folio_set_order(new_folio, new_order); 2945 } 2946 unlock_page_lruvec(lruvec); 2947 /* Caller disabled irqs, so they are still disabled here */ 2948 2949 split_page_owner(head, order, new_order); 2950 pgalloc_tag_split(head, 1 << order); 2951 2952 /* See comment in __split_huge_page_tail() */ 2953 if (folio_test_anon(folio)) { 2954 /* Additional pin to swap cache */ 2955 if (folio_test_swapcache(folio)) { 2956 folio_ref_add(folio, 1 + new_nr); 2957 xa_unlock(&swap_cache->i_pages); 2958 } else { 2959 folio_ref_inc(folio); 2960 } 2961 } else { 2962 /* Additional pin to page cache */ 2963 folio_ref_add(folio, 1 + new_nr); 2964 xa_unlock(&folio->mapping->i_pages); 2965 } 2966 local_irq_enable(); 2967 2968 if (nr_dropped) 2969 shmem_uncharge(folio->mapping->host, nr_dropped); 2970 remap_page(folio, nr); 2971 2972 if (folio_test_swapcache(folio)) 2973 split_swap_cluster(folio->swap); 2974 2975 /* 2976 * set page to its compound_head when split to non order-0 pages, so 2977 * we can skip unlocking it below, since PG_locked is transferred to 2978 * the compound_head of the page and the caller will unlock it. 2979 */ 2980 if (new_order) 2981 page = compound_head(page); 2982 2983 for (i = 0; i < nr; i += new_nr) { 2984 struct page *subpage = head + i; 2985 struct folio *new_folio = page_folio(subpage); 2986 if (subpage == page) 2987 continue; 2988 folio_unlock(new_folio); 2989 2990 /* 2991 * Subpages may be freed if there wasn't any mapping 2992 * like if add_to_swap() is running on a lru page that 2993 * had its mapping zapped. And freeing these pages 2994 * requires taking the lru_lock so we do the put_page 2995 * of the tail pages after the split is complete. 2996 */ 2997 free_page_and_swap_cache(subpage); 2998 } 2999 } 3000 3001 /* Racy check whether the huge page can be split */ 3002 bool can_split_folio(struct folio *folio, int *pextra_pins) 3003 { 3004 int extra_pins; 3005 3006 /* Additional pins from page cache */ 3007 if (folio_test_anon(folio)) 3008 extra_pins = folio_test_swapcache(folio) ? 3009 folio_nr_pages(folio) : 0; 3010 else 3011 extra_pins = folio_nr_pages(folio); 3012 if (pextra_pins) 3013 *pextra_pins = extra_pins; 3014 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; 3015 } 3016 3017 /* 3018 * This function splits huge page into pages in @new_order. @page can point to 3019 * any subpage of huge page to split. Split doesn't change the position of 3020 * @page. 3021 * 3022 * NOTE: order-1 anonymous folio is not supported because _deferred_list, 3023 * which is used by partially mapped folios, is stored in subpage 2 and an 3024 * order-1 folio only has subpage 0 and 1. File-backed order-1 folios are OK, 3025 * since they do not use _deferred_list. 3026 * 3027 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 3028 * The huge page must be locked. 3029 * 3030 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 3031 * 3032 * Pages in new_order will inherit mapping, flags, and so on from the hugepage. 3033 * 3034 * GUP pin and PG_locked transferred to @page or the compound page @page belongs 3035 * to. Rest subpages can be freed if they are not mapped. 3036 * 3037 * Returns 0 if the hugepage is split successfully. 3038 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under 3039 * us. 3040 */ 3041 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 3042 unsigned int new_order) 3043 { 3044 struct folio *folio = page_folio(page); 3045 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3046 /* reset xarray order to new order after split */ 3047 XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); 3048 struct anon_vma *anon_vma = NULL; 3049 struct address_space *mapping = NULL; 3050 int extra_pins, ret; 3051 pgoff_t end; 3052 bool is_hzp; 3053 3054 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 3055 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 3056 3057 if (new_order >= folio_order(folio)) 3058 return -EINVAL; 3059 3060 /* Cannot split anonymous THP to order-1 */ 3061 if (new_order == 1 && folio_test_anon(folio)) { 3062 VM_WARN_ONCE(1, "Cannot split to order-1 folio"); 3063 return -EINVAL; 3064 } 3065 3066 if (new_order) { 3067 /* Only swapping a whole PMD-mapped folio is supported */ 3068 if (folio_test_swapcache(folio)) 3069 return -EINVAL; 3070 /* Split shmem folio to non-zero order not supported */ 3071 if (shmem_mapping(folio->mapping)) { 3072 VM_WARN_ONCE(1, 3073 "Cannot split shmem folio to non-0 order"); 3074 return -EINVAL; 3075 } 3076 /* No split if the file system does not support large folio */ 3077 if (!mapping_large_folio_support(folio->mapping)) { 3078 VM_WARN_ONCE(1, 3079 "Cannot split file folio to non-0 order"); 3080 return -EINVAL; 3081 } 3082 } 3083 3084 3085 is_hzp = is_huge_zero_page(&folio->page); 3086 if (is_hzp) { 3087 pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); 3088 return -EBUSY; 3089 } 3090 3091 if (folio_test_writeback(folio)) 3092 return -EBUSY; 3093 3094 if (folio_test_anon(folio)) { 3095 /* 3096 * The caller does not necessarily hold an mmap_lock that would 3097 * prevent the anon_vma disappearing so we first we take a 3098 * reference to it and then lock the anon_vma for write. This 3099 * is similar to folio_lock_anon_vma_read except the write lock 3100 * is taken to serialise against parallel split or collapse 3101 * operations. 3102 */ 3103 anon_vma = folio_get_anon_vma(folio); 3104 if (!anon_vma) { 3105 ret = -EBUSY; 3106 goto out; 3107 } 3108 end = -1; 3109 mapping = NULL; 3110 anon_vma_lock_write(anon_vma); 3111 } else { 3112 gfp_t gfp; 3113 3114 mapping = folio->mapping; 3115 3116 /* Truncated ? */ 3117 if (!mapping) { 3118 ret = -EBUSY; 3119 goto out; 3120 } 3121 3122 gfp = current_gfp_context(mapping_gfp_mask(mapping) & 3123 GFP_RECLAIM_MASK); 3124 3125 if (!filemap_release_folio(folio, gfp)) { 3126 ret = -EBUSY; 3127 goto out; 3128 } 3129 3130 xas_split_alloc(&xas, folio, folio_order(folio), gfp); 3131 if (xas_error(&xas)) { 3132 ret = xas_error(&xas); 3133 goto out; 3134 } 3135 3136 anon_vma = NULL; 3137 i_mmap_lock_read(mapping); 3138 3139 /* 3140 *__split_huge_page() may need to trim off pages beyond EOF: 3141 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 3142 * which cannot be nested inside the page tree lock. So note 3143 * end now: i_size itself may be changed at any moment, but 3144 * folio lock is good enough to serialize the trimming. 3145 */ 3146 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3147 if (shmem_mapping(mapping)) 3148 end = shmem_fallocend(mapping->host, end); 3149 } 3150 3151 /* 3152 * Racy check if we can split the page, before unmap_folio() will 3153 * split PMDs 3154 */ 3155 if (!can_split_folio(folio, &extra_pins)) { 3156 ret = -EAGAIN; 3157 goto out_unlock; 3158 } 3159 3160 unmap_folio(folio); 3161 3162 /* block interrupt reentry in xa_lock and spinlock */ 3163 local_irq_disable(); 3164 if (mapping) { 3165 /* 3166 * Check if the folio is present in page cache. 3167 * We assume all tail are present too, if folio is there. 3168 */ 3169 xas_lock(&xas); 3170 xas_reset(&xas); 3171 if (xas_load(&xas) != folio) 3172 goto fail; 3173 } 3174 3175 /* Prevent deferred_split_scan() touching ->_refcount */ 3176 spin_lock(&ds_queue->split_queue_lock); 3177 if (folio_ref_freeze(folio, 1 + extra_pins)) { 3178 if (folio_order(folio) > 1 && 3179 !list_empty(&folio->_deferred_list)) { 3180 ds_queue->split_queue_len--; 3181 /* 3182 * Reinitialize page_deferred_list after removing the 3183 * page from the split_queue, otherwise a subsequent 3184 * split will see list corruption when checking the 3185 * page_deferred_list. 3186 */ 3187 list_del_init(&folio->_deferred_list); 3188 } 3189 spin_unlock(&ds_queue->split_queue_lock); 3190 if (mapping) { 3191 int nr = folio_nr_pages(folio); 3192 3193 xas_split(&xas, folio, folio_order(folio)); 3194 if (folio_test_pmd_mappable(folio) && 3195 new_order < HPAGE_PMD_ORDER) { 3196 if (folio_test_swapbacked(folio)) { 3197 __lruvec_stat_mod_folio(folio, 3198 NR_SHMEM_THPS, -nr); 3199 } else { 3200 __lruvec_stat_mod_folio(folio, 3201 NR_FILE_THPS, -nr); 3202 filemap_nr_thps_dec(mapping); 3203 } 3204 } 3205 } 3206 3207 __split_huge_page(page, list, end, new_order); 3208 ret = 0; 3209 } else { 3210 spin_unlock(&ds_queue->split_queue_lock); 3211 fail: 3212 if (mapping) 3213 xas_unlock(&xas); 3214 local_irq_enable(); 3215 remap_page(folio, folio_nr_pages(folio)); 3216 ret = -EAGAIN; 3217 } 3218 3219 out_unlock: 3220 if (anon_vma) { 3221 anon_vma_unlock_write(anon_vma); 3222 put_anon_vma(anon_vma); 3223 } 3224 if (mapping) 3225 i_mmap_unlock_read(mapping); 3226 out: 3227 xas_destroy(&xas); 3228 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3229 return ret; 3230 } 3231 3232 void folio_undo_large_rmappable(struct folio *folio) 3233 { 3234 struct deferred_split *ds_queue; 3235 unsigned long flags; 3236 3237 if (folio_order(folio) <= 1) 3238 return; 3239 3240 /* 3241 * At this point, there is no one trying to add the folio to 3242 * deferred_list. If folio is not in deferred_list, it's safe 3243 * to check without acquiring the split_queue_lock. 3244 */ 3245 if (data_race(list_empty(&folio->_deferred_list))) 3246 return; 3247 3248 ds_queue = get_deferred_split_queue(folio); 3249 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3250 if (!list_empty(&folio->_deferred_list)) { 3251 ds_queue->split_queue_len--; 3252 list_del_init(&folio->_deferred_list); 3253 } 3254 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3255 } 3256 3257 void deferred_split_folio(struct folio *folio) 3258 { 3259 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3260 #ifdef CONFIG_MEMCG 3261 struct mem_cgroup *memcg = folio_memcg(folio); 3262 #endif 3263 unsigned long flags; 3264 3265 /* 3266 * Order 1 folios have no space for a deferred list, but we also 3267 * won't waste much memory by not adding them to the deferred list. 3268 */ 3269 if (folio_order(folio) <= 1) 3270 return; 3271 3272 /* 3273 * The try_to_unmap() in page reclaim path might reach here too, 3274 * this may cause a race condition to corrupt deferred split queue. 3275 * And, if page reclaim is already handling the same folio, it is 3276 * unnecessary to handle it again in shrinker. 3277 * 3278 * Check the swapcache flag to determine if the folio is being 3279 * handled by page reclaim since THP swap would add the folio into 3280 * swap cache before calling try_to_unmap(). 3281 */ 3282 if (folio_test_swapcache(folio)) 3283 return; 3284 3285 if (!list_empty(&folio->_deferred_list)) 3286 return; 3287 3288 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3289 if (list_empty(&folio->_deferred_list)) { 3290 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 3291 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); 3292 ds_queue->split_queue_len++; 3293 #ifdef CONFIG_MEMCG 3294 if (memcg) 3295 set_shrinker_bit(memcg, folio_nid(folio), 3296 deferred_split_shrinker->id); 3297 #endif 3298 } 3299 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3300 } 3301 3302 static unsigned long deferred_split_count(struct shrinker *shrink, 3303 struct shrink_control *sc) 3304 { 3305 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3306 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3307 3308 #ifdef CONFIG_MEMCG 3309 if (sc->memcg) 3310 ds_queue = &sc->memcg->deferred_split_queue; 3311 #endif 3312 return READ_ONCE(ds_queue->split_queue_len); 3313 } 3314 3315 static unsigned long deferred_split_scan(struct shrinker *shrink, 3316 struct shrink_control *sc) 3317 { 3318 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3319 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3320 unsigned long flags; 3321 LIST_HEAD(list); 3322 struct folio *folio, *next; 3323 int split = 0; 3324 3325 #ifdef CONFIG_MEMCG 3326 if (sc->memcg) 3327 ds_queue = &sc->memcg->deferred_split_queue; 3328 #endif 3329 3330 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3331 /* Take pin on all head pages to avoid freeing them under us */ 3332 list_for_each_entry_safe(folio, next, &ds_queue->split_queue, 3333 _deferred_list) { 3334 if (folio_try_get(folio)) { 3335 list_move(&folio->_deferred_list, &list); 3336 } else { 3337 /* We lost race with folio_put() */ 3338 list_del_init(&folio->_deferred_list); 3339 ds_queue->split_queue_len--; 3340 } 3341 if (!--sc->nr_to_scan) 3342 break; 3343 } 3344 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3345 3346 list_for_each_entry_safe(folio, next, &list, _deferred_list) { 3347 if (!folio_trylock(folio)) 3348 goto next; 3349 /* split_huge_page() removes page from list on success */ 3350 if (!split_folio(folio)) 3351 split++; 3352 folio_unlock(folio); 3353 next: 3354 folio_put(folio); 3355 } 3356 3357 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3358 list_splice_tail(&list, &ds_queue->split_queue); 3359 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3360 3361 /* 3362 * Stop shrinker if we didn't split any page, but the queue is empty. 3363 * This can happen if pages were freed under us. 3364 */ 3365 if (!split && list_empty(&ds_queue->split_queue)) 3366 return SHRINK_STOP; 3367 return split; 3368 } 3369 3370 #ifdef CONFIG_DEBUG_FS 3371 static void split_huge_pages_all(void) 3372 { 3373 struct zone *zone; 3374 struct page *page; 3375 struct folio *folio; 3376 unsigned long pfn, max_zone_pfn; 3377 unsigned long total = 0, split = 0; 3378 3379 pr_debug("Split all THPs\n"); 3380 for_each_zone(zone) { 3381 if (!managed_zone(zone)) 3382 continue; 3383 max_zone_pfn = zone_end_pfn(zone); 3384 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 3385 int nr_pages; 3386 3387 page = pfn_to_online_page(pfn); 3388 if (!page || PageTail(page)) 3389 continue; 3390 folio = page_folio(page); 3391 if (!folio_try_get(folio)) 3392 continue; 3393 3394 if (unlikely(page_folio(page) != folio)) 3395 goto next; 3396 3397 if (zone != folio_zone(folio)) 3398 goto next; 3399 3400 if (!folio_test_large(folio) 3401 || folio_test_hugetlb(folio) 3402 || !folio_test_lru(folio)) 3403 goto next; 3404 3405 total++; 3406 folio_lock(folio); 3407 nr_pages = folio_nr_pages(folio); 3408 if (!split_folio(folio)) 3409 split++; 3410 pfn += nr_pages - 1; 3411 folio_unlock(folio); 3412 next: 3413 folio_put(folio); 3414 cond_resched(); 3415 } 3416 } 3417 3418 pr_debug("%lu of %lu THP split\n", split, total); 3419 } 3420 3421 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) 3422 { 3423 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || 3424 is_vm_hugetlb_page(vma); 3425 } 3426 3427 static int split_huge_pages_pid(int pid, unsigned long vaddr_start, 3428 unsigned long vaddr_end, unsigned int new_order) 3429 { 3430 int ret = 0; 3431 struct task_struct *task; 3432 struct mm_struct *mm; 3433 unsigned long total = 0, split = 0; 3434 unsigned long addr; 3435 3436 vaddr_start &= PAGE_MASK; 3437 vaddr_end &= PAGE_MASK; 3438 3439 /* Find the task_struct from pid */ 3440 rcu_read_lock(); 3441 task = find_task_by_vpid(pid); 3442 if (!task) { 3443 rcu_read_unlock(); 3444 ret = -ESRCH; 3445 goto out; 3446 } 3447 get_task_struct(task); 3448 rcu_read_unlock(); 3449 3450 /* Find the mm_struct */ 3451 mm = get_task_mm(task); 3452 put_task_struct(task); 3453 3454 if (!mm) { 3455 ret = -EINVAL; 3456 goto out; 3457 } 3458 3459 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", 3460 pid, vaddr_start, vaddr_end); 3461 3462 mmap_read_lock(mm); 3463 /* 3464 * always increase addr by PAGE_SIZE, since we could have a PTE page 3465 * table filled with PTE-mapped THPs, each of which is distinct. 3466 */ 3467 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { 3468 struct vm_area_struct *vma = vma_lookup(mm, addr); 3469 struct page *page; 3470 struct folio *folio; 3471 3472 if (!vma) 3473 break; 3474 3475 /* skip special VMA and hugetlb VMA */ 3476 if (vma_not_suitable_for_thp_split(vma)) { 3477 addr = vma->vm_end; 3478 continue; 3479 } 3480 3481 /* FOLL_DUMP to ignore special (like zero) pages */ 3482 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 3483 3484 if (IS_ERR_OR_NULL(page)) 3485 continue; 3486 3487 folio = page_folio(page); 3488 if (!is_transparent_hugepage(folio)) 3489 goto next; 3490 3491 if (new_order >= folio_order(folio)) 3492 goto next; 3493 3494 total++; 3495 /* 3496 * For folios with private, split_huge_page_to_list_to_order() 3497 * will try to drop it before split and then check if the folio 3498 * can be split or not. So skip the check here. 3499 */ 3500 if (!folio_test_private(folio) && 3501 !can_split_folio(folio, NULL)) 3502 goto next; 3503 3504 if (!folio_trylock(folio)) 3505 goto next; 3506 3507 if (!split_folio_to_order(folio, new_order)) 3508 split++; 3509 3510 folio_unlock(folio); 3511 next: 3512 folio_put(folio); 3513 cond_resched(); 3514 } 3515 mmap_read_unlock(mm); 3516 mmput(mm); 3517 3518 pr_debug("%lu of %lu THP split\n", split, total); 3519 3520 out: 3521 return ret; 3522 } 3523 3524 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, 3525 pgoff_t off_end, unsigned int new_order) 3526 { 3527 struct filename *file; 3528 struct file *candidate; 3529 struct address_space *mapping; 3530 int ret = -EINVAL; 3531 pgoff_t index; 3532 int nr_pages = 1; 3533 unsigned long total = 0, split = 0; 3534 3535 file = getname_kernel(file_path); 3536 if (IS_ERR(file)) 3537 return ret; 3538 3539 candidate = file_open_name(file, O_RDONLY, 0); 3540 if (IS_ERR(candidate)) 3541 goto out; 3542 3543 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", 3544 file_path, off_start, off_end); 3545 3546 mapping = candidate->f_mapping; 3547 3548 for (index = off_start; index < off_end; index += nr_pages) { 3549 struct folio *folio = filemap_get_folio(mapping, index); 3550 3551 nr_pages = 1; 3552 if (IS_ERR(folio)) 3553 continue; 3554 3555 if (!folio_test_large(folio)) 3556 goto next; 3557 3558 total++; 3559 nr_pages = folio_nr_pages(folio); 3560 3561 if (new_order >= folio_order(folio)) 3562 goto next; 3563 3564 if (!folio_trylock(folio)) 3565 goto next; 3566 3567 if (!split_folio_to_order(folio, new_order)) 3568 split++; 3569 3570 folio_unlock(folio); 3571 next: 3572 folio_put(folio); 3573 cond_resched(); 3574 } 3575 3576 filp_close(candidate, NULL); 3577 ret = 0; 3578 3579 pr_debug("%lu of %lu file-backed THP split\n", split, total); 3580 out: 3581 putname(file); 3582 return ret; 3583 } 3584 3585 #define MAX_INPUT_BUF_SZ 255 3586 3587 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, 3588 size_t count, loff_t *ppops) 3589 { 3590 static DEFINE_MUTEX(split_debug_mutex); 3591 ssize_t ret; 3592 /* 3593 * hold pid, start_vaddr, end_vaddr, new_order or 3594 * file_path, off_start, off_end, new_order 3595 */ 3596 char input_buf[MAX_INPUT_BUF_SZ]; 3597 int pid; 3598 unsigned long vaddr_start, vaddr_end; 3599 unsigned int new_order = 0; 3600 3601 ret = mutex_lock_interruptible(&split_debug_mutex); 3602 if (ret) 3603 return ret; 3604 3605 ret = -EFAULT; 3606 3607 memset(input_buf, 0, MAX_INPUT_BUF_SZ); 3608 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) 3609 goto out; 3610 3611 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; 3612 3613 if (input_buf[0] == '/') { 3614 char *tok; 3615 char *buf = input_buf; 3616 char file_path[MAX_INPUT_BUF_SZ]; 3617 pgoff_t off_start = 0, off_end = 0; 3618 size_t input_len = strlen(input_buf); 3619 3620 tok = strsep(&buf, ","); 3621 if (tok) { 3622 strcpy(file_path, tok); 3623 } else { 3624 ret = -EINVAL; 3625 goto out; 3626 } 3627 3628 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); 3629 if (ret != 2 && ret != 3) { 3630 ret = -EINVAL; 3631 goto out; 3632 } 3633 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); 3634 if (!ret) 3635 ret = input_len; 3636 3637 goto out; 3638 } 3639 3640 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); 3641 if (ret == 1 && pid == 1) { 3642 split_huge_pages_all(); 3643 ret = strlen(input_buf); 3644 goto out; 3645 } else if (ret != 3 && ret != 4) { 3646 ret = -EINVAL; 3647 goto out; 3648 } 3649 3650 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); 3651 if (!ret) 3652 ret = strlen(input_buf); 3653 out: 3654 mutex_unlock(&split_debug_mutex); 3655 return ret; 3656 3657 } 3658 3659 static const struct file_operations split_huge_pages_fops = { 3660 .owner = THIS_MODULE, 3661 .write = split_huge_pages_write, 3662 .llseek = no_llseek, 3663 }; 3664 3665 static int __init split_huge_pages_debugfs(void) 3666 { 3667 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 3668 &split_huge_pages_fops); 3669 return 0; 3670 } 3671 late_initcall(split_huge_pages_debugfs); 3672 #endif 3673 3674 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 3675 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 3676 struct page *page) 3677 { 3678 struct folio *folio = page_folio(page); 3679 struct vm_area_struct *vma = pvmw->vma; 3680 struct mm_struct *mm = vma->vm_mm; 3681 unsigned long address = pvmw->address; 3682 bool anon_exclusive; 3683 pmd_t pmdval; 3684 swp_entry_t entry; 3685 pmd_t pmdswp; 3686 3687 if (!(pvmw->pmd && !pvmw->pte)) 3688 return 0; 3689 3690 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 3691 pmdval = pmdp_invalidate(vma, address, pvmw->pmd); 3692 3693 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ 3694 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); 3695 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { 3696 set_pmd_at(mm, address, pvmw->pmd, pmdval); 3697 return -EBUSY; 3698 } 3699 3700 if (pmd_dirty(pmdval)) 3701 folio_mark_dirty(folio); 3702 if (pmd_write(pmdval)) 3703 entry = make_writable_migration_entry(page_to_pfn(page)); 3704 else if (anon_exclusive) 3705 entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); 3706 else 3707 entry = make_readable_migration_entry(page_to_pfn(page)); 3708 if (pmd_young(pmdval)) 3709 entry = make_migration_entry_young(entry); 3710 if (pmd_dirty(pmdval)) 3711 entry = make_migration_entry_dirty(entry); 3712 pmdswp = swp_entry_to_pmd(entry); 3713 if (pmd_soft_dirty(pmdval)) 3714 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 3715 if (pmd_uffd_wp(pmdval)) 3716 pmdswp = pmd_swp_mkuffd_wp(pmdswp); 3717 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 3718 folio_remove_rmap_pmd(folio, page, vma); 3719 folio_put(folio); 3720 trace_set_migration_pmd(address, pmd_val(pmdswp)); 3721 3722 return 0; 3723 } 3724 3725 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 3726 { 3727 struct folio *folio = page_folio(new); 3728 struct vm_area_struct *vma = pvmw->vma; 3729 struct mm_struct *mm = vma->vm_mm; 3730 unsigned long address = pvmw->address; 3731 unsigned long haddr = address & HPAGE_PMD_MASK; 3732 pmd_t pmde; 3733 swp_entry_t entry; 3734 3735 if (!(pvmw->pmd && !pvmw->pte)) 3736 return; 3737 3738 entry = pmd_to_swp_entry(*pvmw->pmd); 3739 folio_get(folio); 3740 pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); 3741 if (pmd_swp_soft_dirty(*pvmw->pmd)) 3742 pmde = pmd_mksoft_dirty(pmde); 3743 if (is_writable_migration_entry(entry)) 3744 pmde = pmd_mkwrite(pmde, vma); 3745 if (pmd_swp_uffd_wp(*pvmw->pmd)) 3746 pmde = pmd_mkuffd_wp(pmde); 3747 if (!is_migration_entry_young(entry)) 3748 pmde = pmd_mkold(pmde); 3749 /* NOTE: this may contain setting soft-dirty on some archs */ 3750 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) 3751 pmde = pmd_mkdirty(pmde); 3752 3753 if (folio_test_anon(folio)) { 3754 rmap_t rmap_flags = RMAP_NONE; 3755 3756 if (!is_readable_migration_entry(entry)) 3757 rmap_flags |= RMAP_EXCLUSIVE; 3758 3759 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); 3760 } else { 3761 folio_add_file_rmap_pmd(folio, new, vma); 3762 } 3763 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new)); 3764 set_pmd_at(mm, haddr, pvmw->pmd, pmde); 3765 3766 /* No need to invalidate - it was non-present before */ 3767 update_mmu_cache_pmd(vma, address, pvmw->pmd); 3768 trace_remove_migration_pmd(address, pmd_val(pmde)); 3769 } 3770 #endif 3771