1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/mm.h> 11 #include <linux/sched/numa_balancing.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/backing-dev.h> 21 #include <linux/dax.h> 22 #include <linux/mm_types.h> 23 #include <linux/khugepaged.h> 24 #include <linux/freezer.h> 25 #include <linux/pfn_t.h> 26 #include <linux/mman.h> 27 #include <linux/memremap.h> 28 #include <linux/pagemap.h> 29 #include <linux/debugfs.h> 30 #include <linux/migrate.h> 31 #include <linux/hashtable.h> 32 #include <linux/userfaultfd_k.h> 33 #include <linux/page_idle.h> 34 #include <linux/shmem_fs.h> 35 #include <linux/oom.h> 36 #include <linux/numa.h> 37 #include <linux/page_owner.h> 38 #include <linux/sched/sysctl.h> 39 #include <linux/memory-tiers.h> 40 #include <linux/compat.h> 41 #include <linux/pgalloc_tag.h> 42 #include <linux/pagewalk.h> 43 44 #include <asm/tlb.h> 45 #include <asm/pgalloc.h> 46 #include "internal.h" 47 #include "swap.h" 48 49 #define CREATE_TRACE_POINTS 50 #include <trace/events/thp.h> 51 52 /* 53 * By default, transparent hugepage support is disabled in order to avoid 54 * risking an increased memory footprint for applications that are not 55 * guaranteed to benefit from it. When transparent hugepage support is 56 * enabled, it is for all mappings, and khugepaged scans all mappings. 57 * Defrag is invoked by khugepaged hugepage allocations and by page faults 58 * for all hugepage allocations. 59 */ 60 unsigned long transparent_hugepage_flags __read_mostly = 61 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 62 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 63 #endif 64 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 65 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 66 #endif 67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 68 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 69 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 70 71 static struct shrinker *deferred_split_shrinker; 72 static unsigned long deferred_split_count(struct shrinker *shrink, 73 struct shrink_control *sc); 74 static unsigned long deferred_split_scan(struct shrinker *shrink, 75 struct shrink_control *sc); 76 static bool split_underused_thp = true; 77 78 static atomic_t huge_zero_refcount; 79 struct folio *huge_zero_folio __read_mostly; 80 unsigned long huge_zero_pfn __read_mostly = ~0UL; 81 unsigned long huge_anon_orders_always __read_mostly; 82 unsigned long huge_anon_orders_madvise __read_mostly; 83 unsigned long huge_anon_orders_inherit __read_mostly; 84 static bool anon_orders_configured __initdata; 85 86 static inline bool file_thp_enabled(struct vm_area_struct *vma) 87 { 88 struct inode *inode; 89 90 if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) 91 return false; 92 93 if (!vma->vm_file) 94 return false; 95 96 inode = file_inode(vma->vm_file); 97 98 return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); 99 } 100 101 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, 102 unsigned long vm_flags, 103 unsigned long tva_flags, 104 unsigned long orders) 105 { 106 bool smaps = tva_flags & TVA_SMAPS; 107 bool in_pf = tva_flags & TVA_IN_PF; 108 bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; 109 unsigned long supported_orders; 110 111 /* Check the intersection of requested and supported orders. */ 112 if (vma_is_anonymous(vma)) 113 supported_orders = THP_ORDERS_ALL_ANON; 114 else if (vma_is_special_huge(vma)) 115 supported_orders = THP_ORDERS_ALL_SPECIAL; 116 else 117 supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; 118 119 orders &= supported_orders; 120 if (!orders) 121 return 0; 122 123 if (!vma->vm_mm) /* vdso */ 124 return 0; 125 126 if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) 127 return 0; 128 129 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ 130 if (vma_is_dax(vma)) 131 return in_pf ? orders : 0; 132 133 /* 134 * khugepaged special VMA and hugetlb VMA. 135 * Must be checked after dax since some dax mappings may have 136 * VM_MIXEDMAP set. 137 */ 138 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) 139 return 0; 140 141 /* 142 * Check alignment for file vma and size for both file and anon vma by 143 * filtering out the unsuitable orders. 144 * 145 * Skip the check for page fault. Huge fault does the check in fault 146 * handlers. 147 */ 148 if (!in_pf) { 149 int order = highest_order(orders); 150 unsigned long addr; 151 152 while (orders) { 153 addr = vma->vm_end - (PAGE_SIZE << order); 154 if (thp_vma_suitable_order(vma, addr, order)) 155 break; 156 order = next_order(&orders, order); 157 } 158 159 if (!orders) 160 return 0; 161 } 162 163 /* 164 * Enabled via shmem mount options or sysfs settings. 165 * Must be done before hugepage flags check since shmem has its 166 * own flags. 167 */ 168 if (!in_pf && shmem_file(vma->vm_file)) 169 return shmem_allowable_huge_orders(file_inode(vma->vm_file), 170 vma, vma->vm_pgoff, 0, 171 !enforce_sysfs); 172 173 if (!vma_is_anonymous(vma)) { 174 /* 175 * Enforce sysfs THP requirements as necessary. Anonymous vmas 176 * were already handled in thp_vma_allowable_orders(). 177 */ 178 if (enforce_sysfs && 179 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && 180 !hugepage_global_always()))) 181 return 0; 182 183 /* 184 * Trust that ->huge_fault() handlers know what they are doing 185 * in fault path. 186 */ 187 if (((in_pf || smaps)) && vma->vm_ops->huge_fault) 188 return orders; 189 /* Only regular file is valid in collapse path */ 190 if (((!in_pf || smaps)) && file_thp_enabled(vma)) 191 return orders; 192 return 0; 193 } 194 195 if (vma_is_temporary_stack(vma)) 196 return 0; 197 198 /* 199 * THPeligible bit of smaps should show 1 for proper VMAs even 200 * though anon_vma is not initialized yet. 201 * 202 * Allow page fault since anon_vma may be not initialized until 203 * the first page fault. 204 */ 205 if (!vma->anon_vma) 206 return (smaps || in_pf) ? orders : 0; 207 208 return orders; 209 } 210 211 static bool get_huge_zero_page(void) 212 { 213 struct folio *zero_folio; 214 retry: 215 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 216 return true; 217 218 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 219 HPAGE_PMD_ORDER); 220 if (!zero_folio) { 221 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 222 return false; 223 } 224 /* Ensure zero folio won't have large_rmappable flag set. */ 225 folio_clear_large_rmappable(zero_folio); 226 preempt_disable(); 227 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { 228 preempt_enable(); 229 folio_put(zero_folio); 230 goto retry; 231 } 232 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); 233 234 /* We take additional reference here. It will be put back by shrinker */ 235 atomic_set(&huge_zero_refcount, 2); 236 preempt_enable(); 237 count_vm_event(THP_ZERO_PAGE_ALLOC); 238 return true; 239 } 240 241 static void put_huge_zero_page(void) 242 { 243 /* 244 * Counter should never go to zero here. Only shrinker can put 245 * last reference. 246 */ 247 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 248 } 249 250 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) 251 { 252 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 253 return READ_ONCE(huge_zero_folio); 254 255 if (!get_huge_zero_page()) 256 return NULL; 257 258 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 259 put_huge_zero_page(); 260 261 return READ_ONCE(huge_zero_folio); 262 } 263 264 void mm_put_huge_zero_folio(struct mm_struct *mm) 265 { 266 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 267 put_huge_zero_page(); 268 } 269 270 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 271 struct shrink_control *sc) 272 { 273 /* we can free zero page only if last reference remains */ 274 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 275 } 276 277 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 278 struct shrink_control *sc) 279 { 280 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 281 struct folio *zero_folio = xchg(&huge_zero_folio, NULL); 282 BUG_ON(zero_folio == NULL); 283 WRITE_ONCE(huge_zero_pfn, ~0UL); 284 folio_put(zero_folio); 285 return HPAGE_PMD_NR; 286 } 287 288 return 0; 289 } 290 291 static struct shrinker *huge_zero_page_shrinker; 292 293 #ifdef CONFIG_SYSFS 294 static ssize_t enabled_show(struct kobject *kobj, 295 struct kobj_attribute *attr, char *buf) 296 { 297 const char *output; 298 299 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 300 output = "[always] madvise never"; 301 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 302 &transparent_hugepage_flags)) 303 output = "always [madvise] never"; 304 else 305 output = "always madvise [never]"; 306 307 return sysfs_emit(buf, "%s\n", output); 308 } 309 310 static ssize_t enabled_store(struct kobject *kobj, 311 struct kobj_attribute *attr, 312 const char *buf, size_t count) 313 { 314 ssize_t ret = count; 315 316 if (sysfs_streq(buf, "always")) { 317 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 318 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 319 } else if (sysfs_streq(buf, "madvise")) { 320 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 321 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 322 } else if (sysfs_streq(buf, "never")) { 323 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 324 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 325 } else 326 ret = -EINVAL; 327 328 if (ret > 0) { 329 int err = start_stop_khugepaged(); 330 if (err) 331 ret = err; 332 } 333 return ret; 334 } 335 336 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); 337 338 ssize_t single_hugepage_flag_show(struct kobject *kobj, 339 struct kobj_attribute *attr, char *buf, 340 enum transparent_hugepage_flag flag) 341 { 342 return sysfs_emit(buf, "%d\n", 343 !!test_bit(flag, &transparent_hugepage_flags)); 344 } 345 346 ssize_t single_hugepage_flag_store(struct kobject *kobj, 347 struct kobj_attribute *attr, 348 const char *buf, size_t count, 349 enum transparent_hugepage_flag flag) 350 { 351 unsigned long value; 352 int ret; 353 354 ret = kstrtoul(buf, 10, &value); 355 if (ret < 0) 356 return ret; 357 if (value > 1) 358 return -EINVAL; 359 360 if (value) 361 set_bit(flag, &transparent_hugepage_flags); 362 else 363 clear_bit(flag, &transparent_hugepage_flags); 364 365 return count; 366 } 367 368 static ssize_t defrag_show(struct kobject *kobj, 369 struct kobj_attribute *attr, char *buf) 370 { 371 const char *output; 372 373 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 374 &transparent_hugepage_flags)) 375 output = "[always] defer defer+madvise madvise never"; 376 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 377 &transparent_hugepage_flags)) 378 output = "always [defer] defer+madvise madvise never"; 379 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 380 &transparent_hugepage_flags)) 381 output = "always defer [defer+madvise] madvise never"; 382 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 383 &transparent_hugepage_flags)) 384 output = "always defer defer+madvise [madvise] never"; 385 else 386 output = "always defer defer+madvise madvise [never]"; 387 388 return sysfs_emit(buf, "%s\n", output); 389 } 390 391 static ssize_t defrag_store(struct kobject *kobj, 392 struct kobj_attribute *attr, 393 const char *buf, size_t count) 394 { 395 if (sysfs_streq(buf, "always")) { 396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 397 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 398 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 399 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 400 } else if (sysfs_streq(buf, "defer+madvise")) { 401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 402 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 403 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 404 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 405 } else if (sysfs_streq(buf, "defer")) { 406 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 407 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 408 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 409 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 410 } else if (sysfs_streq(buf, "madvise")) { 411 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 412 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 413 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 414 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 415 } else if (sysfs_streq(buf, "never")) { 416 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 417 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 418 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 419 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 420 } else 421 return -EINVAL; 422 423 return count; 424 } 425 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); 426 427 static ssize_t use_zero_page_show(struct kobject *kobj, 428 struct kobj_attribute *attr, char *buf) 429 { 430 return single_hugepage_flag_show(kobj, attr, buf, 431 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 432 } 433 static ssize_t use_zero_page_store(struct kobject *kobj, 434 struct kobj_attribute *attr, const char *buf, size_t count) 435 { 436 return single_hugepage_flag_store(kobj, attr, buf, count, 437 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 438 } 439 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); 440 441 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 442 struct kobj_attribute *attr, char *buf) 443 { 444 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); 445 } 446 static struct kobj_attribute hpage_pmd_size_attr = 447 __ATTR_RO(hpage_pmd_size); 448 449 static ssize_t split_underused_thp_show(struct kobject *kobj, 450 struct kobj_attribute *attr, char *buf) 451 { 452 return sysfs_emit(buf, "%d\n", split_underused_thp); 453 } 454 455 static ssize_t split_underused_thp_store(struct kobject *kobj, 456 struct kobj_attribute *attr, 457 const char *buf, size_t count) 458 { 459 int err = kstrtobool(buf, &split_underused_thp); 460 461 if (err < 0) 462 return err; 463 464 return count; 465 } 466 467 static struct kobj_attribute split_underused_thp_attr = __ATTR( 468 shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store); 469 470 static struct attribute *hugepage_attr[] = { 471 &enabled_attr.attr, 472 &defrag_attr.attr, 473 &use_zero_page_attr.attr, 474 &hpage_pmd_size_attr.attr, 475 #ifdef CONFIG_SHMEM 476 &shmem_enabled_attr.attr, 477 #endif 478 &split_underused_thp_attr.attr, 479 NULL, 480 }; 481 482 static const struct attribute_group hugepage_attr_group = { 483 .attrs = hugepage_attr, 484 }; 485 486 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); 487 static void thpsize_release(struct kobject *kobj); 488 static DEFINE_SPINLOCK(huge_anon_orders_lock); 489 static LIST_HEAD(thpsize_list); 490 491 static ssize_t anon_enabled_show(struct kobject *kobj, 492 struct kobj_attribute *attr, char *buf) 493 { 494 int order = to_thpsize(kobj)->order; 495 const char *output; 496 497 if (test_bit(order, &huge_anon_orders_always)) 498 output = "[always] inherit madvise never"; 499 else if (test_bit(order, &huge_anon_orders_inherit)) 500 output = "always [inherit] madvise never"; 501 else if (test_bit(order, &huge_anon_orders_madvise)) 502 output = "always inherit [madvise] never"; 503 else 504 output = "always inherit madvise [never]"; 505 506 return sysfs_emit(buf, "%s\n", output); 507 } 508 509 static ssize_t anon_enabled_store(struct kobject *kobj, 510 struct kobj_attribute *attr, 511 const char *buf, size_t count) 512 { 513 int order = to_thpsize(kobj)->order; 514 ssize_t ret = count; 515 516 if (sysfs_streq(buf, "always")) { 517 spin_lock(&huge_anon_orders_lock); 518 clear_bit(order, &huge_anon_orders_inherit); 519 clear_bit(order, &huge_anon_orders_madvise); 520 set_bit(order, &huge_anon_orders_always); 521 spin_unlock(&huge_anon_orders_lock); 522 } else if (sysfs_streq(buf, "inherit")) { 523 spin_lock(&huge_anon_orders_lock); 524 clear_bit(order, &huge_anon_orders_always); 525 clear_bit(order, &huge_anon_orders_madvise); 526 set_bit(order, &huge_anon_orders_inherit); 527 spin_unlock(&huge_anon_orders_lock); 528 } else if (sysfs_streq(buf, "madvise")) { 529 spin_lock(&huge_anon_orders_lock); 530 clear_bit(order, &huge_anon_orders_always); 531 clear_bit(order, &huge_anon_orders_inherit); 532 set_bit(order, &huge_anon_orders_madvise); 533 spin_unlock(&huge_anon_orders_lock); 534 } else if (sysfs_streq(buf, "never")) { 535 spin_lock(&huge_anon_orders_lock); 536 clear_bit(order, &huge_anon_orders_always); 537 clear_bit(order, &huge_anon_orders_inherit); 538 clear_bit(order, &huge_anon_orders_madvise); 539 spin_unlock(&huge_anon_orders_lock); 540 } else 541 ret = -EINVAL; 542 543 if (ret > 0) { 544 int err; 545 546 err = start_stop_khugepaged(); 547 if (err) 548 ret = err; 549 } 550 return ret; 551 } 552 553 static struct kobj_attribute anon_enabled_attr = 554 __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); 555 556 static struct attribute *anon_ctrl_attrs[] = { 557 &anon_enabled_attr.attr, 558 NULL, 559 }; 560 561 static const struct attribute_group anon_ctrl_attr_grp = { 562 .attrs = anon_ctrl_attrs, 563 }; 564 565 static struct attribute *file_ctrl_attrs[] = { 566 #ifdef CONFIG_SHMEM 567 &thpsize_shmem_enabled_attr.attr, 568 #endif 569 NULL, 570 }; 571 572 static const struct attribute_group file_ctrl_attr_grp = { 573 .attrs = file_ctrl_attrs, 574 }; 575 576 static struct attribute *any_ctrl_attrs[] = { 577 NULL, 578 }; 579 580 static const struct attribute_group any_ctrl_attr_grp = { 581 .attrs = any_ctrl_attrs, 582 }; 583 584 static const struct kobj_type thpsize_ktype = { 585 .release = &thpsize_release, 586 .sysfs_ops = &kobj_sysfs_ops, 587 }; 588 589 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; 590 591 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) 592 { 593 unsigned long sum = 0; 594 int cpu; 595 596 for_each_possible_cpu(cpu) { 597 struct mthp_stat *this = &per_cpu(mthp_stats, cpu); 598 599 sum += this->stats[order][item]; 600 } 601 602 return sum; 603 } 604 605 #define DEFINE_MTHP_STAT_ATTR(_name, _index) \ 606 static ssize_t _name##_show(struct kobject *kobj, \ 607 struct kobj_attribute *attr, char *buf) \ 608 { \ 609 int order = to_thpsize(kobj)->order; \ 610 \ 611 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ 612 } \ 613 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 614 615 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); 616 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); 617 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 618 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); 619 DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); 620 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); 621 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); 622 #ifdef CONFIG_SHMEM 623 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); 624 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); 625 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); 626 #endif 627 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); 628 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); 629 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); 630 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); 631 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); 632 633 static struct attribute *anon_stats_attrs[] = { 634 &anon_fault_alloc_attr.attr, 635 &anon_fault_fallback_attr.attr, 636 &anon_fault_fallback_charge_attr.attr, 637 #ifndef CONFIG_SHMEM 638 &zswpout_attr.attr, 639 &swpin_attr.attr, 640 &swpout_attr.attr, 641 &swpout_fallback_attr.attr, 642 #endif 643 &split_deferred_attr.attr, 644 &nr_anon_attr.attr, 645 &nr_anon_partially_mapped_attr.attr, 646 NULL, 647 }; 648 649 static struct attribute_group anon_stats_attr_grp = { 650 .name = "stats", 651 .attrs = anon_stats_attrs, 652 }; 653 654 static struct attribute *file_stats_attrs[] = { 655 #ifdef CONFIG_SHMEM 656 &shmem_alloc_attr.attr, 657 &shmem_fallback_attr.attr, 658 &shmem_fallback_charge_attr.attr, 659 #endif 660 NULL, 661 }; 662 663 static struct attribute_group file_stats_attr_grp = { 664 .name = "stats", 665 .attrs = file_stats_attrs, 666 }; 667 668 static struct attribute *any_stats_attrs[] = { 669 #ifdef CONFIG_SHMEM 670 &zswpout_attr.attr, 671 &swpin_attr.attr, 672 &swpout_attr.attr, 673 &swpout_fallback_attr.attr, 674 #endif 675 &split_attr.attr, 676 &split_failed_attr.attr, 677 NULL, 678 }; 679 680 static struct attribute_group any_stats_attr_grp = { 681 .name = "stats", 682 .attrs = any_stats_attrs, 683 }; 684 685 static int sysfs_add_group(struct kobject *kobj, 686 const struct attribute_group *grp) 687 { 688 int ret = -ENOENT; 689 690 /* 691 * If the group is named, try to merge first, assuming the subdirectory 692 * was already created. This avoids the warning emitted by 693 * sysfs_create_group() if the directory already exists. 694 */ 695 if (grp->name) 696 ret = sysfs_merge_group(kobj, grp); 697 if (ret) 698 ret = sysfs_create_group(kobj, grp); 699 700 return ret; 701 } 702 703 static struct thpsize *thpsize_create(int order, struct kobject *parent) 704 { 705 unsigned long size = (PAGE_SIZE << order) / SZ_1K; 706 struct thpsize *thpsize; 707 int ret = -ENOMEM; 708 709 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); 710 if (!thpsize) 711 goto err; 712 713 thpsize->order = order; 714 715 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, 716 "hugepages-%lukB", size); 717 if (ret) { 718 kfree(thpsize); 719 goto err; 720 } 721 722 723 ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp); 724 if (ret) 725 goto err_put; 726 727 ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp); 728 if (ret) 729 goto err_put; 730 731 if (BIT(order) & THP_ORDERS_ALL_ANON) { 732 ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp); 733 if (ret) 734 goto err_put; 735 736 ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp); 737 if (ret) 738 goto err_put; 739 } 740 741 if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) { 742 ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp); 743 if (ret) 744 goto err_put; 745 746 ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp); 747 if (ret) 748 goto err_put; 749 } 750 751 return thpsize; 752 err_put: 753 kobject_put(&thpsize->kobj); 754 err: 755 return ERR_PTR(ret); 756 } 757 758 static void thpsize_release(struct kobject *kobj) 759 { 760 kfree(to_thpsize(kobj)); 761 } 762 763 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 764 { 765 int err; 766 struct thpsize *thpsize; 767 unsigned long orders; 768 int order; 769 770 /* 771 * Default to setting PMD-sized THP to inherit the global setting and 772 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time 773 * constant so we have to do this here. 774 */ 775 if (!anon_orders_configured) 776 huge_anon_orders_inherit = BIT(PMD_ORDER); 777 778 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 779 if (unlikely(!*hugepage_kobj)) { 780 pr_err("failed to create transparent hugepage kobject\n"); 781 return -ENOMEM; 782 } 783 784 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 785 if (err) { 786 pr_err("failed to register transparent hugepage group\n"); 787 goto delete_obj; 788 } 789 790 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 791 if (err) { 792 pr_err("failed to register transparent hugepage group\n"); 793 goto remove_hp_group; 794 } 795 796 orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT; 797 order = highest_order(orders); 798 while (orders) { 799 thpsize = thpsize_create(order, *hugepage_kobj); 800 if (IS_ERR(thpsize)) { 801 pr_err("failed to create thpsize for order %d\n", order); 802 err = PTR_ERR(thpsize); 803 goto remove_all; 804 } 805 list_add(&thpsize->node, &thpsize_list); 806 order = next_order(&orders, order); 807 } 808 809 return 0; 810 811 remove_all: 812 hugepage_exit_sysfs(*hugepage_kobj); 813 return err; 814 remove_hp_group: 815 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 816 delete_obj: 817 kobject_put(*hugepage_kobj); 818 return err; 819 } 820 821 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 822 { 823 struct thpsize *thpsize, *tmp; 824 825 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { 826 list_del(&thpsize->node); 827 kobject_put(&thpsize->kobj); 828 } 829 830 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 831 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 832 kobject_put(hugepage_kobj); 833 } 834 #else 835 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 836 { 837 return 0; 838 } 839 840 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 841 { 842 } 843 #endif /* CONFIG_SYSFS */ 844 845 static int __init thp_shrinker_init(void) 846 { 847 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); 848 if (!huge_zero_page_shrinker) 849 return -ENOMEM; 850 851 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 852 SHRINKER_MEMCG_AWARE | 853 SHRINKER_NONSLAB, 854 "thp-deferred_split"); 855 if (!deferred_split_shrinker) { 856 shrinker_free(huge_zero_page_shrinker); 857 return -ENOMEM; 858 } 859 860 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; 861 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; 862 shrinker_register(huge_zero_page_shrinker); 863 864 deferred_split_shrinker->count_objects = deferred_split_count; 865 deferred_split_shrinker->scan_objects = deferred_split_scan; 866 shrinker_register(deferred_split_shrinker); 867 868 return 0; 869 } 870 871 static void __init thp_shrinker_exit(void) 872 { 873 shrinker_free(huge_zero_page_shrinker); 874 shrinker_free(deferred_split_shrinker); 875 } 876 877 static int __init hugepage_init(void) 878 { 879 int err; 880 struct kobject *hugepage_kobj; 881 882 if (!has_transparent_hugepage()) { 883 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; 884 return -EINVAL; 885 } 886 887 /* 888 * hugepages can't be allocated by the buddy allocator 889 */ 890 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); 891 892 err = hugepage_init_sysfs(&hugepage_kobj); 893 if (err) 894 goto err_sysfs; 895 896 err = khugepaged_init(); 897 if (err) 898 goto err_slab; 899 900 err = thp_shrinker_init(); 901 if (err) 902 goto err_shrinker; 903 904 /* 905 * By default disable transparent hugepages on smaller systems, 906 * where the extra memory used could hurt more than TLB overhead 907 * is likely to save. The admin can still enable it through /sys. 908 */ 909 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 910 transparent_hugepage_flags = 0; 911 return 0; 912 } 913 914 err = start_stop_khugepaged(); 915 if (err) 916 goto err_khugepaged; 917 918 return 0; 919 err_khugepaged: 920 thp_shrinker_exit(); 921 err_shrinker: 922 khugepaged_destroy(); 923 err_slab: 924 hugepage_exit_sysfs(hugepage_kobj); 925 err_sysfs: 926 return err; 927 } 928 subsys_initcall(hugepage_init); 929 930 static int __init setup_transparent_hugepage(char *str) 931 { 932 int ret = 0; 933 if (!str) 934 goto out; 935 if (!strcmp(str, "always")) { 936 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 937 &transparent_hugepage_flags); 938 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 939 &transparent_hugepage_flags); 940 ret = 1; 941 } else if (!strcmp(str, "madvise")) { 942 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 943 &transparent_hugepage_flags); 944 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 945 &transparent_hugepage_flags); 946 ret = 1; 947 } else if (!strcmp(str, "never")) { 948 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 949 &transparent_hugepage_flags); 950 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 951 &transparent_hugepage_flags); 952 ret = 1; 953 } 954 out: 955 if (!ret) 956 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 957 return ret; 958 } 959 __setup("transparent_hugepage=", setup_transparent_hugepage); 960 961 static char str_dup[PAGE_SIZE] __initdata; 962 static int __init setup_thp_anon(char *str) 963 { 964 char *token, *range, *policy, *subtoken; 965 unsigned long always, inherit, madvise; 966 char *start_size, *end_size; 967 int start, end, nr; 968 char *p; 969 970 if (!str || strlen(str) + 1 > PAGE_SIZE) 971 goto err; 972 strscpy(str_dup, str); 973 974 always = huge_anon_orders_always; 975 madvise = huge_anon_orders_madvise; 976 inherit = huge_anon_orders_inherit; 977 p = str_dup; 978 while ((token = strsep(&p, ";")) != NULL) { 979 range = strsep(&token, ":"); 980 policy = token; 981 982 if (!policy) 983 goto err; 984 985 while ((subtoken = strsep(&range, ",")) != NULL) { 986 if (strchr(subtoken, '-')) { 987 start_size = strsep(&subtoken, "-"); 988 end_size = subtoken; 989 990 start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON); 991 end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON); 992 } else { 993 start_size = end_size = subtoken; 994 start = end = get_order_from_str(subtoken, 995 THP_ORDERS_ALL_ANON); 996 } 997 998 if (start == -EINVAL) { 999 pr_err("invalid size %s in thp_anon boot parameter\n", start_size); 1000 goto err; 1001 } 1002 1003 if (end == -EINVAL) { 1004 pr_err("invalid size %s in thp_anon boot parameter\n", end_size); 1005 goto err; 1006 } 1007 1008 if (start < 0 || end < 0 || start > end) 1009 goto err; 1010 1011 nr = end - start + 1; 1012 if (!strcmp(policy, "always")) { 1013 bitmap_set(&always, start, nr); 1014 bitmap_clear(&inherit, start, nr); 1015 bitmap_clear(&madvise, start, nr); 1016 } else if (!strcmp(policy, "madvise")) { 1017 bitmap_set(&madvise, start, nr); 1018 bitmap_clear(&inherit, start, nr); 1019 bitmap_clear(&always, start, nr); 1020 } else if (!strcmp(policy, "inherit")) { 1021 bitmap_set(&inherit, start, nr); 1022 bitmap_clear(&madvise, start, nr); 1023 bitmap_clear(&always, start, nr); 1024 } else if (!strcmp(policy, "never")) { 1025 bitmap_clear(&inherit, start, nr); 1026 bitmap_clear(&madvise, start, nr); 1027 bitmap_clear(&always, start, nr); 1028 } else { 1029 pr_err("invalid policy %s in thp_anon boot parameter\n", policy); 1030 goto err; 1031 } 1032 } 1033 } 1034 1035 huge_anon_orders_always = always; 1036 huge_anon_orders_madvise = madvise; 1037 huge_anon_orders_inherit = inherit; 1038 anon_orders_configured = true; 1039 return 1; 1040 1041 err: 1042 pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str); 1043 return 0; 1044 } 1045 __setup("thp_anon=", setup_thp_anon); 1046 1047 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 1048 { 1049 if (likely(vma->vm_flags & VM_WRITE)) 1050 pmd = pmd_mkwrite(pmd, vma); 1051 return pmd; 1052 } 1053 1054 #ifdef CONFIG_MEMCG 1055 static inline 1056 struct deferred_split *get_deferred_split_queue(struct folio *folio) 1057 { 1058 struct mem_cgroup *memcg = folio_memcg(folio); 1059 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 1060 1061 if (memcg) 1062 return &memcg->deferred_split_queue; 1063 else 1064 return &pgdat->deferred_split_queue; 1065 } 1066 #else 1067 static inline 1068 struct deferred_split *get_deferred_split_queue(struct folio *folio) 1069 { 1070 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 1071 1072 return &pgdat->deferred_split_queue; 1073 } 1074 #endif 1075 1076 static inline bool is_transparent_hugepage(const struct folio *folio) 1077 { 1078 if (!folio_test_large(folio)) 1079 return false; 1080 1081 return is_huge_zero_folio(folio) || 1082 folio_test_large_rmappable(folio); 1083 } 1084 1085 static unsigned long __thp_get_unmapped_area(struct file *filp, 1086 unsigned long addr, unsigned long len, 1087 loff_t off, unsigned long flags, unsigned long size, 1088 vm_flags_t vm_flags) 1089 { 1090 loff_t off_end = off + len; 1091 loff_t off_align = round_up(off, size); 1092 unsigned long len_pad, ret, off_sub; 1093 1094 if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) 1095 return 0; 1096 1097 if (off_end <= off_align || (off_end - off_align) < size) 1098 return 0; 1099 1100 len_pad = len + size; 1101 if (len_pad < len || (off + len_pad) < off) 1102 return 0; 1103 1104 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, 1105 off >> PAGE_SHIFT, flags, vm_flags); 1106 1107 /* 1108 * The failure might be due to length padding. The caller will retry 1109 * without the padding. 1110 */ 1111 if (IS_ERR_VALUE(ret)) 1112 return 0; 1113 1114 /* 1115 * Do not try to align to THP boundary if allocation at the address 1116 * hint succeeds. 1117 */ 1118 if (ret == addr) 1119 return addr; 1120 1121 off_sub = (off - ret) & (size - 1); 1122 1123 if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) 1124 return ret + size; 1125 1126 ret += off_sub; 1127 return ret; 1128 } 1129 1130 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, 1131 unsigned long len, unsigned long pgoff, unsigned long flags, 1132 vm_flags_t vm_flags) 1133 { 1134 unsigned long ret; 1135 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 1136 1137 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); 1138 if (ret) 1139 return ret; 1140 1141 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, 1142 vm_flags); 1143 } 1144 1145 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 1146 unsigned long len, unsigned long pgoff, unsigned long flags) 1147 { 1148 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); 1149 } 1150 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 1151 1152 static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, 1153 unsigned long addr) 1154 { 1155 gfp_t gfp = vma_thp_gfp_mask(vma); 1156 const int order = HPAGE_PMD_ORDER; 1157 struct folio *folio; 1158 1159 folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); 1160 1161 if (unlikely(!folio)) { 1162 count_vm_event(THP_FAULT_FALLBACK); 1163 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); 1164 return NULL; 1165 } 1166 1167 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 1168 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { 1169 folio_put(folio); 1170 count_vm_event(THP_FAULT_FALLBACK); 1171 count_vm_event(THP_FAULT_FALLBACK_CHARGE); 1172 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); 1173 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 1174 return NULL; 1175 } 1176 folio_throttle_swaprate(folio, gfp); 1177 1178 /* 1179 * When a folio is not zeroed during allocation (__GFP_ZERO not used) 1180 * or user folios require special handling, folio_zero_user() is used to 1181 * make sure that the page corresponding to the faulting address will be 1182 * hot in the cache after zeroing. 1183 */ 1184 if (user_alloc_needs_zeroing()) 1185 folio_zero_user(folio, addr); 1186 /* 1187 * The memory barrier inside __folio_mark_uptodate makes sure that 1188 * folio_zero_user writes become visible before the set_pmd_at() 1189 * write. 1190 */ 1191 __folio_mark_uptodate(folio); 1192 return folio; 1193 } 1194 1195 static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, 1196 struct vm_area_struct *vma, unsigned long haddr) 1197 { 1198 pmd_t entry; 1199 1200 entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); 1201 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1202 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); 1203 folio_add_lru_vma(folio, vma); 1204 set_pmd_at(vma->vm_mm, haddr, pmd, entry); 1205 update_mmu_cache_pmd(vma, haddr, pmd); 1206 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1207 count_vm_event(THP_FAULT_ALLOC); 1208 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); 1209 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); 1210 } 1211 1212 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1213 { 1214 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1215 struct vm_area_struct *vma = vmf->vma; 1216 struct folio *folio; 1217 pgtable_t pgtable; 1218 vm_fault_t ret = 0; 1219 1220 folio = vma_alloc_anon_folio_pmd(vma, vmf->address); 1221 if (unlikely(!folio)) 1222 return VM_FAULT_FALLBACK; 1223 1224 pgtable = pte_alloc_one(vma->vm_mm); 1225 if (unlikely(!pgtable)) { 1226 ret = VM_FAULT_OOM; 1227 goto release; 1228 } 1229 1230 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1231 if (unlikely(!pmd_none(*vmf->pmd))) { 1232 goto unlock_release; 1233 } else { 1234 ret = check_stable_address_space(vma->vm_mm); 1235 if (ret) 1236 goto unlock_release; 1237 1238 /* Deliver the page fault to userland */ 1239 if (userfaultfd_missing(vma)) { 1240 spin_unlock(vmf->ptl); 1241 folio_put(folio); 1242 pte_free(vma->vm_mm, pgtable); 1243 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1244 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1245 return ret; 1246 } 1247 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 1248 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); 1249 mm_inc_nr_ptes(vma->vm_mm); 1250 deferred_split_folio(folio, false); 1251 spin_unlock(vmf->ptl); 1252 } 1253 1254 return 0; 1255 unlock_release: 1256 spin_unlock(vmf->ptl); 1257 release: 1258 if (pgtable) 1259 pte_free(vma->vm_mm, pgtable); 1260 folio_put(folio); 1261 return ret; 1262 1263 } 1264 1265 /* 1266 * always: directly stall for all thp allocations 1267 * defer: wake kswapd and fail if not immediately available 1268 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 1269 * fail if not immediately available 1270 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 1271 * available 1272 * never: never stall for any thp allocation 1273 */ 1274 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) 1275 { 1276 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); 1277 1278 /* Always do synchronous compaction */ 1279 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 1280 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 1281 1282 /* Kick kcompactd and fail quickly */ 1283 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 1284 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 1285 1286 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 1287 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 1288 return GFP_TRANSHUGE_LIGHT | 1289 (vma_madvised ? __GFP_DIRECT_RECLAIM : 1290 __GFP_KSWAPD_RECLAIM); 1291 1292 /* Only do synchronous compaction if madvised */ 1293 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 1294 return GFP_TRANSHUGE_LIGHT | 1295 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 1296 1297 return GFP_TRANSHUGE_LIGHT; 1298 } 1299 1300 /* Caller must hold page table lock. */ 1301 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, 1302 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 1303 struct folio *zero_folio) 1304 { 1305 pmd_t entry; 1306 if (!pmd_none(*pmd)) 1307 return; 1308 entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); 1309 entry = pmd_mkhuge(entry); 1310 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1311 set_pmd_at(mm, haddr, pmd, entry); 1312 mm_inc_nr_ptes(mm); 1313 } 1314 1315 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1316 { 1317 struct vm_area_struct *vma = vmf->vma; 1318 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1319 vm_fault_t ret; 1320 1321 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) 1322 return VM_FAULT_FALLBACK; 1323 ret = vmf_anon_prepare(vmf); 1324 if (ret) 1325 return ret; 1326 khugepaged_enter_vma(vma, vma->vm_flags); 1327 1328 if (!(vmf->flags & FAULT_FLAG_WRITE) && 1329 !mm_forbids_zeropage(vma->vm_mm) && 1330 transparent_hugepage_use_zero_page()) { 1331 pgtable_t pgtable; 1332 struct folio *zero_folio; 1333 vm_fault_t ret; 1334 1335 pgtable = pte_alloc_one(vma->vm_mm); 1336 if (unlikely(!pgtable)) 1337 return VM_FAULT_OOM; 1338 zero_folio = mm_get_huge_zero_folio(vma->vm_mm); 1339 if (unlikely(!zero_folio)) { 1340 pte_free(vma->vm_mm, pgtable); 1341 count_vm_event(THP_FAULT_FALLBACK); 1342 return VM_FAULT_FALLBACK; 1343 } 1344 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1345 ret = 0; 1346 if (pmd_none(*vmf->pmd)) { 1347 ret = check_stable_address_space(vma->vm_mm); 1348 if (ret) { 1349 spin_unlock(vmf->ptl); 1350 pte_free(vma->vm_mm, pgtable); 1351 } else if (userfaultfd_missing(vma)) { 1352 spin_unlock(vmf->ptl); 1353 pte_free(vma->vm_mm, pgtable); 1354 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1355 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1356 } else { 1357 set_huge_zero_folio(pgtable, vma->vm_mm, vma, 1358 haddr, vmf->pmd, zero_folio); 1359 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1360 spin_unlock(vmf->ptl); 1361 } 1362 } else { 1363 spin_unlock(vmf->ptl); 1364 pte_free(vma->vm_mm, pgtable); 1365 } 1366 return ret; 1367 } 1368 1369 return __do_huge_pmd_anonymous_page(vmf); 1370 } 1371 1372 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 1373 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 1374 pgtable_t pgtable) 1375 { 1376 struct mm_struct *mm = vma->vm_mm; 1377 pmd_t entry; 1378 spinlock_t *ptl; 1379 1380 ptl = pmd_lock(mm, pmd); 1381 if (!pmd_none(*pmd)) { 1382 if (write) { 1383 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { 1384 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 1385 goto out_unlock; 1386 } 1387 entry = pmd_mkyoung(*pmd); 1388 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1389 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 1390 update_mmu_cache_pmd(vma, addr, pmd); 1391 } 1392 1393 goto out_unlock; 1394 } 1395 1396 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 1397 if (pfn_t_devmap(pfn)) 1398 entry = pmd_mkdevmap(entry); 1399 else 1400 entry = pmd_mkspecial(entry); 1401 if (write) { 1402 entry = pmd_mkyoung(pmd_mkdirty(entry)); 1403 entry = maybe_pmd_mkwrite(entry, vma); 1404 } 1405 1406 if (pgtable) { 1407 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1408 mm_inc_nr_ptes(mm); 1409 pgtable = NULL; 1410 } 1411 1412 set_pmd_at(mm, addr, pmd, entry); 1413 update_mmu_cache_pmd(vma, addr, pmd); 1414 1415 out_unlock: 1416 spin_unlock(ptl); 1417 if (pgtable) 1418 pte_free(mm, pgtable); 1419 } 1420 1421 /** 1422 * vmf_insert_pfn_pmd - insert a pmd size pfn 1423 * @vmf: Structure describing the fault 1424 * @pfn: pfn to insert 1425 * @write: whether it's a write fault 1426 * 1427 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. 1428 * 1429 * Return: vm_fault_t value. 1430 */ 1431 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 1432 { 1433 unsigned long addr = vmf->address & PMD_MASK; 1434 struct vm_area_struct *vma = vmf->vma; 1435 pgprot_t pgprot = vma->vm_page_prot; 1436 pgtable_t pgtable = NULL; 1437 1438 /* 1439 * If we had pmd_special, we could avoid all these restrictions, 1440 * but we need to be consistent with PTEs and architectures that 1441 * can't support a 'special' bit. 1442 */ 1443 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1444 !pfn_t_devmap(pfn)); 1445 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1446 (VM_PFNMAP|VM_MIXEDMAP)); 1447 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1448 1449 if (addr < vma->vm_start || addr >= vma->vm_end) 1450 return VM_FAULT_SIGBUS; 1451 1452 if (arch_needs_pgtable_deposit()) { 1453 pgtable = pte_alloc_one(vma->vm_mm); 1454 if (!pgtable) 1455 return VM_FAULT_OOM; 1456 } 1457 1458 track_pfn_insert(vma, &pgprot, pfn); 1459 1460 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 1461 return VM_FAULT_NOPAGE; 1462 } 1463 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1464 1465 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1466 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 1467 { 1468 if (likely(vma->vm_flags & VM_WRITE)) 1469 pud = pud_mkwrite(pud); 1470 return pud; 1471 } 1472 1473 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 1474 pud_t *pud, pfn_t pfn, bool write) 1475 { 1476 struct mm_struct *mm = vma->vm_mm; 1477 pgprot_t prot = vma->vm_page_prot; 1478 pud_t entry; 1479 spinlock_t *ptl; 1480 1481 ptl = pud_lock(mm, pud); 1482 if (!pud_none(*pud)) { 1483 if (write) { 1484 if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) 1485 goto out_unlock; 1486 entry = pud_mkyoung(*pud); 1487 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 1488 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 1489 update_mmu_cache_pud(vma, addr, pud); 1490 } 1491 goto out_unlock; 1492 } 1493 1494 entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 1495 if (pfn_t_devmap(pfn)) 1496 entry = pud_mkdevmap(entry); 1497 else 1498 entry = pud_mkspecial(entry); 1499 if (write) { 1500 entry = pud_mkyoung(pud_mkdirty(entry)); 1501 entry = maybe_pud_mkwrite(entry, vma); 1502 } 1503 set_pud_at(mm, addr, pud, entry); 1504 update_mmu_cache_pud(vma, addr, pud); 1505 1506 out_unlock: 1507 spin_unlock(ptl); 1508 } 1509 1510 /** 1511 * vmf_insert_pfn_pud - insert a pud size pfn 1512 * @vmf: Structure describing the fault 1513 * @pfn: pfn to insert 1514 * @write: whether it's a write fault 1515 * 1516 * Insert a pud size pfn. See vmf_insert_pfn() for additional info. 1517 * 1518 * Return: vm_fault_t value. 1519 */ 1520 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 1521 { 1522 unsigned long addr = vmf->address & PUD_MASK; 1523 struct vm_area_struct *vma = vmf->vma; 1524 pgprot_t pgprot = vma->vm_page_prot; 1525 1526 /* 1527 * If we had pud_special, we could avoid all these restrictions, 1528 * but we need to be consistent with PTEs and architectures that 1529 * can't support a 'special' bit. 1530 */ 1531 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1532 !pfn_t_devmap(pfn)); 1533 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1534 (VM_PFNMAP|VM_MIXEDMAP)); 1535 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1536 1537 if (addr < vma->vm_start || addr >= vma->vm_end) 1538 return VM_FAULT_SIGBUS; 1539 1540 track_pfn_insert(vma, &pgprot, pfn); 1541 1542 insert_pfn_pud(vma, addr, vmf->pud, pfn, write); 1543 return VM_FAULT_NOPAGE; 1544 } 1545 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 1546 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1547 1548 void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1549 pmd_t *pmd, bool write) 1550 { 1551 pmd_t _pmd; 1552 1553 _pmd = pmd_mkyoung(*pmd); 1554 if (write) 1555 _pmd = pmd_mkdirty(_pmd); 1556 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1557 pmd, _pmd, write)) 1558 update_mmu_cache_pmd(vma, addr, pmd); 1559 } 1560 1561 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 1562 pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 1563 { 1564 unsigned long pfn = pmd_pfn(*pmd); 1565 struct mm_struct *mm = vma->vm_mm; 1566 struct page *page; 1567 int ret; 1568 1569 assert_spin_locked(pmd_lockptr(mm, pmd)); 1570 1571 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1572 return NULL; 1573 1574 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 1575 /* pass */; 1576 else 1577 return NULL; 1578 1579 if (flags & FOLL_TOUCH) 1580 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); 1581 1582 /* 1583 * device mapped pages can only be returned if the 1584 * caller will manage the page reference count. 1585 */ 1586 if (!(flags & (FOLL_GET | FOLL_PIN))) 1587 return ERR_PTR(-EEXIST); 1588 1589 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 1590 *pgmap = get_dev_pagemap(pfn, *pgmap); 1591 if (!*pgmap) 1592 return ERR_PTR(-EFAULT); 1593 page = pfn_to_page(pfn); 1594 ret = try_grab_folio(page_folio(page), 1, flags); 1595 if (ret) 1596 page = ERR_PTR(ret); 1597 1598 return page; 1599 } 1600 1601 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1602 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1603 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1604 { 1605 spinlock_t *dst_ptl, *src_ptl; 1606 struct page *src_page; 1607 struct folio *src_folio; 1608 pmd_t pmd; 1609 pgtable_t pgtable = NULL; 1610 int ret = -ENOMEM; 1611 1612 pmd = pmdp_get_lockless(src_pmd); 1613 if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { 1614 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1615 src_ptl = pmd_lockptr(src_mm, src_pmd); 1616 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1617 /* 1618 * No need to recheck the pmd, it can't change with write 1619 * mmap lock held here. 1620 * 1621 * Meanwhile, making sure it's not a CoW VMA with writable 1622 * mapping, otherwise it means either the anon page wrongly 1623 * applied special bit, or we made the PRIVATE mapping be 1624 * able to wrongly write to the backend MMIO. 1625 */ 1626 VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); 1627 goto set_pmd; 1628 } 1629 1630 /* Skip if can be re-fill on fault */ 1631 if (!vma_is_anonymous(dst_vma)) 1632 return 0; 1633 1634 pgtable = pte_alloc_one(dst_mm); 1635 if (unlikely(!pgtable)) 1636 goto out; 1637 1638 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1639 src_ptl = pmd_lockptr(src_mm, src_pmd); 1640 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1641 1642 ret = -EAGAIN; 1643 pmd = *src_pmd; 1644 1645 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1646 if (unlikely(is_swap_pmd(pmd))) { 1647 swp_entry_t entry = pmd_to_swp_entry(pmd); 1648 1649 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1650 if (!is_readable_migration_entry(entry)) { 1651 entry = make_readable_migration_entry( 1652 swp_offset(entry)); 1653 pmd = swp_entry_to_pmd(entry); 1654 if (pmd_swp_soft_dirty(*src_pmd)) 1655 pmd = pmd_swp_mksoft_dirty(pmd); 1656 if (pmd_swp_uffd_wp(*src_pmd)) 1657 pmd = pmd_swp_mkuffd_wp(pmd); 1658 set_pmd_at(src_mm, addr, src_pmd, pmd); 1659 } 1660 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1661 mm_inc_nr_ptes(dst_mm); 1662 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1663 if (!userfaultfd_wp(dst_vma)) 1664 pmd = pmd_swp_clear_uffd_wp(pmd); 1665 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1666 ret = 0; 1667 goto out_unlock; 1668 } 1669 #endif 1670 1671 if (unlikely(!pmd_trans_huge(pmd))) { 1672 pte_free(dst_mm, pgtable); 1673 goto out_unlock; 1674 } 1675 /* 1676 * When page table lock is held, the huge zero pmd should not be 1677 * under splitting since we don't split the page itself, only pmd to 1678 * a page table. 1679 */ 1680 if (is_huge_zero_pmd(pmd)) { 1681 /* 1682 * mm_get_huge_zero_folio() will never allocate a new 1683 * folio here, since we already have a zero page to 1684 * copy. It just takes a reference. 1685 */ 1686 mm_get_huge_zero_folio(dst_mm); 1687 goto out_zero_page; 1688 } 1689 1690 src_page = pmd_page(pmd); 1691 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1692 src_folio = page_folio(src_page); 1693 1694 folio_get(src_folio); 1695 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) { 1696 /* Page maybe pinned: split and retry the fault on PTEs. */ 1697 folio_put(src_folio); 1698 pte_free(dst_mm, pgtable); 1699 spin_unlock(src_ptl); 1700 spin_unlock(dst_ptl); 1701 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); 1702 return -EAGAIN; 1703 } 1704 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1705 out_zero_page: 1706 mm_inc_nr_ptes(dst_mm); 1707 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1708 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1709 if (!userfaultfd_wp(dst_vma)) 1710 pmd = pmd_clear_uffd_wp(pmd); 1711 pmd = pmd_wrprotect(pmd); 1712 set_pmd: 1713 pmd = pmd_mkold(pmd); 1714 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1715 1716 ret = 0; 1717 out_unlock: 1718 spin_unlock(src_ptl); 1719 spin_unlock(dst_ptl); 1720 out: 1721 return ret; 1722 } 1723 1724 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1725 void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1726 pud_t *pud, bool write) 1727 { 1728 pud_t _pud; 1729 1730 _pud = pud_mkyoung(*pud); 1731 if (write) 1732 _pud = pud_mkdirty(_pud); 1733 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1734 pud, _pud, write)) 1735 update_mmu_cache_pud(vma, addr, pud); 1736 } 1737 1738 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1739 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1740 struct vm_area_struct *vma) 1741 { 1742 spinlock_t *dst_ptl, *src_ptl; 1743 pud_t pud; 1744 int ret; 1745 1746 dst_ptl = pud_lock(dst_mm, dst_pud); 1747 src_ptl = pud_lockptr(src_mm, src_pud); 1748 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1749 1750 ret = -EAGAIN; 1751 pud = *src_pud; 1752 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 1753 goto out_unlock; 1754 1755 /* 1756 * TODO: once we support anonymous pages, use 1757 * folio_try_dup_anon_rmap_*() and split if duplicating fails. 1758 */ 1759 if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { 1760 pudp_set_wrprotect(src_mm, addr, src_pud); 1761 pud = pud_wrprotect(pud); 1762 } 1763 pud = pud_mkold(pud); 1764 set_pud_at(dst_mm, addr, dst_pud, pud); 1765 1766 ret = 0; 1767 out_unlock: 1768 spin_unlock(src_ptl); 1769 spin_unlock(dst_ptl); 1770 return ret; 1771 } 1772 1773 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1774 { 1775 bool write = vmf->flags & FAULT_FLAG_WRITE; 1776 1777 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1778 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1779 goto unlock; 1780 1781 touch_pud(vmf->vma, vmf->address, vmf->pud, write); 1782 unlock: 1783 spin_unlock(vmf->ptl); 1784 } 1785 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1786 1787 void huge_pmd_set_accessed(struct vm_fault *vmf) 1788 { 1789 bool write = vmf->flags & FAULT_FLAG_WRITE; 1790 1791 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1792 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) 1793 goto unlock; 1794 1795 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); 1796 1797 unlock: 1798 spin_unlock(vmf->ptl); 1799 } 1800 1801 static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) 1802 { 1803 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1804 struct vm_area_struct *vma = vmf->vma; 1805 struct mmu_notifier_range range; 1806 struct folio *folio; 1807 vm_fault_t ret = 0; 1808 1809 folio = vma_alloc_anon_folio_pmd(vma, vmf->address); 1810 if (unlikely(!folio)) 1811 return VM_FAULT_FALLBACK; 1812 1813 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, 1814 haddr + HPAGE_PMD_SIZE); 1815 mmu_notifier_invalidate_range_start(&range); 1816 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1817 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) 1818 goto release; 1819 ret = check_stable_address_space(vma->vm_mm); 1820 if (ret) 1821 goto release; 1822 (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); 1823 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); 1824 goto unlock; 1825 release: 1826 folio_put(folio); 1827 unlock: 1828 spin_unlock(vmf->ptl); 1829 mmu_notifier_invalidate_range_end(&range); 1830 return ret; 1831 } 1832 1833 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) 1834 { 1835 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 1836 struct vm_area_struct *vma = vmf->vma; 1837 struct folio *folio; 1838 struct page *page; 1839 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1840 pmd_t orig_pmd = vmf->orig_pmd; 1841 1842 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 1843 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1844 1845 if (is_huge_zero_pmd(orig_pmd)) { 1846 vm_fault_t ret = do_huge_zero_wp_pmd(vmf); 1847 1848 if (!(ret & VM_FAULT_FALLBACK)) 1849 return ret; 1850 1851 /* Fallback to splitting PMD if THP cannot be allocated */ 1852 goto fallback; 1853 } 1854 1855 spin_lock(vmf->ptl); 1856 1857 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1858 spin_unlock(vmf->ptl); 1859 return 0; 1860 } 1861 1862 page = pmd_page(orig_pmd); 1863 folio = page_folio(page); 1864 VM_BUG_ON_PAGE(!PageHead(page), page); 1865 1866 /* Early check when only holding the PT lock. */ 1867 if (PageAnonExclusive(page)) 1868 goto reuse; 1869 1870 if (!folio_trylock(folio)) { 1871 folio_get(folio); 1872 spin_unlock(vmf->ptl); 1873 folio_lock(folio); 1874 spin_lock(vmf->ptl); 1875 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1876 spin_unlock(vmf->ptl); 1877 folio_unlock(folio); 1878 folio_put(folio); 1879 return 0; 1880 } 1881 folio_put(folio); 1882 } 1883 1884 /* Recheck after temporarily dropping the PT lock. */ 1885 if (PageAnonExclusive(page)) { 1886 folio_unlock(folio); 1887 goto reuse; 1888 } 1889 1890 /* 1891 * See do_wp_page(): we can only reuse the folio exclusively if 1892 * there are no additional references. Note that we always drain 1893 * the LRU cache immediately after adding a THP. 1894 */ 1895 if (folio_ref_count(folio) > 1896 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) 1897 goto unlock_fallback; 1898 if (folio_test_swapcache(folio)) 1899 folio_free_swap(folio); 1900 if (folio_ref_count(folio) == 1) { 1901 pmd_t entry; 1902 1903 folio_move_anon_rmap(folio, vma); 1904 SetPageAnonExclusive(page); 1905 folio_unlock(folio); 1906 reuse: 1907 if (unlikely(unshare)) { 1908 spin_unlock(vmf->ptl); 1909 return 0; 1910 } 1911 entry = pmd_mkyoung(orig_pmd); 1912 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1913 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1914 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1915 spin_unlock(vmf->ptl); 1916 return 0; 1917 } 1918 1919 unlock_fallback: 1920 folio_unlock(folio); 1921 spin_unlock(vmf->ptl); 1922 fallback: 1923 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); 1924 return VM_FAULT_FALLBACK; 1925 } 1926 1927 static inline bool can_change_pmd_writable(struct vm_area_struct *vma, 1928 unsigned long addr, pmd_t pmd) 1929 { 1930 struct page *page; 1931 1932 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) 1933 return false; 1934 1935 /* Don't touch entries that are not even readable (NUMA hinting). */ 1936 if (pmd_protnone(pmd)) 1937 return false; 1938 1939 /* Do we need write faults for softdirty tracking? */ 1940 if (pmd_needs_soft_dirty_wp(vma, pmd)) 1941 return false; 1942 1943 /* Do we need write faults for uffd-wp tracking? */ 1944 if (userfaultfd_huge_pmd_wp(vma, pmd)) 1945 return false; 1946 1947 if (!(vma->vm_flags & VM_SHARED)) { 1948 /* See can_change_pte_writable(). */ 1949 page = vm_normal_page_pmd(vma, addr, pmd); 1950 return page && PageAnon(page) && PageAnonExclusive(page); 1951 } 1952 1953 /* See can_change_pte_writable(). */ 1954 return pmd_dirty(pmd); 1955 } 1956 1957 /* NUMA hinting page fault entry point for trans huge pmds */ 1958 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) 1959 { 1960 struct vm_area_struct *vma = vmf->vma; 1961 struct folio *folio; 1962 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1963 int nid = NUMA_NO_NODE; 1964 int target_nid, last_cpupid; 1965 pmd_t pmd, old_pmd; 1966 bool writable = false; 1967 int flags = 0; 1968 1969 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1970 old_pmd = pmdp_get(vmf->pmd); 1971 1972 if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) { 1973 spin_unlock(vmf->ptl); 1974 return 0; 1975 } 1976 1977 pmd = pmd_modify(old_pmd, vma->vm_page_prot); 1978 1979 /* 1980 * Detect now whether the PMD could be writable; this information 1981 * is only valid while holding the PT lock. 1982 */ 1983 writable = pmd_write(pmd); 1984 if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 1985 can_change_pmd_writable(vma, vmf->address, pmd)) 1986 writable = true; 1987 1988 folio = vm_normal_folio_pmd(vma, haddr, pmd); 1989 if (!folio) 1990 goto out_map; 1991 1992 nid = folio_nid(folio); 1993 1994 target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable, 1995 &last_cpupid); 1996 if (target_nid == NUMA_NO_NODE) 1997 goto out_map; 1998 if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { 1999 flags |= TNF_MIGRATE_FAIL; 2000 goto out_map; 2001 } 2002 /* The folio is isolated and isolation code holds a folio reference. */ 2003 spin_unlock(vmf->ptl); 2004 writable = false; 2005 2006 if (!migrate_misplaced_folio(folio, vma, target_nid)) { 2007 flags |= TNF_MIGRATED; 2008 nid = target_nid; 2009 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 2010 return 0; 2011 } 2012 2013 flags |= TNF_MIGRATE_FAIL; 2014 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 2015 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) { 2016 spin_unlock(vmf->ptl); 2017 return 0; 2018 } 2019 out_map: 2020 /* Restore the PMD */ 2021 pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot); 2022 pmd = pmd_mkyoung(pmd); 2023 if (writable) 2024 pmd = pmd_mkwrite(pmd, vma); 2025 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 2026 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 2027 spin_unlock(vmf->ptl); 2028 2029 if (nid != NUMA_NO_NODE) 2030 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 2031 return 0; 2032 } 2033 2034 /* 2035 * Return true if we do MADV_FREE successfully on entire pmd page. 2036 * Otherwise, return false. 2037 */ 2038 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2039 pmd_t *pmd, unsigned long addr, unsigned long next) 2040 { 2041 spinlock_t *ptl; 2042 pmd_t orig_pmd; 2043 struct folio *folio; 2044 struct mm_struct *mm = tlb->mm; 2045 bool ret = false; 2046 2047 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2048 2049 ptl = pmd_trans_huge_lock(pmd, vma); 2050 if (!ptl) 2051 goto out_unlocked; 2052 2053 orig_pmd = *pmd; 2054 if (is_huge_zero_pmd(orig_pmd)) 2055 goto out; 2056 2057 if (unlikely(!pmd_present(orig_pmd))) { 2058 VM_BUG_ON(thp_migration_supported() && 2059 !is_pmd_migration_entry(orig_pmd)); 2060 goto out; 2061 } 2062 2063 folio = pmd_folio(orig_pmd); 2064 /* 2065 * If other processes are mapping this folio, we couldn't discard 2066 * the folio unless they all do MADV_FREE so let's skip the folio. 2067 */ 2068 if (folio_likely_mapped_shared(folio)) 2069 goto out; 2070 2071 if (!folio_trylock(folio)) 2072 goto out; 2073 2074 /* 2075 * If user want to discard part-pages of THP, split it so MADV_FREE 2076 * will deactivate only them. 2077 */ 2078 if (next - addr != HPAGE_PMD_SIZE) { 2079 folio_get(folio); 2080 spin_unlock(ptl); 2081 split_folio(folio); 2082 folio_unlock(folio); 2083 folio_put(folio); 2084 goto out_unlocked; 2085 } 2086 2087 if (folio_test_dirty(folio)) 2088 folio_clear_dirty(folio); 2089 folio_unlock(folio); 2090 2091 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 2092 pmdp_invalidate(vma, addr, pmd); 2093 orig_pmd = pmd_mkold(orig_pmd); 2094 orig_pmd = pmd_mkclean(orig_pmd); 2095 2096 set_pmd_at(mm, addr, pmd, orig_pmd); 2097 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2098 } 2099 2100 folio_mark_lazyfree(folio); 2101 ret = true; 2102 out: 2103 spin_unlock(ptl); 2104 out_unlocked: 2105 return ret; 2106 } 2107 2108 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 2109 { 2110 pgtable_t pgtable; 2111 2112 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2113 pte_free(mm, pgtable); 2114 mm_dec_nr_ptes(mm); 2115 } 2116 2117 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2118 pmd_t *pmd, unsigned long addr) 2119 { 2120 pmd_t orig_pmd; 2121 spinlock_t *ptl; 2122 2123 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2124 2125 ptl = __pmd_trans_huge_lock(pmd, vma); 2126 if (!ptl) 2127 return 0; 2128 /* 2129 * For architectures like ppc64 we look at deposited pgtable 2130 * when calling pmdp_huge_get_and_clear. So do the 2131 * pgtable_trans_huge_withdraw after finishing pmdp related 2132 * operations. 2133 */ 2134 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, 2135 tlb->fullmm); 2136 arch_check_zapped_pmd(vma, orig_pmd); 2137 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2138 if (vma_is_special_huge(vma)) { 2139 if (arch_needs_pgtable_deposit()) 2140 zap_deposited_table(tlb->mm, pmd); 2141 spin_unlock(ptl); 2142 } else if (is_huge_zero_pmd(orig_pmd)) { 2143 zap_deposited_table(tlb->mm, pmd); 2144 spin_unlock(ptl); 2145 } else { 2146 struct folio *folio = NULL; 2147 int flush_needed = 1; 2148 2149 if (pmd_present(orig_pmd)) { 2150 struct page *page = pmd_page(orig_pmd); 2151 2152 folio = page_folio(page); 2153 folio_remove_rmap_pmd(folio, page, vma); 2154 WARN_ON_ONCE(folio_mapcount(folio) < 0); 2155 VM_BUG_ON_PAGE(!PageHead(page), page); 2156 } else if (thp_migration_supported()) { 2157 swp_entry_t entry; 2158 2159 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 2160 entry = pmd_to_swp_entry(orig_pmd); 2161 folio = pfn_swap_entry_folio(entry); 2162 flush_needed = 0; 2163 } else 2164 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 2165 2166 if (folio_test_anon(folio)) { 2167 zap_deposited_table(tlb->mm, pmd); 2168 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 2169 } else { 2170 if (arch_needs_pgtable_deposit()) 2171 zap_deposited_table(tlb->mm, pmd); 2172 add_mm_counter(tlb->mm, mm_counter_file(folio), 2173 -HPAGE_PMD_NR); 2174 } 2175 2176 spin_unlock(ptl); 2177 if (flush_needed) 2178 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); 2179 } 2180 return 1; 2181 } 2182 2183 #ifndef pmd_move_must_withdraw 2184 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 2185 spinlock_t *old_pmd_ptl, 2186 struct vm_area_struct *vma) 2187 { 2188 /* 2189 * With split pmd lock we also need to move preallocated 2190 * PTE page table if new_pmd is on different PMD page table. 2191 * 2192 * We also don't deposit and withdraw tables for file pages. 2193 */ 2194 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 2195 } 2196 #endif 2197 2198 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 2199 { 2200 #ifdef CONFIG_MEM_SOFT_DIRTY 2201 if (unlikely(is_pmd_migration_entry(pmd))) 2202 pmd = pmd_swp_mksoft_dirty(pmd); 2203 else if (pmd_present(pmd)) 2204 pmd = pmd_mksoft_dirty(pmd); 2205 #endif 2206 return pmd; 2207 } 2208 2209 static pmd_t clear_uffd_wp_pmd(pmd_t pmd) 2210 { 2211 if (pmd_present(pmd)) 2212 pmd = pmd_clear_uffd_wp(pmd); 2213 else if (is_swap_pmd(pmd)) 2214 pmd = pmd_swp_clear_uffd_wp(pmd); 2215 2216 return pmd; 2217 } 2218 2219 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 2220 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) 2221 { 2222 spinlock_t *old_ptl, *new_ptl; 2223 pmd_t pmd; 2224 struct mm_struct *mm = vma->vm_mm; 2225 bool force_flush = false; 2226 2227 /* 2228 * The destination pmd shouldn't be established, free_pgtables() 2229 * should have released it; but move_page_tables() might have already 2230 * inserted a page table, if racing against shmem/file collapse. 2231 */ 2232 if (!pmd_none(*new_pmd)) { 2233 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 2234 return false; 2235 } 2236 2237 /* 2238 * We don't have to worry about the ordering of src and dst 2239 * ptlocks because exclusive mmap_lock prevents deadlock. 2240 */ 2241 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 2242 if (old_ptl) { 2243 new_ptl = pmd_lockptr(mm, new_pmd); 2244 if (new_ptl != old_ptl) 2245 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 2246 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 2247 if (pmd_present(pmd)) 2248 force_flush = true; 2249 VM_BUG_ON(!pmd_none(*new_pmd)); 2250 2251 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 2252 pgtable_t pgtable; 2253 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 2254 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 2255 } 2256 pmd = move_soft_dirty_pmd(pmd); 2257 if (vma_has_uffd_without_event_remap(vma)) 2258 pmd = clear_uffd_wp_pmd(pmd); 2259 set_pmd_at(mm, new_addr, new_pmd, pmd); 2260 if (force_flush) 2261 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 2262 if (new_ptl != old_ptl) 2263 spin_unlock(new_ptl); 2264 spin_unlock(old_ptl); 2265 return true; 2266 } 2267 return false; 2268 } 2269 2270 /* 2271 * Returns 2272 * - 0 if PMD could not be locked 2273 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary 2274 * or if prot_numa but THP migration is not supported 2275 * - HPAGE_PMD_NR if protections changed and TLB flush necessary 2276 */ 2277 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2278 pmd_t *pmd, unsigned long addr, pgprot_t newprot, 2279 unsigned long cp_flags) 2280 { 2281 struct mm_struct *mm = vma->vm_mm; 2282 spinlock_t *ptl; 2283 pmd_t oldpmd, entry; 2284 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 2285 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 2286 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 2287 int ret = 1; 2288 2289 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 2290 2291 if (prot_numa && !thp_migration_supported()) 2292 return 1; 2293 2294 ptl = __pmd_trans_huge_lock(pmd, vma); 2295 if (!ptl) 2296 return 0; 2297 2298 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2299 if (is_swap_pmd(*pmd)) { 2300 swp_entry_t entry = pmd_to_swp_entry(*pmd); 2301 struct folio *folio = pfn_swap_entry_folio(entry); 2302 pmd_t newpmd; 2303 2304 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 2305 if (is_writable_migration_entry(entry)) { 2306 /* 2307 * A protection check is difficult so 2308 * just be safe and disable write 2309 */ 2310 if (folio_test_anon(folio)) 2311 entry = make_readable_exclusive_migration_entry(swp_offset(entry)); 2312 else 2313 entry = make_readable_migration_entry(swp_offset(entry)); 2314 newpmd = swp_entry_to_pmd(entry); 2315 if (pmd_swp_soft_dirty(*pmd)) 2316 newpmd = pmd_swp_mksoft_dirty(newpmd); 2317 } else { 2318 newpmd = *pmd; 2319 } 2320 2321 if (uffd_wp) 2322 newpmd = pmd_swp_mkuffd_wp(newpmd); 2323 else if (uffd_wp_resolve) 2324 newpmd = pmd_swp_clear_uffd_wp(newpmd); 2325 if (!pmd_same(*pmd, newpmd)) 2326 set_pmd_at(mm, addr, pmd, newpmd); 2327 goto unlock; 2328 } 2329 #endif 2330 2331 if (prot_numa) { 2332 struct folio *folio; 2333 bool toptier; 2334 /* 2335 * Avoid trapping faults against the zero page. The read-only 2336 * data is likely to be read-cached on the local CPU and 2337 * local/remote hits to the zero page are not interesting. 2338 */ 2339 if (is_huge_zero_pmd(*pmd)) 2340 goto unlock; 2341 2342 if (pmd_protnone(*pmd)) 2343 goto unlock; 2344 2345 folio = pmd_folio(*pmd); 2346 toptier = node_is_toptier(folio_nid(folio)); 2347 /* 2348 * Skip scanning top tier node if normal numa 2349 * balancing is disabled 2350 */ 2351 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 2352 toptier) 2353 goto unlock; 2354 2355 if (folio_use_access_time(folio)) 2356 folio_xchg_access_time(folio, 2357 jiffies_to_msecs(jiffies)); 2358 } 2359 /* 2360 * In case prot_numa, we are under mmap_read_lock(mm). It's critical 2361 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 2362 * which is also under mmap_read_lock(mm): 2363 * 2364 * CPU0: CPU1: 2365 * change_huge_pmd(prot_numa=1) 2366 * pmdp_huge_get_and_clear_notify() 2367 * madvise_dontneed() 2368 * zap_pmd_range() 2369 * pmd_trans_huge(*pmd) == 0 (without ptl) 2370 * // skip the pmd 2371 * set_pmd_at(); 2372 * // pmd is re-established 2373 * 2374 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 2375 * which may break userspace. 2376 * 2377 * pmdp_invalidate_ad() is required to make sure we don't miss 2378 * dirty/young flags set by hardware. 2379 */ 2380 oldpmd = pmdp_invalidate_ad(vma, addr, pmd); 2381 2382 entry = pmd_modify(oldpmd, newprot); 2383 if (uffd_wp) 2384 entry = pmd_mkuffd_wp(entry); 2385 else if (uffd_wp_resolve) 2386 /* 2387 * Leave the write bit to be handled by PF interrupt 2388 * handler, then things like COW could be properly 2389 * handled. 2390 */ 2391 entry = pmd_clear_uffd_wp(entry); 2392 2393 /* See change_pte_range(). */ 2394 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && 2395 can_change_pmd_writable(vma, addr, entry)) 2396 entry = pmd_mkwrite(entry, vma); 2397 2398 ret = HPAGE_PMD_NR; 2399 set_pmd_at(mm, addr, pmd, entry); 2400 2401 if (huge_pmd_needs_flush(oldpmd, entry)) 2402 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); 2403 unlock: 2404 spin_unlock(ptl); 2405 return ret; 2406 } 2407 2408 /* 2409 * Returns: 2410 * 2411 * - 0: if pud leaf changed from under us 2412 * - 1: if pud can be skipped 2413 * - HPAGE_PUD_NR: if pud was successfully processed 2414 */ 2415 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2416 int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2417 pud_t *pudp, unsigned long addr, pgprot_t newprot, 2418 unsigned long cp_flags) 2419 { 2420 struct mm_struct *mm = vma->vm_mm; 2421 pud_t oldpud, entry; 2422 spinlock_t *ptl; 2423 2424 tlb_change_page_size(tlb, HPAGE_PUD_SIZE); 2425 2426 /* NUMA balancing doesn't apply to dax */ 2427 if (cp_flags & MM_CP_PROT_NUMA) 2428 return 1; 2429 2430 /* 2431 * Huge entries on userfault-wp only works with anonymous, while we 2432 * don't have anonymous PUDs yet. 2433 */ 2434 if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) 2435 return 1; 2436 2437 ptl = __pud_trans_huge_lock(pudp, vma); 2438 if (!ptl) 2439 return 0; 2440 2441 /* 2442 * Can't clear PUD or it can race with concurrent zapping. See 2443 * change_huge_pmd(). 2444 */ 2445 oldpud = pudp_invalidate(vma, addr, pudp); 2446 entry = pud_modify(oldpud, newprot); 2447 set_pud_at(mm, addr, pudp, entry); 2448 tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE); 2449 2450 spin_unlock(ptl); 2451 return HPAGE_PUD_NR; 2452 } 2453 #endif 2454 2455 #ifdef CONFIG_USERFAULTFD 2456 /* 2457 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by 2458 * the caller, but it must return after releasing the page_table_lock. 2459 * Just move the page from src_pmd to dst_pmd if possible. 2460 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be 2461 * repeated by the caller, or other errors in case of failure. 2462 */ 2463 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 2464 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 2465 unsigned long dst_addr, unsigned long src_addr) 2466 { 2467 pmd_t _dst_pmd, src_pmdval; 2468 struct page *src_page; 2469 struct folio *src_folio; 2470 struct anon_vma *src_anon_vma; 2471 spinlock_t *src_ptl, *dst_ptl; 2472 pgtable_t src_pgtable; 2473 struct mmu_notifier_range range; 2474 int err = 0; 2475 2476 src_pmdval = *src_pmd; 2477 src_ptl = pmd_lockptr(mm, src_pmd); 2478 2479 lockdep_assert_held(src_ptl); 2480 vma_assert_locked(src_vma); 2481 vma_assert_locked(dst_vma); 2482 2483 /* Sanity checks before the operation */ 2484 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || 2485 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { 2486 spin_unlock(src_ptl); 2487 return -EINVAL; 2488 } 2489 2490 if (!pmd_trans_huge(src_pmdval)) { 2491 spin_unlock(src_ptl); 2492 if (is_pmd_migration_entry(src_pmdval)) { 2493 pmd_migration_entry_wait(mm, &src_pmdval); 2494 return -EAGAIN; 2495 } 2496 return -ENOENT; 2497 } 2498 2499 src_page = pmd_page(src_pmdval); 2500 2501 if (!is_huge_zero_pmd(src_pmdval)) { 2502 if (unlikely(!PageAnonExclusive(src_page))) { 2503 spin_unlock(src_ptl); 2504 return -EBUSY; 2505 } 2506 2507 src_folio = page_folio(src_page); 2508 folio_get(src_folio); 2509 } else 2510 src_folio = NULL; 2511 2512 spin_unlock(src_ptl); 2513 2514 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); 2515 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, 2516 src_addr + HPAGE_PMD_SIZE); 2517 mmu_notifier_invalidate_range_start(&range); 2518 2519 if (src_folio) { 2520 folio_lock(src_folio); 2521 2522 /* 2523 * split_huge_page walks the anon_vma chain without the page 2524 * lock. Serialize against it with the anon_vma lock, the page 2525 * lock is not enough. 2526 */ 2527 src_anon_vma = folio_get_anon_vma(src_folio); 2528 if (!src_anon_vma) { 2529 err = -EAGAIN; 2530 goto unlock_folio; 2531 } 2532 anon_vma_lock_write(src_anon_vma); 2533 } else 2534 src_anon_vma = NULL; 2535 2536 dst_ptl = pmd_lockptr(mm, dst_pmd); 2537 double_pt_lock(src_ptl, dst_ptl); 2538 if (unlikely(!pmd_same(*src_pmd, src_pmdval) || 2539 !pmd_same(*dst_pmd, dst_pmdval))) { 2540 err = -EAGAIN; 2541 goto unlock_ptls; 2542 } 2543 if (src_folio) { 2544 if (folio_maybe_dma_pinned(src_folio) || 2545 !PageAnonExclusive(&src_folio->page)) { 2546 err = -EBUSY; 2547 goto unlock_ptls; 2548 } 2549 2550 if (WARN_ON_ONCE(!folio_test_head(src_folio)) || 2551 WARN_ON_ONCE(!folio_test_anon(src_folio))) { 2552 err = -EBUSY; 2553 goto unlock_ptls; 2554 } 2555 2556 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2557 /* Folio got pinned from under us. Put it back and fail the move. */ 2558 if (folio_maybe_dma_pinned(src_folio)) { 2559 set_pmd_at(mm, src_addr, src_pmd, src_pmdval); 2560 err = -EBUSY; 2561 goto unlock_ptls; 2562 } 2563 2564 folio_move_anon_rmap(src_folio, dst_vma); 2565 src_folio->index = linear_page_index(dst_vma, dst_addr); 2566 2567 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); 2568 /* Follow mremap() behavior and treat the entry dirty after the move */ 2569 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); 2570 } else { 2571 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2572 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); 2573 } 2574 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); 2575 2576 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); 2577 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2578 unlock_ptls: 2579 double_pt_unlock(src_ptl, dst_ptl); 2580 if (src_anon_vma) { 2581 anon_vma_unlock_write(src_anon_vma); 2582 put_anon_vma(src_anon_vma); 2583 } 2584 unlock_folio: 2585 /* unblock rmap walks */ 2586 if (src_folio) 2587 folio_unlock(src_folio); 2588 mmu_notifier_invalidate_range_end(&range); 2589 if (src_folio) 2590 folio_put(src_folio); 2591 return err; 2592 } 2593 #endif /* CONFIG_USERFAULTFD */ 2594 2595 /* 2596 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2597 * 2598 * Note that if it returns page table lock pointer, this routine returns without 2599 * unlocking page table lock. So callers must unlock it. 2600 */ 2601 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2602 { 2603 spinlock_t *ptl; 2604 ptl = pmd_lock(vma->vm_mm, pmd); 2605 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 2606 pmd_devmap(*pmd))) 2607 return ptl; 2608 spin_unlock(ptl); 2609 return NULL; 2610 } 2611 2612 /* 2613 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. 2614 * 2615 * Note that if it returns page table lock pointer, this routine returns without 2616 * unlocking page table lock. So callers must unlock it. 2617 */ 2618 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2619 { 2620 spinlock_t *ptl; 2621 2622 ptl = pud_lock(vma->vm_mm, pud); 2623 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 2624 return ptl; 2625 spin_unlock(ptl); 2626 return NULL; 2627 } 2628 2629 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2630 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2631 pud_t *pud, unsigned long addr) 2632 { 2633 spinlock_t *ptl; 2634 pud_t orig_pud; 2635 2636 ptl = __pud_trans_huge_lock(pud, vma); 2637 if (!ptl) 2638 return 0; 2639 2640 orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); 2641 arch_check_zapped_pud(vma, orig_pud); 2642 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2643 if (vma_is_special_huge(vma)) { 2644 spin_unlock(ptl); 2645 /* No zero page support yet */ 2646 } else { 2647 /* No support for anonymous PUD pages yet */ 2648 BUG(); 2649 } 2650 return 1; 2651 } 2652 2653 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2654 unsigned long haddr) 2655 { 2656 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2657 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2658 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2659 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 2660 2661 count_vm_event(THP_SPLIT_PUD); 2662 2663 pudp_huge_clear_flush(vma, haddr, pud); 2664 } 2665 2666 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2667 unsigned long address) 2668 { 2669 spinlock_t *ptl; 2670 struct mmu_notifier_range range; 2671 2672 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2673 address & HPAGE_PUD_MASK, 2674 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2675 mmu_notifier_invalidate_range_start(&range); 2676 ptl = pud_lock(vma->vm_mm, pud); 2677 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2678 goto out; 2679 __split_huge_pud_locked(vma, pud, range.start); 2680 2681 out: 2682 spin_unlock(ptl); 2683 mmu_notifier_invalidate_range_end(&range); 2684 } 2685 #else 2686 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2687 unsigned long address) 2688 { 2689 } 2690 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2691 2692 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2693 unsigned long haddr, pmd_t *pmd) 2694 { 2695 struct mm_struct *mm = vma->vm_mm; 2696 pgtable_t pgtable; 2697 pmd_t _pmd, old_pmd; 2698 unsigned long addr; 2699 pte_t *pte; 2700 int i; 2701 2702 /* 2703 * Leave pmd empty until pte is filled note that it is fine to delay 2704 * notification until mmu_notifier_invalidate_range_end() as we are 2705 * replacing a zero pmd write protected page with a zero pte write 2706 * protected page. 2707 * 2708 * See Documentation/mm/mmu_notifier.rst 2709 */ 2710 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2711 2712 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2713 pmd_populate(mm, &_pmd, pgtable); 2714 2715 pte = pte_offset_map(&_pmd, haddr); 2716 VM_BUG_ON(!pte); 2717 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2718 pte_t entry; 2719 2720 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); 2721 entry = pte_mkspecial(entry); 2722 if (pmd_uffd_wp(old_pmd)) 2723 entry = pte_mkuffd_wp(entry); 2724 VM_BUG_ON(!pte_none(ptep_get(pte))); 2725 set_pte_at(mm, addr, pte, entry); 2726 pte++; 2727 } 2728 pte_unmap(pte - 1); 2729 smp_wmb(); /* make pte visible before pmd */ 2730 pmd_populate(mm, pmd, pgtable); 2731 } 2732 2733 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2734 unsigned long haddr, bool freeze) 2735 { 2736 struct mm_struct *mm = vma->vm_mm; 2737 struct folio *folio; 2738 struct page *page; 2739 pgtable_t pgtable; 2740 pmd_t old_pmd, _pmd; 2741 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; 2742 bool anon_exclusive = false, dirty = false; 2743 unsigned long addr; 2744 pte_t *pte; 2745 int i; 2746 2747 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2748 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2749 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2750 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 2751 && !pmd_devmap(*pmd)); 2752 2753 count_vm_event(THP_SPLIT_PMD); 2754 2755 if (!vma_is_anonymous(vma)) { 2756 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2757 /* 2758 * We are going to unmap this huge page. So 2759 * just go ahead and zap it 2760 */ 2761 if (arch_needs_pgtable_deposit()) 2762 zap_deposited_table(mm, pmd); 2763 if (vma_is_special_huge(vma)) 2764 return; 2765 if (unlikely(is_pmd_migration_entry(old_pmd))) { 2766 swp_entry_t entry; 2767 2768 entry = pmd_to_swp_entry(old_pmd); 2769 folio = pfn_swap_entry_folio(entry); 2770 } else { 2771 page = pmd_page(old_pmd); 2772 folio = page_folio(page); 2773 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) 2774 folio_mark_dirty(folio); 2775 if (!folio_test_referenced(folio) && pmd_young(old_pmd)) 2776 folio_set_referenced(folio); 2777 folio_remove_rmap_pmd(folio, page, vma); 2778 folio_put(folio); 2779 } 2780 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); 2781 return; 2782 } 2783 2784 if (is_huge_zero_pmd(*pmd)) { 2785 /* 2786 * FIXME: Do we want to invalidate secondary mmu by calling 2787 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below 2788 * inside __split_huge_pmd() ? 2789 * 2790 * We are going from a zero huge page write protected to zero 2791 * small page also write protected so it does not seems useful 2792 * to invalidate secondary mmu at this time. 2793 */ 2794 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2795 } 2796 2797 pmd_migration = is_pmd_migration_entry(*pmd); 2798 if (unlikely(pmd_migration)) { 2799 swp_entry_t entry; 2800 2801 old_pmd = *pmd; 2802 entry = pmd_to_swp_entry(old_pmd); 2803 page = pfn_swap_entry_to_page(entry); 2804 write = is_writable_migration_entry(entry); 2805 if (PageAnon(page)) 2806 anon_exclusive = is_readable_exclusive_migration_entry(entry); 2807 young = is_migration_entry_young(entry); 2808 dirty = is_migration_entry_dirty(entry); 2809 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2810 uffd_wp = pmd_swp_uffd_wp(old_pmd); 2811 } else { 2812 /* 2813 * Up to this point the pmd is present and huge and userland has 2814 * the whole access to the hugepage during the split (which 2815 * happens in place). If we overwrite the pmd with the not-huge 2816 * version pointing to the pte here (which of course we could if 2817 * all CPUs were bug free), userland could trigger a small page 2818 * size TLB miss on the small sized TLB while the hugepage TLB 2819 * entry is still established in the huge TLB. Some CPU doesn't 2820 * like that. See 2821 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum 2822 * 383 on page 105. Intel should be safe but is also warns that 2823 * it's only safe if the permission and cache attributes of the 2824 * two entries loaded in the two TLB is identical (which should 2825 * be the case here). But it is generally safer to never allow 2826 * small and huge TLB entries for the same virtual address to be 2827 * loaded simultaneously. So instead of doing "pmd_populate(); 2828 * flush_pmd_tlb_range();" we first mark the current pmd 2829 * notpresent (atomically because here the pmd_trans_huge must 2830 * remain set at all times on the pmd until the split is 2831 * complete for this pmd), then we flush the SMP TLB and finally 2832 * we write the non-huge version of the pmd entry with 2833 * pmd_populate. 2834 */ 2835 old_pmd = pmdp_invalidate(vma, haddr, pmd); 2836 page = pmd_page(old_pmd); 2837 folio = page_folio(page); 2838 if (pmd_dirty(old_pmd)) { 2839 dirty = true; 2840 folio_set_dirty(folio); 2841 } 2842 write = pmd_write(old_pmd); 2843 young = pmd_young(old_pmd); 2844 soft_dirty = pmd_soft_dirty(old_pmd); 2845 uffd_wp = pmd_uffd_wp(old_pmd); 2846 2847 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); 2848 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 2849 2850 /* 2851 * Without "freeze", we'll simply split the PMD, propagating the 2852 * PageAnonExclusive() flag for each PTE by setting it for 2853 * each subpage -- no need to (temporarily) clear. 2854 * 2855 * With "freeze" we want to replace mapped pages by 2856 * migration entries right away. This is only possible if we 2857 * managed to clear PageAnonExclusive() -- see 2858 * set_pmd_migration_entry(). 2859 * 2860 * In case we cannot clear PageAnonExclusive(), split the PMD 2861 * only and let try_to_migrate_one() fail later. 2862 * 2863 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. 2864 */ 2865 anon_exclusive = PageAnonExclusive(page); 2866 if (freeze && anon_exclusive && 2867 folio_try_share_anon_rmap_pmd(folio, page)) 2868 freeze = false; 2869 if (!freeze) { 2870 rmap_t rmap_flags = RMAP_NONE; 2871 2872 folio_ref_add(folio, HPAGE_PMD_NR - 1); 2873 if (anon_exclusive) 2874 rmap_flags |= RMAP_EXCLUSIVE; 2875 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, 2876 vma, haddr, rmap_flags); 2877 } 2878 } 2879 2880 /* 2881 * Withdraw the table only after we mark the pmd entry invalid. 2882 * This's critical for some architectures (Power). 2883 */ 2884 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2885 pmd_populate(mm, &_pmd, pgtable); 2886 2887 pte = pte_offset_map(&_pmd, haddr); 2888 VM_BUG_ON(!pte); 2889 2890 /* 2891 * Note that NUMA hinting access restrictions are not transferred to 2892 * avoid any possibility of altering permissions across VMAs. 2893 */ 2894 if (freeze || pmd_migration) { 2895 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2896 pte_t entry; 2897 swp_entry_t swp_entry; 2898 2899 if (write) 2900 swp_entry = make_writable_migration_entry( 2901 page_to_pfn(page + i)); 2902 else if (anon_exclusive) 2903 swp_entry = make_readable_exclusive_migration_entry( 2904 page_to_pfn(page + i)); 2905 else 2906 swp_entry = make_readable_migration_entry( 2907 page_to_pfn(page + i)); 2908 if (young) 2909 swp_entry = make_migration_entry_young(swp_entry); 2910 if (dirty) 2911 swp_entry = make_migration_entry_dirty(swp_entry); 2912 entry = swp_entry_to_pte(swp_entry); 2913 if (soft_dirty) 2914 entry = pte_swp_mksoft_dirty(entry); 2915 if (uffd_wp) 2916 entry = pte_swp_mkuffd_wp(entry); 2917 2918 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2919 set_pte_at(mm, addr, pte + i, entry); 2920 } 2921 } else { 2922 pte_t entry; 2923 2924 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); 2925 if (write) 2926 entry = pte_mkwrite(entry, vma); 2927 if (!young) 2928 entry = pte_mkold(entry); 2929 /* NOTE: this may set soft-dirty too on some archs */ 2930 if (dirty) 2931 entry = pte_mkdirty(entry); 2932 if (soft_dirty) 2933 entry = pte_mksoft_dirty(entry); 2934 if (uffd_wp) 2935 entry = pte_mkuffd_wp(entry); 2936 2937 for (i = 0; i < HPAGE_PMD_NR; i++) 2938 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2939 2940 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); 2941 } 2942 pte_unmap(pte); 2943 2944 if (!pmd_migration) 2945 folio_remove_rmap_pmd(folio, page, vma); 2946 if (freeze) 2947 put_page(page); 2948 2949 smp_wmb(); /* make pte visible before pmd */ 2950 pmd_populate(mm, pmd, pgtable); 2951 } 2952 2953 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, 2954 pmd_t *pmd, bool freeze, struct folio *folio) 2955 { 2956 VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); 2957 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); 2958 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); 2959 VM_BUG_ON(freeze && !folio); 2960 2961 /* 2962 * When the caller requests to set up a migration entry, we 2963 * require a folio to check the PMD against. Otherwise, there 2964 * is a risk of replacing the wrong folio. 2965 */ 2966 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || 2967 is_pmd_migration_entry(*pmd)) { 2968 if (folio && folio != pmd_folio(*pmd)) 2969 return; 2970 __split_huge_pmd_locked(vma, pmd, address, freeze); 2971 } 2972 } 2973 2974 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2975 unsigned long address, bool freeze, struct folio *folio) 2976 { 2977 spinlock_t *ptl; 2978 struct mmu_notifier_range range; 2979 2980 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2981 address & HPAGE_PMD_MASK, 2982 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 2983 mmu_notifier_invalidate_range_start(&range); 2984 ptl = pmd_lock(vma->vm_mm, pmd); 2985 split_huge_pmd_locked(vma, range.start, pmd, freeze, folio); 2986 spin_unlock(ptl); 2987 mmu_notifier_invalidate_range_end(&range); 2988 } 2989 2990 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2991 bool freeze, struct folio *folio) 2992 { 2993 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 2994 2995 if (!pmd) 2996 return; 2997 2998 __split_huge_pmd(vma, pmd, address, freeze, folio); 2999 } 3000 3001 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) 3002 { 3003 /* 3004 * If the new address isn't hpage aligned and it could previously 3005 * contain an hugepage: check if we need to split an huge pmd. 3006 */ 3007 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && 3008 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), 3009 ALIGN(address, HPAGE_PMD_SIZE))) 3010 split_huge_pmd_address(vma, address, false, NULL); 3011 } 3012 3013 void vma_adjust_trans_huge(struct vm_area_struct *vma, 3014 unsigned long start, 3015 unsigned long end, 3016 long adjust_next) 3017 { 3018 /* Check if we need to split start first. */ 3019 split_huge_pmd_if_needed(vma, start); 3020 3021 /* Check if we need to split end next. */ 3022 split_huge_pmd_if_needed(vma, end); 3023 3024 /* 3025 * If we're also updating the next vma vm_start, 3026 * check if we need to split it. 3027 */ 3028 if (adjust_next > 0) { 3029 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); 3030 unsigned long nstart = next->vm_start; 3031 nstart += adjust_next; 3032 split_huge_pmd_if_needed(next, nstart); 3033 } 3034 } 3035 3036 static void unmap_folio(struct folio *folio) 3037 { 3038 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | 3039 TTU_BATCH_FLUSH; 3040 3041 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 3042 3043 if (folio_test_pmd_mappable(folio)) 3044 ttu_flags |= TTU_SPLIT_HUGE_PMD; 3045 3046 /* 3047 * Anon pages need migration entries to preserve them, but file 3048 * pages can simply be left unmapped, then faulted back on demand. 3049 * If that is ever changed (perhaps for mlock), update remap_page(). 3050 */ 3051 if (folio_test_anon(folio)) 3052 try_to_migrate(folio, ttu_flags); 3053 else 3054 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 3055 3056 try_to_unmap_flush(); 3057 } 3058 3059 static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, 3060 unsigned long addr, pmd_t *pmdp, 3061 struct folio *folio) 3062 { 3063 struct mm_struct *mm = vma->vm_mm; 3064 int ref_count, map_count; 3065 pmd_t orig_pmd = *pmdp; 3066 3067 if (folio_test_dirty(folio) || pmd_dirty(orig_pmd)) 3068 return false; 3069 3070 orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp); 3071 3072 /* 3073 * Syncing against concurrent GUP-fast: 3074 * - clear PMD; barrier; read refcount 3075 * - inc refcount; barrier; read PMD 3076 */ 3077 smp_mb(); 3078 3079 ref_count = folio_ref_count(folio); 3080 map_count = folio_mapcount(folio); 3081 3082 /* 3083 * Order reads for folio refcount and dirty flag 3084 * (see comments in __remove_mapping()). 3085 */ 3086 smp_rmb(); 3087 3088 /* 3089 * If the folio or its PMD is redirtied at this point, or if there 3090 * are unexpected references, we will give up to discard this folio 3091 * and remap it. 3092 * 3093 * The only folio refs must be one from isolation plus the rmap(s). 3094 */ 3095 if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) || 3096 ref_count != map_count + 1) { 3097 set_pmd_at(mm, addr, pmdp, orig_pmd); 3098 return false; 3099 } 3100 3101 folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma); 3102 zap_deposited_table(mm, pmdp); 3103 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); 3104 if (vma->vm_flags & VM_LOCKED) 3105 mlock_drain_local(); 3106 folio_put(folio); 3107 3108 return true; 3109 } 3110 3111 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, 3112 pmd_t *pmdp, struct folio *folio) 3113 { 3114 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); 3115 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 3116 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE)); 3117 3118 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) 3119 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); 3120 3121 return false; 3122 } 3123 3124 static void remap_page(struct folio *folio, unsigned long nr, int flags) 3125 { 3126 int i = 0; 3127 3128 /* If unmap_folio() uses try_to_migrate() on file, remove this check */ 3129 if (!folio_test_anon(folio)) 3130 return; 3131 for (;;) { 3132 remove_migration_ptes(folio, folio, RMP_LOCKED | flags); 3133 i += folio_nr_pages(folio); 3134 if (i >= nr) 3135 break; 3136 folio = folio_next(folio); 3137 } 3138 } 3139 3140 static void lru_add_page_tail(struct folio *folio, struct page *tail, 3141 struct lruvec *lruvec, struct list_head *list) 3142 { 3143 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 3144 VM_BUG_ON_FOLIO(PageLRU(tail), folio); 3145 lockdep_assert_held(&lruvec->lru_lock); 3146 3147 if (list) { 3148 /* page reclaim is reclaiming a huge page */ 3149 VM_WARN_ON(folio_test_lru(folio)); 3150 get_page(tail); 3151 list_add_tail(&tail->lru, list); 3152 } else { 3153 /* head is still on lru (and we have it frozen) */ 3154 VM_WARN_ON(!folio_test_lru(folio)); 3155 if (folio_test_unevictable(folio)) 3156 tail->mlock_count = 0; 3157 else 3158 list_add_tail(&tail->lru, &folio->lru); 3159 SetPageLRU(tail); 3160 } 3161 } 3162 3163 static void __split_huge_page_tail(struct folio *folio, int tail, 3164 struct lruvec *lruvec, struct list_head *list, 3165 unsigned int new_order) 3166 { 3167 struct page *head = &folio->page; 3168 struct page *page_tail = head + tail; 3169 /* 3170 * Careful: new_folio is not a "real" folio before we cleared PageTail. 3171 * Don't pass it around before clear_compound_head(). 3172 */ 3173 struct folio *new_folio = (struct folio *)page_tail; 3174 3175 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 3176 3177 /* 3178 * Clone page flags before unfreezing refcount. 3179 * 3180 * After successful get_page_unless_zero() might follow flags change, 3181 * for example lock_page() which set PG_waiters. 3182 * 3183 * Note that for mapped sub-pages of an anonymous THP, 3184 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in 3185 * the migration entry instead from where remap_page() will restore it. 3186 * We can still have PG_anon_exclusive set on effectively unmapped and 3187 * unreferenced sub-pages of an anonymous THP: we can simply drop 3188 * PG_anon_exclusive (-> PG_mappedtodisk) for these here. 3189 */ 3190 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3191 page_tail->flags |= (head->flags & 3192 ((1L << PG_referenced) | 3193 (1L << PG_swapbacked) | 3194 (1L << PG_swapcache) | 3195 (1L << PG_mlocked) | 3196 (1L << PG_uptodate) | 3197 (1L << PG_active) | 3198 (1L << PG_workingset) | 3199 (1L << PG_locked) | 3200 (1L << PG_unevictable) | 3201 #ifdef CONFIG_ARCH_USES_PG_ARCH_2 3202 (1L << PG_arch_2) | 3203 #endif 3204 #ifdef CONFIG_ARCH_USES_PG_ARCH_3 3205 (1L << PG_arch_3) | 3206 #endif 3207 (1L << PG_dirty) | 3208 LRU_GEN_MASK | LRU_REFS_MASK)); 3209 3210 /* ->mapping in first and second tail page is replaced by other uses */ 3211 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 3212 page_tail); 3213 new_folio->mapping = folio->mapping; 3214 new_folio->index = folio->index + tail; 3215 3216 /* 3217 * page->private should not be set in tail pages. Fix up and warn once 3218 * if private is unexpectedly set. 3219 */ 3220 if (unlikely(page_tail->private)) { 3221 VM_WARN_ON_ONCE_PAGE(true, page_tail); 3222 page_tail->private = 0; 3223 } 3224 if (folio_test_swapcache(folio)) 3225 new_folio->swap.val = folio->swap.val + tail; 3226 3227 /* Page flags must be visible before we make the page non-compound. */ 3228 smp_wmb(); 3229 3230 /* 3231 * Clear PageTail before unfreezing page refcount. 3232 * 3233 * After successful get_page_unless_zero() might follow put_page() 3234 * which needs correct compound_head(). 3235 */ 3236 clear_compound_head(page_tail); 3237 if (new_order) { 3238 prep_compound_page(page_tail, new_order); 3239 folio_set_large_rmappable(new_folio); 3240 } 3241 3242 /* Finally unfreeze refcount. Additional reference from page cache. */ 3243 page_ref_unfreeze(page_tail, 3244 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? 3245 folio_nr_pages(new_folio) : 0)); 3246 3247 if (folio_test_young(folio)) 3248 folio_set_young(new_folio); 3249 if (folio_test_idle(folio)) 3250 folio_set_idle(new_folio); 3251 3252 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); 3253 3254 /* 3255 * always add to the tail because some iterators expect new 3256 * pages to show after the currently processed elements - e.g. 3257 * migrate_pages 3258 */ 3259 lru_add_page_tail(folio, page_tail, lruvec, list); 3260 } 3261 3262 static void __split_huge_page(struct page *page, struct list_head *list, 3263 pgoff_t end, unsigned int new_order) 3264 { 3265 struct folio *folio = page_folio(page); 3266 struct page *head = &folio->page; 3267 struct lruvec *lruvec; 3268 struct address_space *swap_cache = NULL; 3269 unsigned long offset = 0; 3270 int i, nr_dropped = 0; 3271 unsigned int new_nr = 1 << new_order; 3272 int order = folio_order(folio); 3273 unsigned int nr = 1 << order; 3274 3275 /* complete memcg works before add pages to LRU */ 3276 split_page_memcg(head, order, new_order); 3277 3278 if (folio_test_anon(folio) && folio_test_swapcache(folio)) { 3279 offset = swap_cache_index(folio->swap); 3280 swap_cache = swap_address_space(folio->swap); 3281 xa_lock(&swap_cache->i_pages); 3282 } 3283 3284 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 3285 lruvec = folio_lruvec_lock(folio); 3286 3287 ClearPageHasHWPoisoned(head); 3288 3289 for (i = nr - new_nr; i >= new_nr; i -= new_nr) { 3290 struct folio *tail; 3291 __split_huge_page_tail(folio, i, lruvec, list, new_order); 3292 tail = page_folio(head + i); 3293 /* Some pages can be beyond EOF: drop them from page cache */ 3294 if (tail->index >= end) { 3295 if (shmem_mapping(folio->mapping)) 3296 nr_dropped++; 3297 else if (folio_test_clear_dirty(tail)) 3298 folio_account_cleaned(tail, 3299 inode_to_wb(folio->mapping->host)); 3300 __filemap_remove_folio(tail, NULL); 3301 folio_put(tail); 3302 } else if (!folio_test_anon(folio)) { 3303 __xa_store(&folio->mapping->i_pages, tail->index, 3304 tail, 0); 3305 } else if (swap_cache) { 3306 __xa_store(&swap_cache->i_pages, offset + i, 3307 tail, 0); 3308 } 3309 } 3310 3311 if (!new_order) 3312 ClearPageCompound(head); 3313 else { 3314 struct folio *new_folio = (struct folio *)head; 3315 3316 folio_set_order(new_folio, new_order); 3317 } 3318 unlock_page_lruvec(lruvec); 3319 /* Caller disabled irqs, so they are still disabled here */ 3320 3321 split_page_owner(head, order, new_order); 3322 pgalloc_tag_split(folio, order, new_order); 3323 3324 /* See comment in __split_huge_page_tail() */ 3325 if (folio_test_anon(folio)) { 3326 /* Additional pin to swap cache */ 3327 if (folio_test_swapcache(folio)) { 3328 folio_ref_add(folio, 1 + new_nr); 3329 xa_unlock(&swap_cache->i_pages); 3330 } else { 3331 folio_ref_inc(folio); 3332 } 3333 } else { 3334 /* Additional pin to page cache */ 3335 folio_ref_add(folio, 1 + new_nr); 3336 xa_unlock(&folio->mapping->i_pages); 3337 } 3338 local_irq_enable(); 3339 3340 if (nr_dropped) 3341 shmem_uncharge(folio->mapping->host, nr_dropped); 3342 remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0); 3343 3344 /* 3345 * set page to its compound_head when split to non order-0 pages, so 3346 * we can skip unlocking it below, since PG_locked is transferred to 3347 * the compound_head of the page and the caller will unlock it. 3348 */ 3349 if (new_order) 3350 page = compound_head(page); 3351 3352 for (i = 0; i < nr; i += new_nr) { 3353 struct page *subpage = head + i; 3354 struct folio *new_folio = page_folio(subpage); 3355 if (subpage == page) 3356 continue; 3357 folio_unlock(new_folio); 3358 3359 /* 3360 * Subpages may be freed if there wasn't any mapping 3361 * like if add_to_swap() is running on a lru page that 3362 * had its mapping zapped. And freeing these pages 3363 * requires taking the lru_lock so we do the put_page 3364 * of the tail pages after the split is complete. 3365 */ 3366 free_page_and_swap_cache(subpage); 3367 } 3368 } 3369 3370 /* Racy check whether the huge page can be split */ 3371 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) 3372 { 3373 int extra_pins; 3374 3375 /* Additional pins from page cache */ 3376 if (folio_test_anon(folio)) 3377 extra_pins = folio_test_swapcache(folio) ? 3378 folio_nr_pages(folio) : 0; 3379 else 3380 extra_pins = folio_nr_pages(folio); 3381 if (pextra_pins) 3382 *pextra_pins = extra_pins; 3383 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 3384 caller_pins; 3385 } 3386 3387 /* 3388 * This function splits a large folio into smaller folios of order @new_order. 3389 * @page can point to any page of the large folio to split. The split operation 3390 * does not change the position of @page. 3391 * 3392 * Prerequisites: 3393 * 3394 * 1) The caller must hold a reference on the @page's owning folio, also known 3395 * as the large folio. 3396 * 3397 * 2) The large folio must be locked. 3398 * 3399 * 3) The folio must not be pinned. Any unexpected folio references, including 3400 * GUP pins, will result in the folio not getting split; instead, the caller 3401 * will receive an -EAGAIN. 3402 * 3403 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not 3404 * supported for non-file-backed folios, because folio->_deferred_list, which 3405 * is used by partially mapped folios, is stored in subpage 2, but an order-1 3406 * folio only has subpages 0 and 1. File-backed order-1 folios are supported, 3407 * since they do not use _deferred_list. 3408 * 3409 * After splitting, the caller's folio reference will be transferred to @page, 3410 * resulting in a raised refcount of @page after this call. The other pages may 3411 * be freed if they are not mapped. 3412 * 3413 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 3414 * 3415 * Pages in @new_order will inherit the mapping, flags, and so on from the 3416 * huge page. 3417 * 3418 * Returns 0 if the huge page was split successfully. 3419 * 3420 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if 3421 * the folio was concurrently removed from the page cache. 3422 * 3423 * Returns -EBUSY when trying to split the huge zeropage, if the folio is 3424 * under writeback, if fs-specific folio metadata cannot currently be 3425 * released, or if some unexpected race happened (e.g., anon VMA disappeared, 3426 * truncation). 3427 * 3428 * Callers should ensure that the order respects the address space mapping 3429 * min-order if one is set for non-anonymous folios. 3430 * 3431 * Returns -EINVAL when trying to split to an order that is incompatible 3432 * with the folio. Splitting to order 0 is compatible with all folios. 3433 */ 3434 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 3435 unsigned int new_order) 3436 { 3437 struct folio *folio = page_folio(page); 3438 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3439 /* reset xarray order to new order after split */ 3440 XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); 3441 bool is_anon = folio_test_anon(folio); 3442 struct address_space *mapping = NULL; 3443 struct anon_vma *anon_vma = NULL; 3444 int order = folio_order(folio); 3445 int extra_pins, ret; 3446 pgoff_t end; 3447 bool is_hzp; 3448 3449 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 3450 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 3451 3452 if (new_order >= folio_order(folio)) 3453 return -EINVAL; 3454 3455 if (is_anon) { 3456 /* order-1 is not supported for anonymous THP. */ 3457 if (new_order == 1) { 3458 VM_WARN_ONCE(1, "Cannot split to order-1 folio"); 3459 return -EINVAL; 3460 } 3461 } else if (new_order) { 3462 /* Split shmem folio to non-zero order not supported */ 3463 if (shmem_mapping(folio->mapping)) { 3464 VM_WARN_ONCE(1, 3465 "Cannot split shmem folio to non-0 order"); 3466 return -EINVAL; 3467 } 3468 /* 3469 * No split if the file system does not support large folio. 3470 * Note that we might still have THPs in such mappings due to 3471 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping 3472 * does not actually support large folios properly. 3473 */ 3474 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 3475 !mapping_large_folio_support(folio->mapping)) { 3476 VM_WARN_ONCE(1, 3477 "Cannot split file folio to non-0 order"); 3478 return -EINVAL; 3479 } 3480 } 3481 3482 /* Only swapping a whole PMD-mapped folio is supported */ 3483 if (folio_test_swapcache(folio) && new_order) 3484 return -EINVAL; 3485 3486 is_hzp = is_huge_zero_folio(folio); 3487 if (is_hzp) { 3488 pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); 3489 return -EBUSY; 3490 } 3491 3492 if (folio_test_writeback(folio)) 3493 return -EBUSY; 3494 3495 if (is_anon) { 3496 /* 3497 * The caller does not necessarily hold an mmap_lock that would 3498 * prevent the anon_vma disappearing so we first we take a 3499 * reference to it and then lock the anon_vma for write. This 3500 * is similar to folio_lock_anon_vma_read except the write lock 3501 * is taken to serialise against parallel split or collapse 3502 * operations. 3503 */ 3504 anon_vma = folio_get_anon_vma(folio); 3505 if (!anon_vma) { 3506 ret = -EBUSY; 3507 goto out; 3508 } 3509 end = -1; 3510 mapping = NULL; 3511 anon_vma_lock_write(anon_vma); 3512 } else { 3513 unsigned int min_order; 3514 gfp_t gfp; 3515 3516 mapping = folio->mapping; 3517 3518 /* Truncated ? */ 3519 if (!mapping) { 3520 ret = -EBUSY; 3521 goto out; 3522 } 3523 3524 min_order = mapping_min_folio_order(folio->mapping); 3525 if (new_order < min_order) { 3526 VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", 3527 min_order); 3528 ret = -EINVAL; 3529 goto out; 3530 } 3531 3532 gfp = current_gfp_context(mapping_gfp_mask(mapping) & 3533 GFP_RECLAIM_MASK); 3534 3535 if (!filemap_release_folio(folio, gfp)) { 3536 ret = -EBUSY; 3537 goto out; 3538 } 3539 3540 xas_split_alloc(&xas, folio, folio_order(folio), gfp); 3541 if (xas_error(&xas)) { 3542 ret = xas_error(&xas); 3543 goto out; 3544 } 3545 3546 anon_vma = NULL; 3547 i_mmap_lock_read(mapping); 3548 3549 /* 3550 *__split_huge_page() may need to trim off pages beyond EOF: 3551 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 3552 * which cannot be nested inside the page tree lock. So note 3553 * end now: i_size itself may be changed at any moment, but 3554 * folio lock is good enough to serialize the trimming. 3555 */ 3556 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3557 if (shmem_mapping(mapping)) 3558 end = shmem_fallocend(mapping->host, end); 3559 } 3560 3561 /* 3562 * Racy check if we can split the page, before unmap_folio() will 3563 * split PMDs 3564 */ 3565 if (!can_split_folio(folio, 1, &extra_pins)) { 3566 ret = -EAGAIN; 3567 goto out_unlock; 3568 } 3569 3570 unmap_folio(folio); 3571 3572 /* block interrupt reentry in xa_lock and spinlock */ 3573 local_irq_disable(); 3574 if (mapping) { 3575 /* 3576 * Check if the folio is present in page cache. 3577 * We assume all tail are present too, if folio is there. 3578 */ 3579 xas_lock(&xas); 3580 xas_reset(&xas); 3581 if (xas_load(&xas) != folio) 3582 goto fail; 3583 } 3584 3585 /* Prevent deferred_split_scan() touching ->_refcount */ 3586 spin_lock(&ds_queue->split_queue_lock); 3587 if (folio_ref_freeze(folio, 1 + extra_pins)) { 3588 if (folio_order(folio) > 1 && 3589 !list_empty(&folio->_deferred_list)) { 3590 ds_queue->split_queue_len--; 3591 if (folio_test_partially_mapped(folio)) { 3592 folio_clear_partially_mapped(folio); 3593 mod_mthp_stat(folio_order(folio), 3594 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 3595 } 3596 /* 3597 * Reinitialize page_deferred_list after removing the 3598 * page from the split_queue, otherwise a subsequent 3599 * split will see list corruption when checking the 3600 * page_deferred_list. 3601 */ 3602 list_del_init(&folio->_deferred_list); 3603 } 3604 spin_unlock(&ds_queue->split_queue_lock); 3605 if (mapping) { 3606 int nr = folio_nr_pages(folio); 3607 3608 xas_split(&xas, folio, folio_order(folio)); 3609 if (folio_test_pmd_mappable(folio) && 3610 new_order < HPAGE_PMD_ORDER) { 3611 if (folio_test_swapbacked(folio)) { 3612 __lruvec_stat_mod_folio(folio, 3613 NR_SHMEM_THPS, -nr); 3614 } else { 3615 __lruvec_stat_mod_folio(folio, 3616 NR_FILE_THPS, -nr); 3617 filemap_nr_thps_dec(mapping); 3618 } 3619 } 3620 } 3621 3622 if (is_anon) { 3623 mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); 3624 mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order)); 3625 } 3626 __split_huge_page(page, list, end, new_order); 3627 ret = 0; 3628 } else { 3629 spin_unlock(&ds_queue->split_queue_lock); 3630 fail: 3631 if (mapping) 3632 xas_unlock(&xas); 3633 local_irq_enable(); 3634 remap_page(folio, folio_nr_pages(folio), 0); 3635 ret = -EAGAIN; 3636 } 3637 3638 out_unlock: 3639 if (anon_vma) { 3640 anon_vma_unlock_write(anon_vma); 3641 put_anon_vma(anon_vma); 3642 } 3643 if (mapping) 3644 i_mmap_unlock_read(mapping); 3645 out: 3646 xas_destroy(&xas); 3647 if (order == HPAGE_PMD_ORDER) 3648 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3649 count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); 3650 return ret; 3651 } 3652 3653 int min_order_for_split(struct folio *folio) 3654 { 3655 if (folio_test_anon(folio)) 3656 return 0; 3657 3658 if (!folio->mapping) { 3659 if (folio_test_pmd_mappable(folio)) 3660 count_vm_event(THP_SPLIT_PAGE_FAILED); 3661 return -EBUSY; 3662 } 3663 3664 return mapping_min_folio_order(folio->mapping); 3665 } 3666 3667 int split_folio_to_list(struct folio *folio, struct list_head *list) 3668 { 3669 int ret = min_order_for_split(folio); 3670 3671 if (ret < 0) 3672 return ret; 3673 3674 return split_huge_page_to_list_to_order(&folio->page, list, ret); 3675 } 3676 3677 /* 3678 * __folio_unqueue_deferred_split() is not to be called directly: 3679 * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h 3680 * limits its calls to those folios which may have a _deferred_list for 3681 * queueing THP splits, and that list is (racily observed to be) non-empty. 3682 * 3683 * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is 3684 * zero: because even when split_queue_lock is held, a non-empty _deferred_list 3685 * might be in use on deferred_split_scan()'s unlocked on-stack list. 3686 * 3687 * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is 3688 * therefore important to unqueue deferred split before changing folio memcg. 3689 */ 3690 bool __folio_unqueue_deferred_split(struct folio *folio) 3691 { 3692 struct deferred_split *ds_queue; 3693 unsigned long flags; 3694 bool unqueued = false; 3695 3696 WARN_ON_ONCE(folio_ref_count(folio)); 3697 WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); 3698 3699 ds_queue = get_deferred_split_queue(folio); 3700 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3701 if (!list_empty(&folio->_deferred_list)) { 3702 ds_queue->split_queue_len--; 3703 if (folio_test_partially_mapped(folio)) { 3704 folio_clear_partially_mapped(folio); 3705 mod_mthp_stat(folio_order(folio), 3706 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 3707 } 3708 list_del_init(&folio->_deferred_list); 3709 unqueued = true; 3710 } 3711 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3712 3713 return unqueued; /* useful for debug warnings */ 3714 } 3715 3716 /* partially_mapped=false won't clear PG_partially_mapped folio flag */ 3717 void deferred_split_folio(struct folio *folio, bool partially_mapped) 3718 { 3719 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3720 #ifdef CONFIG_MEMCG 3721 struct mem_cgroup *memcg = folio_memcg(folio); 3722 #endif 3723 unsigned long flags; 3724 3725 /* 3726 * Order 1 folios have no space for a deferred list, but we also 3727 * won't waste much memory by not adding them to the deferred list. 3728 */ 3729 if (folio_order(folio) <= 1) 3730 return; 3731 3732 if (!partially_mapped && !split_underused_thp) 3733 return; 3734 3735 /* 3736 * Exclude swapcache: originally to avoid a corrupt deferred split 3737 * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); 3738 * but if page reclaim is already handling the same folio, it is 3739 * unnecessary to handle it again in the shrinker, so excluding 3740 * swapcache here may still be a useful optimization. 3741 */ 3742 if (folio_test_swapcache(folio)) 3743 return; 3744 3745 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3746 if (partially_mapped) { 3747 if (!folio_test_partially_mapped(folio)) { 3748 folio_set_partially_mapped(folio); 3749 if (folio_test_pmd_mappable(folio)) 3750 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 3751 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); 3752 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); 3753 3754 } 3755 } else { 3756 /* partially mapped folios cannot become non-partially mapped */ 3757 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); 3758 } 3759 if (list_empty(&folio->_deferred_list)) { 3760 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); 3761 ds_queue->split_queue_len++; 3762 #ifdef CONFIG_MEMCG 3763 if (memcg) 3764 set_shrinker_bit(memcg, folio_nid(folio), 3765 deferred_split_shrinker->id); 3766 #endif 3767 } 3768 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3769 } 3770 3771 static unsigned long deferred_split_count(struct shrinker *shrink, 3772 struct shrink_control *sc) 3773 { 3774 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3775 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3776 3777 #ifdef CONFIG_MEMCG 3778 if (sc->memcg) 3779 ds_queue = &sc->memcg->deferred_split_queue; 3780 #endif 3781 return READ_ONCE(ds_queue->split_queue_len); 3782 } 3783 3784 static bool thp_underused(struct folio *folio) 3785 { 3786 int num_zero_pages = 0, num_filled_pages = 0; 3787 void *kaddr; 3788 int i; 3789 3790 if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) 3791 return false; 3792 3793 for (i = 0; i < folio_nr_pages(folio); i++) { 3794 kaddr = kmap_local_folio(folio, i * PAGE_SIZE); 3795 if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { 3796 num_zero_pages++; 3797 if (num_zero_pages > khugepaged_max_ptes_none) { 3798 kunmap_local(kaddr); 3799 return true; 3800 } 3801 } else { 3802 /* 3803 * Another path for early exit once the number 3804 * of non-zero filled pages exceeds threshold. 3805 */ 3806 num_filled_pages++; 3807 if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { 3808 kunmap_local(kaddr); 3809 return false; 3810 } 3811 } 3812 kunmap_local(kaddr); 3813 } 3814 return false; 3815 } 3816 3817 static unsigned long deferred_split_scan(struct shrinker *shrink, 3818 struct shrink_control *sc) 3819 { 3820 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3821 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3822 unsigned long flags; 3823 LIST_HEAD(list); 3824 struct folio *folio, *next, *prev = NULL; 3825 int split = 0, removed = 0; 3826 3827 #ifdef CONFIG_MEMCG 3828 if (sc->memcg) 3829 ds_queue = &sc->memcg->deferred_split_queue; 3830 #endif 3831 3832 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3833 /* Take pin on all head pages to avoid freeing them under us */ 3834 list_for_each_entry_safe(folio, next, &ds_queue->split_queue, 3835 _deferred_list) { 3836 if (folio_try_get(folio)) { 3837 list_move(&folio->_deferred_list, &list); 3838 } else { 3839 /* We lost race with folio_put() */ 3840 if (folio_test_partially_mapped(folio)) { 3841 folio_clear_partially_mapped(folio); 3842 mod_mthp_stat(folio_order(folio), 3843 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 3844 } 3845 list_del_init(&folio->_deferred_list); 3846 ds_queue->split_queue_len--; 3847 } 3848 if (!--sc->nr_to_scan) 3849 break; 3850 } 3851 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3852 3853 list_for_each_entry_safe(folio, next, &list, _deferred_list) { 3854 bool did_split = false; 3855 bool underused = false; 3856 3857 if (!folio_test_partially_mapped(folio)) { 3858 underused = thp_underused(folio); 3859 if (!underused) 3860 goto next; 3861 } 3862 if (!folio_trylock(folio)) 3863 goto next; 3864 if (!split_folio(folio)) { 3865 did_split = true; 3866 if (underused) 3867 count_vm_event(THP_UNDERUSED_SPLIT_PAGE); 3868 split++; 3869 } 3870 folio_unlock(folio); 3871 next: 3872 /* 3873 * split_folio() removes folio from list on success. 3874 * Only add back to the queue if folio is partially mapped. 3875 * If thp_underused returns false, or if split_folio fails 3876 * in the case it was underused, then consider it used and 3877 * don't add it back to split_queue. 3878 */ 3879 if (did_split) { 3880 ; /* folio already removed from list */ 3881 } else if (!folio_test_partially_mapped(folio)) { 3882 list_del_init(&folio->_deferred_list); 3883 removed++; 3884 } else { 3885 /* 3886 * That unlocked list_del_init() above would be unsafe, 3887 * unless its folio is separated from any earlier folios 3888 * left on the list (which may be concurrently unqueued) 3889 * by one safe folio with refcount still raised. 3890 */ 3891 swap(folio, prev); 3892 } 3893 if (folio) 3894 folio_put(folio); 3895 } 3896 3897 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3898 list_splice_tail(&list, &ds_queue->split_queue); 3899 ds_queue->split_queue_len -= removed; 3900 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3901 3902 if (prev) 3903 folio_put(prev); 3904 3905 /* 3906 * Stop shrinker if we didn't split any page, but the queue is empty. 3907 * This can happen if pages were freed under us. 3908 */ 3909 if (!split && list_empty(&ds_queue->split_queue)) 3910 return SHRINK_STOP; 3911 return split; 3912 } 3913 3914 #ifdef CONFIG_DEBUG_FS 3915 static void split_huge_pages_all(void) 3916 { 3917 struct zone *zone; 3918 struct page *page; 3919 struct folio *folio; 3920 unsigned long pfn, max_zone_pfn; 3921 unsigned long total = 0, split = 0; 3922 3923 pr_debug("Split all THPs\n"); 3924 for_each_zone(zone) { 3925 if (!managed_zone(zone)) 3926 continue; 3927 max_zone_pfn = zone_end_pfn(zone); 3928 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 3929 int nr_pages; 3930 3931 page = pfn_to_online_page(pfn); 3932 if (!page || PageTail(page)) 3933 continue; 3934 folio = page_folio(page); 3935 if (!folio_try_get(folio)) 3936 continue; 3937 3938 if (unlikely(page_folio(page) != folio)) 3939 goto next; 3940 3941 if (zone != folio_zone(folio)) 3942 goto next; 3943 3944 if (!folio_test_large(folio) 3945 || folio_test_hugetlb(folio) 3946 || !folio_test_lru(folio)) 3947 goto next; 3948 3949 total++; 3950 folio_lock(folio); 3951 nr_pages = folio_nr_pages(folio); 3952 if (!split_folio(folio)) 3953 split++; 3954 pfn += nr_pages - 1; 3955 folio_unlock(folio); 3956 next: 3957 folio_put(folio); 3958 cond_resched(); 3959 } 3960 } 3961 3962 pr_debug("%lu of %lu THP split\n", split, total); 3963 } 3964 3965 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) 3966 { 3967 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || 3968 is_vm_hugetlb_page(vma); 3969 } 3970 3971 static int split_huge_pages_pid(int pid, unsigned long vaddr_start, 3972 unsigned long vaddr_end, unsigned int new_order) 3973 { 3974 int ret = 0; 3975 struct task_struct *task; 3976 struct mm_struct *mm; 3977 unsigned long total = 0, split = 0; 3978 unsigned long addr; 3979 3980 vaddr_start &= PAGE_MASK; 3981 vaddr_end &= PAGE_MASK; 3982 3983 task = find_get_task_by_vpid(pid); 3984 if (!task) { 3985 ret = -ESRCH; 3986 goto out; 3987 } 3988 3989 /* Find the mm_struct */ 3990 mm = get_task_mm(task); 3991 put_task_struct(task); 3992 3993 if (!mm) { 3994 ret = -EINVAL; 3995 goto out; 3996 } 3997 3998 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", 3999 pid, vaddr_start, vaddr_end); 4000 4001 mmap_read_lock(mm); 4002 /* 4003 * always increase addr by PAGE_SIZE, since we could have a PTE page 4004 * table filled with PTE-mapped THPs, each of which is distinct. 4005 */ 4006 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { 4007 struct vm_area_struct *vma = vma_lookup(mm, addr); 4008 struct folio_walk fw; 4009 struct folio *folio; 4010 struct address_space *mapping; 4011 unsigned int target_order = new_order; 4012 4013 if (!vma) 4014 break; 4015 4016 /* skip special VMA and hugetlb VMA */ 4017 if (vma_not_suitable_for_thp_split(vma)) { 4018 addr = vma->vm_end; 4019 continue; 4020 } 4021 4022 folio = folio_walk_start(&fw, vma, addr, 0); 4023 if (!folio) 4024 continue; 4025 4026 if (!is_transparent_hugepage(folio)) 4027 goto next; 4028 4029 if (!folio_test_anon(folio)) { 4030 mapping = folio->mapping; 4031 target_order = max(new_order, 4032 mapping_min_folio_order(mapping)); 4033 } 4034 4035 if (target_order >= folio_order(folio)) 4036 goto next; 4037 4038 total++; 4039 /* 4040 * For folios with private, split_huge_page_to_list_to_order() 4041 * will try to drop it before split and then check if the folio 4042 * can be split or not. So skip the check here. 4043 */ 4044 if (!folio_test_private(folio) && 4045 !can_split_folio(folio, 0, NULL)) 4046 goto next; 4047 4048 if (!folio_trylock(folio)) 4049 goto next; 4050 folio_get(folio); 4051 folio_walk_end(&fw, vma); 4052 4053 if (!folio_test_anon(folio) && folio->mapping != mapping) 4054 goto unlock; 4055 4056 if (!split_folio_to_order(folio, target_order)) 4057 split++; 4058 4059 unlock: 4060 4061 folio_unlock(folio); 4062 folio_put(folio); 4063 4064 cond_resched(); 4065 continue; 4066 next: 4067 folio_walk_end(&fw, vma); 4068 cond_resched(); 4069 } 4070 mmap_read_unlock(mm); 4071 mmput(mm); 4072 4073 pr_debug("%lu of %lu THP split\n", split, total); 4074 4075 out: 4076 return ret; 4077 } 4078 4079 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, 4080 pgoff_t off_end, unsigned int new_order) 4081 { 4082 struct filename *file; 4083 struct file *candidate; 4084 struct address_space *mapping; 4085 int ret = -EINVAL; 4086 pgoff_t index; 4087 int nr_pages = 1; 4088 unsigned long total = 0, split = 0; 4089 unsigned int min_order; 4090 unsigned int target_order; 4091 4092 file = getname_kernel(file_path); 4093 if (IS_ERR(file)) 4094 return ret; 4095 4096 candidate = file_open_name(file, O_RDONLY, 0); 4097 if (IS_ERR(candidate)) 4098 goto out; 4099 4100 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", 4101 file_path, off_start, off_end); 4102 4103 mapping = candidate->f_mapping; 4104 min_order = mapping_min_folio_order(mapping); 4105 target_order = max(new_order, min_order); 4106 4107 for (index = off_start; index < off_end; index += nr_pages) { 4108 struct folio *folio = filemap_get_folio(mapping, index); 4109 4110 nr_pages = 1; 4111 if (IS_ERR(folio)) 4112 continue; 4113 4114 if (!folio_test_large(folio)) 4115 goto next; 4116 4117 total++; 4118 nr_pages = folio_nr_pages(folio); 4119 4120 if (target_order >= folio_order(folio)) 4121 goto next; 4122 4123 if (!folio_trylock(folio)) 4124 goto next; 4125 4126 if (folio->mapping != mapping) 4127 goto unlock; 4128 4129 if (!split_folio_to_order(folio, target_order)) 4130 split++; 4131 4132 unlock: 4133 folio_unlock(folio); 4134 next: 4135 folio_put(folio); 4136 cond_resched(); 4137 } 4138 4139 filp_close(candidate, NULL); 4140 ret = 0; 4141 4142 pr_debug("%lu of %lu file-backed THP split\n", split, total); 4143 out: 4144 putname(file); 4145 return ret; 4146 } 4147 4148 #define MAX_INPUT_BUF_SZ 255 4149 4150 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, 4151 size_t count, loff_t *ppops) 4152 { 4153 static DEFINE_MUTEX(split_debug_mutex); 4154 ssize_t ret; 4155 /* 4156 * hold pid, start_vaddr, end_vaddr, new_order or 4157 * file_path, off_start, off_end, new_order 4158 */ 4159 char input_buf[MAX_INPUT_BUF_SZ]; 4160 int pid; 4161 unsigned long vaddr_start, vaddr_end; 4162 unsigned int new_order = 0; 4163 4164 ret = mutex_lock_interruptible(&split_debug_mutex); 4165 if (ret) 4166 return ret; 4167 4168 ret = -EFAULT; 4169 4170 memset(input_buf, 0, MAX_INPUT_BUF_SZ); 4171 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) 4172 goto out; 4173 4174 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; 4175 4176 if (input_buf[0] == '/') { 4177 char *tok; 4178 char *buf = input_buf; 4179 char file_path[MAX_INPUT_BUF_SZ]; 4180 pgoff_t off_start = 0, off_end = 0; 4181 size_t input_len = strlen(input_buf); 4182 4183 tok = strsep(&buf, ","); 4184 if (tok && buf) { 4185 strscpy(file_path, tok); 4186 } else { 4187 ret = -EINVAL; 4188 goto out; 4189 } 4190 4191 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); 4192 if (ret != 2 && ret != 3) { 4193 ret = -EINVAL; 4194 goto out; 4195 } 4196 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); 4197 if (!ret) 4198 ret = input_len; 4199 4200 goto out; 4201 } 4202 4203 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); 4204 if (ret == 1 && pid == 1) { 4205 split_huge_pages_all(); 4206 ret = strlen(input_buf); 4207 goto out; 4208 } else if (ret != 3 && ret != 4) { 4209 ret = -EINVAL; 4210 goto out; 4211 } 4212 4213 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); 4214 if (!ret) 4215 ret = strlen(input_buf); 4216 out: 4217 mutex_unlock(&split_debug_mutex); 4218 return ret; 4219 4220 } 4221 4222 static const struct file_operations split_huge_pages_fops = { 4223 .owner = THIS_MODULE, 4224 .write = split_huge_pages_write, 4225 }; 4226 4227 static int __init split_huge_pages_debugfs(void) 4228 { 4229 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 4230 &split_huge_pages_fops); 4231 return 0; 4232 } 4233 late_initcall(split_huge_pages_debugfs); 4234 #endif 4235 4236 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 4237 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 4238 struct page *page) 4239 { 4240 struct folio *folio = page_folio(page); 4241 struct vm_area_struct *vma = pvmw->vma; 4242 struct mm_struct *mm = vma->vm_mm; 4243 unsigned long address = pvmw->address; 4244 bool anon_exclusive; 4245 pmd_t pmdval; 4246 swp_entry_t entry; 4247 pmd_t pmdswp; 4248 4249 if (!(pvmw->pmd && !pvmw->pte)) 4250 return 0; 4251 4252 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 4253 pmdval = pmdp_invalidate(vma, address, pvmw->pmd); 4254 4255 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ 4256 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); 4257 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { 4258 set_pmd_at(mm, address, pvmw->pmd, pmdval); 4259 return -EBUSY; 4260 } 4261 4262 if (pmd_dirty(pmdval)) 4263 folio_mark_dirty(folio); 4264 if (pmd_write(pmdval)) 4265 entry = make_writable_migration_entry(page_to_pfn(page)); 4266 else if (anon_exclusive) 4267 entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); 4268 else 4269 entry = make_readable_migration_entry(page_to_pfn(page)); 4270 if (pmd_young(pmdval)) 4271 entry = make_migration_entry_young(entry); 4272 if (pmd_dirty(pmdval)) 4273 entry = make_migration_entry_dirty(entry); 4274 pmdswp = swp_entry_to_pmd(entry); 4275 if (pmd_soft_dirty(pmdval)) 4276 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 4277 if (pmd_uffd_wp(pmdval)) 4278 pmdswp = pmd_swp_mkuffd_wp(pmdswp); 4279 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 4280 folio_remove_rmap_pmd(folio, page, vma); 4281 folio_put(folio); 4282 trace_set_migration_pmd(address, pmd_val(pmdswp)); 4283 4284 return 0; 4285 } 4286 4287 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 4288 { 4289 struct folio *folio = page_folio(new); 4290 struct vm_area_struct *vma = pvmw->vma; 4291 struct mm_struct *mm = vma->vm_mm; 4292 unsigned long address = pvmw->address; 4293 unsigned long haddr = address & HPAGE_PMD_MASK; 4294 pmd_t pmde; 4295 swp_entry_t entry; 4296 4297 if (!(pvmw->pmd && !pvmw->pte)) 4298 return; 4299 4300 entry = pmd_to_swp_entry(*pvmw->pmd); 4301 folio_get(folio); 4302 pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); 4303 if (pmd_swp_soft_dirty(*pvmw->pmd)) 4304 pmde = pmd_mksoft_dirty(pmde); 4305 if (is_writable_migration_entry(entry)) 4306 pmde = pmd_mkwrite(pmde, vma); 4307 if (pmd_swp_uffd_wp(*pvmw->pmd)) 4308 pmde = pmd_mkuffd_wp(pmde); 4309 if (!is_migration_entry_young(entry)) 4310 pmde = pmd_mkold(pmde); 4311 /* NOTE: this may contain setting soft-dirty on some archs */ 4312 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) 4313 pmde = pmd_mkdirty(pmde); 4314 4315 if (folio_test_anon(folio)) { 4316 rmap_t rmap_flags = RMAP_NONE; 4317 4318 if (!is_readable_migration_entry(entry)) 4319 rmap_flags |= RMAP_EXCLUSIVE; 4320 4321 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); 4322 } else { 4323 folio_add_file_rmap_pmd(folio, new, vma); 4324 } 4325 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new)); 4326 set_pmd_at(mm, haddr, pvmw->pmd, pmde); 4327 4328 /* No need to invalidate - it was non-present before */ 4329 update_mmu_cache_pmd(vma, address, pvmw->pmd); 4330 trace_remove_migration_pmd(address, pmd_val(pmde)); 4331 } 4332 #endif 4333