1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/mm.h> 11 #include <linux/sched/coredump.h> 12 #include <linux/sched/numa_balancing.h> 13 #include <linux/highmem.h> 14 #include <linux/hugetlb.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/rmap.h> 17 #include <linux/swap.h> 18 #include <linux/shrinker.h> 19 #include <linux/mm_inline.h> 20 #include <linux/swapops.h> 21 #include <linux/backing-dev.h> 22 #include <linux/dax.h> 23 #include <linux/mm_types.h> 24 #include <linux/khugepaged.h> 25 #include <linux/freezer.h> 26 #include <linux/pfn_t.h> 27 #include <linux/mman.h> 28 #include <linux/memremap.h> 29 #include <linux/pagemap.h> 30 #include <linux/debugfs.h> 31 #include <linux/migrate.h> 32 #include <linux/hashtable.h> 33 #include <linux/userfaultfd_k.h> 34 #include <linux/page_idle.h> 35 #include <linux/shmem_fs.h> 36 #include <linux/oom.h> 37 #include <linux/numa.h> 38 #include <linux/page_owner.h> 39 #include <linux/sched/sysctl.h> 40 #include <linux/memory-tiers.h> 41 #include <linux/compat.h> 42 #include <linux/pgalloc_tag.h> 43 44 #include <asm/tlb.h> 45 #include <asm/pgalloc.h> 46 #include "internal.h" 47 #include "swap.h" 48 49 #define CREATE_TRACE_POINTS 50 #include <trace/events/thp.h> 51 52 /* 53 * By default, transparent hugepage support is disabled in order to avoid 54 * risking an increased memory footprint for applications that are not 55 * guaranteed to benefit from it. When transparent hugepage support is 56 * enabled, it is for all mappings, and khugepaged scans all mappings. 57 * Defrag is invoked by khugepaged hugepage allocations and by page faults 58 * for all hugepage allocations. 59 */ 60 unsigned long transparent_hugepage_flags __read_mostly = 61 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 62 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 63 #endif 64 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 65 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 66 #endif 67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 68 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 69 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 70 71 static struct shrinker *deferred_split_shrinker; 72 static unsigned long deferred_split_count(struct shrinker *shrink, 73 struct shrink_control *sc); 74 static unsigned long deferred_split_scan(struct shrinker *shrink, 75 struct shrink_control *sc); 76 77 static atomic_t huge_zero_refcount; 78 struct folio *huge_zero_folio __read_mostly; 79 unsigned long huge_zero_pfn __read_mostly = ~0UL; 80 unsigned long huge_anon_orders_always __read_mostly; 81 unsigned long huge_anon_orders_madvise __read_mostly; 82 unsigned long huge_anon_orders_inherit __read_mostly; 83 84 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, 85 unsigned long vm_flags, 86 unsigned long tva_flags, 87 unsigned long orders) 88 { 89 bool smaps = tva_flags & TVA_SMAPS; 90 bool in_pf = tva_flags & TVA_IN_PF; 91 bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; 92 /* Check the intersection of requested and supported orders. */ 93 orders &= vma_is_anonymous(vma) ? 94 THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; 95 if (!orders) 96 return 0; 97 98 if (!vma->vm_mm) /* vdso */ 99 return 0; 100 101 /* 102 * Explicitly disabled through madvise or prctl, or some 103 * architectures may disable THP for some mappings, for 104 * example, s390 kvm. 105 * */ 106 if ((vm_flags & VM_NOHUGEPAGE) || 107 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 108 return 0; 109 /* 110 * If the hardware/firmware marked hugepage support disabled. 111 */ 112 if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) 113 return 0; 114 115 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ 116 if (vma_is_dax(vma)) 117 return in_pf ? orders : 0; 118 119 /* 120 * khugepaged special VMA and hugetlb VMA. 121 * Must be checked after dax since some dax mappings may have 122 * VM_MIXEDMAP set. 123 */ 124 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) 125 return 0; 126 127 /* 128 * Check alignment for file vma and size for both file and anon vma by 129 * filtering out the unsuitable orders. 130 * 131 * Skip the check for page fault. Huge fault does the check in fault 132 * handlers. 133 */ 134 if (!in_pf) { 135 int order = highest_order(orders); 136 unsigned long addr; 137 138 while (orders) { 139 addr = vma->vm_end - (PAGE_SIZE << order); 140 if (thp_vma_suitable_order(vma, addr, order)) 141 break; 142 order = next_order(&orders, order); 143 } 144 145 if (!orders) 146 return 0; 147 } 148 149 /* 150 * Enabled via shmem mount options or sysfs settings. 151 * Must be done before hugepage flags check since shmem has its 152 * own flags. 153 */ 154 if (!in_pf && shmem_file(vma->vm_file)) { 155 bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 156 !enforce_sysfs, vma->vm_mm, vm_flags); 157 158 if (!vma_is_anon_shmem(vma)) 159 return global_huge ? orders : 0; 160 return shmem_allowable_huge_orders(file_inode(vma->vm_file), 161 vma, vma->vm_pgoff, global_huge); 162 } 163 164 if (!vma_is_anonymous(vma)) { 165 /* 166 * Enforce sysfs THP requirements as necessary. Anonymous vmas 167 * were already handled in thp_vma_allowable_orders(). 168 */ 169 if (enforce_sysfs && 170 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && 171 !hugepage_global_always()))) 172 return 0; 173 174 /* 175 * Trust that ->huge_fault() handlers know what they are doing 176 * in fault path. 177 */ 178 if (((in_pf || smaps)) && vma->vm_ops->huge_fault) 179 return orders; 180 /* Only regular file is valid in collapse path */ 181 if (((!in_pf || smaps)) && file_thp_enabled(vma)) 182 return orders; 183 return 0; 184 } 185 186 if (vma_is_temporary_stack(vma)) 187 return 0; 188 189 /* 190 * THPeligible bit of smaps should show 1 for proper VMAs even 191 * though anon_vma is not initialized yet. 192 * 193 * Allow page fault since anon_vma may be not initialized until 194 * the first page fault. 195 */ 196 if (!vma->anon_vma) 197 return (smaps || in_pf) ? orders : 0; 198 199 return orders; 200 } 201 202 static bool get_huge_zero_page(void) 203 { 204 struct folio *zero_folio; 205 retry: 206 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 207 return true; 208 209 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 210 HPAGE_PMD_ORDER); 211 if (!zero_folio) { 212 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 213 return false; 214 } 215 preempt_disable(); 216 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { 217 preempt_enable(); 218 folio_put(zero_folio); 219 goto retry; 220 } 221 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); 222 223 /* We take additional reference here. It will be put back by shrinker */ 224 atomic_set(&huge_zero_refcount, 2); 225 preempt_enable(); 226 count_vm_event(THP_ZERO_PAGE_ALLOC); 227 return true; 228 } 229 230 static void put_huge_zero_page(void) 231 { 232 /* 233 * Counter should never go to zero here. Only shrinker can put 234 * last reference. 235 */ 236 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 237 } 238 239 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) 240 { 241 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 242 return READ_ONCE(huge_zero_folio); 243 244 if (!get_huge_zero_page()) 245 return NULL; 246 247 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 248 put_huge_zero_page(); 249 250 return READ_ONCE(huge_zero_folio); 251 } 252 253 void mm_put_huge_zero_folio(struct mm_struct *mm) 254 { 255 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 256 put_huge_zero_page(); 257 } 258 259 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 260 struct shrink_control *sc) 261 { 262 /* we can free zero page only if last reference remains */ 263 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 264 } 265 266 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 267 struct shrink_control *sc) 268 { 269 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 270 struct folio *zero_folio = xchg(&huge_zero_folio, NULL); 271 BUG_ON(zero_folio == NULL); 272 WRITE_ONCE(huge_zero_pfn, ~0UL); 273 folio_put(zero_folio); 274 return HPAGE_PMD_NR; 275 } 276 277 return 0; 278 } 279 280 static struct shrinker *huge_zero_page_shrinker; 281 282 #ifdef CONFIG_SYSFS 283 static ssize_t enabled_show(struct kobject *kobj, 284 struct kobj_attribute *attr, char *buf) 285 { 286 const char *output; 287 288 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 289 output = "[always] madvise never"; 290 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 291 &transparent_hugepage_flags)) 292 output = "always [madvise] never"; 293 else 294 output = "always madvise [never]"; 295 296 return sysfs_emit(buf, "%s\n", output); 297 } 298 299 static ssize_t enabled_store(struct kobject *kobj, 300 struct kobj_attribute *attr, 301 const char *buf, size_t count) 302 { 303 ssize_t ret = count; 304 305 if (sysfs_streq(buf, "always")) { 306 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 307 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 308 } else if (sysfs_streq(buf, "madvise")) { 309 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 310 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 311 } else if (sysfs_streq(buf, "never")) { 312 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 313 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 314 } else 315 ret = -EINVAL; 316 317 if (ret > 0) { 318 int err = start_stop_khugepaged(); 319 if (err) 320 ret = err; 321 } 322 return ret; 323 } 324 325 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); 326 327 ssize_t single_hugepage_flag_show(struct kobject *kobj, 328 struct kobj_attribute *attr, char *buf, 329 enum transparent_hugepage_flag flag) 330 { 331 return sysfs_emit(buf, "%d\n", 332 !!test_bit(flag, &transparent_hugepage_flags)); 333 } 334 335 ssize_t single_hugepage_flag_store(struct kobject *kobj, 336 struct kobj_attribute *attr, 337 const char *buf, size_t count, 338 enum transparent_hugepage_flag flag) 339 { 340 unsigned long value; 341 int ret; 342 343 ret = kstrtoul(buf, 10, &value); 344 if (ret < 0) 345 return ret; 346 if (value > 1) 347 return -EINVAL; 348 349 if (value) 350 set_bit(flag, &transparent_hugepage_flags); 351 else 352 clear_bit(flag, &transparent_hugepage_flags); 353 354 return count; 355 } 356 357 static ssize_t defrag_show(struct kobject *kobj, 358 struct kobj_attribute *attr, char *buf) 359 { 360 const char *output; 361 362 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 363 &transparent_hugepage_flags)) 364 output = "[always] defer defer+madvise madvise never"; 365 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 366 &transparent_hugepage_flags)) 367 output = "always [defer] defer+madvise madvise never"; 368 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 369 &transparent_hugepage_flags)) 370 output = "always defer [defer+madvise] madvise never"; 371 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 372 &transparent_hugepage_flags)) 373 output = "always defer defer+madvise [madvise] never"; 374 else 375 output = "always defer defer+madvise madvise [never]"; 376 377 return sysfs_emit(buf, "%s\n", output); 378 } 379 380 static ssize_t defrag_store(struct kobject *kobj, 381 struct kobj_attribute *attr, 382 const char *buf, size_t count) 383 { 384 if (sysfs_streq(buf, "always")) { 385 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 386 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 387 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 388 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 389 } else if (sysfs_streq(buf, "defer+madvise")) { 390 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 391 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 392 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 393 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 394 } else if (sysfs_streq(buf, "defer")) { 395 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 397 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 398 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 399 } else if (sysfs_streq(buf, "madvise")) { 400 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 402 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 403 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 404 } else if (sysfs_streq(buf, "never")) { 405 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 406 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 407 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 408 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 409 } else 410 return -EINVAL; 411 412 return count; 413 } 414 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); 415 416 static ssize_t use_zero_page_show(struct kobject *kobj, 417 struct kobj_attribute *attr, char *buf) 418 { 419 return single_hugepage_flag_show(kobj, attr, buf, 420 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 421 } 422 static ssize_t use_zero_page_store(struct kobject *kobj, 423 struct kobj_attribute *attr, const char *buf, size_t count) 424 { 425 return single_hugepage_flag_store(kobj, attr, buf, count, 426 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 427 } 428 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); 429 430 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 431 struct kobj_attribute *attr, char *buf) 432 { 433 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); 434 } 435 static struct kobj_attribute hpage_pmd_size_attr = 436 __ATTR_RO(hpage_pmd_size); 437 438 static struct attribute *hugepage_attr[] = { 439 &enabled_attr.attr, 440 &defrag_attr.attr, 441 &use_zero_page_attr.attr, 442 &hpage_pmd_size_attr.attr, 443 #ifdef CONFIG_SHMEM 444 &shmem_enabled_attr.attr, 445 #endif 446 NULL, 447 }; 448 449 static const struct attribute_group hugepage_attr_group = { 450 .attrs = hugepage_attr, 451 }; 452 453 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); 454 static void thpsize_release(struct kobject *kobj); 455 static DEFINE_SPINLOCK(huge_anon_orders_lock); 456 static LIST_HEAD(thpsize_list); 457 458 static ssize_t thpsize_enabled_show(struct kobject *kobj, 459 struct kobj_attribute *attr, char *buf) 460 { 461 int order = to_thpsize(kobj)->order; 462 const char *output; 463 464 if (test_bit(order, &huge_anon_orders_always)) 465 output = "[always] inherit madvise never"; 466 else if (test_bit(order, &huge_anon_orders_inherit)) 467 output = "always [inherit] madvise never"; 468 else if (test_bit(order, &huge_anon_orders_madvise)) 469 output = "always inherit [madvise] never"; 470 else 471 output = "always inherit madvise [never]"; 472 473 return sysfs_emit(buf, "%s\n", output); 474 } 475 476 static ssize_t thpsize_enabled_store(struct kobject *kobj, 477 struct kobj_attribute *attr, 478 const char *buf, size_t count) 479 { 480 int order = to_thpsize(kobj)->order; 481 ssize_t ret = count; 482 483 if (sysfs_streq(buf, "always")) { 484 spin_lock(&huge_anon_orders_lock); 485 clear_bit(order, &huge_anon_orders_inherit); 486 clear_bit(order, &huge_anon_orders_madvise); 487 set_bit(order, &huge_anon_orders_always); 488 spin_unlock(&huge_anon_orders_lock); 489 } else if (sysfs_streq(buf, "inherit")) { 490 spin_lock(&huge_anon_orders_lock); 491 clear_bit(order, &huge_anon_orders_always); 492 clear_bit(order, &huge_anon_orders_madvise); 493 set_bit(order, &huge_anon_orders_inherit); 494 spin_unlock(&huge_anon_orders_lock); 495 } else if (sysfs_streq(buf, "madvise")) { 496 spin_lock(&huge_anon_orders_lock); 497 clear_bit(order, &huge_anon_orders_always); 498 clear_bit(order, &huge_anon_orders_inherit); 499 set_bit(order, &huge_anon_orders_madvise); 500 spin_unlock(&huge_anon_orders_lock); 501 } else if (sysfs_streq(buf, "never")) { 502 spin_lock(&huge_anon_orders_lock); 503 clear_bit(order, &huge_anon_orders_always); 504 clear_bit(order, &huge_anon_orders_inherit); 505 clear_bit(order, &huge_anon_orders_madvise); 506 spin_unlock(&huge_anon_orders_lock); 507 } else 508 ret = -EINVAL; 509 510 if (ret > 0) { 511 int err; 512 513 err = start_stop_khugepaged(); 514 if (err) 515 ret = err; 516 } 517 return ret; 518 } 519 520 static struct kobj_attribute thpsize_enabled_attr = 521 __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); 522 523 static struct attribute *thpsize_attrs[] = { 524 &thpsize_enabled_attr.attr, 525 #ifdef CONFIG_SHMEM 526 &thpsize_shmem_enabled_attr.attr, 527 #endif 528 NULL, 529 }; 530 531 static const struct attribute_group thpsize_attr_group = { 532 .attrs = thpsize_attrs, 533 }; 534 535 static const struct kobj_type thpsize_ktype = { 536 .release = &thpsize_release, 537 .sysfs_ops = &kobj_sysfs_ops, 538 }; 539 540 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; 541 542 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) 543 { 544 unsigned long sum = 0; 545 int cpu; 546 547 for_each_possible_cpu(cpu) { 548 struct mthp_stat *this = &per_cpu(mthp_stats, cpu); 549 550 sum += this->stats[order][item]; 551 } 552 553 return sum; 554 } 555 556 #define DEFINE_MTHP_STAT_ATTR(_name, _index) \ 557 static ssize_t _name##_show(struct kobject *kobj, \ 558 struct kobj_attribute *attr, char *buf) \ 559 { \ 560 int order = to_thpsize(kobj)->order; \ 561 \ 562 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ 563 } \ 564 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 565 566 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); 567 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); 568 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 569 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); 570 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); 571 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); 572 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); 573 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); 574 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); 575 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); 576 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); 577 578 static struct attribute *stats_attrs[] = { 579 &anon_fault_alloc_attr.attr, 580 &anon_fault_fallback_attr.attr, 581 &anon_fault_fallback_charge_attr.attr, 582 &swpout_attr.attr, 583 &swpout_fallback_attr.attr, 584 &shmem_alloc_attr.attr, 585 &shmem_fallback_attr.attr, 586 &shmem_fallback_charge_attr.attr, 587 &split_attr.attr, 588 &split_failed_attr.attr, 589 &split_deferred_attr.attr, 590 NULL, 591 }; 592 593 static struct attribute_group stats_attr_group = { 594 .name = "stats", 595 .attrs = stats_attrs, 596 }; 597 598 static struct thpsize *thpsize_create(int order, struct kobject *parent) 599 { 600 unsigned long size = (PAGE_SIZE << order) / SZ_1K; 601 struct thpsize *thpsize; 602 int ret; 603 604 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); 605 if (!thpsize) 606 return ERR_PTR(-ENOMEM); 607 608 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, 609 "hugepages-%lukB", size); 610 if (ret) { 611 kfree(thpsize); 612 return ERR_PTR(ret); 613 } 614 615 ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); 616 if (ret) { 617 kobject_put(&thpsize->kobj); 618 return ERR_PTR(ret); 619 } 620 621 ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); 622 if (ret) { 623 kobject_put(&thpsize->kobj); 624 return ERR_PTR(ret); 625 } 626 627 thpsize->order = order; 628 return thpsize; 629 } 630 631 static void thpsize_release(struct kobject *kobj) 632 { 633 kfree(to_thpsize(kobj)); 634 } 635 636 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 637 { 638 int err; 639 struct thpsize *thpsize; 640 unsigned long orders; 641 int order; 642 643 /* 644 * Default to setting PMD-sized THP to inherit the global setting and 645 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time 646 * constant so we have to do this here. 647 */ 648 huge_anon_orders_inherit = BIT(PMD_ORDER); 649 650 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 651 if (unlikely(!*hugepage_kobj)) { 652 pr_err("failed to create transparent hugepage kobject\n"); 653 return -ENOMEM; 654 } 655 656 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 657 if (err) { 658 pr_err("failed to register transparent hugepage group\n"); 659 goto delete_obj; 660 } 661 662 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 663 if (err) { 664 pr_err("failed to register transparent hugepage group\n"); 665 goto remove_hp_group; 666 } 667 668 orders = THP_ORDERS_ALL_ANON; 669 order = highest_order(orders); 670 while (orders) { 671 thpsize = thpsize_create(order, *hugepage_kobj); 672 if (IS_ERR(thpsize)) { 673 pr_err("failed to create thpsize for order %d\n", order); 674 err = PTR_ERR(thpsize); 675 goto remove_all; 676 } 677 list_add(&thpsize->node, &thpsize_list); 678 order = next_order(&orders, order); 679 } 680 681 return 0; 682 683 remove_all: 684 hugepage_exit_sysfs(*hugepage_kobj); 685 return err; 686 remove_hp_group: 687 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 688 delete_obj: 689 kobject_put(*hugepage_kobj); 690 return err; 691 } 692 693 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 694 { 695 struct thpsize *thpsize, *tmp; 696 697 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { 698 list_del(&thpsize->node); 699 kobject_put(&thpsize->kobj); 700 } 701 702 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 703 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 704 kobject_put(hugepage_kobj); 705 } 706 #else 707 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 708 { 709 return 0; 710 } 711 712 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 713 { 714 } 715 #endif /* CONFIG_SYSFS */ 716 717 static int __init thp_shrinker_init(void) 718 { 719 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); 720 if (!huge_zero_page_shrinker) 721 return -ENOMEM; 722 723 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 724 SHRINKER_MEMCG_AWARE | 725 SHRINKER_NONSLAB, 726 "thp-deferred_split"); 727 if (!deferred_split_shrinker) { 728 shrinker_free(huge_zero_page_shrinker); 729 return -ENOMEM; 730 } 731 732 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; 733 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; 734 shrinker_register(huge_zero_page_shrinker); 735 736 deferred_split_shrinker->count_objects = deferred_split_count; 737 deferred_split_shrinker->scan_objects = deferred_split_scan; 738 shrinker_register(deferred_split_shrinker); 739 740 return 0; 741 } 742 743 static void __init thp_shrinker_exit(void) 744 { 745 shrinker_free(huge_zero_page_shrinker); 746 shrinker_free(deferred_split_shrinker); 747 } 748 749 static int __init hugepage_init(void) 750 { 751 int err; 752 struct kobject *hugepage_kobj; 753 754 if (!has_transparent_hugepage()) { 755 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; 756 return -EINVAL; 757 } 758 759 /* 760 * hugepages can't be allocated by the buddy allocator 761 */ 762 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); 763 764 err = hugepage_init_sysfs(&hugepage_kobj); 765 if (err) 766 goto err_sysfs; 767 768 err = khugepaged_init(); 769 if (err) 770 goto err_slab; 771 772 err = thp_shrinker_init(); 773 if (err) 774 goto err_shrinker; 775 776 /* 777 * By default disable transparent hugepages on smaller systems, 778 * where the extra memory used could hurt more than TLB overhead 779 * is likely to save. The admin can still enable it through /sys. 780 */ 781 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 782 transparent_hugepage_flags = 0; 783 return 0; 784 } 785 786 err = start_stop_khugepaged(); 787 if (err) 788 goto err_khugepaged; 789 790 return 0; 791 err_khugepaged: 792 thp_shrinker_exit(); 793 err_shrinker: 794 khugepaged_destroy(); 795 err_slab: 796 hugepage_exit_sysfs(hugepage_kobj); 797 err_sysfs: 798 return err; 799 } 800 subsys_initcall(hugepage_init); 801 802 static int __init setup_transparent_hugepage(char *str) 803 { 804 int ret = 0; 805 if (!str) 806 goto out; 807 if (!strcmp(str, "always")) { 808 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 809 &transparent_hugepage_flags); 810 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 811 &transparent_hugepage_flags); 812 ret = 1; 813 } else if (!strcmp(str, "madvise")) { 814 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 815 &transparent_hugepage_flags); 816 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 817 &transparent_hugepage_flags); 818 ret = 1; 819 } else if (!strcmp(str, "never")) { 820 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 821 &transparent_hugepage_flags); 822 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 823 &transparent_hugepage_flags); 824 ret = 1; 825 } 826 out: 827 if (!ret) 828 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 829 return ret; 830 } 831 __setup("transparent_hugepage=", setup_transparent_hugepage); 832 833 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 834 { 835 if (likely(vma->vm_flags & VM_WRITE)) 836 pmd = pmd_mkwrite(pmd, vma); 837 return pmd; 838 } 839 840 #ifdef CONFIG_MEMCG 841 static inline 842 struct deferred_split *get_deferred_split_queue(struct folio *folio) 843 { 844 struct mem_cgroup *memcg = folio_memcg(folio); 845 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 846 847 if (memcg) 848 return &memcg->deferred_split_queue; 849 else 850 return &pgdat->deferred_split_queue; 851 } 852 #else 853 static inline 854 struct deferred_split *get_deferred_split_queue(struct folio *folio) 855 { 856 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 857 858 return &pgdat->deferred_split_queue; 859 } 860 #endif 861 862 static inline bool is_transparent_hugepage(const struct folio *folio) 863 { 864 if (!folio_test_large(folio)) 865 return false; 866 867 return is_huge_zero_folio(folio) || 868 folio_test_large_rmappable(folio); 869 } 870 871 static unsigned long __thp_get_unmapped_area(struct file *filp, 872 unsigned long addr, unsigned long len, 873 loff_t off, unsigned long flags, unsigned long size, 874 vm_flags_t vm_flags) 875 { 876 loff_t off_end = off + len; 877 loff_t off_align = round_up(off, size); 878 unsigned long len_pad, ret, off_sub; 879 880 if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) 881 return 0; 882 883 if (off_end <= off_align || (off_end - off_align) < size) 884 return 0; 885 886 len_pad = len + size; 887 if (len_pad < len || (off + len_pad) < off) 888 return 0; 889 890 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, 891 off >> PAGE_SHIFT, flags, vm_flags); 892 893 /* 894 * The failure might be due to length padding. The caller will retry 895 * without the padding. 896 */ 897 if (IS_ERR_VALUE(ret)) 898 return 0; 899 900 /* 901 * Do not try to align to THP boundary if allocation at the address 902 * hint succeeds. 903 */ 904 if (ret == addr) 905 return addr; 906 907 off_sub = (off - ret) & (size - 1); 908 909 if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) 910 return ret + size; 911 912 ret += off_sub; 913 return ret; 914 } 915 916 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, 917 unsigned long len, unsigned long pgoff, unsigned long flags, 918 vm_flags_t vm_flags) 919 { 920 unsigned long ret; 921 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 922 923 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); 924 if (ret) 925 return ret; 926 927 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, 928 vm_flags); 929 } 930 931 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 932 unsigned long len, unsigned long pgoff, unsigned long flags) 933 { 934 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); 935 } 936 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 937 938 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, 939 struct page *page, gfp_t gfp) 940 { 941 struct vm_area_struct *vma = vmf->vma; 942 struct folio *folio = page_folio(page); 943 pgtable_t pgtable; 944 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 945 vm_fault_t ret = 0; 946 947 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 948 949 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { 950 folio_put(folio); 951 count_vm_event(THP_FAULT_FALLBACK); 952 count_vm_event(THP_FAULT_FALLBACK_CHARGE); 953 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); 954 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); 955 return VM_FAULT_FALLBACK; 956 } 957 folio_throttle_swaprate(folio, gfp); 958 959 pgtable = pte_alloc_one(vma->vm_mm); 960 if (unlikely(!pgtable)) { 961 ret = VM_FAULT_OOM; 962 goto release; 963 } 964 965 folio_zero_user(folio, vmf->address); 966 /* 967 * The memory barrier inside __folio_mark_uptodate makes sure that 968 * folio_zero_user writes become visible before the set_pmd_at() 969 * write. 970 */ 971 __folio_mark_uptodate(folio); 972 973 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 974 if (unlikely(!pmd_none(*vmf->pmd))) { 975 goto unlock_release; 976 } else { 977 pmd_t entry; 978 979 ret = check_stable_address_space(vma->vm_mm); 980 if (ret) 981 goto unlock_release; 982 983 /* Deliver the page fault to userland */ 984 if (userfaultfd_missing(vma)) { 985 spin_unlock(vmf->ptl); 986 folio_put(folio); 987 pte_free(vma->vm_mm, pgtable); 988 ret = handle_userfault(vmf, VM_UFFD_MISSING); 989 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 990 return ret; 991 } 992 993 entry = mk_huge_pmd(page, vma->vm_page_prot); 994 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 995 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); 996 folio_add_lru_vma(folio, vma); 997 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 998 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 999 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1000 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1001 mm_inc_nr_ptes(vma->vm_mm); 1002 spin_unlock(vmf->ptl); 1003 count_vm_event(THP_FAULT_ALLOC); 1004 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); 1005 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); 1006 } 1007 1008 return 0; 1009 unlock_release: 1010 spin_unlock(vmf->ptl); 1011 release: 1012 if (pgtable) 1013 pte_free(vma->vm_mm, pgtable); 1014 folio_put(folio); 1015 return ret; 1016 1017 } 1018 1019 /* 1020 * always: directly stall for all thp allocations 1021 * defer: wake kswapd and fail if not immediately available 1022 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 1023 * fail if not immediately available 1024 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 1025 * available 1026 * never: never stall for any thp allocation 1027 */ 1028 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) 1029 { 1030 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); 1031 1032 /* Always do synchronous compaction */ 1033 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 1034 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 1035 1036 /* Kick kcompactd and fail quickly */ 1037 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 1038 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 1039 1040 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 1041 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 1042 return GFP_TRANSHUGE_LIGHT | 1043 (vma_madvised ? __GFP_DIRECT_RECLAIM : 1044 __GFP_KSWAPD_RECLAIM); 1045 1046 /* Only do synchronous compaction if madvised */ 1047 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 1048 return GFP_TRANSHUGE_LIGHT | 1049 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 1050 1051 return GFP_TRANSHUGE_LIGHT; 1052 } 1053 1054 /* Caller must hold page table lock. */ 1055 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, 1056 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 1057 struct folio *zero_folio) 1058 { 1059 pmd_t entry; 1060 if (!pmd_none(*pmd)) 1061 return; 1062 entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); 1063 entry = pmd_mkhuge(entry); 1064 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1065 set_pmd_at(mm, haddr, pmd, entry); 1066 mm_inc_nr_ptes(mm); 1067 } 1068 1069 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1070 { 1071 struct vm_area_struct *vma = vmf->vma; 1072 gfp_t gfp; 1073 struct folio *folio; 1074 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1075 vm_fault_t ret; 1076 1077 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) 1078 return VM_FAULT_FALLBACK; 1079 ret = vmf_anon_prepare(vmf); 1080 if (ret) 1081 return ret; 1082 khugepaged_enter_vma(vma, vma->vm_flags); 1083 1084 if (!(vmf->flags & FAULT_FLAG_WRITE) && 1085 !mm_forbids_zeropage(vma->vm_mm) && 1086 transparent_hugepage_use_zero_page()) { 1087 pgtable_t pgtable; 1088 struct folio *zero_folio; 1089 vm_fault_t ret; 1090 1091 pgtable = pte_alloc_one(vma->vm_mm); 1092 if (unlikely(!pgtable)) 1093 return VM_FAULT_OOM; 1094 zero_folio = mm_get_huge_zero_folio(vma->vm_mm); 1095 if (unlikely(!zero_folio)) { 1096 pte_free(vma->vm_mm, pgtable); 1097 count_vm_event(THP_FAULT_FALLBACK); 1098 return VM_FAULT_FALLBACK; 1099 } 1100 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1101 ret = 0; 1102 if (pmd_none(*vmf->pmd)) { 1103 ret = check_stable_address_space(vma->vm_mm); 1104 if (ret) { 1105 spin_unlock(vmf->ptl); 1106 pte_free(vma->vm_mm, pgtable); 1107 } else if (userfaultfd_missing(vma)) { 1108 spin_unlock(vmf->ptl); 1109 pte_free(vma->vm_mm, pgtable); 1110 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1111 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1112 } else { 1113 set_huge_zero_folio(pgtable, vma->vm_mm, vma, 1114 haddr, vmf->pmd, zero_folio); 1115 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1116 spin_unlock(vmf->ptl); 1117 } 1118 } else { 1119 spin_unlock(vmf->ptl); 1120 pte_free(vma->vm_mm, pgtable); 1121 } 1122 return ret; 1123 } 1124 gfp = vma_thp_gfp_mask(vma); 1125 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); 1126 if (unlikely(!folio)) { 1127 count_vm_event(THP_FAULT_FALLBACK); 1128 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); 1129 return VM_FAULT_FALLBACK; 1130 } 1131 return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); 1132 } 1133 1134 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 1135 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 1136 pgtable_t pgtable) 1137 { 1138 struct mm_struct *mm = vma->vm_mm; 1139 pmd_t entry; 1140 spinlock_t *ptl; 1141 1142 ptl = pmd_lock(mm, pmd); 1143 if (!pmd_none(*pmd)) { 1144 if (write) { 1145 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { 1146 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 1147 goto out_unlock; 1148 } 1149 entry = pmd_mkyoung(*pmd); 1150 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1151 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 1152 update_mmu_cache_pmd(vma, addr, pmd); 1153 } 1154 1155 goto out_unlock; 1156 } 1157 1158 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 1159 if (pfn_t_devmap(pfn)) 1160 entry = pmd_mkdevmap(entry); 1161 if (write) { 1162 entry = pmd_mkyoung(pmd_mkdirty(entry)); 1163 entry = maybe_pmd_mkwrite(entry, vma); 1164 } 1165 1166 if (pgtable) { 1167 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1168 mm_inc_nr_ptes(mm); 1169 pgtable = NULL; 1170 } 1171 1172 set_pmd_at(mm, addr, pmd, entry); 1173 update_mmu_cache_pmd(vma, addr, pmd); 1174 1175 out_unlock: 1176 spin_unlock(ptl); 1177 if (pgtable) 1178 pte_free(mm, pgtable); 1179 } 1180 1181 /** 1182 * vmf_insert_pfn_pmd - insert a pmd size pfn 1183 * @vmf: Structure describing the fault 1184 * @pfn: pfn to insert 1185 * @write: whether it's a write fault 1186 * 1187 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. 1188 * 1189 * Return: vm_fault_t value. 1190 */ 1191 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 1192 { 1193 unsigned long addr = vmf->address & PMD_MASK; 1194 struct vm_area_struct *vma = vmf->vma; 1195 pgprot_t pgprot = vma->vm_page_prot; 1196 pgtable_t pgtable = NULL; 1197 1198 /* 1199 * If we had pmd_special, we could avoid all these restrictions, 1200 * but we need to be consistent with PTEs and architectures that 1201 * can't support a 'special' bit. 1202 */ 1203 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1204 !pfn_t_devmap(pfn)); 1205 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1206 (VM_PFNMAP|VM_MIXEDMAP)); 1207 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1208 1209 if (addr < vma->vm_start || addr >= vma->vm_end) 1210 return VM_FAULT_SIGBUS; 1211 1212 if (arch_needs_pgtable_deposit()) { 1213 pgtable = pte_alloc_one(vma->vm_mm); 1214 if (!pgtable) 1215 return VM_FAULT_OOM; 1216 } 1217 1218 track_pfn_insert(vma, &pgprot, pfn); 1219 1220 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 1221 return VM_FAULT_NOPAGE; 1222 } 1223 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1224 1225 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1226 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 1227 { 1228 if (likely(vma->vm_flags & VM_WRITE)) 1229 pud = pud_mkwrite(pud); 1230 return pud; 1231 } 1232 1233 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 1234 pud_t *pud, pfn_t pfn, bool write) 1235 { 1236 struct mm_struct *mm = vma->vm_mm; 1237 pgprot_t prot = vma->vm_page_prot; 1238 pud_t entry; 1239 spinlock_t *ptl; 1240 1241 ptl = pud_lock(mm, pud); 1242 if (!pud_none(*pud)) { 1243 if (write) { 1244 if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { 1245 WARN_ON_ONCE(!is_huge_zero_pud(*pud)); 1246 goto out_unlock; 1247 } 1248 entry = pud_mkyoung(*pud); 1249 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 1250 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 1251 update_mmu_cache_pud(vma, addr, pud); 1252 } 1253 goto out_unlock; 1254 } 1255 1256 entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 1257 if (pfn_t_devmap(pfn)) 1258 entry = pud_mkdevmap(entry); 1259 if (write) { 1260 entry = pud_mkyoung(pud_mkdirty(entry)); 1261 entry = maybe_pud_mkwrite(entry, vma); 1262 } 1263 set_pud_at(mm, addr, pud, entry); 1264 update_mmu_cache_pud(vma, addr, pud); 1265 1266 out_unlock: 1267 spin_unlock(ptl); 1268 } 1269 1270 /** 1271 * vmf_insert_pfn_pud - insert a pud size pfn 1272 * @vmf: Structure describing the fault 1273 * @pfn: pfn to insert 1274 * @write: whether it's a write fault 1275 * 1276 * Insert a pud size pfn. See vmf_insert_pfn() for additional info. 1277 * 1278 * Return: vm_fault_t value. 1279 */ 1280 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 1281 { 1282 unsigned long addr = vmf->address & PUD_MASK; 1283 struct vm_area_struct *vma = vmf->vma; 1284 pgprot_t pgprot = vma->vm_page_prot; 1285 1286 /* 1287 * If we had pud_special, we could avoid all these restrictions, 1288 * but we need to be consistent with PTEs and architectures that 1289 * can't support a 'special' bit. 1290 */ 1291 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1292 !pfn_t_devmap(pfn)); 1293 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1294 (VM_PFNMAP|VM_MIXEDMAP)); 1295 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1296 1297 if (addr < vma->vm_start || addr >= vma->vm_end) 1298 return VM_FAULT_SIGBUS; 1299 1300 track_pfn_insert(vma, &pgprot, pfn); 1301 1302 insert_pfn_pud(vma, addr, vmf->pud, pfn, write); 1303 return VM_FAULT_NOPAGE; 1304 } 1305 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 1306 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1307 1308 void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1309 pmd_t *pmd, bool write) 1310 { 1311 pmd_t _pmd; 1312 1313 _pmd = pmd_mkyoung(*pmd); 1314 if (write) 1315 _pmd = pmd_mkdirty(_pmd); 1316 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1317 pmd, _pmd, write)) 1318 update_mmu_cache_pmd(vma, addr, pmd); 1319 } 1320 1321 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 1322 pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 1323 { 1324 unsigned long pfn = pmd_pfn(*pmd); 1325 struct mm_struct *mm = vma->vm_mm; 1326 struct page *page; 1327 int ret; 1328 1329 assert_spin_locked(pmd_lockptr(mm, pmd)); 1330 1331 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1332 return NULL; 1333 1334 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 1335 /* pass */; 1336 else 1337 return NULL; 1338 1339 if (flags & FOLL_TOUCH) 1340 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); 1341 1342 /* 1343 * device mapped pages can only be returned if the 1344 * caller will manage the page reference count. 1345 */ 1346 if (!(flags & (FOLL_GET | FOLL_PIN))) 1347 return ERR_PTR(-EEXIST); 1348 1349 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 1350 *pgmap = get_dev_pagemap(pfn, *pgmap); 1351 if (!*pgmap) 1352 return ERR_PTR(-EFAULT); 1353 page = pfn_to_page(pfn); 1354 ret = try_grab_folio(page_folio(page), 1, flags); 1355 if (ret) 1356 page = ERR_PTR(ret); 1357 1358 return page; 1359 } 1360 1361 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1362 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1363 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1364 { 1365 spinlock_t *dst_ptl, *src_ptl; 1366 struct page *src_page; 1367 struct folio *src_folio; 1368 pmd_t pmd; 1369 pgtable_t pgtable = NULL; 1370 int ret = -ENOMEM; 1371 1372 /* Skip if can be re-fill on fault */ 1373 if (!vma_is_anonymous(dst_vma)) 1374 return 0; 1375 1376 pgtable = pte_alloc_one(dst_mm); 1377 if (unlikely(!pgtable)) 1378 goto out; 1379 1380 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1381 src_ptl = pmd_lockptr(src_mm, src_pmd); 1382 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1383 1384 ret = -EAGAIN; 1385 pmd = *src_pmd; 1386 1387 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1388 if (unlikely(is_swap_pmd(pmd))) { 1389 swp_entry_t entry = pmd_to_swp_entry(pmd); 1390 1391 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1392 if (!is_readable_migration_entry(entry)) { 1393 entry = make_readable_migration_entry( 1394 swp_offset(entry)); 1395 pmd = swp_entry_to_pmd(entry); 1396 if (pmd_swp_soft_dirty(*src_pmd)) 1397 pmd = pmd_swp_mksoft_dirty(pmd); 1398 if (pmd_swp_uffd_wp(*src_pmd)) 1399 pmd = pmd_swp_mkuffd_wp(pmd); 1400 set_pmd_at(src_mm, addr, src_pmd, pmd); 1401 } 1402 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1403 mm_inc_nr_ptes(dst_mm); 1404 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1405 if (!userfaultfd_wp(dst_vma)) 1406 pmd = pmd_swp_clear_uffd_wp(pmd); 1407 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1408 ret = 0; 1409 goto out_unlock; 1410 } 1411 #endif 1412 1413 if (unlikely(!pmd_trans_huge(pmd))) { 1414 pte_free(dst_mm, pgtable); 1415 goto out_unlock; 1416 } 1417 /* 1418 * When page table lock is held, the huge zero pmd should not be 1419 * under splitting since we don't split the page itself, only pmd to 1420 * a page table. 1421 */ 1422 if (is_huge_zero_pmd(pmd)) { 1423 /* 1424 * mm_get_huge_zero_folio() will never allocate a new 1425 * folio here, since we already have a zero page to 1426 * copy. It just takes a reference. 1427 */ 1428 mm_get_huge_zero_folio(dst_mm); 1429 goto out_zero_page; 1430 } 1431 1432 src_page = pmd_page(pmd); 1433 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1434 src_folio = page_folio(src_page); 1435 1436 folio_get(src_folio); 1437 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) { 1438 /* Page maybe pinned: split and retry the fault on PTEs. */ 1439 folio_put(src_folio); 1440 pte_free(dst_mm, pgtable); 1441 spin_unlock(src_ptl); 1442 spin_unlock(dst_ptl); 1443 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); 1444 return -EAGAIN; 1445 } 1446 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1447 out_zero_page: 1448 mm_inc_nr_ptes(dst_mm); 1449 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1450 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1451 if (!userfaultfd_wp(dst_vma)) 1452 pmd = pmd_clear_uffd_wp(pmd); 1453 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1454 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1455 1456 ret = 0; 1457 out_unlock: 1458 spin_unlock(src_ptl); 1459 spin_unlock(dst_ptl); 1460 out: 1461 return ret; 1462 } 1463 1464 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1465 void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1466 pud_t *pud, bool write) 1467 { 1468 pud_t _pud; 1469 1470 _pud = pud_mkyoung(*pud); 1471 if (write) 1472 _pud = pud_mkdirty(_pud); 1473 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1474 pud, _pud, write)) 1475 update_mmu_cache_pud(vma, addr, pud); 1476 } 1477 1478 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1479 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1480 struct vm_area_struct *vma) 1481 { 1482 spinlock_t *dst_ptl, *src_ptl; 1483 pud_t pud; 1484 int ret; 1485 1486 dst_ptl = pud_lock(dst_mm, dst_pud); 1487 src_ptl = pud_lockptr(src_mm, src_pud); 1488 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1489 1490 ret = -EAGAIN; 1491 pud = *src_pud; 1492 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 1493 goto out_unlock; 1494 1495 /* 1496 * When page table lock is held, the huge zero pud should not be 1497 * under splitting since we don't split the page itself, only pud to 1498 * a page table. 1499 */ 1500 if (is_huge_zero_pud(pud)) { 1501 /* No huge zero pud yet */ 1502 } 1503 1504 /* 1505 * TODO: once we support anonymous pages, use 1506 * folio_try_dup_anon_rmap_*() and split if duplicating fails. 1507 */ 1508 pudp_set_wrprotect(src_mm, addr, src_pud); 1509 pud = pud_mkold(pud_wrprotect(pud)); 1510 set_pud_at(dst_mm, addr, dst_pud, pud); 1511 1512 ret = 0; 1513 out_unlock: 1514 spin_unlock(src_ptl); 1515 spin_unlock(dst_ptl); 1516 return ret; 1517 } 1518 1519 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1520 { 1521 bool write = vmf->flags & FAULT_FLAG_WRITE; 1522 1523 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1524 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1525 goto unlock; 1526 1527 touch_pud(vmf->vma, vmf->address, vmf->pud, write); 1528 unlock: 1529 spin_unlock(vmf->ptl); 1530 } 1531 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1532 1533 void huge_pmd_set_accessed(struct vm_fault *vmf) 1534 { 1535 bool write = vmf->flags & FAULT_FLAG_WRITE; 1536 1537 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1538 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) 1539 goto unlock; 1540 1541 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); 1542 1543 unlock: 1544 spin_unlock(vmf->ptl); 1545 } 1546 1547 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) 1548 { 1549 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 1550 struct vm_area_struct *vma = vmf->vma; 1551 struct folio *folio; 1552 struct page *page; 1553 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1554 pmd_t orig_pmd = vmf->orig_pmd; 1555 1556 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 1557 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1558 1559 if (is_huge_zero_pmd(orig_pmd)) 1560 goto fallback; 1561 1562 spin_lock(vmf->ptl); 1563 1564 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1565 spin_unlock(vmf->ptl); 1566 return 0; 1567 } 1568 1569 page = pmd_page(orig_pmd); 1570 folio = page_folio(page); 1571 VM_BUG_ON_PAGE(!PageHead(page), page); 1572 1573 /* Early check when only holding the PT lock. */ 1574 if (PageAnonExclusive(page)) 1575 goto reuse; 1576 1577 if (!folio_trylock(folio)) { 1578 folio_get(folio); 1579 spin_unlock(vmf->ptl); 1580 folio_lock(folio); 1581 spin_lock(vmf->ptl); 1582 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1583 spin_unlock(vmf->ptl); 1584 folio_unlock(folio); 1585 folio_put(folio); 1586 return 0; 1587 } 1588 folio_put(folio); 1589 } 1590 1591 /* Recheck after temporarily dropping the PT lock. */ 1592 if (PageAnonExclusive(page)) { 1593 folio_unlock(folio); 1594 goto reuse; 1595 } 1596 1597 /* 1598 * See do_wp_page(): we can only reuse the folio exclusively if 1599 * there are no additional references. Note that we always drain 1600 * the LRU cache immediately after adding a THP. 1601 */ 1602 if (folio_ref_count(folio) > 1603 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) 1604 goto unlock_fallback; 1605 if (folio_test_swapcache(folio)) 1606 folio_free_swap(folio); 1607 if (folio_ref_count(folio) == 1) { 1608 pmd_t entry; 1609 1610 folio_move_anon_rmap(folio, vma); 1611 SetPageAnonExclusive(page); 1612 folio_unlock(folio); 1613 reuse: 1614 if (unlikely(unshare)) { 1615 spin_unlock(vmf->ptl); 1616 return 0; 1617 } 1618 entry = pmd_mkyoung(orig_pmd); 1619 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1620 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1621 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1622 spin_unlock(vmf->ptl); 1623 return 0; 1624 } 1625 1626 unlock_fallback: 1627 folio_unlock(folio); 1628 spin_unlock(vmf->ptl); 1629 fallback: 1630 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); 1631 return VM_FAULT_FALLBACK; 1632 } 1633 1634 static inline bool can_change_pmd_writable(struct vm_area_struct *vma, 1635 unsigned long addr, pmd_t pmd) 1636 { 1637 struct page *page; 1638 1639 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) 1640 return false; 1641 1642 /* Don't touch entries that are not even readable (NUMA hinting). */ 1643 if (pmd_protnone(pmd)) 1644 return false; 1645 1646 /* Do we need write faults for softdirty tracking? */ 1647 if (pmd_needs_soft_dirty_wp(vma, pmd)) 1648 return false; 1649 1650 /* Do we need write faults for uffd-wp tracking? */ 1651 if (userfaultfd_huge_pmd_wp(vma, pmd)) 1652 return false; 1653 1654 if (!(vma->vm_flags & VM_SHARED)) { 1655 /* See can_change_pte_writable(). */ 1656 page = vm_normal_page_pmd(vma, addr, pmd); 1657 return page && PageAnon(page) && PageAnonExclusive(page); 1658 } 1659 1660 /* See can_change_pte_writable(). */ 1661 return pmd_dirty(pmd); 1662 } 1663 1664 /* NUMA hinting page fault entry point for trans huge pmds */ 1665 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) 1666 { 1667 struct vm_area_struct *vma = vmf->vma; 1668 pmd_t oldpmd = vmf->orig_pmd; 1669 pmd_t pmd; 1670 struct folio *folio; 1671 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1672 int nid = NUMA_NO_NODE; 1673 int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); 1674 bool writable = false; 1675 int flags = 0; 1676 1677 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1678 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 1679 spin_unlock(vmf->ptl); 1680 goto out; 1681 } 1682 1683 pmd = pmd_modify(oldpmd, vma->vm_page_prot); 1684 1685 /* 1686 * Detect now whether the PMD could be writable; this information 1687 * is only valid while holding the PT lock. 1688 */ 1689 writable = pmd_write(pmd); 1690 if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 1691 can_change_pmd_writable(vma, vmf->address, pmd)) 1692 writable = true; 1693 1694 folio = vm_normal_folio_pmd(vma, haddr, pmd); 1695 if (!folio) 1696 goto out_map; 1697 1698 /* See similar comment in do_numa_page for explanation */ 1699 if (!writable) 1700 flags |= TNF_NO_GROUP; 1701 1702 nid = folio_nid(folio); 1703 /* 1704 * For memory tiering mode, cpupid of slow memory page is used 1705 * to record page access time. So use default value. 1706 */ 1707 if (node_is_toptier(nid)) 1708 last_cpupid = folio_last_cpupid(folio); 1709 target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags); 1710 if (target_nid == NUMA_NO_NODE) 1711 goto out_map; 1712 if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { 1713 flags |= TNF_MIGRATE_FAIL; 1714 goto out_map; 1715 } 1716 /* The folio is isolated and isolation code holds a folio reference. */ 1717 spin_unlock(vmf->ptl); 1718 writable = false; 1719 1720 if (!migrate_misplaced_folio(folio, vma, target_nid)) { 1721 flags |= TNF_MIGRATED; 1722 nid = target_nid; 1723 } else { 1724 flags |= TNF_MIGRATE_FAIL; 1725 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1726 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 1727 spin_unlock(vmf->ptl); 1728 goto out; 1729 } 1730 goto out_map; 1731 } 1732 1733 out: 1734 if (nid != NUMA_NO_NODE) 1735 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); 1736 1737 return 0; 1738 1739 out_map: 1740 /* Restore the PMD */ 1741 pmd = pmd_modify(oldpmd, vma->vm_page_prot); 1742 pmd = pmd_mkyoung(pmd); 1743 if (writable) 1744 pmd = pmd_mkwrite(pmd, vma); 1745 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 1746 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1747 spin_unlock(vmf->ptl); 1748 goto out; 1749 } 1750 1751 /* 1752 * Return true if we do MADV_FREE successfully on entire pmd page. 1753 * Otherwise, return false. 1754 */ 1755 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1756 pmd_t *pmd, unsigned long addr, unsigned long next) 1757 { 1758 spinlock_t *ptl; 1759 pmd_t orig_pmd; 1760 struct folio *folio; 1761 struct mm_struct *mm = tlb->mm; 1762 bool ret = false; 1763 1764 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1765 1766 ptl = pmd_trans_huge_lock(pmd, vma); 1767 if (!ptl) 1768 goto out_unlocked; 1769 1770 orig_pmd = *pmd; 1771 if (is_huge_zero_pmd(orig_pmd)) 1772 goto out; 1773 1774 if (unlikely(!pmd_present(orig_pmd))) { 1775 VM_BUG_ON(thp_migration_supported() && 1776 !is_pmd_migration_entry(orig_pmd)); 1777 goto out; 1778 } 1779 1780 folio = pmd_folio(orig_pmd); 1781 /* 1782 * If other processes are mapping this folio, we couldn't discard 1783 * the folio unless they all do MADV_FREE so let's skip the folio. 1784 */ 1785 if (folio_likely_mapped_shared(folio)) 1786 goto out; 1787 1788 if (!folio_trylock(folio)) 1789 goto out; 1790 1791 /* 1792 * If user want to discard part-pages of THP, split it so MADV_FREE 1793 * will deactivate only them. 1794 */ 1795 if (next - addr != HPAGE_PMD_SIZE) { 1796 folio_get(folio); 1797 spin_unlock(ptl); 1798 split_folio(folio); 1799 folio_unlock(folio); 1800 folio_put(folio); 1801 goto out_unlocked; 1802 } 1803 1804 if (folio_test_dirty(folio)) 1805 folio_clear_dirty(folio); 1806 folio_unlock(folio); 1807 1808 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1809 pmdp_invalidate(vma, addr, pmd); 1810 orig_pmd = pmd_mkold(orig_pmd); 1811 orig_pmd = pmd_mkclean(orig_pmd); 1812 1813 set_pmd_at(mm, addr, pmd, orig_pmd); 1814 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1815 } 1816 1817 folio_mark_lazyfree(folio); 1818 ret = true; 1819 out: 1820 spin_unlock(ptl); 1821 out_unlocked: 1822 return ret; 1823 } 1824 1825 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 1826 { 1827 pgtable_t pgtable; 1828 1829 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1830 pte_free(mm, pgtable); 1831 mm_dec_nr_ptes(mm); 1832 } 1833 1834 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1835 pmd_t *pmd, unsigned long addr) 1836 { 1837 pmd_t orig_pmd; 1838 spinlock_t *ptl; 1839 1840 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1841 1842 ptl = __pmd_trans_huge_lock(pmd, vma); 1843 if (!ptl) 1844 return 0; 1845 /* 1846 * For architectures like ppc64 we look at deposited pgtable 1847 * when calling pmdp_huge_get_and_clear. So do the 1848 * pgtable_trans_huge_withdraw after finishing pmdp related 1849 * operations. 1850 */ 1851 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, 1852 tlb->fullmm); 1853 arch_check_zapped_pmd(vma, orig_pmd); 1854 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1855 if (vma_is_special_huge(vma)) { 1856 if (arch_needs_pgtable_deposit()) 1857 zap_deposited_table(tlb->mm, pmd); 1858 spin_unlock(ptl); 1859 } else if (is_huge_zero_pmd(orig_pmd)) { 1860 zap_deposited_table(tlb->mm, pmd); 1861 spin_unlock(ptl); 1862 } else { 1863 struct folio *folio = NULL; 1864 int flush_needed = 1; 1865 1866 if (pmd_present(orig_pmd)) { 1867 struct page *page = pmd_page(orig_pmd); 1868 1869 folio = page_folio(page); 1870 folio_remove_rmap_pmd(folio, page, vma); 1871 WARN_ON_ONCE(folio_mapcount(folio) < 0); 1872 VM_BUG_ON_PAGE(!PageHead(page), page); 1873 } else if (thp_migration_supported()) { 1874 swp_entry_t entry; 1875 1876 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 1877 entry = pmd_to_swp_entry(orig_pmd); 1878 folio = pfn_swap_entry_folio(entry); 1879 flush_needed = 0; 1880 } else 1881 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 1882 1883 if (folio_test_anon(folio)) { 1884 zap_deposited_table(tlb->mm, pmd); 1885 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1886 } else { 1887 if (arch_needs_pgtable_deposit()) 1888 zap_deposited_table(tlb->mm, pmd); 1889 add_mm_counter(tlb->mm, mm_counter_file(folio), 1890 -HPAGE_PMD_NR); 1891 } 1892 1893 spin_unlock(ptl); 1894 if (flush_needed) 1895 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); 1896 } 1897 return 1; 1898 } 1899 1900 #ifndef pmd_move_must_withdraw 1901 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 1902 spinlock_t *old_pmd_ptl, 1903 struct vm_area_struct *vma) 1904 { 1905 /* 1906 * With split pmd lock we also need to move preallocated 1907 * PTE page table if new_pmd is on different PMD page table. 1908 * 1909 * We also don't deposit and withdraw tables for file pages. 1910 */ 1911 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 1912 } 1913 #endif 1914 1915 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 1916 { 1917 #ifdef CONFIG_MEM_SOFT_DIRTY 1918 if (unlikely(is_pmd_migration_entry(pmd))) 1919 pmd = pmd_swp_mksoft_dirty(pmd); 1920 else if (pmd_present(pmd)) 1921 pmd = pmd_mksoft_dirty(pmd); 1922 #endif 1923 return pmd; 1924 } 1925 1926 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1927 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) 1928 { 1929 spinlock_t *old_ptl, *new_ptl; 1930 pmd_t pmd; 1931 struct mm_struct *mm = vma->vm_mm; 1932 bool force_flush = false; 1933 1934 /* 1935 * The destination pmd shouldn't be established, free_pgtables() 1936 * should have released it; but move_page_tables() might have already 1937 * inserted a page table, if racing against shmem/file collapse. 1938 */ 1939 if (!pmd_none(*new_pmd)) { 1940 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1941 return false; 1942 } 1943 1944 /* 1945 * We don't have to worry about the ordering of src and dst 1946 * ptlocks because exclusive mmap_lock prevents deadlock. 1947 */ 1948 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1949 if (old_ptl) { 1950 new_ptl = pmd_lockptr(mm, new_pmd); 1951 if (new_ptl != old_ptl) 1952 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1953 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1954 if (pmd_present(pmd)) 1955 force_flush = true; 1956 VM_BUG_ON(!pmd_none(*new_pmd)); 1957 1958 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 1959 pgtable_t pgtable; 1960 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1961 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1962 } 1963 pmd = move_soft_dirty_pmd(pmd); 1964 set_pmd_at(mm, new_addr, new_pmd, pmd); 1965 if (force_flush) 1966 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 1967 if (new_ptl != old_ptl) 1968 spin_unlock(new_ptl); 1969 spin_unlock(old_ptl); 1970 return true; 1971 } 1972 return false; 1973 } 1974 1975 /* 1976 * Returns 1977 * - 0 if PMD could not be locked 1978 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary 1979 * or if prot_numa but THP migration is not supported 1980 * - HPAGE_PMD_NR if protections changed and TLB flush necessary 1981 */ 1982 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1983 pmd_t *pmd, unsigned long addr, pgprot_t newprot, 1984 unsigned long cp_flags) 1985 { 1986 struct mm_struct *mm = vma->vm_mm; 1987 spinlock_t *ptl; 1988 pmd_t oldpmd, entry; 1989 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 1990 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 1991 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 1992 int ret = 1; 1993 1994 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1995 1996 if (prot_numa && !thp_migration_supported()) 1997 return 1; 1998 1999 ptl = __pmd_trans_huge_lock(pmd, vma); 2000 if (!ptl) 2001 return 0; 2002 2003 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2004 if (is_swap_pmd(*pmd)) { 2005 swp_entry_t entry = pmd_to_swp_entry(*pmd); 2006 struct folio *folio = pfn_swap_entry_folio(entry); 2007 pmd_t newpmd; 2008 2009 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 2010 if (is_writable_migration_entry(entry)) { 2011 /* 2012 * A protection check is difficult so 2013 * just be safe and disable write 2014 */ 2015 if (folio_test_anon(folio)) 2016 entry = make_readable_exclusive_migration_entry(swp_offset(entry)); 2017 else 2018 entry = make_readable_migration_entry(swp_offset(entry)); 2019 newpmd = swp_entry_to_pmd(entry); 2020 if (pmd_swp_soft_dirty(*pmd)) 2021 newpmd = pmd_swp_mksoft_dirty(newpmd); 2022 } else { 2023 newpmd = *pmd; 2024 } 2025 2026 if (uffd_wp) 2027 newpmd = pmd_swp_mkuffd_wp(newpmd); 2028 else if (uffd_wp_resolve) 2029 newpmd = pmd_swp_clear_uffd_wp(newpmd); 2030 if (!pmd_same(*pmd, newpmd)) 2031 set_pmd_at(mm, addr, pmd, newpmd); 2032 goto unlock; 2033 } 2034 #endif 2035 2036 if (prot_numa) { 2037 struct folio *folio; 2038 bool toptier; 2039 /* 2040 * Avoid trapping faults against the zero page. The read-only 2041 * data is likely to be read-cached on the local CPU and 2042 * local/remote hits to the zero page are not interesting. 2043 */ 2044 if (is_huge_zero_pmd(*pmd)) 2045 goto unlock; 2046 2047 if (pmd_protnone(*pmd)) 2048 goto unlock; 2049 2050 folio = pmd_folio(*pmd); 2051 toptier = node_is_toptier(folio_nid(folio)); 2052 /* 2053 * Skip scanning top tier node if normal numa 2054 * balancing is disabled 2055 */ 2056 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 2057 toptier) 2058 goto unlock; 2059 2060 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && 2061 !toptier) 2062 folio_xchg_access_time(folio, 2063 jiffies_to_msecs(jiffies)); 2064 } 2065 /* 2066 * In case prot_numa, we are under mmap_read_lock(mm). It's critical 2067 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 2068 * which is also under mmap_read_lock(mm): 2069 * 2070 * CPU0: CPU1: 2071 * change_huge_pmd(prot_numa=1) 2072 * pmdp_huge_get_and_clear_notify() 2073 * madvise_dontneed() 2074 * zap_pmd_range() 2075 * pmd_trans_huge(*pmd) == 0 (without ptl) 2076 * // skip the pmd 2077 * set_pmd_at(); 2078 * // pmd is re-established 2079 * 2080 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 2081 * which may break userspace. 2082 * 2083 * pmdp_invalidate_ad() is required to make sure we don't miss 2084 * dirty/young flags set by hardware. 2085 */ 2086 oldpmd = pmdp_invalidate_ad(vma, addr, pmd); 2087 2088 entry = pmd_modify(oldpmd, newprot); 2089 if (uffd_wp) 2090 entry = pmd_mkuffd_wp(entry); 2091 else if (uffd_wp_resolve) 2092 /* 2093 * Leave the write bit to be handled by PF interrupt 2094 * handler, then things like COW could be properly 2095 * handled. 2096 */ 2097 entry = pmd_clear_uffd_wp(entry); 2098 2099 /* See change_pte_range(). */ 2100 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && 2101 can_change_pmd_writable(vma, addr, entry)) 2102 entry = pmd_mkwrite(entry, vma); 2103 2104 ret = HPAGE_PMD_NR; 2105 set_pmd_at(mm, addr, pmd, entry); 2106 2107 if (huge_pmd_needs_flush(oldpmd, entry)) 2108 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); 2109 unlock: 2110 spin_unlock(ptl); 2111 return ret; 2112 } 2113 2114 #ifdef CONFIG_USERFAULTFD 2115 /* 2116 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by 2117 * the caller, but it must return after releasing the page_table_lock. 2118 * Just move the page from src_pmd to dst_pmd if possible. 2119 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be 2120 * repeated by the caller, or other errors in case of failure. 2121 */ 2122 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 2123 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 2124 unsigned long dst_addr, unsigned long src_addr) 2125 { 2126 pmd_t _dst_pmd, src_pmdval; 2127 struct page *src_page; 2128 struct folio *src_folio; 2129 struct anon_vma *src_anon_vma; 2130 spinlock_t *src_ptl, *dst_ptl; 2131 pgtable_t src_pgtable; 2132 struct mmu_notifier_range range; 2133 int err = 0; 2134 2135 src_pmdval = *src_pmd; 2136 src_ptl = pmd_lockptr(mm, src_pmd); 2137 2138 lockdep_assert_held(src_ptl); 2139 vma_assert_locked(src_vma); 2140 vma_assert_locked(dst_vma); 2141 2142 /* Sanity checks before the operation */ 2143 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || 2144 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { 2145 spin_unlock(src_ptl); 2146 return -EINVAL; 2147 } 2148 2149 if (!pmd_trans_huge(src_pmdval)) { 2150 spin_unlock(src_ptl); 2151 if (is_pmd_migration_entry(src_pmdval)) { 2152 pmd_migration_entry_wait(mm, &src_pmdval); 2153 return -EAGAIN; 2154 } 2155 return -ENOENT; 2156 } 2157 2158 src_page = pmd_page(src_pmdval); 2159 2160 if (!is_huge_zero_pmd(src_pmdval)) { 2161 if (unlikely(!PageAnonExclusive(src_page))) { 2162 spin_unlock(src_ptl); 2163 return -EBUSY; 2164 } 2165 2166 src_folio = page_folio(src_page); 2167 folio_get(src_folio); 2168 } else 2169 src_folio = NULL; 2170 2171 spin_unlock(src_ptl); 2172 2173 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); 2174 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, 2175 src_addr + HPAGE_PMD_SIZE); 2176 mmu_notifier_invalidate_range_start(&range); 2177 2178 if (src_folio) { 2179 folio_lock(src_folio); 2180 2181 /* 2182 * split_huge_page walks the anon_vma chain without the page 2183 * lock. Serialize against it with the anon_vma lock, the page 2184 * lock is not enough. 2185 */ 2186 src_anon_vma = folio_get_anon_vma(src_folio); 2187 if (!src_anon_vma) { 2188 err = -EAGAIN; 2189 goto unlock_folio; 2190 } 2191 anon_vma_lock_write(src_anon_vma); 2192 } else 2193 src_anon_vma = NULL; 2194 2195 dst_ptl = pmd_lockptr(mm, dst_pmd); 2196 double_pt_lock(src_ptl, dst_ptl); 2197 if (unlikely(!pmd_same(*src_pmd, src_pmdval) || 2198 !pmd_same(*dst_pmd, dst_pmdval))) { 2199 err = -EAGAIN; 2200 goto unlock_ptls; 2201 } 2202 if (src_folio) { 2203 if (folio_maybe_dma_pinned(src_folio) || 2204 !PageAnonExclusive(&src_folio->page)) { 2205 err = -EBUSY; 2206 goto unlock_ptls; 2207 } 2208 2209 if (WARN_ON_ONCE(!folio_test_head(src_folio)) || 2210 WARN_ON_ONCE(!folio_test_anon(src_folio))) { 2211 err = -EBUSY; 2212 goto unlock_ptls; 2213 } 2214 2215 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2216 /* Folio got pinned from under us. Put it back and fail the move. */ 2217 if (folio_maybe_dma_pinned(src_folio)) { 2218 set_pmd_at(mm, src_addr, src_pmd, src_pmdval); 2219 err = -EBUSY; 2220 goto unlock_ptls; 2221 } 2222 2223 folio_move_anon_rmap(src_folio, dst_vma); 2224 src_folio->index = linear_page_index(dst_vma, dst_addr); 2225 2226 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); 2227 /* Follow mremap() behavior and treat the entry dirty after the move */ 2228 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); 2229 } else { 2230 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2231 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); 2232 } 2233 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); 2234 2235 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); 2236 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2237 unlock_ptls: 2238 double_pt_unlock(src_ptl, dst_ptl); 2239 if (src_anon_vma) { 2240 anon_vma_unlock_write(src_anon_vma); 2241 put_anon_vma(src_anon_vma); 2242 } 2243 unlock_folio: 2244 /* unblock rmap walks */ 2245 if (src_folio) 2246 folio_unlock(src_folio); 2247 mmu_notifier_invalidate_range_end(&range); 2248 if (src_folio) 2249 folio_put(src_folio); 2250 return err; 2251 } 2252 #endif /* CONFIG_USERFAULTFD */ 2253 2254 /* 2255 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2256 * 2257 * Note that if it returns page table lock pointer, this routine returns without 2258 * unlocking page table lock. So callers must unlock it. 2259 */ 2260 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2261 { 2262 spinlock_t *ptl; 2263 ptl = pmd_lock(vma->vm_mm, pmd); 2264 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 2265 pmd_devmap(*pmd))) 2266 return ptl; 2267 spin_unlock(ptl); 2268 return NULL; 2269 } 2270 2271 /* 2272 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. 2273 * 2274 * Note that if it returns page table lock pointer, this routine returns without 2275 * unlocking page table lock. So callers must unlock it. 2276 */ 2277 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2278 { 2279 spinlock_t *ptl; 2280 2281 ptl = pud_lock(vma->vm_mm, pud); 2282 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 2283 return ptl; 2284 spin_unlock(ptl); 2285 return NULL; 2286 } 2287 2288 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2289 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2290 pud_t *pud, unsigned long addr) 2291 { 2292 spinlock_t *ptl; 2293 2294 ptl = __pud_trans_huge_lock(pud, vma); 2295 if (!ptl) 2296 return 0; 2297 2298 pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); 2299 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2300 if (vma_is_special_huge(vma)) { 2301 spin_unlock(ptl); 2302 /* No zero page support yet */ 2303 } else { 2304 /* No support for anonymous PUD pages yet */ 2305 BUG(); 2306 } 2307 return 1; 2308 } 2309 2310 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2311 unsigned long haddr) 2312 { 2313 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2314 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2315 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2316 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 2317 2318 count_vm_event(THP_SPLIT_PUD); 2319 2320 pudp_huge_clear_flush(vma, haddr, pud); 2321 } 2322 2323 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2324 unsigned long address) 2325 { 2326 spinlock_t *ptl; 2327 struct mmu_notifier_range range; 2328 2329 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2330 address & HPAGE_PUD_MASK, 2331 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2332 mmu_notifier_invalidate_range_start(&range); 2333 ptl = pud_lock(vma->vm_mm, pud); 2334 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2335 goto out; 2336 __split_huge_pud_locked(vma, pud, range.start); 2337 2338 out: 2339 spin_unlock(ptl); 2340 mmu_notifier_invalidate_range_end(&range); 2341 } 2342 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2343 2344 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2345 unsigned long haddr, pmd_t *pmd) 2346 { 2347 struct mm_struct *mm = vma->vm_mm; 2348 pgtable_t pgtable; 2349 pmd_t _pmd, old_pmd; 2350 unsigned long addr; 2351 pte_t *pte; 2352 int i; 2353 2354 /* 2355 * Leave pmd empty until pte is filled note that it is fine to delay 2356 * notification until mmu_notifier_invalidate_range_end() as we are 2357 * replacing a zero pmd write protected page with a zero pte write 2358 * protected page. 2359 * 2360 * See Documentation/mm/mmu_notifier.rst 2361 */ 2362 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2363 2364 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2365 pmd_populate(mm, &_pmd, pgtable); 2366 2367 pte = pte_offset_map(&_pmd, haddr); 2368 VM_BUG_ON(!pte); 2369 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2370 pte_t entry; 2371 2372 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); 2373 entry = pte_mkspecial(entry); 2374 if (pmd_uffd_wp(old_pmd)) 2375 entry = pte_mkuffd_wp(entry); 2376 VM_BUG_ON(!pte_none(ptep_get(pte))); 2377 set_pte_at(mm, addr, pte, entry); 2378 pte++; 2379 } 2380 pte_unmap(pte - 1); 2381 smp_wmb(); /* make pte visible before pmd */ 2382 pmd_populate(mm, pmd, pgtable); 2383 } 2384 2385 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2386 unsigned long haddr, bool freeze) 2387 { 2388 struct mm_struct *mm = vma->vm_mm; 2389 struct folio *folio; 2390 struct page *page; 2391 pgtable_t pgtable; 2392 pmd_t old_pmd, _pmd; 2393 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; 2394 bool anon_exclusive = false, dirty = false; 2395 unsigned long addr; 2396 pte_t *pte; 2397 int i; 2398 2399 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2400 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2401 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2402 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 2403 && !pmd_devmap(*pmd)); 2404 2405 count_vm_event(THP_SPLIT_PMD); 2406 2407 if (!vma_is_anonymous(vma)) { 2408 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 2409 /* 2410 * We are going to unmap this huge page. So 2411 * just go ahead and zap it 2412 */ 2413 if (arch_needs_pgtable_deposit()) 2414 zap_deposited_table(mm, pmd); 2415 if (vma_is_special_huge(vma)) 2416 return; 2417 if (unlikely(is_pmd_migration_entry(old_pmd))) { 2418 swp_entry_t entry; 2419 2420 entry = pmd_to_swp_entry(old_pmd); 2421 folio = pfn_swap_entry_folio(entry); 2422 } else { 2423 page = pmd_page(old_pmd); 2424 folio = page_folio(page); 2425 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) 2426 folio_mark_dirty(folio); 2427 if (!folio_test_referenced(folio) && pmd_young(old_pmd)) 2428 folio_set_referenced(folio); 2429 folio_remove_rmap_pmd(folio, page, vma); 2430 folio_put(folio); 2431 } 2432 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); 2433 return; 2434 } 2435 2436 if (is_huge_zero_pmd(*pmd)) { 2437 /* 2438 * FIXME: Do we want to invalidate secondary mmu by calling 2439 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below 2440 * inside __split_huge_pmd() ? 2441 * 2442 * We are going from a zero huge page write protected to zero 2443 * small page also write protected so it does not seems useful 2444 * to invalidate secondary mmu at this time. 2445 */ 2446 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2447 } 2448 2449 pmd_migration = is_pmd_migration_entry(*pmd); 2450 if (unlikely(pmd_migration)) { 2451 swp_entry_t entry; 2452 2453 old_pmd = *pmd; 2454 entry = pmd_to_swp_entry(old_pmd); 2455 page = pfn_swap_entry_to_page(entry); 2456 write = is_writable_migration_entry(entry); 2457 if (PageAnon(page)) 2458 anon_exclusive = is_readable_exclusive_migration_entry(entry); 2459 young = is_migration_entry_young(entry); 2460 dirty = is_migration_entry_dirty(entry); 2461 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2462 uffd_wp = pmd_swp_uffd_wp(old_pmd); 2463 } else { 2464 /* 2465 * Up to this point the pmd is present and huge and userland has 2466 * the whole access to the hugepage during the split (which 2467 * happens in place). If we overwrite the pmd with the not-huge 2468 * version pointing to the pte here (which of course we could if 2469 * all CPUs were bug free), userland could trigger a small page 2470 * size TLB miss on the small sized TLB while the hugepage TLB 2471 * entry is still established in the huge TLB. Some CPU doesn't 2472 * like that. See 2473 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum 2474 * 383 on page 105. Intel should be safe but is also warns that 2475 * it's only safe if the permission and cache attributes of the 2476 * two entries loaded in the two TLB is identical (which should 2477 * be the case here). But it is generally safer to never allow 2478 * small and huge TLB entries for the same virtual address to be 2479 * loaded simultaneously. So instead of doing "pmd_populate(); 2480 * flush_pmd_tlb_range();" we first mark the current pmd 2481 * notpresent (atomically because here the pmd_trans_huge must 2482 * remain set at all times on the pmd until the split is 2483 * complete for this pmd), then we flush the SMP TLB and finally 2484 * we write the non-huge version of the pmd entry with 2485 * pmd_populate. 2486 */ 2487 old_pmd = pmdp_invalidate(vma, haddr, pmd); 2488 page = pmd_page(old_pmd); 2489 folio = page_folio(page); 2490 if (pmd_dirty(old_pmd)) { 2491 dirty = true; 2492 folio_set_dirty(folio); 2493 } 2494 write = pmd_write(old_pmd); 2495 young = pmd_young(old_pmd); 2496 soft_dirty = pmd_soft_dirty(old_pmd); 2497 uffd_wp = pmd_uffd_wp(old_pmd); 2498 2499 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); 2500 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 2501 2502 /* 2503 * Without "freeze", we'll simply split the PMD, propagating the 2504 * PageAnonExclusive() flag for each PTE by setting it for 2505 * each subpage -- no need to (temporarily) clear. 2506 * 2507 * With "freeze" we want to replace mapped pages by 2508 * migration entries right away. This is only possible if we 2509 * managed to clear PageAnonExclusive() -- see 2510 * set_pmd_migration_entry(). 2511 * 2512 * In case we cannot clear PageAnonExclusive(), split the PMD 2513 * only and let try_to_migrate_one() fail later. 2514 * 2515 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. 2516 */ 2517 anon_exclusive = PageAnonExclusive(page); 2518 if (freeze && anon_exclusive && 2519 folio_try_share_anon_rmap_pmd(folio, page)) 2520 freeze = false; 2521 if (!freeze) { 2522 rmap_t rmap_flags = RMAP_NONE; 2523 2524 folio_ref_add(folio, HPAGE_PMD_NR - 1); 2525 if (anon_exclusive) 2526 rmap_flags |= RMAP_EXCLUSIVE; 2527 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, 2528 vma, haddr, rmap_flags); 2529 } 2530 } 2531 2532 /* 2533 * Withdraw the table only after we mark the pmd entry invalid. 2534 * This's critical for some architectures (Power). 2535 */ 2536 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2537 pmd_populate(mm, &_pmd, pgtable); 2538 2539 pte = pte_offset_map(&_pmd, haddr); 2540 VM_BUG_ON(!pte); 2541 2542 /* 2543 * Note that NUMA hinting access restrictions are not transferred to 2544 * avoid any possibility of altering permissions across VMAs. 2545 */ 2546 if (freeze || pmd_migration) { 2547 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2548 pte_t entry; 2549 swp_entry_t swp_entry; 2550 2551 if (write) 2552 swp_entry = make_writable_migration_entry( 2553 page_to_pfn(page + i)); 2554 else if (anon_exclusive) 2555 swp_entry = make_readable_exclusive_migration_entry( 2556 page_to_pfn(page + i)); 2557 else 2558 swp_entry = make_readable_migration_entry( 2559 page_to_pfn(page + i)); 2560 if (young) 2561 swp_entry = make_migration_entry_young(swp_entry); 2562 if (dirty) 2563 swp_entry = make_migration_entry_dirty(swp_entry); 2564 entry = swp_entry_to_pte(swp_entry); 2565 if (soft_dirty) 2566 entry = pte_swp_mksoft_dirty(entry); 2567 if (uffd_wp) 2568 entry = pte_swp_mkuffd_wp(entry); 2569 2570 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2571 set_pte_at(mm, addr, pte + i, entry); 2572 } 2573 } else { 2574 pte_t entry; 2575 2576 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); 2577 if (write) 2578 entry = pte_mkwrite(entry, vma); 2579 if (!young) 2580 entry = pte_mkold(entry); 2581 /* NOTE: this may set soft-dirty too on some archs */ 2582 if (dirty) 2583 entry = pte_mkdirty(entry); 2584 if (soft_dirty) 2585 entry = pte_mksoft_dirty(entry); 2586 if (uffd_wp) 2587 entry = pte_mkuffd_wp(entry); 2588 2589 for (i = 0; i < HPAGE_PMD_NR; i++) 2590 VM_WARN_ON(!pte_none(ptep_get(pte + i))); 2591 2592 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); 2593 } 2594 pte_unmap(pte); 2595 2596 if (!pmd_migration) 2597 folio_remove_rmap_pmd(folio, page, vma); 2598 if (freeze) 2599 put_page(page); 2600 2601 smp_wmb(); /* make pte visible before pmd */ 2602 pmd_populate(mm, pmd, pgtable); 2603 } 2604 2605 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, 2606 pmd_t *pmd, bool freeze, struct folio *folio) 2607 { 2608 VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); 2609 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); 2610 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); 2611 VM_BUG_ON(freeze && !folio); 2612 2613 /* 2614 * When the caller requests to set up a migration entry, we 2615 * require a folio to check the PMD against. Otherwise, there 2616 * is a risk of replacing the wrong folio. 2617 */ 2618 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || 2619 is_pmd_migration_entry(*pmd)) { 2620 if (folio && folio != pmd_folio(*pmd)) 2621 return; 2622 __split_huge_pmd_locked(vma, pmd, address, freeze); 2623 } 2624 } 2625 2626 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2627 unsigned long address, bool freeze, struct folio *folio) 2628 { 2629 spinlock_t *ptl; 2630 struct mmu_notifier_range range; 2631 2632 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 2633 address & HPAGE_PMD_MASK, 2634 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 2635 mmu_notifier_invalidate_range_start(&range); 2636 ptl = pmd_lock(vma->vm_mm, pmd); 2637 split_huge_pmd_locked(vma, range.start, pmd, freeze, folio); 2638 spin_unlock(ptl); 2639 mmu_notifier_invalidate_range_end(&range); 2640 } 2641 2642 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2643 bool freeze, struct folio *folio) 2644 { 2645 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 2646 2647 if (!pmd) 2648 return; 2649 2650 __split_huge_pmd(vma, pmd, address, freeze, folio); 2651 } 2652 2653 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) 2654 { 2655 /* 2656 * If the new address isn't hpage aligned and it could previously 2657 * contain an hugepage: check if we need to split an huge pmd. 2658 */ 2659 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && 2660 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), 2661 ALIGN(address, HPAGE_PMD_SIZE))) 2662 split_huge_pmd_address(vma, address, false, NULL); 2663 } 2664 2665 void vma_adjust_trans_huge(struct vm_area_struct *vma, 2666 unsigned long start, 2667 unsigned long end, 2668 long adjust_next) 2669 { 2670 /* Check if we need to split start first. */ 2671 split_huge_pmd_if_needed(vma, start); 2672 2673 /* Check if we need to split end next. */ 2674 split_huge_pmd_if_needed(vma, end); 2675 2676 /* 2677 * If we're also updating the next vma vm_start, 2678 * check if we need to split it. 2679 */ 2680 if (adjust_next > 0) { 2681 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); 2682 unsigned long nstart = next->vm_start; 2683 nstart += adjust_next; 2684 split_huge_pmd_if_needed(next, nstart); 2685 } 2686 } 2687 2688 static void unmap_folio(struct folio *folio) 2689 { 2690 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | 2691 TTU_BATCH_FLUSH; 2692 2693 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 2694 2695 if (folio_test_pmd_mappable(folio)) 2696 ttu_flags |= TTU_SPLIT_HUGE_PMD; 2697 2698 /* 2699 * Anon pages need migration entries to preserve them, but file 2700 * pages can simply be left unmapped, then faulted back on demand. 2701 * If that is ever changed (perhaps for mlock), update remap_page(). 2702 */ 2703 if (folio_test_anon(folio)) 2704 try_to_migrate(folio, ttu_flags); 2705 else 2706 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 2707 2708 try_to_unmap_flush(); 2709 } 2710 2711 static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, 2712 unsigned long addr, pmd_t *pmdp, 2713 struct folio *folio) 2714 { 2715 struct mm_struct *mm = vma->vm_mm; 2716 int ref_count, map_count; 2717 pmd_t orig_pmd = *pmdp; 2718 2719 if (folio_test_dirty(folio) || pmd_dirty(orig_pmd)) 2720 return false; 2721 2722 orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp); 2723 2724 /* 2725 * Syncing against concurrent GUP-fast: 2726 * - clear PMD; barrier; read refcount 2727 * - inc refcount; barrier; read PMD 2728 */ 2729 smp_mb(); 2730 2731 ref_count = folio_ref_count(folio); 2732 map_count = folio_mapcount(folio); 2733 2734 /* 2735 * Order reads for folio refcount and dirty flag 2736 * (see comments in __remove_mapping()). 2737 */ 2738 smp_rmb(); 2739 2740 /* 2741 * If the folio or its PMD is redirtied at this point, or if there 2742 * are unexpected references, we will give up to discard this folio 2743 * and remap it. 2744 * 2745 * The only folio refs must be one from isolation plus the rmap(s). 2746 */ 2747 if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) || 2748 ref_count != map_count + 1) { 2749 set_pmd_at(mm, addr, pmdp, orig_pmd); 2750 return false; 2751 } 2752 2753 folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma); 2754 zap_deposited_table(mm, pmdp); 2755 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); 2756 if (vma->vm_flags & VM_LOCKED) 2757 mlock_drain_local(); 2758 folio_put(folio); 2759 2760 return true; 2761 } 2762 2763 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, 2764 pmd_t *pmdp, struct folio *folio) 2765 { 2766 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); 2767 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 2768 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE)); 2769 2770 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) 2771 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); 2772 2773 return false; 2774 } 2775 2776 static void remap_page(struct folio *folio, unsigned long nr) 2777 { 2778 int i = 0; 2779 2780 /* If unmap_folio() uses try_to_migrate() on file, remove this check */ 2781 if (!folio_test_anon(folio)) 2782 return; 2783 for (;;) { 2784 remove_migration_ptes(folio, folio, true); 2785 i += folio_nr_pages(folio); 2786 if (i >= nr) 2787 break; 2788 folio = folio_next(folio); 2789 } 2790 } 2791 2792 static void lru_add_page_tail(struct page *head, struct page *tail, 2793 struct lruvec *lruvec, struct list_head *list) 2794 { 2795 VM_BUG_ON_PAGE(!PageHead(head), head); 2796 VM_BUG_ON_PAGE(PageLRU(tail), head); 2797 lockdep_assert_held(&lruvec->lru_lock); 2798 2799 if (list) { 2800 /* page reclaim is reclaiming a huge page */ 2801 VM_WARN_ON(PageLRU(head)); 2802 get_page(tail); 2803 list_add_tail(&tail->lru, list); 2804 } else { 2805 /* head is still on lru (and we have it frozen) */ 2806 VM_WARN_ON(!PageLRU(head)); 2807 if (PageUnevictable(tail)) 2808 tail->mlock_count = 0; 2809 else 2810 list_add_tail(&tail->lru, &head->lru); 2811 SetPageLRU(tail); 2812 } 2813 } 2814 2815 static void __split_huge_page_tail(struct folio *folio, int tail, 2816 struct lruvec *lruvec, struct list_head *list, 2817 unsigned int new_order) 2818 { 2819 struct page *head = &folio->page; 2820 struct page *page_tail = head + tail; 2821 /* 2822 * Careful: new_folio is not a "real" folio before we cleared PageTail. 2823 * Don't pass it around before clear_compound_head(). 2824 */ 2825 struct folio *new_folio = (struct folio *)page_tail; 2826 2827 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 2828 2829 /* 2830 * Clone page flags before unfreezing refcount. 2831 * 2832 * After successful get_page_unless_zero() might follow flags change, 2833 * for example lock_page() which set PG_waiters. 2834 * 2835 * Note that for mapped sub-pages of an anonymous THP, 2836 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in 2837 * the migration entry instead from where remap_page() will restore it. 2838 * We can still have PG_anon_exclusive set on effectively unmapped and 2839 * unreferenced sub-pages of an anonymous THP: we can simply drop 2840 * PG_anon_exclusive (-> PG_mappedtodisk) for these here. 2841 */ 2842 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 2843 page_tail->flags |= (head->flags & 2844 ((1L << PG_referenced) | 2845 (1L << PG_swapbacked) | 2846 (1L << PG_swapcache) | 2847 (1L << PG_mlocked) | 2848 (1L << PG_uptodate) | 2849 (1L << PG_active) | 2850 (1L << PG_workingset) | 2851 (1L << PG_locked) | 2852 (1L << PG_unevictable) | 2853 #ifdef CONFIG_ARCH_USES_PG_ARCH_X 2854 (1L << PG_arch_2) | 2855 (1L << PG_arch_3) | 2856 #endif 2857 (1L << PG_dirty) | 2858 LRU_GEN_MASK | LRU_REFS_MASK)); 2859 2860 /* ->mapping in first and second tail page is replaced by other uses */ 2861 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 2862 page_tail); 2863 page_tail->mapping = head->mapping; 2864 page_tail->index = head->index + tail; 2865 2866 /* 2867 * page->private should not be set in tail pages. Fix up and warn once 2868 * if private is unexpectedly set. 2869 */ 2870 if (unlikely(page_tail->private)) { 2871 VM_WARN_ON_ONCE_PAGE(true, page_tail); 2872 page_tail->private = 0; 2873 } 2874 if (folio_test_swapcache(folio)) 2875 new_folio->swap.val = folio->swap.val + tail; 2876 2877 /* Page flags must be visible before we make the page non-compound. */ 2878 smp_wmb(); 2879 2880 /* 2881 * Clear PageTail before unfreezing page refcount. 2882 * 2883 * After successful get_page_unless_zero() might follow put_page() 2884 * which needs correct compound_head(). 2885 */ 2886 clear_compound_head(page_tail); 2887 if (new_order) { 2888 prep_compound_page(page_tail, new_order); 2889 folio_set_large_rmappable(new_folio); 2890 } 2891 2892 /* Finally unfreeze refcount. Additional reference from page cache. */ 2893 page_ref_unfreeze(page_tail, 2894 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? 2895 folio_nr_pages(new_folio) : 0)); 2896 2897 if (folio_test_young(folio)) 2898 folio_set_young(new_folio); 2899 if (folio_test_idle(folio)) 2900 folio_set_idle(new_folio); 2901 2902 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); 2903 2904 /* 2905 * always add to the tail because some iterators expect new 2906 * pages to show after the currently processed elements - e.g. 2907 * migrate_pages 2908 */ 2909 lru_add_page_tail(head, page_tail, lruvec, list); 2910 } 2911 2912 static void __split_huge_page(struct page *page, struct list_head *list, 2913 pgoff_t end, unsigned int new_order) 2914 { 2915 struct folio *folio = page_folio(page); 2916 struct page *head = &folio->page; 2917 struct lruvec *lruvec; 2918 struct address_space *swap_cache = NULL; 2919 unsigned long offset = 0; 2920 int i, nr_dropped = 0; 2921 unsigned int new_nr = 1 << new_order; 2922 int order = folio_order(folio); 2923 unsigned int nr = 1 << order; 2924 2925 /* complete memcg works before add pages to LRU */ 2926 split_page_memcg(head, order, new_order); 2927 2928 if (folio_test_anon(folio) && folio_test_swapcache(folio)) { 2929 offset = swap_cache_index(folio->swap); 2930 swap_cache = swap_address_space(folio->swap); 2931 xa_lock(&swap_cache->i_pages); 2932 } 2933 2934 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 2935 lruvec = folio_lruvec_lock(folio); 2936 2937 ClearPageHasHWPoisoned(head); 2938 2939 for (i = nr - new_nr; i >= new_nr; i -= new_nr) { 2940 __split_huge_page_tail(folio, i, lruvec, list, new_order); 2941 /* Some pages can be beyond EOF: drop them from page cache */ 2942 if (head[i].index >= end) { 2943 struct folio *tail = page_folio(head + i); 2944 2945 if (shmem_mapping(folio->mapping)) 2946 nr_dropped++; 2947 else if (folio_test_clear_dirty(tail)) 2948 folio_account_cleaned(tail, 2949 inode_to_wb(folio->mapping->host)); 2950 __filemap_remove_folio(tail, NULL); 2951 folio_put(tail); 2952 } else if (!PageAnon(page)) { 2953 __xa_store(&folio->mapping->i_pages, head[i].index, 2954 head + i, 0); 2955 } else if (swap_cache) { 2956 __xa_store(&swap_cache->i_pages, offset + i, 2957 head + i, 0); 2958 } 2959 } 2960 2961 if (!new_order) 2962 ClearPageCompound(head); 2963 else { 2964 struct folio *new_folio = (struct folio *)head; 2965 2966 folio_set_order(new_folio, new_order); 2967 } 2968 unlock_page_lruvec(lruvec); 2969 /* Caller disabled irqs, so they are still disabled here */ 2970 2971 split_page_owner(head, order, new_order); 2972 pgalloc_tag_split(head, 1 << order); 2973 2974 /* See comment in __split_huge_page_tail() */ 2975 if (folio_test_anon(folio)) { 2976 /* Additional pin to swap cache */ 2977 if (folio_test_swapcache(folio)) { 2978 folio_ref_add(folio, 1 + new_nr); 2979 xa_unlock(&swap_cache->i_pages); 2980 } else { 2981 folio_ref_inc(folio); 2982 } 2983 } else { 2984 /* Additional pin to page cache */ 2985 folio_ref_add(folio, 1 + new_nr); 2986 xa_unlock(&folio->mapping->i_pages); 2987 } 2988 local_irq_enable(); 2989 2990 if (nr_dropped) 2991 shmem_uncharge(folio->mapping->host, nr_dropped); 2992 remap_page(folio, nr); 2993 2994 /* 2995 * set page to its compound_head when split to non order-0 pages, so 2996 * we can skip unlocking it below, since PG_locked is transferred to 2997 * the compound_head of the page and the caller will unlock it. 2998 */ 2999 if (new_order) 3000 page = compound_head(page); 3001 3002 for (i = 0; i < nr; i += new_nr) { 3003 struct page *subpage = head + i; 3004 struct folio *new_folio = page_folio(subpage); 3005 if (subpage == page) 3006 continue; 3007 folio_unlock(new_folio); 3008 3009 /* 3010 * Subpages may be freed if there wasn't any mapping 3011 * like if add_to_swap() is running on a lru page that 3012 * had its mapping zapped. And freeing these pages 3013 * requires taking the lru_lock so we do the put_page 3014 * of the tail pages after the split is complete. 3015 */ 3016 free_page_and_swap_cache(subpage); 3017 } 3018 } 3019 3020 /* Racy check whether the huge page can be split */ 3021 bool can_split_folio(struct folio *folio, int *pextra_pins) 3022 { 3023 int extra_pins; 3024 3025 /* Additional pins from page cache */ 3026 if (folio_test_anon(folio)) 3027 extra_pins = folio_test_swapcache(folio) ? 3028 folio_nr_pages(folio) : 0; 3029 else 3030 extra_pins = folio_nr_pages(folio); 3031 if (pextra_pins) 3032 *pextra_pins = extra_pins; 3033 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; 3034 } 3035 3036 /* 3037 * This function splits a large folio into smaller folios of order @new_order. 3038 * @page can point to any page of the large folio to split. The split operation 3039 * does not change the position of @page. 3040 * 3041 * Prerequisites: 3042 * 3043 * 1) The caller must hold a reference on the @page's owning folio, also known 3044 * as the large folio. 3045 * 3046 * 2) The large folio must be locked. 3047 * 3048 * 3) The folio must not be pinned. Any unexpected folio references, including 3049 * GUP pins, will result in the folio not getting split; instead, the caller 3050 * will receive an -EAGAIN. 3051 * 3052 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not 3053 * supported for non-file-backed folios, because folio->_deferred_list, which 3054 * is used by partially mapped folios, is stored in subpage 2, but an order-1 3055 * folio only has subpages 0 and 1. File-backed order-1 folios are supported, 3056 * since they do not use _deferred_list. 3057 * 3058 * After splitting, the caller's folio reference will be transferred to @page, 3059 * resulting in a raised refcount of @page after this call. The other pages may 3060 * be freed if they are not mapped. 3061 * 3062 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 3063 * 3064 * Pages in @new_order will inherit the mapping, flags, and so on from the 3065 * huge page. 3066 * 3067 * Returns 0 if the huge page was split successfully. 3068 * 3069 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if 3070 * the folio was concurrently removed from the page cache. 3071 * 3072 * Returns -EBUSY when trying to split the huge zeropage, if the folio is 3073 * under writeback, if fs-specific folio metadata cannot currently be 3074 * released, or if some unexpected race happened (e.g., anon VMA disappeared, 3075 * truncation). 3076 * 3077 * Returns -EINVAL when trying to split to an order that is incompatible 3078 * with the folio. Splitting to order 0 is compatible with all folios. 3079 */ 3080 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 3081 unsigned int new_order) 3082 { 3083 struct folio *folio = page_folio(page); 3084 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3085 /* reset xarray order to new order after split */ 3086 XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); 3087 struct anon_vma *anon_vma = NULL; 3088 struct address_space *mapping = NULL; 3089 int order = folio_order(folio); 3090 int extra_pins, ret; 3091 pgoff_t end; 3092 bool is_hzp; 3093 3094 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 3095 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 3096 3097 if (new_order >= folio_order(folio)) 3098 return -EINVAL; 3099 3100 if (folio_test_anon(folio)) { 3101 /* order-1 is not supported for anonymous THP. */ 3102 if (new_order == 1) { 3103 VM_WARN_ONCE(1, "Cannot split to order-1 folio"); 3104 return -EINVAL; 3105 } 3106 } else if (new_order) { 3107 /* Split shmem folio to non-zero order not supported */ 3108 if (shmem_mapping(folio->mapping)) { 3109 VM_WARN_ONCE(1, 3110 "Cannot split shmem folio to non-0 order"); 3111 return -EINVAL; 3112 } 3113 /* 3114 * No split if the file system does not support large folio. 3115 * Note that we might still have THPs in such mappings due to 3116 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping 3117 * does not actually support large folios properly. 3118 */ 3119 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 3120 !mapping_large_folio_support(folio->mapping)) { 3121 VM_WARN_ONCE(1, 3122 "Cannot split file folio to non-0 order"); 3123 return -EINVAL; 3124 } 3125 } 3126 3127 /* Only swapping a whole PMD-mapped folio is supported */ 3128 if (folio_test_swapcache(folio) && new_order) 3129 return -EINVAL; 3130 3131 is_hzp = is_huge_zero_folio(folio); 3132 if (is_hzp) { 3133 pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); 3134 return -EBUSY; 3135 } 3136 3137 if (folio_test_writeback(folio)) 3138 return -EBUSY; 3139 3140 if (folio_test_anon(folio)) { 3141 /* 3142 * The caller does not necessarily hold an mmap_lock that would 3143 * prevent the anon_vma disappearing so we first we take a 3144 * reference to it and then lock the anon_vma for write. This 3145 * is similar to folio_lock_anon_vma_read except the write lock 3146 * is taken to serialise against parallel split or collapse 3147 * operations. 3148 */ 3149 anon_vma = folio_get_anon_vma(folio); 3150 if (!anon_vma) { 3151 ret = -EBUSY; 3152 goto out; 3153 } 3154 end = -1; 3155 mapping = NULL; 3156 anon_vma_lock_write(anon_vma); 3157 } else { 3158 gfp_t gfp; 3159 3160 mapping = folio->mapping; 3161 3162 /* Truncated ? */ 3163 if (!mapping) { 3164 ret = -EBUSY; 3165 goto out; 3166 } 3167 3168 gfp = current_gfp_context(mapping_gfp_mask(mapping) & 3169 GFP_RECLAIM_MASK); 3170 3171 if (!filemap_release_folio(folio, gfp)) { 3172 ret = -EBUSY; 3173 goto out; 3174 } 3175 3176 xas_split_alloc(&xas, folio, folio_order(folio), gfp); 3177 if (xas_error(&xas)) { 3178 ret = xas_error(&xas); 3179 goto out; 3180 } 3181 3182 anon_vma = NULL; 3183 i_mmap_lock_read(mapping); 3184 3185 /* 3186 *__split_huge_page() may need to trim off pages beyond EOF: 3187 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 3188 * which cannot be nested inside the page tree lock. So note 3189 * end now: i_size itself may be changed at any moment, but 3190 * folio lock is good enough to serialize the trimming. 3191 */ 3192 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3193 if (shmem_mapping(mapping)) 3194 end = shmem_fallocend(mapping->host, end); 3195 } 3196 3197 /* 3198 * Racy check if we can split the page, before unmap_folio() will 3199 * split PMDs 3200 */ 3201 if (!can_split_folio(folio, &extra_pins)) { 3202 ret = -EAGAIN; 3203 goto out_unlock; 3204 } 3205 3206 unmap_folio(folio); 3207 3208 /* block interrupt reentry in xa_lock and spinlock */ 3209 local_irq_disable(); 3210 if (mapping) { 3211 /* 3212 * Check if the folio is present in page cache. 3213 * We assume all tail are present too, if folio is there. 3214 */ 3215 xas_lock(&xas); 3216 xas_reset(&xas); 3217 if (xas_load(&xas) != folio) 3218 goto fail; 3219 } 3220 3221 /* Prevent deferred_split_scan() touching ->_refcount */ 3222 spin_lock(&ds_queue->split_queue_lock); 3223 if (folio_ref_freeze(folio, 1 + extra_pins)) { 3224 if (folio_order(folio) > 1 && 3225 !list_empty(&folio->_deferred_list)) { 3226 ds_queue->split_queue_len--; 3227 /* 3228 * Reinitialize page_deferred_list after removing the 3229 * page from the split_queue, otherwise a subsequent 3230 * split will see list corruption when checking the 3231 * page_deferred_list. 3232 */ 3233 list_del_init(&folio->_deferred_list); 3234 } 3235 spin_unlock(&ds_queue->split_queue_lock); 3236 if (mapping) { 3237 int nr = folio_nr_pages(folio); 3238 3239 xas_split(&xas, folio, folio_order(folio)); 3240 if (folio_test_pmd_mappable(folio) && 3241 new_order < HPAGE_PMD_ORDER) { 3242 if (folio_test_swapbacked(folio)) { 3243 __lruvec_stat_mod_folio(folio, 3244 NR_SHMEM_THPS, -nr); 3245 } else { 3246 __lruvec_stat_mod_folio(folio, 3247 NR_FILE_THPS, -nr); 3248 filemap_nr_thps_dec(mapping); 3249 } 3250 } 3251 } 3252 3253 __split_huge_page(page, list, end, new_order); 3254 ret = 0; 3255 } else { 3256 spin_unlock(&ds_queue->split_queue_lock); 3257 fail: 3258 if (mapping) 3259 xas_unlock(&xas); 3260 local_irq_enable(); 3261 remap_page(folio, folio_nr_pages(folio)); 3262 ret = -EAGAIN; 3263 } 3264 3265 out_unlock: 3266 if (anon_vma) { 3267 anon_vma_unlock_write(anon_vma); 3268 put_anon_vma(anon_vma); 3269 } 3270 if (mapping) 3271 i_mmap_unlock_read(mapping); 3272 out: 3273 xas_destroy(&xas); 3274 if (order == HPAGE_PMD_ORDER) 3275 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3276 count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); 3277 return ret; 3278 } 3279 3280 void __folio_undo_large_rmappable(struct folio *folio) 3281 { 3282 struct deferred_split *ds_queue; 3283 unsigned long flags; 3284 3285 ds_queue = get_deferred_split_queue(folio); 3286 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3287 if (!list_empty(&folio->_deferred_list)) { 3288 ds_queue->split_queue_len--; 3289 list_del_init(&folio->_deferred_list); 3290 } 3291 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3292 } 3293 3294 void deferred_split_folio(struct folio *folio) 3295 { 3296 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3297 #ifdef CONFIG_MEMCG 3298 struct mem_cgroup *memcg = folio_memcg(folio); 3299 #endif 3300 unsigned long flags; 3301 3302 /* 3303 * Order 1 folios have no space for a deferred list, but we also 3304 * won't waste much memory by not adding them to the deferred list. 3305 */ 3306 if (folio_order(folio) <= 1) 3307 return; 3308 3309 /* 3310 * The try_to_unmap() in page reclaim path might reach here too, 3311 * this may cause a race condition to corrupt deferred split queue. 3312 * And, if page reclaim is already handling the same folio, it is 3313 * unnecessary to handle it again in shrinker. 3314 * 3315 * Check the swapcache flag to determine if the folio is being 3316 * handled by page reclaim since THP swap would add the folio into 3317 * swap cache before calling try_to_unmap(). 3318 */ 3319 if (folio_test_swapcache(folio)) 3320 return; 3321 3322 if (!list_empty(&folio->_deferred_list)) 3323 return; 3324 3325 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3326 if (list_empty(&folio->_deferred_list)) { 3327 if (folio_test_pmd_mappable(folio)) 3328 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 3329 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); 3330 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); 3331 ds_queue->split_queue_len++; 3332 #ifdef CONFIG_MEMCG 3333 if (memcg) 3334 set_shrinker_bit(memcg, folio_nid(folio), 3335 deferred_split_shrinker->id); 3336 #endif 3337 } 3338 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3339 } 3340 3341 static unsigned long deferred_split_count(struct shrinker *shrink, 3342 struct shrink_control *sc) 3343 { 3344 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3345 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3346 3347 #ifdef CONFIG_MEMCG 3348 if (sc->memcg) 3349 ds_queue = &sc->memcg->deferred_split_queue; 3350 #endif 3351 return READ_ONCE(ds_queue->split_queue_len); 3352 } 3353 3354 static unsigned long deferred_split_scan(struct shrinker *shrink, 3355 struct shrink_control *sc) 3356 { 3357 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3358 struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 3359 unsigned long flags; 3360 LIST_HEAD(list); 3361 struct folio *folio, *next; 3362 int split = 0; 3363 3364 #ifdef CONFIG_MEMCG 3365 if (sc->memcg) 3366 ds_queue = &sc->memcg->deferred_split_queue; 3367 #endif 3368 3369 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3370 /* Take pin on all head pages to avoid freeing them under us */ 3371 list_for_each_entry_safe(folio, next, &ds_queue->split_queue, 3372 _deferred_list) { 3373 if (folio_try_get(folio)) { 3374 list_move(&folio->_deferred_list, &list); 3375 } else { 3376 /* We lost race with folio_put() */ 3377 list_del_init(&folio->_deferred_list); 3378 ds_queue->split_queue_len--; 3379 } 3380 if (!--sc->nr_to_scan) 3381 break; 3382 } 3383 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3384 3385 list_for_each_entry_safe(folio, next, &list, _deferred_list) { 3386 if (!folio_trylock(folio)) 3387 goto next; 3388 /* split_huge_page() removes page from list on success */ 3389 if (!split_folio(folio)) 3390 split++; 3391 folio_unlock(folio); 3392 next: 3393 folio_put(folio); 3394 } 3395 3396 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 3397 list_splice_tail(&list, &ds_queue->split_queue); 3398 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3399 3400 /* 3401 * Stop shrinker if we didn't split any page, but the queue is empty. 3402 * This can happen if pages were freed under us. 3403 */ 3404 if (!split && list_empty(&ds_queue->split_queue)) 3405 return SHRINK_STOP; 3406 return split; 3407 } 3408 3409 #ifdef CONFIG_DEBUG_FS 3410 static void split_huge_pages_all(void) 3411 { 3412 struct zone *zone; 3413 struct page *page; 3414 struct folio *folio; 3415 unsigned long pfn, max_zone_pfn; 3416 unsigned long total = 0, split = 0; 3417 3418 pr_debug("Split all THPs\n"); 3419 for_each_zone(zone) { 3420 if (!managed_zone(zone)) 3421 continue; 3422 max_zone_pfn = zone_end_pfn(zone); 3423 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 3424 int nr_pages; 3425 3426 page = pfn_to_online_page(pfn); 3427 if (!page || PageTail(page)) 3428 continue; 3429 folio = page_folio(page); 3430 if (!folio_try_get(folio)) 3431 continue; 3432 3433 if (unlikely(page_folio(page) != folio)) 3434 goto next; 3435 3436 if (zone != folio_zone(folio)) 3437 goto next; 3438 3439 if (!folio_test_large(folio) 3440 || folio_test_hugetlb(folio) 3441 || !folio_test_lru(folio)) 3442 goto next; 3443 3444 total++; 3445 folio_lock(folio); 3446 nr_pages = folio_nr_pages(folio); 3447 if (!split_folio(folio)) 3448 split++; 3449 pfn += nr_pages - 1; 3450 folio_unlock(folio); 3451 next: 3452 folio_put(folio); 3453 cond_resched(); 3454 } 3455 } 3456 3457 pr_debug("%lu of %lu THP split\n", split, total); 3458 } 3459 3460 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) 3461 { 3462 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || 3463 is_vm_hugetlb_page(vma); 3464 } 3465 3466 static int split_huge_pages_pid(int pid, unsigned long vaddr_start, 3467 unsigned long vaddr_end, unsigned int new_order) 3468 { 3469 int ret = 0; 3470 struct task_struct *task; 3471 struct mm_struct *mm; 3472 unsigned long total = 0, split = 0; 3473 unsigned long addr; 3474 3475 vaddr_start &= PAGE_MASK; 3476 vaddr_end &= PAGE_MASK; 3477 3478 /* Find the task_struct from pid */ 3479 rcu_read_lock(); 3480 task = find_task_by_vpid(pid); 3481 if (!task) { 3482 rcu_read_unlock(); 3483 ret = -ESRCH; 3484 goto out; 3485 } 3486 get_task_struct(task); 3487 rcu_read_unlock(); 3488 3489 /* Find the mm_struct */ 3490 mm = get_task_mm(task); 3491 put_task_struct(task); 3492 3493 if (!mm) { 3494 ret = -EINVAL; 3495 goto out; 3496 } 3497 3498 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", 3499 pid, vaddr_start, vaddr_end); 3500 3501 mmap_read_lock(mm); 3502 /* 3503 * always increase addr by PAGE_SIZE, since we could have a PTE page 3504 * table filled with PTE-mapped THPs, each of which is distinct. 3505 */ 3506 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { 3507 struct vm_area_struct *vma = vma_lookup(mm, addr); 3508 struct page *page; 3509 struct folio *folio; 3510 3511 if (!vma) 3512 break; 3513 3514 /* skip special VMA and hugetlb VMA */ 3515 if (vma_not_suitable_for_thp_split(vma)) { 3516 addr = vma->vm_end; 3517 continue; 3518 } 3519 3520 /* FOLL_DUMP to ignore special (like zero) pages */ 3521 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 3522 3523 if (IS_ERR_OR_NULL(page)) 3524 continue; 3525 3526 folio = page_folio(page); 3527 if (!is_transparent_hugepage(folio)) 3528 goto next; 3529 3530 if (new_order >= folio_order(folio)) 3531 goto next; 3532 3533 total++; 3534 /* 3535 * For folios with private, split_huge_page_to_list_to_order() 3536 * will try to drop it before split and then check if the folio 3537 * can be split or not. So skip the check here. 3538 */ 3539 if (!folio_test_private(folio) && 3540 !can_split_folio(folio, NULL)) 3541 goto next; 3542 3543 if (!folio_trylock(folio)) 3544 goto next; 3545 3546 if (!split_folio_to_order(folio, new_order)) 3547 split++; 3548 3549 folio_unlock(folio); 3550 next: 3551 folio_put(folio); 3552 cond_resched(); 3553 } 3554 mmap_read_unlock(mm); 3555 mmput(mm); 3556 3557 pr_debug("%lu of %lu THP split\n", split, total); 3558 3559 out: 3560 return ret; 3561 } 3562 3563 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, 3564 pgoff_t off_end, unsigned int new_order) 3565 { 3566 struct filename *file; 3567 struct file *candidate; 3568 struct address_space *mapping; 3569 int ret = -EINVAL; 3570 pgoff_t index; 3571 int nr_pages = 1; 3572 unsigned long total = 0, split = 0; 3573 3574 file = getname_kernel(file_path); 3575 if (IS_ERR(file)) 3576 return ret; 3577 3578 candidate = file_open_name(file, O_RDONLY, 0); 3579 if (IS_ERR(candidate)) 3580 goto out; 3581 3582 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", 3583 file_path, off_start, off_end); 3584 3585 mapping = candidate->f_mapping; 3586 3587 for (index = off_start; index < off_end; index += nr_pages) { 3588 struct folio *folio = filemap_get_folio(mapping, index); 3589 3590 nr_pages = 1; 3591 if (IS_ERR(folio)) 3592 continue; 3593 3594 if (!folio_test_large(folio)) 3595 goto next; 3596 3597 total++; 3598 nr_pages = folio_nr_pages(folio); 3599 3600 if (new_order >= folio_order(folio)) 3601 goto next; 3602 3603 if (!folio_trylock(folio)) 3604 goto next; 3605 3606 if (!split_folio_to_order(folio, new_order)) 3607 split++; 3608 3609 folio_unlock(folio); 3610 next: 3611 folio_put(folio); 3612 cond_resched(); 3613 } 3614 3615 filp_close(candidate, NULL); 3616 ret = 0; 3617 3618 pr_debug("%lu of %lu file-backed THP split\n", split, total); 3619 out: 3620 putname(file); 3621 return ret; 3622 } 3623 3624 #define MAX_INPUT_BUF_SZ 255 3625 3626 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, 3627 size_t count, loff_t *ppops) 3628 { 3629 static DEFINE_MUTEX(split_debug_mutex); 3630 ssize_t ret; 3631 /* 3632 * hold pid, start_vaddr, end_vaddr, new_order or 3633 * file_path, off_start, off_end, new_order 3634 */ 3635 char input_buf[MAX_INPUT_BUF_SZ]; 3636 int pid; 3637 unsigned long vaddr_start, vaddr_end; 3638 unsigned int new_order = 0; 3639 3640 ret = mutex_lock_interruptible(&split_debug_mutex); 3641 if (ret) 3642 return ret; 3643 3644 ret = -EFAULT; 3645 3646 memset(input_buf, 0, MAX_INPUT_BUF_SZ); 3647 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) 3648 goto out; 3649 3650 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; 3651 3652 if (input_buf[0] == '/') { 3653 char *tok; 3654 char *buf = input_buf; 3655 char file_path[MAX_INPUT_BUF_SZ]; 3656 pgoff_t off_start = 0, off_end = 0; 3657 size_t input_len = strlen(input_buf); 3658 3659 tok = strsep(&buf, ","); 3660 if (tok) { 3661 strcpy(file_path, tok); 3662 } else { 3663 ret = -EINVAL; 3664 goto out; 3665 } 3666 3667 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); 3668 if (ret != 2 && ret != 3) { 3669 ret = -EINVAL; 3670 goto out; 3671 } 3672 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); 3673 if (!ret) 3674 ret = input_len; 3675 3676 goto out; 3677 } 3678 3679 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); 3680 if (ret == 1 && pid == 1) { 3681 split_huge_pages_all(); 3682 ret = strlen(input_buf); 3683 goto out; 3684 } else if (ret != 3 && ret != 4) { 3685 ret = -EINVAL; 3686 goto out; 3687 } 3688 3689 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); 3690 if (!ret) 3691 ret = strlen(input_buf); 3692 out: 3693 mutex_unlock(&split_debug_mutex); 3694 return ret; 3695 3696 } 3697 3698 static const struct file_operations split_huge_pages_fops = { 3699 .owner = THIS_MODULE, 3700 .write = split_huge_pages_write, 3701 .llseek = no_llseek, 3702 }; 3703 3704 static int __init split_huge_pages_debugfs(void) 3705 { 3706 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 3707 &split_huge_pages_fops); 3708 return 0; 3709 } 3710 late_initcall(split_huge_pages_debugfs); 3711 #endif 3712 3713 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 3714 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 3715 struct page *page) 3716 { 3717 struct folio *folio = page_folio(page); 3718 struct vm_area_struct *vma = pvmw->vma; 3719 struct mm_struct *mm = vma->vm_mm; 3720 unsigned long address = pvmw->address; 3721 bool anon_exclusive; 3722 pmd_t pmdval; 3723 swp_entry_t entry; 3724 pmd_t pmdswp; 3725 3726 if (!(pvmw->pmd && !pvmw->pte)) 3727 return 0; 3728 3729 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 3730 pmdval = pmdp_invalidate(vma, address, pvmw->pmd); 3731 3732 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ 3733 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); 3734 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { 3735 set_pmd_at(mm, address, pvmw->pmd, pmdval); 3736 return -EBUSY; 3737 } 3738 3739 if (pmd_dirty(pmdval)) 3740 folio_mark_dirty(folio); 3741 if (pmd_write(pmdval)) 3742 entry = make_writable_migration_entry(page_to_pfn(page)); 3743 else if (anon_exclusive) 3744 entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); 3745 else 3746 entry = make_readable_migration_entry(page_to_pfn(page)); 3747 if (pmd_young(pmdval)) 3748 entry = make_migration_entry_young(entry); 3749 if (pmd_dirty(pmdval)) 3750 entry = make_migration_entry_dirty(entry); 3751 pmdswp = swp_entry_to_pmd(entry); 3752 if (pmd_soft_dirty(pmdval)) 3753 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 3754 if (pmd_uffd_wp(pmdval)) 3755 pmdswp = pmd_swp_mkuffd_wp(pmdswp); 3756 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 3757 folio_remove_rmap_pmd(folio, page, vma); 3758 folio_put(folio); 3759 trace_set_migration_pmd(address, pmd_val(pmdswp)); 3760 3761 return 0; 3762 } 3763 3764 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 3765 { 3766 struct folio *folio = page_folio(new); 3767 struct vm_area_struct *vma = pvmw->vma; 3768 struct mm_struct *mm = vma->vm_mm; 3769 unsigned long address = pvmw->address; 3770 unsigned long haddr = address & HPAGE_PMD_MASK; 3771 pmd_t pmde; 3772 swp_entry_t entry; 3773 3774 if (!(pvmw->pmd && !pvmw->pte)) 3775 return; 3776 3777 entry = pmd_to_swp_entry(*pvmw->pmd); 3778 folio_get(folio); 3779 pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); 3780 if (pmd_swp_soft_dirty(*pvmw->pmd)) 3781 pmde = pmd_mksoft_dirty(pmde); 3782 if (is_writable_migration_entry(entry)) 3783 pmde = pmd_mkwrite(pmde, vma); 3784 if (pmd_swp_uffd_wp(*pvmw->pmd)) 3785 pmde = pmd_mkuffd_wp(pmde); 3786 if (!is_migration_entry_young(entry)) 3787 pmde = pmd_mkold(pmde); 3788 /* NOTE: this may contain setting soft-dirty on some archs */ 3789 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) 3790 pmde = pmd_mkdirty(pmde); 3791 3792 if (folio_test_anon(folio)) { 3793 rmap_t rmap_flags = RMAP_NONE; 3794 3795 if (!is_readable_migration_entry(entry)) 3796 rmap_flags |= RMAP_EXCLUSIVE; 3797 3798 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); 3799 } else { 3800 folio_add_file_rmap_pmd(folio, new, vma); 3801 } 3802 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new)); 3803 set_pmd_at(mm, haddr, pvmw->pmd, pmde); 3804 3805 /* No need to invalidate - it was non-present before */ 3806 update_mmu_cache_pmd(vma, address, pvmw->pmd); 3807 trace_remove_migration_pmd(address, pmd_val(pmde)); 3808 } 3809 #endif 3810