1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/coredump.h> 11 #include <linux/sched/numa_balancing.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/dax.h> 21 #include <linux/khugepaged.h> 22 #include <linux/freezer.h> 23 #include <linux/pfn_t.h> 24 #include <linux/mman.h> 25 #include <linux/memremap.h> 26 #include <linux/pagemap.h> 27 #include <linux/debugfs.h> 28 #include <linux/migrate.h> 29 #include <linux/hashtable.h> 30 #include <linux/userfaultfd_k.h> 31 #include <linux/page_idle.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/oom.h> 34 #include <linux/numa.h> 35 36 #include <asm/tlb.h> 37 #include <asm/pgalloc.h> 38 #include "internal.h" 39 40 /* 41 * By default, transparent hugepage support is disabled in order to avoid 42 * risking an increased memory footprint for applications that are not 43 * guaranteed to benefit from it. When transparent hugepage support is 44 * enabled, it is for all mappings, and khugepaged scans all mappings. 45 * Defrag is invoked by khugepaged hugepage allocations and by page faults 46 * for all hugepage allocations. 47 */ 48 unsigned long transparent_hugepage_flags __read_mostly = 49 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 50 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 51 #endif 52 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 53 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 54 #endif 55 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 56 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 57 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 58 59 static struct shrinker deferred_split_shrinker; 60 61 static atomic_t huge_zero_refcount; 62 struct page *huge_zero_page __read_mostly; 63 64 bool transparent_hugepage_enabled(struct vm_area_struct *vma) 65 { 66 /* The addr is used to check if the vma size fits */ 67 unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; 68 69 if (!transhuge_vma_suitable(vma, addr)) 70 return false; 71 if (vma_is_anonymous(vma)) 72 return __transparent_hugepage_enabled(vma); 73 if (vma_is_shmem(vma)) 74 return shmem_huge_enabled(vma); 75 76 return false; 77 } 78 79 static struct page *get_huge_zero_page(void) 80 { 81 struct page *zero_page; 82 retry: 83 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 84 return READ_ONCE(huge_zero_page); 85 86 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 87 HPAGE_PMD_ORDER); 88 if (!zero_page) { 89 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 90 return NULL; 91 } 92 count_vm_event(THP_ZERO_PAGE_ALLOC); 93 preempt_disable(); 94 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 95 preempt_enable(); 96 __free_pages(zero_page, compound_order(zero_page)); 97 goto retry; 98 } 99 100 /* We take additional reference here. It will be put back by shrinker */ 101 atomic_set(&huge_zero_refcount, 2); 102 preempt_enable(); 103 return READ_ONCE(huge_zero_page); 104 } 105 106 static void put_huge_zero_page(void) 107 { 108 /* 109 * Counter should never go to zero here. Only shrinker can put 110 * last reference. 111 */ 112 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 113 } 114 115 struct page *mm_get_huge_zero_page(struct mm_struct *mm) 116 { 117 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 118 return READ_ONCE(huge_zero_page); 119 120 if (!get_huge_zero_page()) 121 return NULL; 122 123 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 124 put_huge_zero_page(); 125 126 return READ_ONCE(huge_zero_page); 127 } 128 129 void mm_put_huge_zero_page(struct mm_struct *mm) 130 { 131 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 132 put_huge_zero_page(); 133 } 134 135 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 136 struct shrink_control *sc) 137 { 138 /* we can free zero page only if last reference remains */ 139 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 140 } 141 142 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 143 struct shrink_control *sc) 144 { 145 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 146 struct page *zero_page = xchg(&huge_zero_page, NULL); 147 BUG_ON(zero_page == NULL); 148 __free_pages(zero_page, compound_order(zero_page)); 149 return HPAGE_PMD_NR; 150 } 151 152 return 0; 153 } 154 155 static struct shrinker huge_zero_page_shrinker = { 156 .count_objects = shrink_huge_zero_page_count, 157 .scan_objects = shrink_huge_zero_page_scan, 158 .seeks = DEFAULT_SEEKS, 159 }; 160 161 #ifdef CONFIG_SYSFS 162 static ssize_t enabled_show(struct kobject *kobj, 163 struct kobj_attribute *attr, char *buf) 164 { 165 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 166 return sprintf(buf, "[always] madvise never\n"); 167 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) 168 return sprintf(buf, "always [madvise] never\n"); 169 else 170 return sprintf(buf, "always madvise [never]\n"); 171 } 172 173 static ssize_t enabled_store(struct kobject *kobj, 174 struct kobj_attribute *attr, 175 const char *buf, size_t count) 176 { 177 ssize_t ret = count; 178 179 if (!memcmp("always", buf, 180 min(sizeof("always")-1, count))) { 181 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 182 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 183 } else if (!memcmp("madvise", buf, 184 min(sizeof("madvise")-1, count))) { 185 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 186 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 187 } else if (!memcmp("never", buf, 188 min(sizeof("never")-1, count))) { 189 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 190 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 191 } else 192 ret = -EINVAL; 193 194 if (ret > 0) { 195 int err = start_stop_khugepaged(); 196 if (err) 197 ret = err; 198 } 199 return ret; 200 } 201 static struct kobj_attribute enabled_attr = 202 __ATTR(enabled, 0644, enabled_show, enabled_store); 203 204 ssize_t single_hugepage_flag_show(struct kobject *kobj, 205 struct kobj_attribute *attr, char *buf, 206 enum transparent_hugepage_flag flag) 207 { 208 return sprintf(buf, "%d\n", 209 !!test_bit(flag, &transparent_hugepage_flags)); 210 } 211 212 ssize_t single_hugepage_flag_store(struct kobject *kobj, 213 struct kobj_attribute *attr, 214 const char *buf, size_t count, 215 enum transparent_hugepage_flag flag) 216 { 217 unsigned long value; 218 int ret; 219 220 ret = kstrtoul(buf, 10, &value); 221 if (ret < 0) 222 return ret; 223 if (value > 1) 224 return -EINVAL; 225 226 if (value) 227 set_bit(flag, &transparent_hugepage_flags); 228 else 229 clear_bit(flag, &transparent_hugepage_flags); 230 231 return count; 232 } 233 234 static ssize_t defrag_show(struct kobject *kobj, 235 struct kobj_attribute *attr, char *buf) 236 { 237 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 238 return sprintf(buf, "[always] defer defer+madvise madvise never\n"); 239 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 240 return sprintf(buf, "always [defer] defer+madvise madvise never\n"); 241 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 242 return sprintf(buf, "always defer [defer+madvise] madvise never\n"); 243 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 244 return sprintf(buf, "always defer defer+madvise [madvise] never\n"); 245 return sprintf(buf, "always defer defer+madvise madvise [never]\n"); 246 } 247 248 static ssize_t defrag_store(struct kobject *kobj, 249 struct kobj_attribute *attr, 250 const char *buf, size_t count) 251 { 252 if (!memcmp("always", buf, 253 min(sizeof("always")-1, count))) { 254 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 255 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 256 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 257 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 258 } else if (!memcmp("defer+madvise", buf, 259 min(sizeof("defer+madvise")-1, count))) { 260 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 261 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 262 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 263 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 264 } else if (!memcmp("defer", buf, 265 min(sizeof("defer")-1, count))) { 266 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 267 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 268 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 269 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 270 } else if (!memcmp("madvise", buf, 271 min(sizeof("madvise")-1, count))) { 272 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 273 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 274 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 275 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 276 } else if (!memcmp("never", buf, 277 min(sizeof("never")-1, count))) { 278 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 279 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 280 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 281 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 282 } else 283 return -EINVAL; 284 285 return count; 286 } 287 static struct kobj_attribute defrag_attr = 288 __ATTR(defrag, 0644, defrag_show, defrag_store); 289 290 static ssize_t use_zero_page_show(struct kobject *kobj, 291 struct kobj_attribute *attr, char *buf) 292 { 293 return single_hugepage_flag_show(kobj, attr, buf, 294 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 295 } 296 static ssize_t use_zero_page_store(struct kobject *kobj, 297 struct kobj_attribute *attr, const char *buf, size_t count) 298 { 299 return single_hugepage_flag_store(kobj, attr, buf, count, 300 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 301 } 302 static struct kobj_attribute use_zero_page_attr = 303 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 304 305 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 306 struct kobj_attribute *attr, char *buf) 307 { 308 return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); 309 } 310 static struct kobj_attribute hpage_pmd_size_attr = 311 __ATTR_RO(hpage_pmd_size); 312 313 #ifdef CONFIG_DEBUG_VM 314 static ssize_t debug_cow_show(struct kobject *kobj, 315 struct kobj_attribute *attr, char *buf) 316 { 317 return single_hugepage_flag_show(kobj, attr, buf, 318 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 319 } 320 static ssize_t debug_cow_store(struct kobject *kobj, 321 struct kobj_attribute *attr, 322 const char *buf, size_t count) 323 { 324 return single_hugepage_flag_store(kobj, attr, buf, count, 325 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 326 } 327 static struct kobj_attribute debug_cow_attr = 328 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 329 #endif /* CONFIG_DEBUG_VM */ 330 331 static struct attribute *hugepage_attr[] = { 332 &enabled_attr.attr, 333 &defrag_attr.attr, 334 &use_zero_page_attr.attr, 335 &hpage_pmd_size_attr.attr, 336 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) 337 &shmem_enabled_attr.attr, 338 #endif 339 #ifdef CONFIG_DEBUG_VM 340 &debug_cow_attr.attr, 341 #endif 342 NULL, 343 }; 344 345 static const struct attribute_group hugepage_attr_group = { 346 .attrs = hugepage_attr, 347 }; 348 349 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 350 { 351 int err; 352 353 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 354 if (unlikely(!*hugepage_kobj)) { 355 pr_err("failed to create transparent hugepage kobject\n"); 356 return -ENOMEM; 357 } 358 359 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 360 if (err) { 361 pr_err("failed to register transparent hugepage group\n"); 362 goto delete_obj; 363 } 364 365 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 366 if (err) { 367 pr_err("failed to register transparent hugepage group\n"); 368 goto remove_hp_group; 369 } 370 371 return 0; 372 373 remove_hp_group: 374 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 375 delete_obj: 376 kobject_put(*hugepage_kobj); 377 return err; 378 } 379 380 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 381 { 382 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 383 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 384 kobject_put(hugepage_kobj); 385 } 386 #else 387 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 388 { 389 return 0; 390 } 391 392 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 393 { 394 } 395 #endif /* CONFIG_SYSFS */ 396 397 static int __init hugepage_init(void) 398 { 399 int err; 400 struct kobject *hugepage_kobj; 401 402 if (!has_transparent_hugepage()) { 403 transparent_hugepage_flags = 0; 404 return -EINVAL; 405 } 406 407 /* 408 * hugepages can't be allocated by the buddy allocator 409 */ 410 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); 411 /* 412 * we use page->mapping and page->index in second tail page 413 * as list_head: assuming THP order >= 2 414 */ 415 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 416 417 err = hugepage_init_sysfs(&hugepage_kobj); 418 if (err) 419 goto err_sysfs; 420 421 err = khugepaged_init(); 422 if (err) 423 goto err_slab; 424 425 err = register_shrinker(&huge_zero_page_shrinker); 426 if (err) 427 goto err_hzp_shrinker; 428 err = register_shrinker(&deferred_split_shrinker); 429 if (err) 430 goto err_split_shrinker; 431 432 /* 433 * By default disable transparent hugepages on smaller systems, 434 * where the extra memory used could hurt more than TLB overhead 435 * is likely to save. The admin can still enable it through /sys. 436 */ 437 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 438 transparent_hugepage_flags = 0; 439 return 0; 440 } 441 442 err = start_stop_khugepaged(); 443 if (err) 444 goto err_khugepaged; 445 446 return 0; 447 err_khugepaged: 448 unregister_shrinker(&deferred_split_shrinker); 449 err_split_shrinker: 450 unregister_shrinker(&huge_zero_page_shrinker); 451 err_hzp_shrinker: 452 khugepaged_destroy(); 453 err_slab: 454 hugepage_exit_sysfs(hugepage_kobj); 455 err_sysfs: 456 return err; 457 } 458 subsys_initcall(hugepage_init); 459 460 static int __init setup_transparent_hugepage(char *str) 461 { 462 int ret = 0; 463 if (!str) 464 goto out; 465 if (!strcmp(str, "always")) { 466 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 467 &transparent_hugepage_flags); 468 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 469 &transparent_hugepage_flags); 470 ret = 1; 471 } else if (!strcmp(str, "madvise")) { 472 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 473 &transparent_hugepage_flags); 474 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 475 &transparent_hugepage_flags); 476 ret = 1; 477 } else if (!strcmp(str, "never")) { 478 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 479 &transparent_hugepage_flags); 480 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 481 &transparent_hugepage_flags); 482 ret = 1; 483 } 484 out: 485 if (!ret) 486 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 487 return ret; 488 } 489 __setup("transparent_hugepage=", setup_transparent_hugepage); 490 491 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 492 { 493 if (likely(vma->vm_flags & VM_WRITE)) 494 pmd = pmd_mkwrite(pmd); 495 return pmd; 496 } 497 498 static inline struct list_head *page_deferred_list(struct page *page) 499 { 500 /* ->lru in the tail pages is occupied by compound_head. */ 501 return &page[2].deferred_list; 502 } 503 504 void prep_transhuge_page(struct page *page) 505 { 506 /* 507 * we use page->mapping and page->indexlru in second tail page 508 * as list_head: assuming THP order >= 2 509 */ 510 511 INIT_LIST_HEAD(page_deferred_list(page)); 512 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 513 } 514 515 static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, 516 loff_t off, unsigned long flags, unsigned long size) 517 { 518 unsigned long addr; 519 loff_t off_end = off + len; 520 loff_t off_align = round_up(off, size); 521 unsigned long len_pad; 522 523 if (off_end <= off_align || (off_end - off_align) < size) 524 return 0; 525 526 len_pad = len + size; 527 if (len_pad < len || (off + len_pad) < off) 528 return 0; 529 530 addr = current->mm->get_unmapped_area(filp, 0, len_pad, 531 off >> PAGE_SHIFT, flags); 532 if (IS_ERR_VALUE(addr)) 533 return 0; 534 535 addr += (off - addr) & (size - 1); 536 return addr; 537 } 538 539 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 540 unsigned long len, unsigned long pgoff, unsigned long flags) 541 { 542 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 543 544 if (addr) 545 goto out; 546 if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) 547 goto out; 548 549 addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); 550 if (addr) 551 return addr; 552 553 out: 554 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 555 } 556 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 557 558 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, 559 struct page *page, gfp_t gfp) 560 { 561 struct vm_area_struct *vma = vmf->vma; 562 struct mem_cgroup *memcg; 563 pgtable_t pgtable; 564 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 565 vm_fault_t ret = 0; 566 567 VM_BUG_ON_PAGE(!PageCompound(page), page); 568 569 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { 570 put_page(page); 571 count_vm_event(THP_FAULT_FALLBACK); 572 return VM_FAULT_FALLBACK; 573 } 574 575 pgtable = pte_alloc_one(vma->vm_mm); 576 if (unlikely(!pgtable)) { 577 ret = VM_FAULT_OOM; 578 goto release; 579 } 580 581 clear_huge_page(page, vmf->address, HPAGE_PMD_NR); 582 /* 583 * The memory barrier inside __SetPageUptodate makes sure that 584 * clear_huge_page writes become visible before the set_pmd_at() 585 * write. 586 */ 587 __SetPageUptodate(page); 588 589 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 590 if (unlikely(!pmd_none(*vmf->pmd))) { 591 goto unlock_release; 592 } else { 593 pmd_t entry; 594 595 ret = check_stable_address_space(vma->vm_mm); 596 if (ret) 597 goto unlock_release; 598 599 /* Deliver the page fault to userland */ 600 if (userfaultfd_missing(vma)) { 601 vm_fault_t ret2; 602 603 spin_unlock(vmf->ptl); 604 mem_cgroup_cancel_charge(page, memcg, true); 605 put_page(page); 606 pte_free(vma->vm_mm, pgtable); 607 ret2 = handle_userfault(vmf, VM_UFFD_MISSING); 608 VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); 609 return ret2; 610 } 611 612 entry = mk_huge_pmd(page, vma->vm_page_prot); 613 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 614 page_add_new_anon_rmap(page, vma, haddr, true); 615 mem_cgroup_commit_charge(page, memcg, false, true); 616 lru_cache_add_active_or_unevictable(page, vma); 617 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 618 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 619 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 620 mm_inc_nr_ptes(vma->vm_mm); 621 spin_unlock(vmf->ptl); 622 count_vm_event(THP_FAULT_ALLOC); 623 count_memcg_events(memcg, THP_FAULT_ALLOC, 1); 624 } 625 626 return 0; 627 unlock_release: 628 spin_unlock(vmf->ptl); 629 release: 630 if (pgtable) 631 pte_free(vma->vm_mm, pgtable); 632 mem_cgroup_cancel_charge(page, memcg, true); 633 put_page(page); 634 return ret; 635 636 } 637 638 /* 639 * always: directly stall for all thp allocations 640 * defer: wake kswapd and fail if not immediately available 641 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 642 * fail if not immediately available 643 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 644 * available 645 * never: never stall for any thp allocation 646 */ 647 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) 648 { 649 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 650 gfp_t this_node = 0; 651 652 #ifdef CONFIG_NUMA 653 struct mempolicy *pol; 654 /* 655 * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not 656 * specified, to express a general desire to stay on the current 657 * node for optimistic allocation attempts. If the defrag mode 658 * and/or madvise hint requires the direct reclaim then we prefer 659 * to fallback to other node rather than node reclaim because that 660 * can lead to excessive reclaim even though there is free memory 661 * on other nodes. We expect that NUMA preferences are specified 662 * by memory policies. 663 */ 664 pol = get_vma_policy(vma, addr); 665 if (pol->mode != MPOL_BIND) 666 this_node = __GFP_THISNODE; 667 mpol_cond_put(pol); 668 #endif 669 670 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 671 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 672 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 673 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; 674 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 675 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 676 __GFP_KSWAPD_RECLAIM | this_node); 677 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 678 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 679 this_node); 680 return GFP_TRANSHUGE_LIGHT | this_node; 681 } 682 683 /* Caller must hold page table lock. */ 684 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 685 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 686 struct page *zero_page) 687 { 688 pmd_t entry; 689 if (!pmd_none(*pmd)) 690 return false; 691 entry = mk_pmd(zero_page, vma->vm_page_prot); 692 entry = pmd_mkhuge(entry); 693 if (pgtable) 694 pgtable_trans_huge_deposit(mm, pmd, pgtable); 695 set_pmd_at(mm, haddr, pmd, entry); 696 mm_inc_nr_ptes(mm); 697 return true; 698 } 699 700 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 701 { 702 struct vm_area_struct *vma = vmf->vma; 703 gfp_t gfp; 704 struct page *page; 705 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 706 707 if (!transhuge_vma_suitable(vma, haddr)) 708 return VM_FAULT_FALLBACK; 709 if (unlikely(anon_vma_prepare(vma))) 710 return VM_FAULT_OOM; 711 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 712 return VM_FAULT_OOM; 713 if (!(vmf->flags & FAULT_FLAG_WRITE) && 714 !mm_forbids_zeropage(vma->vm_mm) && 715 transparent_hugepage_use_zero_page()) { 716 pgtable_t pgtable; 717 struct page *zero_page; 718 bool set; 719 vm_fault_t ret; 720 pgtable = pte_alloc_one(vma->vm_mm); 721 if (unlikely(!pgtable)) 722 return VM_FAULT_OOM; 723 zero_page = mm_get_huge_zero_page(vma->vm_mm); 724 if (unlikely(!zero_page)) { 725 pte_free(vma->vm_mm, pgtable); 726 count_vm_event(THP_FAULT_FALLBACK); 727 return VM_FAULT_FALLBACK; 728 } 729 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 730 ret = 0; 731 set = false; 732 if (pmd_none(*vmf->pmd)) { 733 ret = check_stable_address_space(vma->vm_mm); 734 if (ret) { 735 spin_unlock(vmf->ptl); 736 } else if (userfaultfd_missing(vma)) { 737 spin_unlock(vmf->ptl); 738 ret = handle_userfault(vmf, VM_UFFD_MISSING); 739 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 740 } else { 741 set_huge_zero_page(pgtable, vma->vm_mm, vma, 742 haddr, vmf->pmd, zero_page); 743 spin_unlock(vmf->ptl); 744 set = true; 745 } 746 } else 747 spin_unlock(vmf->ptl); 748 if (!set) 749 pte_free(vma->vm_mm, pgtable); 750 return ret; 751 } 752 gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 753 page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); 754 if (unlikely(!page)) { 755 count_vm_event(THP_FAULT_FALLBACK); 756 return VM_FAULT_FALLBACK; 757 } 758 prep_transhuge_page(page); 759 return __do_huge_pmd_anonymous_page(vmf, page, gfp); 760 } 761 762 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 763 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 764 pgtable_t pgtable) 765 { 766 struct mm_struct *mm = vma->vm_mm; 767 pmd_t entry; 768 spinlock_t *ptl; 769 770 ptl = pmd_lock(mm, pmd); 771 if (!pmd_none(*pmd)) { 772 if (write) { 773 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { 774 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 775 goto out_unlock; 776 } 777 entry = pmd_mkyoung(*pmd); 778 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 779 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 780 update_mmu_cache_pmd(vma, addr, pmd); 781 } 782 783 goto out_unlock; 784 } 785 786 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 787 if (pfn_t_devmap(pfn)) 788 entry = pmd_mkdevmap(entry); 789 if (write) { 790 entry = pmd_mkyoung(pmd_mkdirty(entry)); 791 entry = maybe_pmd_mkwrite(entry, vma); 792 } 793 794 if (pgtable) { 795 pgtable_trans_huge_deposit(mm, pmd, pgtable); 796 mm_inc_nr_ptes(mm); 797 pgtable = NULL; 798 } 799 800 set_pmd_at(mm, addr, pmd, entry); 801 update_mmu_cache_pmd(vma, addr, pmd); 802 803 out_unlock: 804 spin_unlock(ptl); 805 if (pgtable) 806 pte_free(mm, pgtable); 807 } 808 809 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 810 { 811 unsigned long addr = vmf->address & PMD_MASK; 812 struct vm_area_struct *vma = vmf->vma; 813 pgprot_t pgprot = vma->vm_page_prot; 814 pgtable_t pgtable = NULL; 815 816 /* 817 * If we had pmd_special, we could avoid all these restrictions, 818 * but we need to be consistent with PTEs and architectures that 819 * can't support a 'special' bit. 820 */ 821 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 822 !pfn_t_devmap(pfn)); 823 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 824 (VM_PFNMAP|VM_MIXEDMAP)); 825 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 826 827 if (addr < vma->vm_start || addr >= vma->vm_end) 828 return VM_FAULT_SIGBUS; 829 830 if (arch_needs_pgtable_deposit()) { 831 pgtable = pte_alloc_one(vma->vm_mm); 832 if (!pgtable) 833 return VM_FAULT_OOM; 834 } 835 836 track_pfn_insert(vma, &pgprot, pfn); 837 838 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 839 return VM_FAULT_NOPAGE; 840 } 841 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 842 843 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 844 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 845 { 846 if (likely(vma->vm_flags & VM_WRITE)) 847 pud = pud_mkwrite(pud); 848 return pud; 849 } 850 851 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 852 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) 853 { 854 struct mm_struct *mm = vma->vm_mm; 855 pud_t entry; 856 spinlock_t *ptl; 857 858 ptl = pud_lock(mm, pud); 859 if (!pud_none(*pud)) { 860 if (write) { 861 if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { 862 WARN_ON_ONCE(!is_huge_zero_pud(*pud)); 863 goto out_unlock; 864 } 865 entry = pud_mkyoung(*pud); 866 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 867 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 868 update_mmu_cache_pud(vma, addr, pud); 869 } 870 goto out_unlock; 871 } 872 873 entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 874 if (pfn_t_devmap(pfn)) 875 entry = pud_mkdevmap(entry); 876 if (write) { 877 entry = pud_mkyoung(pud_mkdirty(entry)); 878 entry = maybe_pud_mkwrite(entry, vma); 879 } 880 set_pud_at(mm, addr, pud, entry); 881 update_mmu_cache_pud(vma, addr, pud); 882 883 out_unlock: 884 spin_unlock(ptl); 885 } 886 887 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 888 { 889 unsigned long addr = vmf->address & PUD_MASK; 890 struct vm_area_struct *vma = vmf->vma; 891 pgprot_t pgprot = vma->vm_page_prot; 892 893 /* 894 * If we had pud_special, we could avoid all these restrictions, 895 * but we need to be consistent with PTEs and architectures that 896 * can't support a 'special' bit. 897 */ 898 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 899 !pfn_t_devmap(pfn)); 900 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 901 (VM_PFNMAP|VM_MIXEDMAP)); 902 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 903 904 if (addr < vma->vm_start || addr >= vma->vm_end) 905 return VM_FAULT_SIGBUS; 906 907 track_pfn_insert(vma, &pgprot, pfn); 908 909 insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); 910 return VM_FAULT_NOPAGE; 911 } 912 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 913 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 914 915 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 916 pmd_t *pmd, int flags) 917 { 918 pmd_t _pmd; 919 920 _pmd = pmd_mkyoung(*pmd); 921 if (flags & FOLL_WRITE) 922 _pmd = pmd_mkdirty(_pmd); 923 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 924 pmd, _pmd, flags & FOLL_WRITE)) 925 update_mmu_cache_pmd(vma, addr, pmd); 926 } 927 928 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 929 pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 930 { 931 unsigned long pfn = pmd_pfn(*pmd); 932 struct mm_struct *mm = vma->vm_mm; 933 struct page *page; 934 935 assert_spin_locked(pmd_lockptr(mm, pmd)); 936 937 /* 938 * When we COW a devmap PMD entry, we split it into PTEs, so we should 939 * not be in this function with `flags & FOLL_COW` set. 940 */ 941 WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); 942 943 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 944 return NULL; 945 946 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 947 /* pass */; 948 else 949 return NULL; 950 951 if (flags & FOLL_TOUCH) 952 touch_pmd(vma, addr, pmd, flags); 953 954 /* 955 * device mapped pages can only be returned if the 956 * caller will manage the page reference count. 957 */ 958 if (!(flags & FOLL_GET)) 959 return ERR_PTR(-EEXIST); 960 961 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 962 *pgmap = get_dev_pagemap(pfn, *pgmap); 963 if (!*pgmap) 964 return ERR_PTR(-EFAULT); 965 page = pfn_to_page(pfn); 966 get_page(page); 967 968 return page; 969 } 970 971 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 972 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 973 struct vm_area_struct *vma) 974 { 975 spinlock_t *dst_ptl, *src_ptl; 976 struct page *src_page; 977 pmd_t pmd; 978 pgtable_t pgtable = NULL; 979 int ret = -ENOMEM; 980 981 /* Skip if can be re-fill on fault */ 982 if (!vma_is_anonymous(vma)) 983 return 0; 984 985 pgtable = pte_alloc_one(dst_mm); 986 if (unlikely(!pgtable)) 987 goto out; 988 989 dst_ptl = pmd_lock(dst_mm, dst_pmd); 990 src_ptl = pmd_lockptr(src_mm, src_pmd); 991 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 992 993 ret = -EAGAIN; 994 pmd = *src_pmd; 995 996 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 997 if (unlikely(is_swap_pmd(pmd))) { 998 swp_entry_t entry = pmd_to_swp_entry(pmd); 999 1000 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1001 if (is_write_migration_entry(entry)) { 1002 make_migration_entry_read(&entry); 1003 pmd = swp_entry_to_pmd(entry); 1004 if (pmd_swp_soft_dirty(*src_pmd)) 1005 pmd = pmd_swp_mksoft_dirty(pmd); 1006 set_pmd_at(src_mm, addr, src_pmd, pmd); 1007 } 1008 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1009 mm_inc_nr_ptes(dst_mm); 1010 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1011 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1012 ret = 0; 1013 goto out_unlock; 1014 } 1015 #endif 1016 1017 if (unlikely(!pmd_trans_huge(pmd))) { 1018 pte_free(dst_mm, pgtable); 1019 goto out_unlock; 1020 } 1021 /* 1022 * When page table lock is held, the huge zero pmd should not be 1023 * under splitting since we don't split the page itself, only pmd to 1024 * a page table. 1025 */ 1026 if (is_huge_zero_pmd(pmd)) { 1027 struct page *zero_page; 1028 /* 1029 * get_huge_zero_page() will never allocate a new page here, 1030 * since we already have a zero page to copy. It just takes a 1031 * reference. 1032 */ 1033 zero_page = mm_get_huge_zero_page(dst_mm); 1034 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 1035 zero_page); 1036 ret = 0; 1037 goto out_unlock; 1038 } 1039 1040 src_page = pmd_page(pmd); 1041 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1042 get_page(src_page); 1043 page_dup_rmap(src_page, true); 1044 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1045 mm_inc_nr_ptes(dst_mm); 1046 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1047 1048 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1049 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1050 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1051 1052 ret = 0; 1053 out_unlock: 1054 spin_unlock(src_ptl); 1055 spin_unlock(dst_ptl); 1056 out: 1057 return ret; 1058 } 1059 1060 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1061 static void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1062 pud_t *pud, int flags) 1063 { 1064 pud_t _pud; 1065 1066 _pud = pud_mkyoung(*pud); 1067 if (flags & FOLL_WRITE) 1068 _pud = pud_mkdirty(_pud); 1069 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1070 pud, _pud, flags & FOLL_WRITE)) 1071 update_mmu_cache_pud(vma, addr, pud); 1072 } 1073 1074 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, 1075 pud_t *pud, int flags, struct dev_pagemap **pgmap) 1076 { 1077 unsigned long pfn = pud_pfn(*pud); 1078 struct mm_struct *mm = vma->vm_mm; 1079 struct page *page; 1080 1081 assert_spin_locked(pud_lockptr(mm, pud)); 1082 1083 if (flags & FOLL_WRITE && !pud_write(*pud)) 1084 return NULL; 1085 1086 if (pud_present(*pud) && pud_devmap(*pud)) 1087 /* pass */; 1088 else 1089 return NULL; 1090 1091 if (flags & FOLL_TOUCH) 1092 touch_pud(vma, addr, pud, flags); 1093 1094 /* 1095 * device mapped pages can only be returned if the 1096 * caller will manage the page reference count. 1097 */ 1098 if (!(flags & FOLL_GET)) 1099 return ERR_PTR(-EEXIST); 1100 1101 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; 1102 *pgmap = get_dev_pagemap(pfn, *pgmap); 1103 if (!*pgmap) 1104 return ERR_PTR(-EFAULT); 1105 page = pfn_to_page(pfn); 1106 get_page(page); 1107 1108 return page; 1109 } 1110 1111 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1112 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1113 struct vm_area_struct *vma) 1114 { 1115 spinlock_t *dst_ptl, *src_ptl; 1116 pud_t pud; 1117 int ret; 1118 1119 dst_ptl = pud_lock(dst_mm, dst_pud); 1120 src_ptl = pud_lockptr(src_mm, src_pud); 1121 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1122 1123 ret = -EAGAIN; 1124 pud = *src_pud; 1125 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 1126 goto out_unlock; 1127 1128 /* 1129 * When page table lock is held, the huge zero pud should not be 1130 * under splitting since we don't split the page itself, only pud to 1131 * a page table. 1132 */ 1133 if (is_huge_zero_pud(pud)) { 1134 /* No huge zero pud yet */ 1135 } 1136 1137 pudp_set_wrprotect(src_mm, addr, src_pud); 1138 pud = pud_mkold(pud_wrprotect(pud)); 1139 set_pud_at(dst_mm, addr, dst_pud, pud); 1140 1141 ret = 0; 1142 out_unlock: 1143 spin_unlock(src_ptl); 1144 spin_unlock(dst_ptl); 1145 return ret; 1146 } 1147 1148 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1149 { 1150 pud_t entry; 1151 unsigned long haddr; 1152 bool write = vmf->flags & FAULT_FLAG_WRITE; 1153 1154 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1155 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1156 goto unlock; 1157 1158 entry = pud_mkyoung(orig_pud); 1159 if (write) 1160 entry = pud_mkdirty(entry); 1161 haddr = vmf->address & HPAGE_PUD_MASK; 1162 if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) 1163 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); 1164 1165 unlock: 1166 spin_unlock(vmf->ptl); 1167 } 1168 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1169 1170 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) 1171 { 1172 pmd_t entry; 1173 unsigned long haddr; 1174 bool write = vmf->flags & FAULT_FLAG_WRITE; 1175 1176 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1177 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1178 goto unlock; 1179 1180 entry = pmd_mkyoung(orig_pmd); 1181 if (write) 1182 entry = pmd_mkdirty(entry); 1183 haddr = vmf->address & HPAGE_PMD_MASK; 1184 if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) 1185 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); 1186 1187 unlock: 1188 spin_unlock(vmf->ptl); 1189 } 1190 1191 static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, 1192 pmd_t orig_pmd, struct page *page) 1193 { 1194 struct vm_area_struct *vma = vmf->vma; 1195 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1196 struct mem_cgroup *memcg; 1197 pgtable_t pgtable; 1198 pmd_t _pmd; 1199 int i; 1200 vm_fault_t ret = 0; 1201 struct page **pages; 1202 struct mmu_notifier_range range; 1203 1204 pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), 1205 GFP_KERNEL); 1206 if (unlikely(!pages)) { 1207 ret |= VM_FAULT_OOM; 1208 goto out; 1209 } 1210 1211 for (i = 0; i < HPAGE_PMD_NR; i++) { 1212 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, 1213 vmf->address, page_to_nid(page)); 1214 if (unlikely(!pages[i] || 1215 mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, 1216 GFP_KERNEL, &memcg, false))) { 1217 if (pages[i]) 1218 put_page(pages[i]); 1219 while (--i >= 0) { 1220 memcg = (void *)page_private(pages[i]); 1221 set_page_private(pages[i], 0); 1222 mem_cgroup_cancel_charge(pages[i], memcg, 1223 false); 1224 put_page(pages[i]); 1225 } 1226 kfree(pages); 1227 ret |= VM_FAULT_OOM; 1228 goto out; 1229 } 1230 set_page_private(pages[i], (unsigned long)memcg); 1231 } 1232 1233 for (i = 0; i < HPAGE_PMD_NR; i++) { 1234 copy_user_highpage(pages[i], page + i, 1235 haddr + PAGE_SIZE * i, vma); 1236 __SetPageUptodate(pages[i]); 1237 cond_resched(); 1238 } 1239 1240 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1241 haddr, haddr + HPAGE_PMD_SIZE); 1242 mmu_notifier_invalidate_range_start(&range); 1243 1244 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1245 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1246 goto out_free_pages; 1247 VM_BUG_ON_PAGE(!PageHead(page), page); 1248 1249 /* 1250 * Leave pmd empty until pte is filled note we must notify here as 1251 * concurrent CPU thread might write to new page before the call to 1252 * mmu_notifier_invalidate_range_end() happens which can lead to a 1253 * device seeing memory write in different order than CPU. 1254 * 1255 * See Documentation/vm/mmu_notifier.rst 1256 */ 1257 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1258 1259 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); 1260 pmd_populate(vma->vm_mm, &_pmd, pgtable); 1261 1262 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1263 pte_t entry; 1264 entry = mk_pte(pages[i], vma->vm_page_prot); 1265 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1266 memcg = (void *)page_private(pages[i]); 1267 set_page_private(pages[i], 0); 1268 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); 1269 mem_cgroup_commit_charge(pages[i], memcg, false, false); 1270 lru_cache_add_active_or_unevictable(pages[i], vma); 1271 vmf->pte = pte_offset_map(&_pmd, haddr); 1272 VM_BUG_ON(!pte_none(*vmf->pte)); 1273 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); 1274 pte_unmap(vmf->pte); 1275 } 1276 kfree(pages); 1277 1278 smp_wmb(); /* make pte visible before pmd */ 1279 pmd_populate(vma->vm_mm, vmf->pmd, pgtable); 1280 page_remove_rmap(page, true); 1281 spin_unlock(vmf->ptl); 1282 1283 /* 1284 * No need to double call mmu_notifier->invalidate_range() callback as 1285 * the above pmdp_huge_clear_flush_notify() did already call it. 1286 */ 1287 mmu_notifier_invalidate_range_only_end(&range); 1288 1289 ret |= VM_FAULT_WRITE; 1290 put_page(page); 1291 1292 out: 1293 return ret; 1294 1295 out_free_pages: 1296 spin_unlock(vmf->ptl); 1297 mmu_notifier_invalidate_range_end(&range); 1298 for (i = 0; i < HPAGE_PMD_NR; i++) { 1299 memcg = (void *)page_private(pages[i]); 1300 set_page_private(pages[i], 0); 1301 mem_cgroup_cancel_charge(pages[i], memcg, false); 1302 put_page(pages[i]); 1303 } 1304 kfree(pages); 1305 goto out; 1306 } 1307 1308 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) 1309 { 1310 struct vm_area_struct *vma = vmf->vma; 1311 struct page *page = NULL, *new_page; 1312 struct mem_cgroup *memcg; 1313 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1314 struct mmu_notifier_range range; 1315 gfp_t huge_gfp; /* for allocation and charge */ 1316 vm_fault_t ret = 0; 1317 1318 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 1319 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1320 if (is_huge_zero_pmd(orig_pmd)) 1321 goto alloc; 1322 spin_lock(vmf->ptl); 1323 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1324 goto out_unlock; 1325 1326 page = pmd_page(orig_pmd); 1327 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); 1328 /* 1329 * We can only reuse the page if nobody else maps the huge page or it's 1330 * part. 1331 */ 1332 if (!trylock_page(page)) { 1333 get_page(page); 1334 spin_unlock(vmf->ptl); 1335 lock_page(page); 1336 spin_lock(vmf->ptl); 1337 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1338 unlock_page(page); 1339 put_page(page); 1340 goto out_unlock; 1341 } 1342 put_page(page); 1343 } 1344 if (reuse_swap_page(page, NULL)) { 1345 pmd_t entry; 1346 entry = pmd_mkyoung(orig_pmd); 1347 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1348 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1349 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1350 ret |= VM_FAULT_WRITE; 1351 unlock_page(page); 1352 goto out_unlock; 1353 } 1354 unlock_page(page); 1355 get_page(page); 1356 spin_unlock(vmf->ptl); 1357 alloc: 1358 if (__transparent_hugepage_enabled(vma) && 1359 !transparent_hugepage_debug_cow()) { 1360 huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 1361 new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, 1362 haddr, numa_node_id()); 1363 } else 1364 new_page = NULL; 1365 1366 if (likely(new_page)) { 1367 prep_transhuge_page(new_page); 1368 } else { 1369 if (!page) { 1370 split_huge_pmd(vma, vmf->pmd, vmf->address); 1371 ret |= VM_FAULT_FALLBACK; 1372 } else { 1373 ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); 1374 if (ret & VM_FAULT_OOM) { 1375 split_huge_pmd(vma, vmf->pmd, vmf->address); 1376 ret |= VM_FAULT_FALLBACK; 1377 } 1378 put_page(page); 1379 } 1380 count_vm_event(THP_FAULT_FALLBACK); 1381 goto out; 1382 } 1383 1384 if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, 1385 huge_gfp, &memcg, true))) { 1386 put_page(new_page); 1387 split_huge_pmd(vma, vmf->pmd, vmf->address); 1388 if (page) 1389 put_page(page); 1390 ret |= VM_FAULT_FALLBACK; 1391 count_vm_event(THP_FAULT_FALLBACK); 1392 goto out; 1393 } 1394 1395 count_vm_event(THP_FAULT_ALLOC); 1396 count_memcg_events(memcg, THP_FAULT_ALLOC, 1); 1397 1398 if (!page) 1399 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); 1400 else 1401 copy_user_huge_page(new_page, page, vmf->address, 1402 vma, HPAGE_PMD_NR); 1403 __SetPageUptodate(new_page); 1404 1405 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1406 haddr, haddr + HPAGE_PMD_SIZE); 1407 mmu_notifier_invalidate_range_start(&range); 1408 1409 spin_lock(vmf->ptl); 1410 if (page) 1411 put_page(page); 1412 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1413 spin_unlock(vmf->ptl); 1414 mem_cgroup_cancel_charge(new_page, memcg, true); 1415 put_page(new_page); 1416 goto out_mn; 1417 } else { 1418 pmd_t entry; 1419 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1420 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1421 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1422 page_add_new_anon_rmap(new_page, vma, haddr, true); 1423 mem_cgroup_commit_charge(new_page, memcg, false, true); 1424 lru_cache_add_active_or_unevictable(new_page, vma); 1425 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 1426 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1427 if (!page) { 1428 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1429 } else { 1430 VM_BUG_ON_PAGE(!PageHead(page), page); 1431 page_remove_rmap(page, true); 1432 put_page(page); 1433 } 1434 ret |= VM_FAULT_WRITE; 1435 } 1436 spin_unlock(vmf->ptl); 1437 out_mn: 1438 /* 1439 * No need to double call mmu_notifier->invalidate_range() callback as 1440 * the above pmdp_huge_clear_flush_notify() did already call it. 1441 */ 1442 mmu_notifier_invalidate_range_only_end(&range); 1443 out: 1444 return ret; 1445 out_unlock: 1446 spin_unlock(vmf->ptl); 1447 return ret; 1448 } 1449 1450 /* 1451 * FOLL_FORCE can write to even unwritable pmd's, but only 1452 * after we've gone through a COW cycle and they are dirty. 1453 */ 1454 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) 1455 { 1456 return pmd_write(pmd) || 1457 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); 1458 } 1459 1460 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1461 unsigned long addr, 1462 pmd_t *pmd, 1463 unsigned int flags) 1464 { 1465 struct mm_struct *mm = vma->vm_mm; 1466 struct page *page = NULL; 1467 1468 assert_spin_locked(pmd_lockptr(mm, pmd)); 1469 1470 if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) 1471 goto out; 1472 1473 /* Avoid dumping huge zero page */ 1474 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1475 return ERR_PTR(-EFAULT); 1476 1477 /* Full NUMA hinting faults to serialise migration in fault paths */ 1478 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 1479 goto out; 1480 1481 page = pmd_page(*pmd); 1482 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); 1483 if (flags & FOLL_TOUCH) 1484 touch_pmd(vma, addr, pmd, flags); 1485 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1486 /* 1487 * We don't mlock() pte-mapped THPs. This way we can avoid 1488 * leaking mlocked pages into non-VM_LOCKED VMAs. 1489 * 1490 * For anon THP: 1491 * 1492 * In most cases the pmd is the only mapping of the page as we 1493 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for 1494 * writable private mappings in populate_vma_page_range(). 1495 * 1496 * The only scenario when we have the page shared here is if we 1497 * mlocking read-only mapping shared over fork(). We skip 1498 * mlocking such pages. 1499 * 1500 * For file THP: 1501 * 1502 * We can expect PageDoubleMap() to be stable under page lock: 1503 * for file pages we set it in page_add_file_rmap(), which 1504 * requires page to be locked. 1505 */ 1506 1507 if (PageAnon(page) && compound_mapcount(page) != 1) 1508 goto skip_mlock; 1509 if (PageDoubleMap(page) || !page->mapping) 1510 goto skip_mlock; 1511 if (!trylock_page(page)) 1512 goto skip_mlock; 1513 lru_add_drain(); 1514 if (page->mapping && !PageDoubleMap(page)) 1515 mlock_vma_page(page); 1516 unlock_page(page); 1517 } 1518 skip_mlock: 1519 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1520 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); 1521 if (flags & FOLL_GET) 1522 get_page(page); 1523 1524 out: 1525 return page; 1526 } 1527 1528 /* NUMA hinting page fault entry point for trans huge pmds */ 1529 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) 1530 { 1531 struct vm_area_struct *vma = vmf->vma; 1532 struct anon_vma *anon_vma = NULL; 1533 struct page *page; 1534 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1535 int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); 1536 int target_nid, last_cpupid = -1; 1537 bool page_locked; 1538 bool migrated = false; 1539 bool was_writable; 1540 int flags = 0; 1541 1542 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1543 if (unlikely(!pmd_same(pmd, *vmf->pmd))) 1544 goto out_unlock; 1545 1546 /* 1547 * If there are potential migrations, wait for completion and retry 1548 * without disrupting NUMA hinting information. Do not relock and 1549 * check_same as the page may no longer be mapped. 1550 */ 1551 if (unlikely(pmd_trans_migrating(*vmf->pmd))) { 1552 page = pmd_page(*vmf->pmd); 1553 if (!get_page_unless_zero(page)) 1554 goto out_unlock; 1555 spin_unlock(vmf->ptl); 1556 put_and_wait_on_page_locked(page); 1557 goto out; 1558 } 1559 1560 page = pmd_page(pmd); 1561 BUG_ON(is_huge_zero_page(page)); 1562 page_nid = page_to_nid(page); 1563 last_cpupid = page_cpupid_last(page); 1564 count_vm_numa_event(NUMA_HINT_FAULTS); 1565 if (page_nid == this_nid) { 1566 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1567 flags |= TNF_FAULT_LOCAL; 1568 } 1569 1570 /* See similar comment in do_numa_page for explanation */ 1571 if (!pmd_savedwrite(pmd)) 1572 flags |= TNF_NO_GROUP; 1573 1574 /* 1575 * Acquire the page lock to serialise THP migrations but avoid dropping 1576 * page_table_lock if at all possible 1577 */ 1578 page_locked = trylock_page(page); 1579 target_nid = mpol_misplaced(page, vma, haddr); 1580 if (target_nid == NUMA_NO_NODE) { 1581 /* If the page was locked, there are no parallel migrations */ 1582 if (page_locked) 1583 goto clear_pmdnuma; 1584 } 1585 1586 /* Migration could have started since the pmd_trans_migrating check */ 1587 if (!page_locked) { 1588 page_nid = NUMA_NO_NODE; 1589 if (!get_page_unless_zero(page)) 1590 goto out_unlock; 1591 spin_unlock(vmf->ptl); 1592 put_and_wait_on_page_locked(page); 1593 goto out; 1594 } 1595 1596 /* 1597 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma 1598 * to serialises splits 1599 */ 1600 get_page(page); 1601 spin_unlock(vmf->ptl); 1602 anon_vma = page_lock_anon_vma_read(page); 1603 1604 /* Confirm the PMD did not change while page_table_lock was released */ 1605 spin_lock(vmf->ptl); 1606 if (unlikely(!pmd_same(pmd, *vmf->pmd))) { 1607 unlock_page(page); 1608 put_page(page); 1609 page_nid = NUMA_NO_NODE; 1610 goto out_unlock; 1611 } 1612 1613 /* Bail if we fail to protect against THP splits for any reason */ 1614 if (unlikely(!anon_vma)) { 1615 put_page(page); 1616 page_nid = NUMA_NO_NODE; 1617 goto clear_pmdnuma; 1618 } 1619 1620 /* 1621 * Since we took the NUMA fault, we must have observed the !accessible 1622 * bit. Make sure all other CPUs agree with that, to avoid them 1623 * modifying the page we're about to migrate. 1624 * 1625 * Must be done under PTL such that we'll observe the relevant 1626 * inc_tlb_flush_pending(). 1627 * 1628 * We are not sure a pending tlb flush here is for a huge page 1629 * mapping or not. Hence use the tlb range variant 1630 */ 1631 if (mm_tlb_flush_pending(vma->vm_mm)) { 1632 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); 1633 /* 1634 * change_huge_pmd() released the pmd lock before 1635 * invalidating the secondary MMUs sharing the primary 1636 * MMU pagetables (with ->invalidate_range()). The 1637 * mmu_notifier_invalidate_range_end() (which 1638 * internally calls ->invalidate_range()) in 1639 * change_pmd_range() will run after us, so we can't 1640 * rely on it here and we need an explicit invalidate. 1641 */ 1642 mmu_notifier_invalidate_range(vma->vm_mm, haddr, 1643 haddr + HPAGE_PMD_SIZE); 1644 } 1645 1646 /* 1647 * Migrate the THP to the requested node, returns with page unlocked 1648 * and access rights restored. 1649 */ 1650 spin_unlock(vmf->ptl); 1651 1652 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, 1653 vmf->pmd, pmd, vmf->address, page, target_nid); 1654 if (migrated) { 1655 flags |= TNF_MIGRATED; 1656 page_nid = target_nid; 1657 } else 1658 flags |= TNF_MIGRATE_FAIL; 1659 1660 goto out; 1661 clear_pmdnuma: 1662 BUG_ON(!PageLocked(page)); 1663 was_writable = pmd_savedwrite(pmd); 1664 pmd = pmd_modify(pmd, vma->vm_page_prot); 1665 pmd = pmd_mkyoung(pmd); 1666 if (was_writable) 1667 pmd = pmd_mkwrite(pmd); 1668 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 1669 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1670 unlock_page(page); 1671 out_unlock: 1672 spin_unlock(vmf->ptl); 1673 1674 out: 1675 if (anon_vma) 1676 page_unlock_anon_vma_read(anon_vma); 1677 1678 if (page_nid != NUMA_NO_NODE) 1679 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, 1680 flags); 1681 1682 return 0; 1683 } 1684 1685 /* 1686 * Return true if we do MADV_FREE successfully on entire pmd page. 1687 * Otherwise, return false. 1688 */ 1689 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1690 pmd_t *pmd, unsigned long addr, unsigned long next) 1691 { 1692 spinlock_t *ptl; 1693 pmd_t orig_pmd; 1694 struct page *page; 1695 struct mm_struct *mm = tlb->mm; 1696 bool ret = false; 1697 1698 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1699 1700 ptl = pmd_trans_huge_lock(pmd, vma); 1701 if (!ptl) 1702 goto out_unlocked; 1703 1704 orig_pmd = *pmd; 1705 if (is_huge_zero_pmd(orig_pmd)) 1706 goto out; 1707 1708 if (unlikely(!pmd_present(orig_pmd))) { 1709 VM_BUG_ON(thp_migration_supported() && 1710 !is_pmd_migration_entry(orig_pmd)); 1711 goto out; 1712 } 1713 1714 page = pmd_page(orig_pmd); 1715 /* 1716 * If other processes are mapping this page, we couldn't discard 1717 * the page unless they all do MADV_FREE so let's skip the page. 1718 */ 1719 if (page_mapcount(page) != 1) 1720 goto out; 1721 1722 if (!trylock_page(page)) 1723 goto out; 1724 1725 /* 1726 * If user want to discard part-pages of THP, split it so MADV_FREE 1727 * will deactivate only them. 1728 */ 1729 if (next - addr != HPAGE_PMD_SIZE) { 1730 get_page(page); 1731 spin_unlock(ptl); 1732 split_huge_page(page); 1733 unlock_page(page); 1734 put_page(page); 1735 goto out_unlocked; 1736 } 1737 1738 if (PageDirty(page)) 1739 ClearPageDirty(page); 1740 unlock_page(page); 1741 1742 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1743 pmdp_invalidate(vma, addr, pmd); 1744 orig_pmd = pmd_mkold(orig_pmd); 1745 orig_pmd = pmd_mkclean(orig_pmd); 1746 1747 set_pmd_at(mm, addr, pmd, orig_pmd); 1748 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1749 } 1750 1751 mark_page_lazyfree(page); 1752 ret = true; 1753 out: 1754 spin_unlock(ptl); 1755 out_unlocked: 1756 return ret; 1757 } 1758 1759 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 1760 { 1761 pgtable_t pgtable; 1762 1763 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1764 pte_free(mm, pgtable); 1765 mm_dec_nr_ptes(mm); 1766 } 1767 1768 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1769 pmd_t *pmd, unsigned long addr) 1770 { 1771 pmd_t orig_pmd; 1772 spinlock_t *ptl; 1773 1774 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1775 1776 ptl = __pmd_trans_huge_lock(pmd, vma); 1777 if (!ptl) 1778 return 0; 1779 /* 1780 * For architectures like ppc64 we look at deposited pgtable 1781 * when calling pmdp_huge_get_and_clear. So do the 1782 * pgtable_trans_huge_withdraw after finishing pmdp related 1783 * operations. 1784 */ 1785 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1786 tlb->fullmm); 1787 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1788 if (vma_is_dax(vma)) { 1789 if (arch_needs_pgtable_deposit()) 1790 zap_deposited_table(tlb->mm, pmd); 1791 spin_unlock(ptl); 1792 if (is_huge_zero_pmd(orig_pmd)) 1793 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); 1794 } else if (is_huge_zero_pmd(orig_pmd)) { 1795 zap_deposited_table(tlb->mm, pmd); 1796 spin_unlock(ptl); 1797 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); 1798 } else { 1799 struct page *page = NULL; 1800 int flush_needed = 1; 1801 1802 if (pmd_present(orig_pmd)) { 1803 page = pmd_page(orig_pmd); 1804 page_remove_rmap(page, true); 1805 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1806 VM_BUG_ON_PAGE(!PageHead(page), page); 1807 } else if (thp_migration_supported()) { 1808 swp_entry_t entry; 1809 1810 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 1811 entry = pmd_to_swp_entry(orig_pmd); 1812 page = pfn_to_page(swp_offset(entry)); 1813 flush_needed = 0; 1814 } else 1815 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 1816 1817 if (PageAnon(page)) { 1818 zap_deposited_table(tlb->mm, pmd); 1819 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1820 } else { 1821 if (arch_needs_pgtable_deposit()) 1822 zap_deposited_table(tlb->mm, pmd); 1823 add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); 1824 } 1825 1826 spin_unlock(ptl); 1827 if (flush_needed) 1828 tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); 1829 } 1830 return 1; 1831 } 1832 1833 #ifndef pmd_move_must_withdraw 1834 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 1835 spinlock_t *old_pmd_ptl, 1836 struct vm_area_struct *vma) 1837 { 1838 /* 1839 * With split pmd lock we also need to move preallocated 1840 * PTE page table if new_pmd is on different PMD page table. 1841 * 1842 * We also don't deposit and withdraw tables for file pages. 1843 */ 1844 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 1845 } 1846 #endif 1847 1848 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 1849 { 1850 #ifdef CONFIG_MEM_SOFT_DIRTY 1851 if (unlikely(is_pmd_migration_entry(pmd))) 1852 pmd = pmd_swp_mksoft_dirty(pmd); 1853 else if (pmd_present(pmd)) 1854 pmd = pmd_mksoft_dirty(pmd); 1855 #endif 1856 return pmd; 1857 } 1858 1859 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1860 unsigned long new_addr, unsigned long old_end, 1861 pmd_t *old_pmd, pmd_t *new_pmd) 1862 { 1863 spinlock_t *old_ptl, *new_ptl; 1864 pmd_t pmd; 1865 struct mm_struct *mm = vma->vm_mm; 1866 bool force_flush = false; 1867 1868 if ((old_addr & ~HPAGE_PMD_MASK) || 1869 (new_addr & ~HPAGE_PMD_MASK) || 1870 old_end - old_addr < HPAGE_PMD_SIZE) 1871 return false; 1872 1873 /* 1874 * The destination pmd shouldn't be established, free_pgtables() 1875 * should have release it. 1876 */ 1877 if (WARN_ON(!pmd_none(*new_pmd))) { 1878 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1879 return false; 1880 } 1881 1882 /* 1883 * We don't have to worry about the ordering of src and dst 1884 * ptlocks because exclusive mmap_sem prevents deadlock. 1885 */ 1886 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1887 if (old_ptl) { 1888 new_ptl = pmd_lockptr(mm, new_pmd); 1889 if (new_ptl != old_ptl) 1890 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1891 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1892 if (pmd_present(pmd)) 1893 force_flush = true; 1894 VM_BUG_ON(!pmd_none(*new_pmd)); 1895 1896 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 1897 pgtable_t pgtable; 1898 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1899 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1900 } 1901 pmd = move_soft_dirty_pmd(pmd); 1902 set_pmd_at(mm, new_addr, new_pmd, pmd); 1903 if (force_flush) 1904 flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 1905 if (new_ptl != old_ptl) 1906 spin_unlock(new_ptl); 1907 spin_unlock(old_ptl); 1908 return true; 1909 } 1910 return false; 1911 } 1912 1913 /* 1914 * Returns 1915 * - 0 if PMD could not be locked 1916 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary 1917 * - HPAGE_PMD_NR is protections changed and TLB flush necessary 1918 */ 1919 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1920 unsigned long addr, pgprot_t newprot, int prot_numa) 1921 { 1922 struct mm_struct *mm = vma->vm_mm; 1923 spinlock_t *ptl; 1924 pmd_t entry; 1925 bool preserve_write; 1926 int ret; 1927 1928 ptl = __pmd_trans_huge_lock(pmd, vma); 1929 if (!ptl) 1930 return 0; 1931 1932 preserve_write = prot_numa && pmd_write(*pmd); 1933 ret = 1; 1934 1935 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1936 if (is_swap_pmd(*pmd)) { 1937 swp_entry_t entry = pmd_to_swp_entry(*pmd); 1938 1939 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 1940 if (is_write_migration_entry(entry)) { 1941 pmd_t newpmd; 1942 /* 1943 * A protection check is difficult so 1944 * just be safe and disable write 1945 */ 1946 make_migration_entry_read(&entry); 1947 newpmd = swp_entry_to_pmd(entry); 1948 if (pmd_swp_soft_dirty(*pmd)) 1949 newpmd = pmd_swp_mksoft_dirty(newpmd); 1950 set_pmd_at(mm, addr, pmd, newpmd); 1951 } 1952 goto unlock; 1953 } 1954 #endif 1955 1956 /* 1957 * Avoid trapping faults against the zero page. The read-only 1958 * data is likely to be read-cached on the local CPU and 1959 * local/remote hits to the zero page are not interesting. 1960 */ 1961 if (prot_numa && is_huge_zero_pmd(*pmd)) 1962 goto unlock; 1963 1964 if (prot_numa && pmd_protnone(*pmd)) 1965 goto unlock; 1966 1967 /* 1968 * In case prot_numa, we are under down_read(mmap_sem). It's critical 1969 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 1970 * which is also under down_read(mmap_sem): 1971 * 1972 * CPU0: CPU1: 1973 * change_huge_pmd(prot_numa=1) 1974 * pmdp_huge_get_and_clear_notify() 1975 * madvise_dontneed() 1976 * zap_pmd_range() 1977 * pmd_trans_huge(*pmd) == 0 (without ptl) 1978 * // skip the pmd 1979 * set_pmd_at(); 1980 * // pmd is re-established 1981 * 1982 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 1983 * which may break userspace. 1984 * 1985 * pmdp_invalidate() is required to make sure we don't miss 1986 * dirty/young flags set by hardware. 1987 */ 1988 entry = pmdp_invalidate(vma, addr, pmd); 1989 1990 entry = pmd_modify(entry, newprot); 1991 if (preserve_write) 1992 entry = pmd_mk_savedwrite(entry); 1993 ret = HPAGE_PMD_NR; 1994 set_pmd_at(mm, addr, pmd, entry); 1995 BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); 1996 unlock: 1997 spin_unlock(ptl); 1998 return ret; 1999 } 2000 2001 /* 2002 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2003 * 2004 * Note that if it returns page table lock pointer, this routine returns without 2005 * unlocking page table lock. So callers must unlock it. 2006 */ 2007 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2008 { 2009 spinlock_t *ptl; 2010 ptl = pmd_lock(vma->vm_mm, pmd); 2011 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 2012 pmd_devmap(*pmd))) 2013 return ptl; 2014 spin_unlock(ptl); 2015 return NULL; 2016 } 2017 2018 /* 2019 * Returns true if a given pud maps a thp, false otherwise. 2020 * 2021 * Note that if it returns true, this routine returns without unlocking page 2022 * table lock. So callers must unlock it. 2023 */ 2024 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2025 { 2026 spinlock_t *ptl; 2027 2028 ptl = pud_lock(vma->vm_mm, pud); 2029 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 2030 return ptl; 2031 spin_unlock(ptl); 2032 return NULL; 2033 } 2034 2035 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2036 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2037 pud_t *pud, unsigned long addr) 2038 { 2039 spinlock_t *ptl; 2040 2041 ptl = __pud_trans_huge_lock(pud, vma); 2042 if (!ptl) 2043 return 0; 2044 /* 2045 * For architectures like ppc64 we look at deposited pgtable 2046 * when calling pudp_huge_get_and_clear. So do the 2047 * pgtable_trans_huge_withdraw after finishing pudp related 2048 * operations. 2049 */ 2050 pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); 2051 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2052 if (vma_is_dax(vma)) { 2053 spin_unlock(ptl); 2054 /* No zero page support yet */ 2055 } else { 2056 /* No support for anonymous PUD pages yet */ 2057 BUG(); 2058 } 2059 return 1; 2060 } 2061 2062 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2063 unsigned long haddr) 2064 { 2065 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2066 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2067 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2068 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 2069 2070 count_vm_event(THP_SPLIT_PUD); 2071 2072 pudp_huge_clear_flush_notify(vma, haddr, pud); 2073 } 2074 2075 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2076 unsigned long address) 2077 { 2078 spinlock_t *ptl; 2079 struct mmu_notifier_range range; 2080 2081 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 2082 address & HPAGE_PUD_MASK, 2083 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2084 mmu_notifier_invalidate_range_start(&range); 2085 ptl = pud_lock(vma->vm_mm, pud); 2086 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2087 goto out; 2088 __split_huge_pud_locked(vma, pud, range.start); 2089 2090 out: 2091 spin_unlock(ptl); 2092 /* 2093 * No need to double call mmu_notifier->invalidate_range() callback as 2094 * the above pudp_huge_clear_flush_notify() did already call it. 2095 */ 2096 mmu_notifier_invalidate_range_only_end(&range); 2097 } 2098 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2099 2100 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2101 unsigned long haddr, pmd_t *pmd) 2102 { 2103 struct mm_struct *mm = vma->vm_mm; 2104 pgtable_t pgtable; 2105 pmd_t _pmd; 2106 int i; 2107 2108 /* 2109 * Leave pmd empty until pte is filled note that it is fine to delay 2110 * notification until mmu_notifier_invalidate_range_end() as we are 2111 * replacing a zero pmd write protected page with a zero pte write 2112 * protected page. 2113 * 2114 * See Documentation/vm/mmu_notifier.rst 2115 */ 2116 pmdp_huge_clear_flush(vma, haddr, pmd); 2117 2118 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2119 pmd_populate(mm, &_pmd, pgtable); 2120 2121 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2122 pte_t *pte, entry; 2123 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2124 entry = pte_mkspecial(entry); 2125 pte = pte_offset_map(&_pmd, haddr); 2126 VM_BUG_ON(!pte_none(*pte)); 2127 set_pte_at(mm, haddr, pte, entry); 2128 pte_unmap(pte); 2129 } 2130 smp_wmb(); /* make pte visible before pmd */ 2131 pmd_populate(mm, pmd, pgtable); 2132 } 2133 2134 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2135 unsigned long haddr, bool freeze) 2136 { 2137 struct mm_struct *mm = vma->vm_mm; 2138 struct page *page; 2139 pgtable_t pgtable; 2140 pmd_t old_pmd, _pmd; 2141 bool young, write, soft_dirty, pmd_migration = false; 2142 unsigned long addr; 2143 int i; 2144 2145 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2146 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2147 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2148 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 2149 && !pmd_devmap(*pmd)); 2150 2151 count_vm_event(THP_SPLIT_PMD); 2152 2153 if (!vma_is_anonymous(vma)) { 2154 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2155 /* 2156 * We are going to unmap this huge page. So 2157 * just go ahead and zap it 2158 */ 2159 if (arch_needs_pgtable_deposit()) 2160 zap_deposited_table(mm, pmd); 2161 if (vma_is_dax(vma)) 2162 return; 2163 page = pmd_page(_pmd); 2164 if (!PageDirty(page) && pmd_dirty(_pmd)) 2165 set_page_dirty(page); 2166 if (!PageReferenced(page) && pmd_young(_pmd)) 2167 SetPageReferenced(page); 2168 page_remove_rmap(page, true); 2169 put_page(page); 2170 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); 2171 return; 2172 } else if (is_huge_zero_pmd(*pmd)) { 2173 /* 2174 * FIXME: Do we want to invalidate secondary mmu by calling 2175 * mmu_notifier_invalidate_range() see comments below inside 2176 * __split_huge_pmd() ? 2177 * 2178 * We are going from a zero huge page write protected to zero 2179 * small page also write protected so it does not seems useful 2180 * to invalidate secondary mmu at this time. 2181 */ 2182 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2183 } 2184 2185 /* 2186 * Up to this point the pmd is present and huge and userland has the 2187 * whole access to the hugepage during the split (which happens in 2188 * place). If we overwrite the pmd with the not-huge version pointing 2189 * to the pte here (which of course we could if all CPUs were bug 2190 * free), userland could trigger a small page size TLB miss on the 2191 * small sized TLB while the hugepage TLB entry is still established in 2192 * the huge TLB. Some CPU doesn't like that. 2193 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 2194 * 383 on page 93. Intel should be safe but is also warns that it's 2195 * only safe if the permission and cache attributes of the two entries 2196 * loaded in the two TLB is identical (which should be the case here). 2197 * But it is generally safer to never allow small and huge TLB entries 2198 * for the same virtual address to be loaded simultaneously. So instead 2199 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 2200 * current pmd notpresent (atomically because here the pmd_trans_huge 2201 * must remain set at all times on the pmd until the split is complete 2202 * for this pmd), then we flush the SMP TLB and finally we write the 2203 * non-huge version of the pmd entry with pmd_populate. 2204 */ 2205 old_pmd = pmdp_invalidate(vma, haddr, pmd); 2206 2207 pmd_migration = is_pmd_migration_entry(old_pmd); 2208 if (unlikely(pmd_migration)) { 2209 swp_entry_t entry; 2210 2211 entry = pmd_to_swp_entry(old_pmd); 2212 page = pfn_to_page(swp_offset(entry)); 2213 write = is_write_migration_entry(entry); 2214 young = false; 2215 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2216 } else { 2217 page = pmd_page(old_pmd); 2218 if (pmd_dirty(old_pmd)) 2219 SetPageDirty(page); 2220 write = pmd_write(old_pmd); 2221 young = pmd_young(old_pmd); 2222 soft_dirty = pmd_soft_dirty(old_pmd); 2223 } 2224 VM_BUG_ON_PAGE(!page_count(page), page); 2225 page_ref_add(page, HPAGE_PMD_NR - 1); 2226 2227 /* 2228 * Withdraw the table only after we mark the pmd entry invalid. 2229 * This's critical for some architectures (Power). 2230 */ 2231 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2232 pmd_populate(mm, &_pmd, pgtable); 2233 2234 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2235 pte_t entry, *pte; 2236 /* 2237 * Note that NUMA hinting access restrictions are not 2238 * transferred to avoid any possibility of altering 2239 * permissions across VMAs. 2240 */ 2241 if (freeze || pmd_migration) { 2242 swp_entry_t swp_entry; 2243 swp_entry = make_migration_entry(page + i, write); 2244 entry = swp_entry_to_pte(swp_entry); 2245 if (soft_dirty) 2246 entry = pte_swp_mksoft_dirty(entry); 2247 } else { 2248 entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); 2249 entry = maybe_mkwrite(entry, vma); 2250 if (!write) 2251 entry = pte_wrprotect(entry); 2252 if (!young) 2253 entry = pte_mkold(entry); 2254 if (soft_dirty) 2255 entry = pte_mksoft_dirty(entry); 2256 } 2257 pte = pte_offset_map(&_pmd, addr); 2258 BUG_ON(!pte_none(*pte)); 2259 set_pte_at(mm, addr, pte, entry); 2260 atomic_inc(&page[i]._mapcount); 2261 pte_unmap(pte); 2262 } 2263 2264 /* 2265 * Set PG_double_map before dropping compound_mapcount to avoid 2266 * false-negative page_mapped(). 2267 */ 2268 if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { 2269 for (i = 0; i < HPAGE_PMD_NR; i++) 2270 atomic_inc(&page[i]._mapcount); 2271 } 2272 2273 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 2274 /* Last compound_mapcount is gone. */ 2275 __dec_node_page_state(page, NR_ANON_THPS); 2276 if (TestClearPageDoubleMap(page)) { 2277 /* No need in mapcount reference anymore */ 2278 for (i = 0; i < HPAGE_PMD_NR; i++) 2279 atomic_dec(&page[i]._mapcount); 2280 } 2281 } 2282 2283 smp_wmb(); /* make pte visible before pmd */ 2284 pmd_populate(mm, pmd, pgtable); 2285 2286 if (freeze) { 2287 for (i = 0; i < HPAGE_PMD_NR; i++) { 2288 page_remove_rmap(page + i, false); 2289 put_page(page + i); 2290 } 2291 } 2292 } 2293 2294 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2295 unsigned long address, bool freeze, struct page *page) 2296 { 2297 spinlock_t *ptl; 2298 struct mmu_notifier_range range; 2299 2300 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 2301 address & HPAGE_PMD_MASK, 2302 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 2303 mmu_notifier_invalidate_range_start(&range); 2304 ptl = pmd_lock(vma->vm_mm, pmd); 2305 2306 /* 2307 * If caller asks to setup a migration entries, we need a page to check 2308 * pmd against. Otherwise we can end up replacing wrong page. 2309 */ 2310 VM_BUG_ON(freeze && !page); 2311 if (page && page != pmd_page(*pmd)) 2312 goto out; 2313 2314 if (pmd_trans_huge(*pmd)) { 2315 page = pmd_page(*pmd); 2316 if (PageMlocked(page)) 2317 clear_page_mlock(page); 2318 } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) 2319 goto out; 2320 __split_huge_pmd_locked(vma, pmd, range.start, freeze); 2321 out: 2322 spin_unlock(ptl); 2323 /* 2324 * No need to double call mmu_notifier->invalidate_range() callback. 2325 * They are 3 cases to consider inside __split_huge_pmd_locked(): 2326 * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious 2327 * 2) __split_huge_zero_page_pmd() read only zero page and any write 2328 * fault will trigger a flush_notify before pointing to a new page 2329 * (it is fine if the secondary mmu keeps pointing to the old zero 2330 * page in the meantime) 2331 * 3) Split a huge pmd into pte pointing to the same page. No need 2332 * to invalidate secondary tlb entry they are all still valid. 2333 * any further changes to individual pte will notify. So no need 2334 * to call mmu_notifier->invalidate_range() 2335 */ 2336 mmu_notifier_invalidate_range_only_end(&range); 2337 } 2338 2339 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2340 bool freeze, struct page *page) 2341 { 2342 pgd_t *pgd; 2343 p4d_t *p4d; 2344 pud_t *pud; 2345 pmd_t *pmd; 2346 2347 pgd = pgd_offset(vma->vm_mm, address); 2348 if (!pgd_present(*pgd)) 2349 return; 2350 2351 p4d = p4d_offset(pgd, address); 2352 if (!p4d_present(*p4d)) 2353 return; 2354 2355 pud = pud_offset(p4d, address); 2356 if (!pud_present(*pud)) 2357 return; 2358 2359 pmd = pmd_offset(pud, address); 2360 2361 __split_huge_pmd(vma, pmd, address, freeze, page); 2362 } 2363 2364 void vma_adjust_trans_huge(struct vm_area_struct *vma, 2365 unsigned long start, 2366 unsigned long end, 2367 long adjust_next) 2368 { 2369 /* 2370 * If the new start address isn't hpage aligned and it could 2371 * previously contain an hugepage: check if we need to split 2372 * an huge pmd. 2373 */ 2374 if (start & ~HPAGE_PMD_MASK && 2375 (start & HPAGE_PMD_MASK) >= vma->vm_start && 2376 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2377 split_huge_pmd_address(vma, start, false, NULL); 2378 2379 /* 2380 * If the new end address isn't hpage aligned and it could 2381 * previously contain an hugepage: check if we need to split 2382 * an huge pmd. 2383 */ 2384 if (end & ~HPAGE_PMD_MASK && 2385 (end & HPAGE_PMD_MASK) >= vma->vm_start && 2386 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2387 split_huge_pmd_address(vma, end, false, NULL); 2388 2389 /* 2390 * If we're also updating the vma->vm_next->vm_start, if the new 2391 * vm_next->vm_start isn't page aligned and it could previously 2392 * contain an hugepage: check if we need to split an huge pmd. 2393 */ 2394 if (adjust_next > 0) { 2395 struct vm_area_struct *next = vma->vm_next; 2396 unsigned long nstart = next->vm_start; 2397 nstart += adjust_next << PAGE_SHIFT; 2398 if (nstart & ~HPAGE_PMD_MASK && 2399 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 2400 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 2401 split_huge_pmd_address(next, nstart, false, NULL); 2402 } 2403 } 2404 2405 static void unmap_page(struct page *page) 2406 { 2407 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | 2408 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; 2409 bool unmap_success; 2410 2411 VM_BUG_ON_PAGE(!PageHead(page), page); 2412 2413 if (PageAnon(page)) 2414 ttu_flags |= TTU_SPLIT_FREEZE; 2415 2416 unmap_success = try_to_unmap(page, ttu_flags); 2417 VM_BUG_ON_PAGE(!unmap_success, page); 2418 } 2419 2420 static void remap_page(struct page *page) 2421 { 2422 int i; 2423 if (PageTransHuge(page)) { 2424 remove_migration_ptes(page, page, true); 2425 } else { 2426 for (i = 0; i < HPAGE_PMD_NR; i++) 2427 remove_migration_ptes(page + i, page + i, true); 2428 } 2429 } 2430 2431 static void __split_huge_page_tail(struct page *head, int tail, 2432 struct lruvec *lruvec, struct list_head *list) 2433 { 2434 struct page *page_tail = head + tail; 2435 2436 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 2437 2438 /* 2439 * Clone page flags before unfreezing refcount. 2440 * 2441 * After successful get_page_unless_zero() might follow flags change, 2442 * for exmaple lock_page() which set PG_waiters. 2443 */ 2444 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 2445 page_tail->flags |= (head->flags & 2446 ((1L << PG_referenced) | 2447 (1L << PG_swapbacked) | 2448 (1L << PG_swapcache) | 2449 (1L << PG_mlocked) | 2450 (1L << PG_uptodate) | 2451 (1L << PG_active) | 2452 (1L << PG_workingset) | 2453 (1L << PG_locked) | 2454 (1L << PG_unevictable) | 2455 (1L << PG_dirty))); 2456 2457 /* ->mapping in first tail page is compound_mapcount */ 2458 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 2459 page_tail); 2460 page_tail->mapping = head->mapping; 2461 page_tail->index = head->index + tail; 2462 2463 /* Page flags must be visible before we make the page non-compound. */ 2464 smp_wmb(); 2465 2466 /* 2467 * Clear PageTail before unfreezing page refcount. 2468 * 2469 * After successful get_page_unless_zero() might follow put_page() 2470 * which needs correct compound_head(). 2471 */ 2472 clear_compound_head(page_tail); 2473 2474 /* Finally unfreeze refcount. Additional reference from page cache. */ 2475 page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || 2476 PageSwapCache(head))); 2477 2478 if (page_is_young(head)) 2479 set_page_young(page_tail); 2480 if (page_is_idle(head)) 2481 set_page_idle(page_tail); 2482 2483 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 2484 2485 /* 2486 * always add to the tail because some iterators expect new 2487 * pages to show after the currently processed elements - e.g. 2488 * migrate_pages 2489 */ 2490 lru_add_page_tail(head, page_tail, lruvec, list); 2491 } 2492 2493 static void __split_huge_page(struct page *page, struct list_head *list, 2494 pgoff_t end, unsigned long flags) 2495 { 2496 struct page *head = compound_head(page); 2497 pg_data_t *pgdat = page_pgdat(head); 2498 struct lruvec *lruvec; 2499 int i; 2500 2501 lruvec = mem_cgroup_page_lruvec(head, pgdat); 2502 2503 /* complete memcg works before add pages to LRU */ 2504 mem_cgroup_split_huge_fixup(head); 2505 2506 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 2507 __split_huge_page_tail(head, i, lruvec, list); 2508 /* Some pages can be beyond i_size: drop them from page cache */ 2509 if (head[i].index >= end) { 2510 ClearPageDirty(head + i); 2511 __delete_from_page_cache(head + i, NULL); 2512 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) 2513 shmem_uncharge(head->mapping->host, 1); 2514 put_page(head + i); 2515 } 2516 } 2517 2518 ClearPageCompound(head); 2519 /* See comment in __split_huge_page_tail() */ 2520 if (PageAnon(head)) { 2521 /* Additional pin to swap cache */ 2522 if (PageSwapCache(head)) 2523 page_ref_add(head, 2); 2524 else 2525 page_ref_inc(head); 2526 } else { 2527 /* Additional pin to page cache */ 2528 page_ref_add(head, 2); 2529 xa_unlock(&head->mapping->i_pages); 2530 } 2531 2532 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 2533 2534 remap_page(head); 2535 2536 for (i = 0; i < HPAGE_PMD_NR; i++) { 2537 struct page *subpage = head + i; 2538 if (subpage == page) 2539 continue; 2540 unlock_page(subpage); 2541 2542 /* 2543 * Subpages may be freed if there wasn't any mapping 2544 * like if add_to_swap() is running on a lru page that 2545 * had its mapping zapped. And freeing these pages 2546 * requires taking the lru_lock so we do the put_page 2547 * of the tail pages after the split is complete. 2548 */ 2549 put_page(subpage); 2550 } 2551 } 2552 2553 int total_mapcount(struct page *page) 2554 { 2555 int i, compound, ret; 2556 2557 VM_BUG_ON_PAGE(PageTail(page), page); 2558 2559 if (likely(!PageCompound(page))) 2560 return atomic_read(&page->_mapcount) + 1; 2561 2562 compound = compound_mapcount(page); 2563 if (PageHuge(page)) 2564 return compound; 2565 ret = compound; 2566 for (i = 0; i < HPAGE_PMD_NR; i++) 2567 ret += atomic_read(&page[i]._mapcount) + 1; 2568 /* File pages has compound_mapcount included in _mapcount */ 2569 if (!PageAnon(page)) 2570 return ret - compound * HPAGE_PMD_NR; 2571 if (PageDoubleMap(page)) 2572 ret -= HPAGE_PMD_NR; 2573 return ret; 2574 } 2575 2576 /* 2577 * This calculates accurately how many mappings a transparent hugepage 2578 * has (unlike page_mapcount() which isn't fully accurate). This full 2579 * accuracy is primarily needed to know if copy-on-write faults can 2580 * reuse the page and change the mapping to read-write instead of 2581 * copying them. At the same time this returns the total_mapcount too. 2582 * 2583 * The function returns the highest mapcount any one of the subpages 2584 * has. If the return value is one, even if different processes are 2585 * mapping different subpages of the transparent hugepage, they can 2586 * all reuse it, because each process is reusing a different subpage. 2587 * 2588 * The total_mapcount is instead counting all virtual mappings of the 2589 * subpages. If the total_mapcount is equal to "one", it tells the 2590 * caller all mappings belong to the same "mm" and in turn the 2591 * anon_vma of the transparent hugepage can become the vma->anon_vma 2592 * local one as no other process may be mapping any of the subpages. 2593 * 2594 * It would be more accurate to replace page_mapcount() with 2595 * page_trans_huge_mapcount(), however we only use 2596 * page_trans_huge_mapcount() in the copy-on-write faults where we 2597 * need full accuracy to avoid breaking page pinning, because 2598 * page_trans_huge_mapcount() is slower than page_mapcount(). 2599 */ 2600 int page_trans_huge_mapcount(struct page *page, int *total_mapcount) 2601 { 2602 int i, ret, _total_mapcount, mapcount; 2603 2604 /* hugetlbfs shouldn't call it */ 2605 VM_BUG_ON_PAGE(PageHuge(page), page); 2606 2607 if (likely(!PageTransCompound(page))) { 2608 mapcount = atomic_read(&page->_mapcount) + 1; 2609 if (total_mapcount) 2610 *total_mapcount = mapcount; 2611 return mapcount; 2612 } 2613 2614 page = compound_head(page); 2615 2616 _total_mapcount = ret = 0; 2617 for (i = 0; i < HPAGE_PMD_NR; i++) { 2618 mapcount = atomic_read(&page[i]._mapcount) + 1; 2619 ret = max(ret, mapcount); 2620 _total_mapcount += mapcount; 2621 } 2622 if (PageDoubleMap(page)) { 2623 ret -= 1; 2624 _total_mapcount -= HPAGE_PMD_NR; 2625 } 2626 mapcount = compound_mapcount(page); 2627 ret += mapcount; 2628 _total_mapcount += mapcount; 2629 if (total_mapcount) 2630 *total_mapcount = _total_mapcount; 2631 return ret; 2632 } 2633 2634 /* Racy check whether the huge page can be split */ 2635 bool can_split_huge_page(struct page *page, int *pextra_pins) 2636 { 2637 int extra_pins; 2638 2639 /* Additional pins from page cache */ 2640 if (PageAnon(page)) 2641 extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; 2642 else 2643 extra_pins = HPAGE_PMD_NR; 2644 if (pextra_pins) 2645 *pextra_pins = extra_pins; 2646 return total_mapcount(page) == page_count(page) - extra_pins - 1; 2647 } 2648 2649 /* 2650 * This function splits huge page into normal pages. @page can point to any 2651 * subpage of huge page to split. Split doesn't change the position of @page. 2652 * 2653 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 2654 * The huge page must be locked. 2655 * 2656 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 2657 * 2658 * Both head page and tail pages will inherit mapping, flags, and so on from 2659 * the hugepage. 2660 * 2661 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if 2662 * they are not mapped. 2663 * 2664 * Returns 0 if the hugepage is split successfully. 2665 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under 2666 * us. 2667 */ 2668 int split_huge_page_to_list(struct page *page, struct list_head *list) 2669 { 2670 struct page *head = compound_head(page); 2671 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 2672 struct anon_vma *anon_vma = NULL; 2673 struct address_space *mapping = NULL; 2674 int count, mapcount, extra_pins, ret; 2675 bool mlocked; 2676 unsigned long flags; 2677 pgoff_t end; 2678 2679 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 2680 VM_BUG_ON_PAGE(!PageLocked(page), page); 2681 VM_BUG_ON_PAGE(!PageCompound(page), page); 2682 2683 if (PageWriteback(page)) 2684 return -EBUSY; 2685 2686 if (PageAnon(head)) { 2687 /* 2688 * The caller does not necessarily hold an mmap_sem that would 2689 * prevent the anon_vma disappearing so we first we take a 2690 * reference to it and then lock the anon_vma for write. This 2691 * is similar to page_lock_anon_vma_read except the write lock 2692 * is taken to serialise against parallel split or collapse 2693 * operations. 2694 */ 2695 anon_vma = page_get_anon_vma(head); 2696 if (!anon_vma) { 2697 ret = -EBUSY; 2698 goto out; 2699 } 2700 end = -1; 2701 mapping = NULL; 2702 anon_vma_lock_write(anon_vma); 2703 } else { 2704 mapping = head->mapping; 2705 2706 /* Truncated ? */ 2707 if (!mapping) { 2708 ret = -EBUSY; 2709 goto out; 2710 } 2711 2712 anon_vma = NULL; 2713 i_mmap_lock_read(mapping); 2714 2715 /* 2716 *__split_huge_page() may need to trim off pages beyond EOF: 2717 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 2718 * which cannot be nested inside the page tree lock. So note 2719 * end now: i_size itself may be changed at any moment, but 2720 * head page lock is good enough to serialize the trimming. 2721 */ 2722 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 2723 } 2724 2725 /* 2726 * Racy check if we can split the page, before unmap_page() will 2727 * split PMDs 2728 */ 2729 if (!can_split_huge_page(head, &extra_pins)) { 2730 ret = -EBUSY; 2731 goto out_unlock; 2732 } 2733 2734 mlocked = PageMlocked(page); 2735 unmap_page(head); 2736 VM_BUG_ON_PAGE(compound_mapcount(head), head); 2737 2738 /* Make sure the page is not on per-CPU pagevec as it takes pin */ 2739 if (mlocked) 2740 lru_add_drain(); 2741 2742 /* prevent PageLRU to go away from under us, and freeze lru stats */ 2743 spin_lock_irqsave(&pgdata->lru_lock, flags); 2744 2745 if (mapping) { 2746 XA_STATE(xas, &mapping->i_pages, page_index(head)); 2747 2748 /* 2749 * Check if the head page is present in page cache. 2750 * We assume all tail are present too, if head is there. 2751 */ 2752 xa_lock(&mapping->i_pages); 2753 if (xas_load(&xas) != head) 2754 goto fail; 2755 } 2756 2757 /* Prevent deferred_split_scan() touching ->_refcount */ 2758 spin_lock(&pgdata->split_queue_lock); 2759 count = page_count(head); 2760 mapcount = total_mapcount(head); 2761 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { 2762 if (!list_empty(page_deferred_list(head))) { 2763 pgdata->split_queue_len--; 2764 list_del(page_deferred_list(head)); 2765 } 2766 if (mapping) 2767 __dec_node_page_state(page, NR_SHMEM_THPS); 2768 spin_unlock(&pgdata->split_queue_lock); 2769 __split_huge_page(page, list, end, flags); 2770 if (PageSwapCache(head)) { 2771 swp_entry_t entry = { .val = page_private(head) }; 2772 2773 ret = split_swap_cluster(entry); 2774 } else 2775 ret = 0; 2776 } else { 2777 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 2778 pr_alert("total_mapcount: %u, page_count(): %u\n", 2779 mapcount, count); 2780 if (PageTail(page)) 2781 dump_page(head, NULL); 2782 dump_page(page, "total_mapcount(head) > 0"); 2783 BUG(); 2784 } 2785 spin_unlock(&pgdata->split_queue_lock); 2786 fail: if (mapping) 2787 xa_unlock(&mapping->i_pages); 2788 spin_unlock_irqrestore(&pgdata->lru_lock, flags); 2789 remap_page(head); 2790 ret = -EBUSY; 2791 } 2792 2793 out_unlock: 2794 if (anon_vma) { 2795 anon_vma_unlock_write(anon_vma); 2796 put_anon_vma(anon_vma); 2797 } 2798 if (mapping) 2799 i_mmap_unlock_read(mapping); 2800 out: 2801 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 2802 return ret; 2803 } 2804 2805 void free_transhuge_page(struct page *page) 2806 { 2807 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2808 unsigned long flags; 2809 2810 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2811 if (!list_empty(page_deferred_list(page))) { 2812 pgdata->split_queue_len--; 2813 list_del(page_deferred_list(page)); 2814 } 2815 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2816 free_compound_page(page); 2817 } 2818 2819 void deferred_split_huge_page(struct page *page) 2820 { 2821 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2822 unsigned long flags; 2823 2824 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 2825 2826 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2827 if (list_empty(page_deferred_list(page))) { 2828 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 2829 list_add_tail(page_deferred_list(page), &pgdata->split_queue); 2830 pgdata->split_queue_len++; 2831 } 2832 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2833 } 2834 2835 static unsigned long deferred_split_count(struct shrinker *shrink, 2836 struct shrink_control *sc) 2837 { 2838 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2839 return READ_ONCE(pgdata->split_queue_len); 2840 } 2841 2842 static unsigned long deferred_split_scan(struct shrinker *shrink, 2843 struct shrink_control *sc) 2844 { 2845 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2846 unsigned long flags; 2847 LIST_HEAD(list), *pos, *next; 2848 struct page *page; 2849 int split = 0; 2850 2851 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2852 /* Take pin on all head pages to avoid freeing them under us */ 2853 list_for_each_safe(pos, next, &pgdata->split_queue) { 2854 page = list_entry((void *)pos, struct page, mapping); 2855 page = compound_head(page); 2856 if (get_page_unless_zero(page)) { 2857 list_move(page_deferred_list(page), &list); 2858 } else { 2859 /* We lost race with put_compound_page() */ 2860 list_del_init(page_deferred_list(page)); 2861 pgdata->split_queue_len--; 2862 } 2863 if (!--sc->nr_to_scan) 2864 break; 2865 } 2866 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2867 2868 list_for_each_safe(pos, next, &list) { 2869 page = list_entry((void *)pos, struct page, mapping); 2870 if (!trylock_page(page)) 2871 goto next; 2872 /* split_huge_page() removes page from list on success */ 2873 if (!split_huge_page(page)) 2874 split++; 2875 unlock_page(page); 2876 next: 2877 put_page(page); 2878 } 2879 2880 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2881 list_splice_tail(&list, &pgdata->split_queue); 2882 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2883 2884 /* 2885 * Stop shrinker if we didn't split any page, but the queue is empty. 2886 * This can happen if pages were freed under us. 2887 */ 2888 if (!split && list_empty(&pgdata->split_queue)) 2889 return SHRINK_STOP; 2890 return split; 2891 } 2892 2893 static struct shrinker deferred_split_shrinker = { 2894 .count_objects = deferred_split_count, 2895 .scan_objects = deferred_split_scan, 2896 .seeks = DEFAULT_SEEKS, 2897 .flags = SHRINKER_NUMA_AWARE, 2898 }; 2899 2900 #ifdef CONFIG_DEBUG_FS 2901 static int split_huge_pages_set(void *data, u64 val) 2902 { 2903 struct zone *zone; 2904 struct page *page; 2905 unsigned long pfn, max_zone_pfn; 2906 unsigned long total = 0, split = 0; 2907 2908 if (val != 1) 2909 return -EINVAL; 2910 2911 for_each_populated_zone(zone) { 2912 max_zone_pfn = zone_end_pfn(zone); 2913 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 2914 if (!pfn_valid(pfn)) 2915 continue; 2916 2917 page = pfn_to_page(pfn); 2918 if (!get_page_unless_zero(page)) 2919 continue; 2920 2921 if (zone != page_zone(page)) 2922 goto next; 2923 2924 if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) 2925 goto next; 2926 2927 total++; 2928 lock_page(page); 2929 if (!split_huge_page(page)) 2930 split++; 2931 unlock_page(page); 2932 next: 2933 put_page(page); 2934 } 2935 } 2936 2937 pr_info("%lu of %lu THP split\n", split, total); 2938 2939 return 0; 2940 } 2941 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, 2942 "%llu\n"); 2943 2944 static int __init split_huge_pages_debugfs(void) 2945 { 2946 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 2947 &split_huge_pages_fops); 2948 return 0; 2949 } 2950 late_initcall(split_huge_pages_debugfs); 2951 #endif 2952 2953 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2954 void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 2955 struct page *page) 2956 { 2957 struct vm_area_struct *vma = pvmw->vma; 2958 struct mm_struct *mm = vma->vm_mm; 2959 unsigned long address = pvmw->address; 2960 pmd_t pmdval; 2961 swp_entry_t entry; 2962 pmd_t pmdswp; 2963 2964 if (!(pvmw->pmd && !pvmw->pte)) 2965 return; 2966 2967 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 2968 pmdval = *pvmw->pmd; 2969 pmdp_invalidate(vma, address, pvmw->pmd); 2970 if (pmd_dirty(pmdval)) 2971 set_page_dirty(page); 2972 entry = make_migration_entry(page, pmd_write(pmdval)); 2973 pmdswp = swp_entry_to_pmd(entry); 2974 if (pmd_soft_dirty(pmdval)) 2975 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 2976 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 2977 page_remove_rmap(page, true); 2978 put_page(page); 2979 } 2980 2981 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 2982 { 2983 struct vm_area_struct *vma = pvmw->vma; 2984 struct mm_struct *mm = vma->vm_mm; 2985 unsigned long address = pvmw->address; 2986 unsigned long mmun_start = address & HPAGE_PMD_MASK; 2987 pmd_t pmde; 2988 swp_entry_t entry; 2989 2990 if (!(pvmw->pmd && !pvmw->pte)) 2991 return; 2992 2993 entry = pmd_to_swp_entry(*pvmw->pmd); 2994 get_page(new); 2995 pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); 2996 if (pmd_swp_soft_dirty(*pvmw->pmd)) 2997 pmde = pmd_mksoft_dirty(pmde); 2998 if (is_write_migration_entry(entry)) 2999 pmde = maybe_pmd_mkwrite(pmde, vma); 3000 3001 flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); 3002 if (PageAnon(new)) 3003 page_add_anon_rmap(new, vma, mmun_start, true); 3004 else 3005 page_add_file_rmap(new, true); 3006 set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); 3007 if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) 3008 mlock_vma_page(new); 3009 update_mmu_cache_pmd(vma, address, pvmw->pmd); 3010 } 3011 #endif 3012