1 /* 2 * Copyright (C) 2009 Red Hat, Inc. 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2. See 5 * the COPYING file in the top-level directory. 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/mm.h> 11 #include <linux/sched.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/dax.h> 21 #include <linux/khugepaged.h> 22 #include <linux/freezer.h> 23 #include <linux/pfn_t.h> 24 #include <linux/mman.h> 25 #include <linux/memremap.h> 26 #include <linux/pagemap.h> 27 #include <linux/debugfs.h> 28 #include <linux/migrate.h> 29 #include <linux/hashtable.h> 30 #include <linux/userfaultfd_k.h> 31 #include <linux/page_idle.h> 32 #include <linux/shmem_fs.h> 33 34 #include <asm/tlb.h> 35 #include <asm/pgalloc.h> 36 #include "internal.h" 37 38 /* 39 * By default transparent hugepage support is disabled in order that avoid 40 * to risk increase the memory footprint of applications without a guaranteed 41 * benefit. When transparent hugepage support is enabled, is for all mappings, 42 * and khugepaged scans all mappings. 43 * Defrag is invoked by khugepaged hugepage allocations and by page faults 44 * for all hugepage allocations. 45 */ 46 unsigned long transparent_hugepage_flags __read_mostly = 47 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 48 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 49 #endif 50 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 51 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 52 #endif 53 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 54 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 55 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 56 57 static struct shrinker deferred_split_shrinker; 58 59 static atomic_t huge_zero_refcount; 60 struct page *huge_zero_page __read_mostly; 61 62 static struct page *get_huge_zero_page(void) 63 { 64 struct page *zero_page; 65 retry: 66 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 67 return READ_ONCE(huge_zero_page); 68 69 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 70 HPAGE_PMD_ORDER); 71 if (!zero_page) { 72 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 73 return NULL; 74 } 75 count_vm_event(THP_ZERO_PAGE_ALLOC); 76 preempt_disable(); 77 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 78 preempt_enable(); 79 __free_pages(zero_page, compound_order(zero_page)); 80 goto retry; 81 } 82 83 /* We take additional reference here. It will be put back by shrinker */ 84 atomic_set(&huge_zero_refcount, 2); 85 preempt_enable(); 86 return READ_ONCE(huge_zero_page); 87 } 88 89 static void put_huge_zero_page(void) 90 { 91 /* 92 * Counter should never go to zero here. Only shrinker can put 93 * last reference. 94 */ 95 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 96 } 97 98 struct page *mm_get_huge_zero_page(struct mm_struct *mm) 99 { 100 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 101 return READ_ONCE(huge_zero_page); 102 103 if (!get_huge_zero_page()) 104 return NULL; 105 106 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 107 put_huge_zero_page(); 108 109 return READ_ONCE(huge_zero_page); 110 } 111 112 void mm_put_huge_zero_page(struct mm_struct *mm) 113 { 114 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 115 put_huge_zero_page(); 116 } 117 118 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 119 struct shrink_control *sc) 120 { 121 /* we can free zero page only if last reference remains */ 122 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 123 } 124 125 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 126 struct shrink_control *sc) 127 { 128 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 129 struct page *zero_page = xchg(&huge_zero_page, NULL); 130 BUG_ON(zero_page == NULL); 131 __free_pages(zero_page, compound_order(zero_page)); 132 return HPAGE_PMD_NR; 133 } 134 135 return 0; 136 } 137 138 static struct shrinker huge_zero_page_shrinker = { 139 .count_objects = shrink_huge_zero_page_count, 140 .scan_objects = shrink_huge_zero_page_scan, 141 .seeks = DEFAULT_SEEKS, 142 }; 143 144 #ifdef CONFIG_SYSFS 145 146 static ssize_t triple_flag_store(struct kobject *kobj, 147 struct kobj_attribute *attr, 148 const char *buf, size_t count, 149 enum transparent_hugepage_flag enabled, 150 enum transparent_hugepage_flag deferred, 151 enum transparent_hugepage_flag req_madv) 152 { 153 if (!memcmp("defer", buf, 154 min(sizeof("defer")-1, count))) { 155 if (enabled == deferred) 156 return -EINVAL; 157 clear_bit(enabled, &transparent_hugepage_flags); 158 clear_bit(req_madv, &transparent_hugepage_flags); 159 set_bit(deferred, &transparent_hugepage_flags); 160 } else if (!memcmp("always", buf, 161 min(sizeof("always")-1, count))) { 162 clear_bit(deferred, &transparent_hugepage_flags); 163 clear_bit(req_madv, &transparent_hugepage_flags); 164 set_bit(enabled, &transparent_hugepage_flags); 165 } else if (!memcmp("madvise", buf, 166 min(sizeof("madvise")-1, count))) { 167 clear_bit(enabled, &transparent_hugepage_flags); 168 clear_bit(deferred, &transparent_hugepage_flags); 169 set_bit(req_madv, &transparent_hugepage_flags); 170 } else if (!memcmp("never", buf, 171 min(sizeof("never")-1, count))) { 172 clear_bit(enabled, &transparent_hugepage_flags); 173 clear_bit(req_madv, &transparent_hugepage_flags); 174 clear_bit(deferred, &transparent_hugepage_flags); 175 } else 176 return -EINVAL; 177 178 return count; 179 } 180 181 static ssize_t enabled_show(struct kobject *kobj, 182 struct kobj_attribute *attr, char *buf) 183 { 184 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 185 return sprintf(buf, "[always] madvise never\n"); 186 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) 187 return sprintf(buf, "always [madvise] never\n"); 188 else 189 return sprintf(buf, "always madvise [never]\n"); 190 } 191 192 static ssize_t enabled_store(struct kobject *kobj, 193 struct kobj_attribute *attr, 194 const char *buf, size_t count) 195 { 196 ssize_t ret; 197 198 ret = triple_flag_store(kobj, attr, buf, count, 199 TRANSPARENT_HUGEPAGE_FLAG, 200 TRANSPARENT_HUGEPAGE_FLAG, 201 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 202 203 if (ret > 0) { 204 int err = start_stop_khugepaged(); 205 if (err) 206 ret = err; 207 } 208 209 return ret; 210 } 211 static struct kobj_attribute enabled_attr = 212 __ATTR(enabled, 0644, enabled_show, enabled_store); 213 214 ssize_t single_hugepage_flag_show(struct kobject *kobj, 215 struct kobj_attribute *attr, char *buf, 216 enum transparent_hugepage_flag flag) 217 { 218 return sprintf(buf, "%d\n", 219 !!test_bit(flag, &transparent_hugepage_flags)); 220 } 221 222 ssize_t single_hugepage_flag_store(struct kobject *kobj, 223 struct kobj_attribute *attr, 224 const char *buf, size_t count, 225 enum transparent_hugepage_flag flag) 226 { 227 unsigned long value; 228 int ret; 229 230 ret = kstrtoul(buf, 10, &value); 231 if (ret < 0) 232 return ret; 233 if (value > 1) 234 return -EINVAL; 235 236 if (value) 237 set_bit(flag, &transparent_hugepage_flags); 238 else 239 clear_bit(flag, &transparent_hugepage_flags); 240 241 return count; 242 } 243 244 /* 245 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind 246 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of 247 * memory just to allocate one more hugepage. 248 */ 249 static ssize_t defrag_show(struct kobject *kobj, 250 struct kobj_attribute *attr, char *buf) 251 { 252 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 253 return sprintf(buf, "[always] defer madvise never\n"); 254 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 255 return sprintf(buf, "always [defer] madvise never\n"); 256 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 257 return sprintf(buf, "always defer [madvise] never\n"); 258 else 259 return sprintf(buf, "always defer madvise [never]\n"); 260 261 } 262 static ssize_t defrag_store(struct kobject *kobj, 263 struct kobj_attribute *attr, 264 const char *buf, size_t count) 265 { 266 return triple_flag_store(kobj, attr, buf, count, 267 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 268 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 269 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 270 } 271 static struct kobj_attribute defrag_attr = 272 __ATTR(defrag, 0644, defrag_show, defrag_store); 273 274 static ssize_t use_zero_page_show(struct kobject *kobj, 275 struct kobj_attribute *attr, char *buf) 276 { 277 return single_hugepage_flag_show(kobj, attr, buf, 278 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 279 } 280 static ssize_t use_zero_page_store(struct kobject *kobj, 281 struct kobj_attribute *attr, const char *buf, size_t count) 282 { 283 return single_hugepage_flag_store(kobj, attr, buf, count, 284 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 285 } 286 static struct kobj_attribute use_zero_page_attr = 287 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 288 #ifdef CONFIG_DEBUG_VM 289 static ssize_t debug_cow_show(struct kobject *kobj, 290 struct kobj_attribute *attr, char *buf) 291 { 292 return single_hugepage_flag_show(kobj, attr, buf, 293 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 294 } 295 static ssize_t debug_cow_store(struct kobject *kobj, 296 struct kobj_attribute *attr, 297 const char *buf, size_t count) 298 { 299 return single_hugepage_flag_store(kobj, attr, buf, count, 300 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 301 } 302 static struct kobj_attribute debug_cow_attr = 303 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 304 #endif /* CONFIG_DEBUG_VM */ 305 306 static struct attribute *hugepage_attr[] = { 307 &enabled_attr.attr, 308 &defrag_attr.attr, 309 &use_zero_page_attr.attr, 310 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) 311 &shmem_enabled_attr.attr, 312 #endif 313 #ifdef CONFIG_DEBUG_VM 314 &debug_cow_attr.attr, 315 #endif 316 NULL, 317 }; 318 319 static struct attribute_group hugepage_attr_group = { 320 .attrs = hugepage_attr, 321 }; 322 323 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 324 { 325 int err; 326 327 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 328 if (unlikely(!*hugepage_kobj)) { 329 pr_err("failed to create transparent hugepage kobject\n"); 330 return -ENOMEM; 331 } 332 333 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 334 if (err) { 335 pr_err("failed to register transparent hugepage group\n"); 336 goto delete_obj; 337 } 338 339 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 340 if (err) { 341 pr_err("failed to register transparent hugepage group\n"); 342 goto remove_hp_group; 343 } 344 345 return 0; 346 347 remove_hp_group: 348 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 349 delete_obj: 350 kobject_put(*hugepage_kobj); 351 return err; 352 } 353 354 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 355 { 356 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 357 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 358 kobject_put(hugepage_kobj); 359 } 360 #else 361 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 362 { 363 return 0; 364 } 365 366 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 367 { 368 } 369 #endif /* CONFIG_SYSFS */ 370 371 static int __init hugepage_init(void) 372 { 373 int err; 374 struct kobject *hugepage_kobj; 375 376 if (!has_transparent_hugepage()) { 377 transparent_hugepage_flags = 0; 378 return -EINVAL; 379 } 380 381 /* 382 * hugepages can't be allocated by the buddy allocator 383 */ 384 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); 385 /* 386 * we use page->mapping and page->index in second tail page 387 * as list_head: assuming THP order >= 2 388 */ 389 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 390 391 err = hugepage_init_sysfs(&hugepage_kobj); 392 if (err) 393 goto err_sysfs; 394 395 err = khugepaged_init(); 396 if (err) 397 goto err_slab; 398 399 err = register_shrinker(&huge_zero_page_shrinker); 400 if (err) 401 goto err_hzp_shrinker; 402 err = register_shrinker(&deferred_split_shrinker); 403 if (err) 404 goto err_split_shrinker; 405 406 /* 407 * By default disable transparent hugepages on smaller systems, 408 * where the extra memory used could hurt more than TLB overhead 409 * is likely to save. The admin can still enable it through /sys. 410 */ 411 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { 412 transparent_hugepage_flags = 0; 413 return 0; 414 } 415 416 err = start_stop_khugepaged(); 417 if (err) 418 goto err_khugepaged; 419 420 return 0; 421 err_khugepaged: 422 unregister_shrinker(&deferred_split_shrinker); 423 err_split_shrinker: 424 unregister_shrinker(&huge_zero_page_shrinker); 425 err_hzp_shrinker: 426 khugepaged_destroy(); 427 err_slab: 428 hugepage_exit_sysfs(hugepage_kobj); 429 err_sysfs: 430 return err; 431 } 432 subsys_initcall(hugepage_init); 433 434 static int __init setup_transparent_hugepage(char *str) 435 { 436 int ret = 0; 437 if (!str) 438 goto out; 439 if (!strcmp(str, "always")) { 440 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 441 &transparent_hugepage_flags); 442 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 443 &transparent_hugepage_flags); 444 ret = 1; 445 } else if (!strcmp(str, "madvise")) { 446 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 447 &transparent_hugepage_flags); 448 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 449 &transparent_hugepage_flags); 450 ret = 1; 451 } else if (!strcmp(str, "never")) { 452 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 453 &transparent_hugepage_flags); 454 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 455 &transparent_hugepage_flags); 456 ret = 1; 457 } 458 out: 459 if (!ret) 460 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 461 return ret; 462 } 463 __setup("transparent_hugepage=", setup_transparent_hugepage); 464 465 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 466 { 467 if (likely(vma->vm_flags & VM_WRITE)) 468 pmd = pmd_mkwrite(pmd); 469 return pmd; 470 } 471 472 static inline struct list_head *page_deferred_list(struct page *page) 473 { 474 /* 475 * ->lru in the tail pages is occupied by compound_head. 476 * Let's use ->mapping + ->index in the second tail page as list_head. 477 */ 478 return (struct list_head *)&page[2].mapping; 479 } 480 481 void prep_transhuge_page(struct page *page) 482 { 483 /* 484 * we use page->mapping and page->indexlru in second tail page 485 * as list_head: assuming THP order >= 2 486 */ 487 488 INIT_LIST_HEAD(page_deferred_list(page)); 489 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 490 } 491 492 unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, 493 loff_t off, unsigned long flags, unsigned long size) 494 { 495 unsigned long addr; 496 loff_t off_end = off + len; 497 loff_t off_align = round_up(off, size); 498 unsigned long len_pad; 499 500 if (off_end <= off_align || (off_end - off_align) < size) 501 return 0; 502 503 len_pad = len + size; 504 if (len_pad < len || (off + len_pad) < off) 505 return 0; 506 507 addr = current->mm->get_unmapped_area(filp, 0, len_pad, 508 off >> PAGE_SHIFT, flags); 509 if (IS_ERR_VALUE(addr)) 510 return 0; 511 512 addr += (off - addr) & (size - 1); 513 return addr; 514 } 515 516 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 517 unsigned long len, unsigned long pgoff, unsigned long flags) 518 { 519 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 520 521 if (addr) 522 goto out; 523 if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) 524 goto out; 525 526 addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); 527 if (addr) 528 return addr; 529 530 out: 531 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 532 } 533 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 534 535 static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, 536 gfp_t gfp) 537 { 538 struct vm_area_struct *vma = fe->vma; 539 struct mem_cgroup *memcg; 540 pgtable_t pgtable; 541 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 542 543 VM_BUG_ON_PAGE(!PageCompound(page), page); 544 545 if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { 546 put_page(page); 547 count_vm_event(THP_FAULT_FALLBACK); 548 return VM_FAULT_FALLBACK; 549 } 550 551 pgtable = pte_alloc_one(vma->vm_mm, haddr); 552 if (unlikely(!pgtable)) { 553 mem_cgroup_cancel_charge(page, memcg, true); 554 put_page(page); 555 return VM_FAULT_OOM; 556 } 557 558 clear_huge_page(page, haddr, HPAGE_PMD_NR); 559 /* 560 * The memory barrier inside __SetPageUptodate makes sure that 561 * clear_huge_page writes become visible before the set_pmd_at() 562 * write. 563 */ 564 __SetPageUptodate(page); 565 566 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 567 if (unlikely(!pmd_none(*fe->pmd))) { 568 spin_unlock(fe->ptl); 569 mem_cgroup_cancel_charge(page, memcg, true); 570 put_page(page); 571 pte_free(vma->vm_mm, pgtable); 572 } else { 573 pmd_t entry; 574 575 /* Deliver the page fault to userland */ 576 if (userfaultfd_missing(vma)) { 577 int ret; 578 579 spin_unlock(fe->ptl); 580 mem_cgroup_cancel_charge(page, memcg, true); 581 put_page(page); 582 pte_free(vma->vm_mm, pgtable); 583 ret = handle_userfault(fe, VM_UFFD_MISSING); 584 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 585 return ret; 586 } 587 588 entry = mk_huge_pmd(page, vma->vm_page_prot); 589 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 590 page_add_new_anon_rmap(page, vma, haddr, true); 591 mem_cgroup_commit_charge(page, memcg, false, true); 592 lru_cache_add_active_or_unevictable(page, vma); 593 pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); 594 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); 595 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 596 atomic_long_inc(&vma->vm_mm->nr_ptes); 597 spin_unlock(fe->ptl); 598 count_vm_event(THP_FAULT_ALLOC); 599 } 600 601 return 0; 602 } 603 604 /* 605 * If THP defrag is set to always then directly reclaim/compact as necessary 606 * If set to defer then do only background reclaim/compact and defer to khugepaged 607 * If set to madvise and the VMA is flagged then directly reclaim/compact 608 * When direct reclaim/compact is allowed, don't retry except for flagged VMA's 609 */ 610 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 611 { 612 bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 613 614 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 615 &transparent_hugepage_flags) && vma_madvised) 616 return GFP_TRANSHUGE; 617 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 618 &transparent_hugepage_flags)) 619 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 620 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 621 &transparent_hugepage_flags)) 622 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 623 624 return GFP_TRANSHUGE_LIGHT; 625 } 626 627 /* Caller must hold page table lock. */ 628 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 629 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 630 struct page *zero_page) 631 { 632 pmd_t entry; 633 if (!pmd_none(*pmd)) 634 return false; 635 entry = mk_pmd(zero_page, vma->vm_page_prot); 636 entry = pmd_mkhuge(entry); 637 if (pgtable) 638 pgtable_trans_huge_deposit(mm, pmd, pgtable); 639 set_pmd_at(mm, haddr, pmd, entry); 640 atomic_long_inc(&mm->nr_ptes); 641 return true; 642 } 643 644 int do_huge_pmd_anonymous_page(struct fault_env *fe) 645 { 646 struct vm_area_struct *vma = fe->vma; 647 gfp_t gfp; 648 struct page *page; 649 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 650 651 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 652 return VM_FAULT_FALLBACK; 653 if (unlikely(anon_vma_prepare(vma))) 654 return VM_FAULT_OOM; 655 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 656 return VM_FAULT_OOM; 657 if (!(fe->flags & FAULT_FLAG_WRITE) && 658 !mm_forbids_zeropage(vma->vm_mm) && 659 transparent_hugepage_use_zero_page()) { 660 pgtable_t pgtable; 661 struct page *zero_page; 662 bool set; 663 int ret; 664 pgtable = pte_alloc_one(vma->vm_mm, haddr); 665 if (unlikely(!pgtable)) 666 return VM_FAULT_OOM; 667 zero_page = mm_get_huge_zero_page(vma->vm_mm); 668 if (unlikely(!zero_page)) { 669 pte_free(vma->vm_mm, pgtable); 670 count_vm_event(THP_FAULT_FALLBACK); 671 return VM_FAULT_FALLBACK; 672 } 673 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 674 ret = 0; 675 set = false; 676 if (pmd_none(*fe->pmd)) { 677 if (userfaultfd_missing(vma)) { 678 spin_unlock(fe->ptl); 679 ret = handle_userfault(fe, VM_UFFD_MISSING); 680 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 681 } else { 682 set_huge_zero_page(pgtable, vma->vm_mm, vma, 683 haddr, fe->pmd, zero_page); 684 spin_unlock(fe->ptl); 685 set = true; 686 } 687 } else 688 spin_unlock(fe->ptl); 689 if (!set) 690 pte_free(vma->vm_mm, pgtable); 691 return ret; 692 } 693 gfp = alloc_hugepage_direct_gfpmask(vma); 694 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 695 if (unlikely(!page)) { 696 count_vm_event(THP_FAULT_FALLBACK); 697 return VM_FAULT_FALLBACK; 698 } 699 prep_transhuge_page(page); 700 return __do_huge_pmd_anonymous_page(fe, page, gfp); 701 } 702 703 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 704 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) 705 { 706 struct mm_struct *mm = vma->vm_mm; 707 pmd_t entry; 708 spinlock_t *ptl; 709 710 ptl = pmd_lock(mm, pmd); 711 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 712 if (pfn_t_devmap(pfn)) 713 entry = pmd_mkdevmap(entry); 714 if (write) { 715 entry = pmd_mkyoung(pmd_mkdirty(entry)); 716 entry = maybe_pmd_mkwrite(entry, vma); 717 } 718 set_pmd_at(mm, addr, pmd, entry); 719 update_mmu_cache_pmd(vma, addr, pmd); 720 spin_unlock(ptl); 721 } 722 723 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 724 pmd_t *pmd, pfn_t pfn, bool write) 725 { 726 pgprot_t pgprot = vma->vm_page_prot; 727 /* 728 * If we had pmd_special, we could avoid all these restrictions, 729 * but we need to be consistent with PTEs and architectures that 730 * can't support a 'special' bit. 731 */ 732 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 733 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 734 (VM_PFNMAP|VM_MIXEDMAP)); 735 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 736 BUG_ON(!pfn_t_devmap(pfn)); 737 738 if (addr < vma->vm_start || addr >= vma->vm_end) 739 return VM_FAULT_SIGBUS; 740 if (track_pfn_insert(vma, &pgprot, pfn)) 741 return VM_FAULT_SIGBUS; 742 insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); 743 return VM_FAULT_NOPAGE; 744 } 745 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 746 747 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 748 pmd_t *pmd) 749 { 750 pmd_t _pmd; 751 752 /* 753 * We should set the dirty bit only for FOLL_WRITE but for now 754 * the dirty bit in the pmd is meaningless. And if the dirty 755 * bit will become meaningful and we'll only set it with 756 * FOLL_WRITE, an atomic set_bit will be required on the pmd to 757 * set the young bit, instead of the current set_pmd_at. 758 */ 759 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 760 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 761 pmd, _pmd, 1)) 762 update_mmu_cache_pmd(vma, addr, pmd); 763 } 764 765 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 766 pmd_t *pmd, int flags) 767 { 768 unsigned long pfn = pmd_pfn(*pmd); 769 struct mm_struct *mm = vma->vm_mm; 770 struct dev_pagemap *pgmap; 771 struct page *page; 772 773 assert_spin_locked(pmd_lockptr(mm, pmd)); 774 775 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 776 return NULL; 777 778 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 779 /* pass */; 780 else 781 return NULL; 782 783 if (flags & FOLL_TOUCH) 784 touch_pmd(vma, addr, pmd); 785 786 /* 787 * device mapped pages can only be returned if the 788 * caller will manage the page reference count. 789 */ 790 if (!(flags & FOLL_GET)) 791 return ERR_PTR(-EEXIST); 792 793 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 794 pgmap = get_dev_pagemap(pfn, NULL); 795 if (!pgmap) 796 return ERR_PTR(-EFAULT); 797 page = pfn_to_page(pfn); 798 get_page(page); 799 put_dev_pagemap(pgmap); 800 801 return page; 802 } 803 804 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 805 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 806 struct vm_area_struct *vma) 807 { 808 spinlock_t *dst_ptl, *src_ptl; 809 struct page *src_page; 810 pmd_t pmd; 811 pgtable_t pgtable = NULL; 812 int ret = -ENOMEM; 813 814 /* Skip if can be re-fill on fault */ 815 if (!vma_is_anonymous(vma)) 816 return 0; 817 818 pgtable = pte_alloc_one(dst_mm, addr); 819 if (unlikely(!pgtable)) 820 goto out; 821 822 dst_ptl = pmd_lock(dst_mm, dst_pmd); 823 src_ptl = pmd_lockptr(src_mm, src_pmd); 824 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 825 826 ret = -EAGAIN; 827 pmd = *src_pmd; 828 if (unlikely(!pmd_trans_huge(pmd))) { 829 pte_free(dst_mm, pgtable); 830 goto out_unlock; 831 } 832 /* 833 * When page table lock is held, the huge zero pmd should not be 834 * under splitting since we don't split the page itself, only pmd to 835 * a page table. 836 */ 837 if (is_huge_zero_pmd(pmd)) { 838 struct page *zero_page; 839 /* 840 * get_huge_zero_page() will never allocate a new page here, 841 * since we already have a zero page to copy. It just takes a 842 * reference. 843 */ 844 zero_page = mm_get_huge_zero_page(dst_mm); 845 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 846 zero_page); 847 ret = 0; 848 goto out_unlock; 849 } 850 851 src_page = pmd_page(pmd); 852 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 853 get_page(src_page); 854 page_dup_rmap(src_page, true); 855 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 856 atomic_long_inc(&dst_mm->nr_ptes); 857 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 858 859 pmdp_set_wrprotect(src_mm, addr, src_pmd); 860 pmd = pmd_mkold(pmd_wrprotect(pmd)); 861 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 862 863 ret = 0; 864 out_unlock: 865 spin_unlock(src_ptl); 866 spin_unlock(dst_ptl); 867 out: 868 return ret; 869 } 870 871 void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) 872 { 873 pmd_t entry; 874 unsigned long haddr; 875 876 fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); 877 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) 878 goto unlock; 879 880 entry = pmd_mkyoung(orig_pmd); 881 haddr = fe->address & HPAGE_PMD_MASK; 882 if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, 883 fe->flags & FAULT_FLAG_WRITE)) 884 update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); 885 886 unlock: 887 spin_unlock(fe->ptl); 888 } 889 890 static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, 891 struct page *page) 892 { 893 struct vm_area_struct *vma = fe->vma; 894 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 895 struct mem_cgroup *memcg; 896 pgtable_t pgtable; 897 pmd_t _pmd; 898 int ret = 0, i; 899 struct page **pages; 900 unsigned long mmun_start; /* For mmu_notifiers */ 901 unsigned long mmun_end; /* For mmu_notifiers */ 902 903 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 904 GFP_KERNEL); 905 if (unlikely(!pages)) { 906 ret |= VM_FAULT_OOM; 907 goto out; 908 } 909 910 for (i = 0; i < HPAGE_PMD_NR; i++) { 911 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 912 __GFP_OTHER_NODE, vma, 913 fe->address, page_to_nid(page)); 914 if (unlikely(!pages[i] || 915 mem_cgroup_try_charge(pages[i], vma->vm_mm, 916 GFP_KERNEL, &memcg, false))) { 917 if (pages[i]) 918 put_page(pages[i]); 919 while (--i >= 0) { 920 memcg = (void *)page_private(pages[i]); 921 set_page_private(pages[i], 0); 922 mem_cgroup_cancel_charge(pages[i], memcg, 923 false); 924 put_page(pages[i]); 925 } 926 kfree(pages); 927 ret |= VM_FAULT_OOM; 928 goto out; 929 } 930 set_page_private(pages[i], (unsigned long)memcg); 931 } 932 933 for (i = 0; i < HPAGE_PMD_NR; i++) { 934 copy_user_highpage(pages[i], page + i, 935 haddr + PAGE_SIZE * i, vma); 936 __SetPageUptodate(pages[i]); 937 cond_resched(); 938 } 939 940 mmun_start = haddr; 941 mmun_end = haddr + HPAGE_PMD_SIZE; 942 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); 943 944 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 945 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) 946 goto out_free_pages; 947 VM_BUG_ON_PAGE(!PageHead(page), page); 948 949 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); 950 /* leave pmd empty until pte is filled */ 951 952 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); 953 pmd_populate(vma->vm_mm, &_pmd, pgtable); 954 955 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 956 pte_t entry; 957 entry = mk_pte(pages[i], vma->vm_page_prot); 958 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 959 memcg = (void *)page_private(pages[i]); 960 set_page_private(pages[i], 0); 961 page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); 962 mem_cgroup_commit_charge(pages[i], memcg, false, false); 963 lru_cache_add_active_or_unevictable(pages[i], vma); 964 fe->pte = pte_offset_map(&_pmd, haddr); 965 VM_BUG_ON(!pte_none(*fe->pte)); 966 set_pte_at(vma->vm_mm, haddr, fe->pte, entry); 967 pte_unmap(fe->pte); 968 } 969 kfree(pages); 970 971 smp_wmb(); /* make pte visible before pmd */ 972 pmd_populate(vma->vm_mm, fe->pmd, pgtable); 973 page_remove_rmap(page, true); 974 spin_unlock(fe->ptl); 975 976 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 977 978 ret |= VM_FAULT_WRITE; 979 put_page(page); 980 981 out: 982 return ret; 983 984 out_free_pages: 985 spin_unlock(fe->ptl); 986 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 987 for (i = 0; i < HPAGE_PMD_NR; i++) { 988 memcg = (void *)page_private(pages[i]); 989 set_page_private(pages[i], 0); 990 mem_cgroup_cancel_charge(pages[i], memcg, false); 991 put_page(pages[i]); 992 } 993 kfree(pages); 994 goto out; 995 } 996 997 int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) 998 { 999 struct vm_area_struct *vma = fe->vma; 1000 struct page *page = NULL, *new_page; 1001 struct mem_cgroup *memcg; 1002 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 1003 unsigned long mmun_start; /* For mmu_notifiers */ 1004 unsigned long mmun_end; /* For mmu_notifiers */ 1005 gfp_t huge_gfp; /* for allocation and charge */ 1006 int ret = 0; 1007 1008 fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); 1009 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1010 if (is_huge_zero_pmd(orig_pmd)) 1011 goto alloc; 1012 spin_lock(fe->ptl); 1013 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) 1014 goto out_unlock; 1015 1016 page = pmd_page(orig_pmd); 1017 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); 1018 /* 1019 * We can only reuse the page if nobody else maps the huge page or it's 1020 * part. 1021 */ 1022 if (page_trans_huge_mapcount(page, NULL) == 1) { 1023 pmd_t entry; 1024 entry = pmd_mkyoung(orig_pmd); 1025 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1026 if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) 1027 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1028 ret |= VM_FAULT_WRITE; 1029 goto out_unlock; 1030 } 1031 get_page(page); 1032 spin_unlock(fe->ptl); 1033 alloc: 1034 if (transparent_hugepage_enabled(vma) && 1035 !transparent_hugepage_debug_cow()) { 1036 huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1037 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1038 } else 1039 new_page = NULL; 1040 1041 if (likely(new_page)) { 1042 prep_transhuge_page(new_page); 1043 } else { 1044 if (!page) { 1045 split_huge_pmd(vma, fe->pmd, fe->address); 1046 ret |= VM_FAULT_FALLBACK; 1047 } else { 1048 ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); 1049 if (ret & VM_FAULT_OOM) { 1050 split_huge_pmd(vma, fe->pmd, fe->address); 1051 ret |= VM_FAULT_FALLBACK; 1052 } 1053 put_page(page); 1054 } 1055 count_vm_event(THP_FAULT_FALLBACK); 1056 goto out; 1057 } 1058 1059 if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, 1060 huge_gfp, &memcg, true))) { 1061 put_page(new_page); 1062 split_huge_pmd(vma, fe->pmd, fe->address); 1063 if (page) 1064 put_page(page); 1065 ret |= VM_FAULT_FALLBACK; 1066 count_vm_event(THP_FAULT_FALLBACK); 1067 goto out; 1068 } 1069 1070 count_vm_event(THP_FAULT_ALLOC); 1071 1072 if (!page) 1073 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1074 else 1075 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1076 __SetPageUptodate(new_page); 1077 1078 mmun_start = haddr; 1079 mmun_end = haddr + HPAGE_PMD_SIZE; 1080 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); 1081 1082 spin_lock(fe->ptl); 1083 if (page) 1084 put_page(page); 1085 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { 1086 spin_unlock(fe->ptl); 1087 mem_cgroup_cancel_charge(new_page, memcg, true); 1088 put_page(new_page); 1089 goto out_mn; 1090 } else { 1091 pmd_t entry; 1092 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1093 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1094 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); 1095 page_add_new_anon_rmap(new_page, vma, haddr, true); 1096 mem_cgroup_commit_charge(new_page, memcg, false, true); 1097 lru_cache_add_active_or_unevictable(new_page, vma); 1098 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); 1099 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1100 if (!page) { 1101 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1102 } else { 1103 VM_BUG_ON_PAGE(!PageHead(page), page); 1104 page_remove_rmap(page, true); 1105 put_page(page); 1106 } 1107 ret |= VM_FAULT_WRITE; 1108 } 1109 spin_unlock(fe->ptl); 1110 out_mn: 1111 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 1112 out: 1113 return ret; 1114 out_unlock: 1115 spin_unlock(fe->ptl); 1116 return ret; 1117 } 1118 1119 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1120 unsigned long addr, 1121 pmd_t *pmd, 1122 unsigned int flags) 1123 { 1124 struct mm_struct *mm = vma->vm_mm; 1125 struct page *page = NULL; 1126 1127 assert_spin_locked(pmd_lockptr(mm, pmd)); 1128 1129 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1130 goto out; 1131 1132 /* Avoid dumping huge zero page */ 1133 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1134 return ERR_PTR(-EFAULT); 1135 1136 /* Full NUMA hinting faults to serialise migration in fault paths */ 1137 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 1138 goto out; 1139 1140 page = pmd_page(*pmd); 1141 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); 1142 if (flags & FOLL_TOUCH) 1143 touch_pmd(vma, addr, pmd); 1144 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1145 /* 1146 * We don't mlock() pte-mapped THPs. This way we can avoid 1147 * leaking mlocked pages into non-VM_LOCKED VMAs. 1148 * 1149 * For anon THP: 1150 * 1151 * In most cases the pmd is the only mapping of the page as we 1152 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for 1153 * writable private mappings in populate_vma_page_range(). 1154 * 1155 * The only scenario when we have the page shared here is if we 1156 * mlocking read-only mapping shared over fork(). We skip 1157 * mlocking such pages. 1158 * 1159 * For file THP: 1160 * 1161 * We can expect PageDoubleMap() to be stable under page lock: 1162 * for file pages we set it in page_add_file_rmap(), which 1163 * requires page to be locked. 1164 */ 1165 1166 if (PageAnon(page) && compound_mapcount(page) != 1) 1167 goto skip_mlock; 1168 if (PageDoubleMap(page) || !page->mapping) 1169 goto skip_mlock; 1170 if (!trylock_page(page)) 1171 goto skip_mlock; 1172 lru_add_drain(); 1173 if (page->mapping && !PageDoubleMap(page)) 1174 mlock_vma_page(page); 1175 unlock_page(page); 1176 } 1177 skip_mlock: 1178 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1179 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); 1180 if (flags & FOLL_GET) 1181 get_page(page); 1182 1183 out: 1184 return page; 1185 } 1186 1187 /* NUMA hinting page fault entry point for trans huge pmds */ 1188 int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) 1189 { 1190 struct vm_area_struct *vma = fe->vma; 1191 struct anon_vma *anon_vma = NULL; 1192 struct page *page; 1193 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 1194 int page_nid = -1, this_nid = numa_node_id(); 1195 int target_nid, last_cpupid = -1; 1196 bool page_locked; 1197 bool migrated = false; 1198 bool was_writable; 1199 int flags = 0; 1200 1201 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 1202 if (unlikely(!pmd_same(pmd, *fe->pmd))) 1203 goto out_unlock; 1204 1205 /* 1206 * If there are potential migrations, wait for completion and retry 1207 * without disrupting NUMA hinting information. Do not relock and 1208 * check_same as the page may no longer be mapped. 1209 */ 1210 if (unlikely(pmd_trans_migrating(*fe->pmd))) { 1211 page = pmd_page(*fe->pmd); 1212 spin_unlock(fe->ptl); 1213 wait_on_page_locked(page); 1214 goto out; 1215 } 1216 1217 page = pmd_page(pmd); 1218 BUG_ON(is_huge_zero_page(page)); 1219 page_nid = page_to_nid(page); 1220 last_cpupid = page_cpupid_last(page); 1221 count_vm_numa_event(NUMA_HINT_FAULTS); 1222 if (page_nid == this_nid) { 1223 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1224 flags |= TNF_FAULT_LOCAL; 1225 } 1226 1227 /* See similar comment in do_numa_page for explanation */ 1228 if (!pmd_write(pmd)) 1229 flags |= TNF_NO_GROUP; 1230 1231 /* 1232 * Acquire the page lock to serialise THP migrations but avoid dropping 1233 * page_table_lock if at all possible 1234 */ 1235 page_locked = trylock_page(page); 1236 target_nid = mpol_misplaced(page, vma, haddr); 1237 if (target_nid == -1) { 1238 /* If the page was locked, there are no parallel migrations */ 1239 if (page_locked) 1240 goto clear_pmdnuma; 1241 } 1242 1243 /* Migration could have started since the pmd_trans_migrating check */ 1244 if (!page_locked) { 1245 spin_unlock(fe->ptl); 1246 wait_on_page_locked(page); 1247 page_nid = -1; 1248 goto out; 1249 } 1250 1251 /* 1252 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma 1253 * to serialises splits 1254 */ 1255 get_page(page); 1256 spin_unlock(fe->ptl); 1257 anon_vma = page_lock_anon_vma_read(page); 1258 1259 /* Confirm the PMD did not change while page_table_lock was released */ 1260 spin_lock(fe->ptl); 1261 if (unlikely(!pmd_same(pmd, *fe->pmd))) { 1262 unlock_page(page); 1263 put_page(page); 1264 page_nid = -1; 1265 goto out_unlock; 1266 } 1267 1268 /* Bail if we fail to protect against THP splits for any reason */ 1269 if (unlikely(!anon_vma)) { 1270 put_page(page); 1271 page_nid = -1; 1272 goto clear_pmdnuma; 1273 } 1274 1275 /* 1276 * Migrate the THP to the requested node, returns with page unlocked 1277 * and access rights restored. 1278 */ 1279 spin_unlock(fe->ptl); 1280 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, 1281 fe->pmd, pmd, fe->address, page, target_nid); 1282 if (migrated) { 1283 flags |= TNF_MIGRATED; 1284 page_nid = target_nid; 1285 } else 1286 flags |= TNF_MIGRATE_FAIL; 1287 1288 goto out; 1289 clear_pmdnuma: 1290 BUG_ON(!PageLocked(page)); 1291 was_writable = pmd_write(pmd); 1292 pmd = pmd_modify(pmd, vma->vm_page_prot); 1293 pmd = pmd_mkyoung(pmd); 1294 if (was_writable) 1295 pmd = pmd_mkwrite(pmd); 1296 set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); 1297 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1298 unlock_page(page); 1299 out_unlock: 1300 spin_unlock(fe->ptl); 1301 1302 out: 1303 if (anon_vma) 1304 page_unlock_anon_vma_read(anon_vma); 1305 1306 if (page_nid != -1) 1307 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); 1308 1309 return 0; 1310 } 1311 1312 /* 1313 * Return true if we do MADV_FREE successfully on entire pmd page. 1314 * Otherwise, return false. 1315 */ 1316 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1317 pmd_t *pmd, unsigned long addr, unsigned long next) 1318 { 1319 spinlock_t *ptl; 1320 pmd_t orig_pmd; 1321 struct page *page; 1322 struct mm_struct *mm = tlb->mm; 1323 bool ret = false; 1324 1325 ptl = pmd_trans_huge_lock(pmd, vma); 1326 if (!ptl) 1327 goto out_unlocked; 1328 1329 orig_pmd = *pmd; 1330 if (is_huge_zero_pmd(orig_pmd)) 1331 goto out; 1332 1333 page = pmd_page(orig_pmd); 1334 /* 1335 * If other processes are mapping this page, we couldn't discard 1336 * the page unless they all do MADV_FREE so let's skip the page. 1337 */ 1338 if (page_mapcount(page) != 1) 1339 goto out; 1340 1341 if (!trylock_page(page)) 1342 goto out; 1343 1344 /* 1345 * If user want to discard part-pages of THP, split it so MADV_FREE 1346 * will deactivate only them. 1347 */ 1348 if (next - addr != HPAGE_PMD_SIZE) { 1349 get_page(page); 1350 spin_unlock(ptl); 1351 split_huge_page(page); 1352 put_page(page); 1353 unlock_page(page); 1354 goto out_unlocked; 1355 } 1356 1357 if (PageDirty(page)) 1358 ClearPageDirty(page); 1359 unlock_page(page); 1360 1361 if (PageActive(page)) 1362 deactivate_page(page); 1363 1364 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1365 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1366 tlb->fullmm); 1367 orig_pmd = pmd_mkold(orig_pmd); 1368 orig_pmd = pmd_mkclean(orig_pmd); 1369 1370 set_pmd_at(mm, addr, pmd, orig_pmd); 1371 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1372 } 1373 ret = true; 1374 out: 1375 spin_unlock(ptl); 1376 out_unlocked: 1377 return ret; 1378 } 1379 1380 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1381 pmd_t *pmd, unsigned long addr) 1382 { 1383 pmd_t orig_pmd; 1384 spinlock_t *ptl; 1385 1386 ptl = __pmd_trans_huge_lock(pmd, vma); 1387 if (!ptl) 1388 return 0; 1389 /* 1390 * For architectures like ppc64 we look at deposited pgtable 1391 * when calling pmdp_huge_get_and_clear. So do the 1392 * pgtable_trans_huge_withdraw after finishing pmdp related 1393 * operations. 1394 */ 1395 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1396 tlb->fullmm); 1397 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1398 if (vma_is_dax(vma)) { 1399 spin_unlock(ptl); 1400 if (is_huge_zero_pmd(orig_pmd)) 1401 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1402 } else if (is_huge_zero_pmd(orig_pmd)) { 1403 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); 1404 atomic_long_dec(&tlb->mm->nr_ptes); 1405 spin_unlock(ptl); 1406 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1407 } else { 1408 struct page *page = pmd_page(orig_pmd); 1409 page_remove_rmap(page, true); 1410 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1411 VM_BUG_ON_PAGE(!PageHead(page), page); 1412 if (PageAnon(page)) { 1413 pgtable_t pgtable; 1414 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); 1415 pte_free(tlb->mm, pgtable); 1416 atomic_long_dec(&tlb->mm->nr_ptes); 1417 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1418 } else { 1419 add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); 1420 } 1421 spin_unlock(ptl); 1422 tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); 1423 } 1424 return 1; 1425 } 1426 1427 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1428 unsigned long new_addr, unsigned long old_end, 1429 pmd_t *old_pmd, pmd_t *new_pmd) 1430 { 1431 spinlock_t *old_ptl, *new_ptl; 1432 pmd_t pmd; 1433 struct mm_struct *mm = vma->vm_mm; 1434 1435 if ((old_addr & ~HPAGE_PMD_MASK) || 1436 (new_addr & ~HPAGE_PMD_MASK) || 1437 old_end - old_addr < HPAGE_PMD_SIZE) 1438 return false; 1439 1440 /* 1441 * The destination pmd shouldn't be established, free_pgtables() 1442 * should have release it. 1443 */ 1444 if (WARN_ON(!pmd_none(*new_pmd))) { 1445 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1446 return false; 1447 } 1448 1449 /* 1450 * We don't have to worry about the ordering of src and dst 1451 * ptlocks because exclusive mmap_sem prevents deadlock. 1452 */ 1453 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1454 if (old_ptl) { 1455 new_ptl = pmd_lockptr(mm, new_pmd); 1456 if (new_ptl != old_ptl) 1457 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1458 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1459 VM_BUG_ON(!pmd_none(*new_pmd)); 1460 1461 if (pmd_move_must_withdraw(new_ptl, old_ptl) && 1462 vma_is_anonymous(vma)) { 1463 pgtable_t pgtable; 1464 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1465 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1466 } 1467 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); 1468 if (new_ptl != old_ptl) 1469 spin_unlock(new_ptl); 1470 spin_unlock(old_ptl); 1471 return true; 1472 } 1473 return false; 1474 } 1475 1476 /* 1477 * Returns 1478 * - 0 if PMD could not be locked 1479 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary 1480 * - HPAGE_PMD_NR is protections changed and TLB flush necessary 1481 */ 1482 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1483 unsigned long addr, pgprot_t newprot, int prot_numa) 1484 { 1485 struct mm_struct *mm = vma->vm_mm; 1486 spinlock_t *ptl; 1487 int ret = 0; 1488 1489 ptl = __pmd_trans_huge_lock(pmd, vma); 1490 if (ptl) { 1491 pmd_t entry; 1492 bool preserve_write = prot_numa && pmd_write(*pmd); 1493 ret = 1; 1494 1495 /* 1496 * Avoid trapping faults against the zero page. The read-only 1497 * data is likely to be read-cached on the local CPU and 1498 * local/remote hits to the zero page are not interesting. 1499 */ 1500 if (prot_numa && is_huge_zero_pmd(*pmd)) { 1501 spin_unlock(ptl); 1502 return ret; 1503 } 1504 1505 if (!prot_numa || !pmd_protnone(*pmd)) { 1506 entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); 1507 entry = pmd_modify(entry, newprot); 1508 if (preserve_write) 1509 entry = pmd_mkwrite(entry); 1510 ret = HPAGE_PMD_NR; 1511 set_pmd_at(mm, addr, pmd, entry); 1512 BUG_ON(vma_is_anonymous(vma) && !preserve_write && 1513 pmd_write(entry)); 1514 } 1515 spin_unlock(ptl); 1516 } 1517 1518 return ret; 1519 } 1520 1521 /* 1522 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 1523 * 1524 * Note that if it returns page table lock pointer, this routine returns without 1525 * unlocking page table lock. So callers must unlock it. 1526 */ 1527 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1528 { 1529 spinlock_t *ptl; 1530 ptl = pmd_lock(vma->vm_mm, pmd); 1531 if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) 1532 return ptl; 1533 spin_unlock(ptl); 1534 return NULL; 1535 } 1536 1537 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 1538 unsigned long haddr, pmd_t *pmd) 1539 { 1540 struct mm_struct *mm = vma->vm_mm; 1541 pgtable_t pgtable; 1542 pmd_t _pmd; 1543 int i; 1544 1545 /* leave pmd empty until pte is filled */ 1546 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1547 1548 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1549 pmd_populate(mm, &_pmd, pgtable); 1550 1551 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1552 pte_t *pte, entry; 1553 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 1554 entry = pte_mkspecial(entry); 1555 pte = pte_offset_map(&_pmd, haddr); 1556 VM_BUG_ON(!pte_none(*pte)); 1557 set_pte_at(mm, haddr, pte, entry); 1558 pte_unmap(pte); 1559 } 1560 smp_wmb(); /* make pte visible before pmd */ 1561 pmd_populate(mm, pmd, pgtable); 1562 } 1563 1564 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 1565 unsigned long haddr, bool freeze) 1566 { 1567 struct mm_struct *mm = vma->vm_mm; 1568 struct page *page; 1569 pgtable_t pgtable; 1570 pmd_t _pmd; 1571 bool young, write, dirty, soft_dirty; 1572 unsigned long addr; 1573 int i; 1574 1575 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 1576 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 1577 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 1578 VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); 1579 1580 count_vm_event(THP_SPLIT_PMD); 1581 1582 if (!vma_is_anonymous(vma)) { 1583 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1584 if (vma_is_dax(vma)) 1585 return; 1586 page = pmd_page(_pmd); 1587 if (!PageReferenced(page) && pmd_young(_pmd)) 1588 SetPageReferenced(page); 1589 page_remove_rmap(page, true); 1590 put_page(page); 1591 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); 1592 return; 1593 } else if (is_huge_zero_pmd(*pmd)) { 1594 return __split_huge_zero_page_pmd(vma, haddr, pmd); 1595 } 1596 1597 page = pmd_page(*pmd); 1598 VM_BUG_ON_PAGE(!page_count(page), page); 1599 page_ref_add(page, HPAGE_PMD_NR - 1); 1600 write = pmd_write(*pmd); 1601 young = pmd_young(*pmd); 1602 dirty = pmd_dirty(*pmd); 1603 soft_dirty = pmd_soft_dirty(*pmd); 1604 1605 pmdp_huge_split_prepare(vma, haddr, pmd); 1606 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1607 pmd_populate(mm, &_pmd, pgtable); 1608 1609 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 1610 pte_t entry, *pte; 1611 /* 1612 * Note that NUMA hinting access restrictions are not 1613 * transferred to avoid any possibility of altering 1614 * permissions across VMAs. 1615 */ 1616 if (freeze) { 1617 swp_entry_t swp_entry; 1618 swp_entry = make_migration_entry(page + i, write); 1619 entry = swp_entry_to_pte(swp_entry); 1620 if (soft_dirty) 1621 entry = pte_swp_mksoft_dirty(entry); 1622 } else { 1623 entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); 1624 entry = maybe_mkwrite(entry, vma); 1625 if (!write) 1626 entry = pte_wrprotect(entry); 1627 if (!young) 1628 entry = pte_mkold(entry); 1629 if (soft_dirty) 1630 entry = pte_mksoft_dirty(entry); 1631 } 1632 if (dirty) 1633 SetPageDirty(page + i); 1634 pte = pte_offset_map(&_pmd, addr); 1635 BUG_ON(!pte_none(*pte)); 1636 set_pte_at(mm, addr, pte, entry); 1637 atomic_inc(&page[i]._mapcount); 1638 pte_unmap(pte); 1639 } 1640 1641 /* 1642 * Set PG_double_map before dropping compound_mapcount to avoid 1643 * false-negative page_mapped(). 1644 */ 1645 if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { 1646 for (i = 0; i < HPAGE_PMD_NR; i++) 1647 atomic_inc(&page[i]._mapcount); 1648 } 1649 1650 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 1651 /* Last compound_mapcount is gone. */ 1652 __dec_node_page_state(page, NR_ANON_THPS); 1653 if (TestClearPageDoubleMap(page)) { 1654 /* No need in mapcount reference anymore */ 1655 for (i = 0; i < HPAGE_PMD_NR; i++) 1656 atomic_dec(&page[i]._mapcount); 1657 } 1658 } 1659 1660 smp_wmb(); /* make pte visible before pmd */ 1661 /* 1662 * Up to this point the pmd is present and huge and userland has the 1663 * whole access to the hugepage during the split (which happens in 1664 * place). If we overwrite the pmd with the not-huge version pointing 1665 * to the pte here (which of course we could if all CPUs were bug 1666 * free), userland could trigger a small page size TLB miss on the 1667 * small sized TLB while the hugepage TLB entry is still established in 1668 * the huge TLB. Some CPU doesn't like that. 1669 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 1670 * 383 on page 93. Intel should be safe but is also warns that it's 1671 * only safe if the permission and cache attributes of the two entries 1672 * loaded in the two TLB is identical (which should be the case here). 1673 * But it is generally safer to never allow small and huge TLB entries 1674 * for the same virtual address to be loaded simultaneously. So instead 1675 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 1676 * current pmd notpresent (atomically because here the pmd_trans_huge 1677 * and pmd_trans_splitting must remain set at all times on the pmd 1678 * until the split is complete for this pmd), then we flush the SMP TLB 1679 * and finally we write the non-huge version of the pmd entry with 1680 * pmd_populate. 1681 */ 1682 pmdp_invalidate(vma, haddr, pmd); 1683 pmd_populate(mm, pmd, pgtable); 1684 1685 if (freeze) { 1686 for (i = 0; i < HPAGE_PMD_NR; i++) { 1687 page_remove_rmap(page + i, false); 1688 put_page(page + i); 1689 } 1690 } 1691 } 1692 1693 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1694 unsigned long address, bool freeze, struct page *page) 1695 { 1696 spinlock_t *ptl; 1697 struct mm_struct *mm = vma->vm_mm; 1698 unsigned long haddr = address & HPAGE_PMD_MASK; 1699 1700 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); 1701 ptl = pmd_lock(mm, pmd); 1702 1703 /* 1704 * If caller asks to setup a migration entries, we need a page to check 1705 * pmd against. Otherwise we can end up replacing wrong page. 1706 */ 1707 VM_BUG_ON(freeze && !page); 1708 if (page && page != pmd_page(*pmd)) 1709 goto out; 1710 1711 if (pmd_trans_huge(*pmd)) { 1712 page = pmd_page(*pmd); 1713 if (PageMlocked(page)) 1714 clear_page_mlock(page); 1715 } else if (!pmd_devmap(*pmd)) 1716 goto out; 1717 __split_huge_pmd_locked(vma, pmd, haddr, freeze); 1718 out: 1719 spin_unlock(ptl); 1720 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); 1721 } 1722 1723 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 1724 bool freeze, struct page *page) 1725 { 1726 pgd_t *pgd; 1727 pud_t *pud; 1728 pmd_t *pmd; 1729 1730 pgd = pgd_offset(vma->vm_mm, address); 1731 if (!pgd_present(*pgd)) 1732 return; 1733 1734 pud = pud_offset(pgd, address); 1735 if (!pud_present(*pud)) 1736 return; 1737 1738 pmd = pmd_offset(pud, address); 1739 1740 __split_huge_pmd(vma, pmd, address, freeze, page); 1741 } 1742 1743 void vma_adjust_trans_huge(struct vm_area_struct *vma, 1744 unsigned long start, 1745 unsigned long end, 1746 long adjust_next) 1747 { 1748 /* 1749 * If the new start address isn't hpage aligned and it could 1750 * previously contain an hugepage: check if we need to split 1751 * an huge pmd. 1752 */ 1753 if (start & ~HPAGE_PMD_MASK && 1754 (start & HPAGE_PMD_MASK) >= vma->vm_start && 1755 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 1756 split_huge_pmd_address(vma, start, false, NULL); 1757 1758 /* 1759 * If the new end address isn't hpage aligned and it could 1760 * previously contain an hugepage: check if we need to split 1761 * an huge pmd. 1762 */ 1763 if (end & ~HPAGE_PMD_MASK && 1764 (end & HPAGE_PMD_MASK) >= vma->vm_start && 1765 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 1766 split_huge_pmd_address(vma, end, false, NULL); 1767 1768 /* 1769 * If we're also updating the vma->vm_next->vm_start, if the new 1770 * vm_next->vm_start isn't page aligned and it could previously 1771 * contain an hugepage: check if we need to split an huge pmd. 1772 */ 1773 if (adjust_next > 0) { 1774 struct vm_area_struct *next = vma->vm_next; 1775 unsigned long nstart = next->vm_start; 1776 nstart += adjust_next << PAGE_SHIFT; 1777 if (nstart & ~HPAGE_PMD_MASK && 1778 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 1779 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 1780 split_huge_pmd_address(next, nstart, false, NULL); 1781 } 1782 } 1783 1784 static void freeze_page(struct page *page) 1785 { 1786 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | 1787 TTU_RMAP_LOCKED; 1788 int i, ret; 1789 1790 VM_BUG_ON_PAGE(!PageHead(page), page); 1791 1792 if (PageAnon(page)) 1793 ttu_flags |= TTU_MIGRATION; 1794 1795 /* We only need TTU_SPLIT_HUGE_PMD once */ 1796 ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); 1797 for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { 1798 /* Cut short if the page is unmapped */ 1799 if (page_count(page) == 1) 1800 return; 1801 1802 ret = try_to_unmap(page + i, ttu_flags); 1803 } 1804 VM_BUG_ON_PAGE(ret, page + i - 1); 1805 } 1806 1807 static void unfreeze_page(struct page *page) 1808 { 1809 int i; 1810 1811 for (i = 0; i < HPAGE_PMD_NR; i++) 1812 remove_migration_ptes(page + i, page + i, true); 1813 } 1814 1815 static void __split_huge_page_tail(struct page *head, int tail, 1816 struct lruvec *lruvec, struct list_head *list) 1817 { 1818 struct page *page_tail = head + tail; 1819 1820 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 1821 VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); 1822 1823 /* 1824 * tail_page->_refcount is zero and not changing from under us. But 1825 * get_page_unless_zero() may be running from under us on the 1826 * tail_page. If we used atomic_set() below instead of atomic_inc() or 1827 * atomic_add(), we would then run atomic_set() concurrently with 1828 * get_page_unless_zero(), and atomic_set() is implemented in C not 1829 * using locked ops. spin_unlock on x86 sometime uses locked ops 1830 * because of PPro errata 66, 92, so unless somebody can guarantee 1831 * atomic_set() here would be safe on all archs (and not only on x86), 1832 * it's safer to use atomic_inc()/atomic_add(). 1833 */ 1834 if (PageAnon(head)) { 1835 page_ref_inc(page_tail); 1836 } else { 1837 /* Additional pin to radix tree */ 1838 page_ref_add(page_tail, 2); 1839 } 1840 1841 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1842 page_tail->flags |= (head->flags & 1843 ((1L << PG_referenced) | 1844 (1L << PG_swapbacked) | 1845 (1L << PG_mlocked) | 1846 (1L << PG_uptodate) | 1847 (1L << PG_active) | 1848 (1L << PG_locked) | 1849 (1L << PG_unevictable) | 1850 (1L << PG_dirty))); 1851 1852 /* 1853 * After clearing PageTail the gup refcount can be released. 1854 * Page flags also must be visible before we make the page non-compound. 1855 */ 1856 smp_wmb(); 1857 1858 clear_compound_head(page_tail); 1859 1860 if (page_is_young(head)) 1861 set_page_young(page_tail); 1862 if (page_is_idle(head)) 1863 set_page_idle(page_tail); 1864 1865 /* ->mapping in first tail page is compound_mapcount */ 1866 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 1867 page_tail); 1868 page_tail->mapping = head->mapping; 1869 1870 page_tail->index = head->index + tail; 1871 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 1872 lru_add_page_tail(head, page_tail, lruvec, list); 1873 } 1874 1875 static void __split_huge_page(struct page *page, struct list_head *list, 1876 unsigned long flags) 1877 { 1878 struct page *head = compound_head(page); 1879 struct zone *zone = page_zone(head); 1880 struct lruvec *lruvec; 1881 pgoff_t end = -1; 1882 int i; 1883 1884 lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); 1885 1886 /* complete memcg works before add pages to LRU */ 1887 mem_cgroup_split_huge_fixup(head); 1888 1889 if (!PageAnon(page)) 1890 end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); 1891 1892 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 1893 __split_huge_page_tail(head, i, lruvec, list); 1894 /* Some pages can be beyond i_size: drop them from page cache */ 1895 if (head[i].index >= end) { 1896 __ClearPageDirty(head + i); 1897 __delete_from_page_cache(head + i, NULL); 1898 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) 1899 shmem_uncharge(head->mapping->host, 1); 1900 put_page(head + i); 1901 } 1902 } 1903 1904 ClearPageCompound(head); 1905 /* See comment in __split_huge_page_tail() */ 1906 if (PageAnon(head)) { 1907 page_ref_inc(head); 1908 } else { 1909 /* Additional pin to radix tree */ 1910 page_ref_add(head, 2); 1911 spin_unlock(&head->mapping->tree_lock); 1912 } 1913 1914 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 1915 1916 unfreeze_page(head); 1917 1918 for (i = 0; i < HPAGE_PMD_NR; i++) { 1919 struct page *subpage = head + i; 1920 if (subpage == page) 1921 continue; 1922 unlock_page(subpage); 1923 1924 /* 1925 * Subpages may be freed if there wasn't any mapping 1926 * like if add_to_swap() is running on a lru page that 1927 * had its mapping zapped. And freeing these pages 1928 * requires taking the lru_lock so we do the put_page 1929 * of the tail pages after the split is complete. 1930 */ 1931 put_page(subpage); 1932 } 1933 } 1934 1935 int total_mapcount(struct page *page) 1936 { 1937 int i, compound, ret; 1938 1939 VM_BUG_ON_PAGE(PageTail(page), page); 1940 1941 if (likely(!PageCompound(page))) 1942 return atomic_read(&page->_mapcount) + 1; 1943 1944 compound = compound_mapcount(page); 1945 if (PageHuge(page)) 1946 return compound; 1947 ret = compound; 1948 for (i = 0; i < HPAGE_PMD_NR; i++) 1949 ret += atomic_read(&page[i]._mapcount) + 1; 1950 /* File pages has compound_mapcount included in _mapcount */ 1951 if (!PageAnon(page)) 1952 return ret - compound * HPAGE_PMD_NR; 1953 if (PageDoubleMap(page)) 1954 ret -= HPAGE_PMD_NR; 1955 return ret; 1956 } 1957 1958 /* 1959 * This calculates accurately how many mappings a transparent hugepage 1960 * has (unlike page_mapcount() which isn't fully accurate). This full 1961 * accuracy is primarily needed to know if copy-on-write faults can 1962 * reuse the page and change the mapping to read-write instead of 1963 * copying them. At the same time this returns the total_mapcount too. 1964 * 1965 * The function returns the highest mapcount any one of the subpages 1966 * has. If the return value is one, even if different processes are 1967 * mapping different subpages of the transparent hugepage, they can 1968 * all reuse it, because each process is reusing a different subpage. 1969 * 1970 * The total_mapcount is instead counting all virtual mappings of the 1971 * subpages. If the total_mapcount is equal to "one", it tells the 1972 * caller all mappings belong to the same "mm" and in turn the 1973 * anon_vma of the transparent hugepage can become the vma->anon_vma 1974 * local one as no other process may be mapping any of the subpages. 1975 * 1976 * It would be more accurate to replace page_mapcount() with 1977 * page_trans_huge_mapcount(), however we only use 1978 * page_trans_huge_mapcount() in the copy-on-write faults where we 1979 * need full accuracy to avoid breaking page pinning, because 1980 * page_trans_huge_mapcount() is slower than page_mapcount(). 1981 */ 1982 int page_trans_huge_mapcount(struct page *page, int *total_mapcount) 1983 { 1984 int i, ret, _total_mapcount, mapcount; 1985 1986 /* hugetlbfs shouldn't call it */ 1987 VM_BUG_ON_PAGE(PageHuge(page), page); 1988 1989 if (likely(!PageTransCompound(page))) { 1990 mapcount = atomic_read(&page->_mapcount) + 1; 1991 if (total_mapcount) 1992 *total_mapcount = mapcount; 1993 return mapcount; 1994 } 1995 1996 page = compound_head(page); 1997 1998 _total_mapcount = ret = 0; 1999 for (i = 0; i < HPAGE_PMD_NR; i++) { 2000 mapcount = atomic_read(&page[i]._mapcount) + 1; 2001 ret = max(ret, mapcount); 2002 _total_mapcount += mapcount; 2003 } 2004 if (PageDoubleMap(page)) { 2005 ret -= 1; 2006 _total_mapcount -= HPAGE_PMD_NR; 2007 } 2008 mapcount = compound_mapcount(page); 2009 ret += mapcount; 2010 _total_mapcount += mapcount; 2011 if (total_mapcount) 2012 *total_mapcount = _total_mapcount; 2013 return ret; 2014 } 2015 2016 /* 2017 * This function splits huge page into normal pages. @page can point to any 2018 * subpage of huge page to split. Split doesn't change the position of @page. 2019 * 2020 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 2021 * The huge page must be locked. 2022 * 2023 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 2024 * 2025 * Both head page and tail pages will inherit mapping, flags, and so on from 2026 * the hugepage. 2027 * 2028 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if 2029 * they are not mapped. 2030 * 2031 * Returns 0 if the hugepage is split successfully. 2032 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under 2033 * us. 2034 */ 2035 int split_huge_page_to_list(struct page *page, struct list_head *list) 2036 { 2037 struct page *head = compound_head(page); 2038 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 2039 struct anon_vma *anon_vma = NULL; 2040 struct address_space *mapping = NULL; 2041 int count, mapcount, extra_pins, ret; 2042 bool mlocked; 2043 unsigned long flags; 2044 2045 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 2046 VM_BUG_ON_PAGE(!PageLocked(page), page); 2047 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 2048 VM_BUG_ON_PAGE(!PageCompound(page), page); 2049 2050 if (PageAnon(head)) { 2051 /* 2052 * The caller does not necessarily hold an mmap_sem that would 2053 * prevent the anon_vma disappearing so we first we take a 2054 * reference to it and then lock the anon_vma for write. This 2055 * is similar to page_lock_anon_vma_read except the write lock 2056 * is taken to serialise against parallel split or collapse 2057 * operations. 2058 */ 2059 anon_vma = page_get_anon_vma(head); 2060 if (!anon_vma) { 2061 ret = -EBUSY; 2062 goto out; 2063 } 2064 extra_pins = 0; 2065 mapping = NULL; 2066 anon_vma_lock_write(anon_vma); 2067 } else { 2068 mapping = head->mapping; 2069 2070 /* Truncated ? */ 2071 if (!mapping) { 2072 ret = -EBUSY; 2073 goto out; 2074 } 2075 2076 /* Addidional pins from radix tree */ 2077 extra_pins = HPAGE_PMD_NR; 2078 anon_vma = NULL; 2079 i_mmap_lock_read(mapping); 2080 } 2081 2082 /* 2083 * Racy check if we can split the page, before freeze_page() will 2084 * split PMDs 2085 */ 2086 if (total_mapcount(head) != page_count(head) - extra_pins - 1) { 2087 ret = -EBUSY; 2088 goto out_unlock; 2089 } 2090 2091 mlocked = PageMlocked(page); 2092 freeze_page(head); 2093 VM_BUG_ON_PAGE(compound_mapcount(head), head); 2094 2095 /* Make sure the page is not on per-CPU pagevec as it takes pin */ 2096 if (mlocked) 2097 lru_add_drain(); 2098 2099 /* prevent PageLRU to go away from under us, and freeze lru stats */ 2100 spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); 2101 2102 if (mapping) { 2103 void **pslot; 2104 2105 spin_lock(&mapping->tree_lock); 2106 pslot = radix_tree_lookup_slot(&mapping->page_tree, 2107 page_index(head)); 2108 /* 2109 * Check if the head page is present in radix tree. 2110 * We assume all tail are present too, if head is there. 2111 */ 2112 if (radix_tree_deref_slot_protected(pslot, 2113 &mapping->tree_lock) != head) 2114 goto fail; 2115 } 2116 2117 /* Prevent deferred_split_scan() touching ->_refcount */ 2118 spin_lock(&pgdata->split_queue_lock); 2119 count = page_count(head); 2120 mapcount = total_mapcount(head); 2121 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { 2122 if (!list_empty(page_deferred_list(head))) { 2123 pgdata->split_queue_len--; 2124 list_del(page_deferred_list(head)); 2125 } 2126 if (mapping) 2127 __dec_node_page_state(page, NR_SHMEM_THPS); 2128 spin_unlock(&pgdata->split_queue_lock); 2129 __split_huge_page(page, list, flags); 2130 ret = 0; 2131 } else { 2132 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 2133 pr_alert("total_mapcount: %u, page_count(): %u\n", 2134 mapcount, count); 2135 if (PageTail(page)) 2136 dump_page(head, NULL); 2137 dump_page(page, "total_mapcount(head) > 0"); 2138 BUG(); 2139 } 2140 spin_unlock(&pgdata->split_queue_lock); 2141 fail: if (mapping) 2142 spin_unlock(&mapping->tree_lock); 2143 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 2144 unfreeze_page(head); 2145 ret = -EBUSY; 2146 } 2147 2148 out_unlock: 2149 if (anon_vma) { 2150 anon_vma_unlock_write(anon_vma); 2151 put_anon_vma(anon_vma); 2152 } 2153 if (mapping) 2154 i_mmap_unlock_read(mapping); 2155 out: 2156 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 2157 return ret; 2158 } 2159 2160 void free_transhuge_page(struct page *page) 2161 { 2162 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2163 unsigned long flags; 2164 2165 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2166 if (!list_empty(page_deferred_list(page))) { 2167 pgdata->split_queue_len--; 2168 list_del(page_deferred_list(page)); 2169 } 2170 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2171 free_compound_page(page); 2172 } 2173 2174 void deferred_split_huge_page(struct page *page) 2175 { 2176 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2177 unsigned long flags; 2178 2179 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 2180 2181 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2182 if (list_empty(page_deferred_list(page))) { 2183 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 2184 list_add_tail(page_deferred_list(page), &pgdata->split_queue); 2185 pgdata->split_queue_len++; 2186 } 2187 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2188 } 2189 2190 static unsigned long deferred_split_count(struct shrinker *shrink, 2191 struct shrink_control *sc) 2192 { 2193 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2194 return ACCESS_ONCE(pgdata->split_queue_len); 2195 } 2196 2197 static unsigned long deferred_split_scan(struct shrinker *shrink, 2198 struct shrink_control *sc) 2199 { 2200 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2201 unsigned long flags; 2202 LIST_HEAD(list), *pos, *next; 2203 struct page *page; 2204 int split = 0; 2205 2206 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2207 /* Take pin on all head pages to avoid freeing them under us */ 2208 list_for_each_safe(pos, next, &pgdata->split_queue) { 2209 page = list_entry((void *)pos, struct page, mapping); 2210 page = compound_head(page); 2211 if (get_page_unless_zero(page)) { 2212 list_move(page_deferred_list(page), &list); 2213 } else { 2214 /* We lost race with put_compound_page() */ 2215 list_del_init(page_deferred_list(page)); 2216 pgdata->split_queue_len--; 2217 } 2218 if (!--sc->nr_to_scan) 2219 break; 2220 } 2221 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2222 2223 list_for_each_safe(pos, next, &list) { 2224 page = list_entry((void *)pos, struct page, mapping); 2225 lock_page(page); 2226 /* split_huge_page() removes page from list on success */ 2227 if (!split_huge_page(page)) 2228 split++; 2229 unlock_page(page); 2230 put_page(page); 2231 } 2232 2233 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2234 list_splice_tail(&list, &pgdata->split_queue); 2235 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2236 2237 /* 2238 * Stop shrinker if we didn't split any page, but the queue is empty. 2239 * This can happen if pages were freed under us. 2240 */ 2241 if (!split && list_empty(&pgdata->split_queue)) 2242 return SHRINK_STOP; 2243 return split; 2244 } 2245 2246 static struct shrinker deferred_split_shrinker = { 2247 .count_objects = deferred_split_count, 2248 .scan_objects = deferred_split_scan, 2249 .seeks = DEFAULT_SEEKS, 2250 .flags = SHRINKER_NUMA_AWARE, 2251 }; 2252 2253 #ifdef CONFIG_DEBUG_FS 2254 static int split_huge_pages_set(void *data, u64 val) 2255 { 2256 struct zone *zone; 2257 struct page *page; 2258 unsigned long pfn, max_zone_pfn; 2259 unsigned long total = 0, split = 0; 2260 2261 if (val != 1) 2262 return -EINVAL; 2263 2264 for_each_populated_zone(zone) { 2265 max_zone_pfn = zone_end_pfn(zone); 2266 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 2267 if (!pfn_valid(pfn)) 2268 continue; 2269 2270 page = pfn_to_page(pfn); 2271 if (!get_page_unless_zero(page)) 2272 continue; 2273 2274 if (zone != page_zone(page)) 2275 goto next; 2276 2277 if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) 2278 goto next; 2279 2280 total++; 2281 lock_page(page); 2282 if (!split_huge_page(page)) 2283 split++; 2284 unlock_page(page); 2285 next: 2286 put_page(page); 2287 } 2288 } 2289 2290 pr_info("%lu of %lu THP split\n", split, total); 2291 2292 return 0; 2293 } 2294 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, 2295 "%llu\n"); 2296 2297 static int __init split_huge_pages_debugfs(void) 2298 { 2299 void *ret; 2300 2301 ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 2302 &split_huge_pages_fops); 2303 if (!ret) 2304 pr_warn("Failed to create split_huge_pages in debugfs"); 2305 return 0; 2306 } 2307 late_initcall(split_huge_pages_debugfs); 2308 #endif 2309