1 /* 2 * Copyright (C) 2009 Red Hat, Inc. 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2. See 5 * the COPYING file in the top-level directory. 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/mm.h> 11 #include <linux/sched.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/dax.h> 21 #include <linux/kthread.h> 22 #include <linux/khugepaged.h> 23 #include <linux/freezer.h> 24 #include <linux/pfn_t.h> 25 #include <linux/mman.h> 26 #include <linux/memremap.h> 27 #include <linux/pagemap.h> 28 #include <linux/debugfs.h> 29 #include <linux/migrate.h> 30 #include <linux/hashtable.h> 31 #include <linux/userfaultfd_k.h> 32 #include <linux/page_idle.h> 33 34 #include <asm/tlb.h> 35 #include <asm/pgalloc.h> 36 #include "internal.h" 37 38 enum scan_result { 39 SCAN_FAIL, 40 SCAN_SUCCEED, 41 SCAN_PMD_NULL, 42 SCAN_EXCEED_NONE_PTE, 43 SCAN_PTE_NON_PRESENT, 44 SCAN_PAGE_RO, 45 SCAN_NO_REFERENCED_PAGE, 46 SCAN_PAGE_NULL, 47 SCAN_SCAN_ABORT, 48 SCAN_PAGE_COUNT, 49 SCAN_PAGE_LRU, 50 SCAN_PAGE_LOCK, 51 SCAN_PAGE_ANON, 52 SCAN_PAGE_COMPOUND, 53 SCAN_ANY_PROCESS, 54 SCAN_VMA_NULL, 55 SCAN_VMA_CHECK, 56 SCAN_ADDRESS_RANGE, 57 SCAN_SWAP_CACHE_PAGE, 58 SCAN_DEL_PAGE_LRU, 59 SCAN_ALLOC_HUGE_PAGE_FAIL, 60 SCAN_CGROUP_CHARGE_FAIL 61 }; 62 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/huge_memory.h> 65 66 /* 67 * By default transparent hugepage support is disabled in order that avoid 68 * to risk increase the memory footprint of applications without a guaranteed 69 * benefit. When transparent hugepage support is enabled, is for all mappings, 70 * and khugepaged scans all mappings. 71 * Defrag is invoked by khugepaged hugepage allocations and by page faults 72 * for all hugepage allocations. 73 */ 74 unsigned long transparent_hugepage_flags __read_mostly = 75 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 76 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 77 #endif 78 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 79 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 80 #endif 81 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 82 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 83 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 84 85 /* default scan 8*512 pte (or vmas) every 30 second */ 86 static unsigned int khugepaged_pages_to_scan __read_mostly; 87 static unsigned int khugepaged_pages_collapsed; 88 static unsigned int khugepaged_full_scans; 89 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 90 /* during fragmentation poll the hugepage allocator once every minute */ 91 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 92 static unsigned long khugepaged_sleep_expire; 93 static struct task_struct *khugepaged_thread __read_mostly; 94 static DEFINE_MUTEX(khugepaged_mutex); 95 static DEFINE_SPINLOCK(khugepaged_mm_lock); 96 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 97 /* 98 * default collapse hugepages if there is at least one pte mapped like 99 * it would have happened if the vma was large enough during page 100 * fault. 101 */ 102 static unsigned int khugepaged_max_ptes_none __read_mostly; 103 104 static int khugepaged(void *none); 105 static int khugepaged_slab_init(void); 106 static void khugepaged_slab_exit(void); 107 108 #define MM_SLOTS_HASH_BITS 10 109 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 110 111 static struct kmem_cache *mm_slot_cache __read_mostly; 112 113 /** 114 * struct mm_slot - hash lookup from mm to mm_slot 115 * @hash: hash collision list 116 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 117 * @mm: the mm that this information is valid for 118 */ 119 struct mm_slot { 120 struct hlist_node hash; 121 struct list_head mm_node; 122 struct mm_struct *mm; 123 }; 124 125 /** 126 * struct khugepaged_scan - cursor for scanning 127 * @mm_head: the head of the mm list to scan 128 * @mm_slot: the current mm_slot we are scanning 129 * @address: the next address inside that to be scanned 130 * 131 * There is only the one khugepaged_scan instance of this cursor structure. 132 */ 133 struct khugepaged_scan { 134 struct list_head mm_head; 135 struct mm_slot *mm_slot; 136 unsigned long address; 137 }; 138 static struct khugepaged_scan khugepaged_scan = { 139 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 140 }; 141 142 static struct shrinker deferred_split_shrinker; 143 144 static void set_recommended_min_free_kbytes(void) 145 { 146 struct zone *zone; 147 int nr_zones = 0; 148 unsigned long recommended_min; 149 150 for_each_populated_zone(zone) 151 nr_zones++; 152 153 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 154 recommended_min = pageblock_nr_pages * nr_zones * 2; 155 156 /* 157 * Make sure that on average at least two pageblocks are almost free 158 * of another type, one for a migratetype to fall back to and a 159 * second to avoid subsequent fallbacks of other types There are 3 160 * MIGRATE_TYPES we care about. 161 */ 162 recommended_min += pageblock_nr_pages * nr_zones * 163 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 164 165 /* don't ever allow to reserve more than 5% of the lowmem */ 166 recommended_min = min(recommended_min, 167 (unsigned long) nr_free_buffer_pages() / 20); 168 recommended_min <<= (PAGE_SHIFT-10); 169 170 if (recommended_min > min_free_kbytes) { 171 if (user_min_free_kbytes >= 0) 172 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 173 min_free_kbytes, recommended_min); 174 175 min_free_kbytes = recommended_min; 176 } 177 setup_per_zone_wmarks(); 178 } 179 180 static int start_stop_khugepaged(void) 181 { 182 int err = 0; 183 if (khugepaged_enabled()) { 184 if (!khugepaged_thread) 185 khugepaged_thread = kthread_run(khugepaged, NULL, 186 "khugepaged"); 187 if (IS_ERR(khugepaged_thread)) { 188 pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 189 err = PTR_ERR(khugepaged_thread); 190 khugepaged_thread = NULL; 191 goto fail; 192 } 193 194 if (!list_empty(&khugepaged_scan.mm_head)) 195 wake_up_interruptible(&khugepaged_wait); 196 197 set_recommended_min_free_kbytes(); 198 } else if (khugepaged_thread) { 199 kthread_stop(khugepaged_thread); 200 khugepaged_thread = NULL; 201 } 202 fail: 203 return err; 204 } 205 206 static atomic_t huge_zero_refcount; 207 struct page *huge_zero_page __read_mostly; 208 209 struct page *get_huge_zero_page(void) 210 { 211 struct page *zero_page; 212 retry: 213 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 214 return READ_ONCE(huge_zero_page); 215 216 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 217 HPAGE_PMD_ORDER); 218 if (!zero_page) { 219 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 220 return NULL; 221 } 222 count_vm_event(THP_ZERO_PAGE_ALLOC); 223 preempt_disable(); 224 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 225 preempt_enable(); 226 __free_pages(zero_page, compound_order(zero_page)); 227 goto retry; 228 } 229 230 /* We take additional reference here. It will be put back by shrinker */ 231 atomic_set(&huge_zero_refcount, 2); 232 preempt_enable(); 233 return READ_ONCE(huge_zero_page); 234 } 235 236 void put_huge_zero_page(void) 237 { 238 /* 239 * Counter should never go to zero here. Only shrinker can put 240 * last reference. 241 */ 242 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 243 } 244 245 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 246 struct shrink_control *sc) 247 { 248 /* we can free zero page only if last reference remains */ 249 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 250 } 251 252 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 253 struct shrink_control *sc) 254 { 255 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 256 struct page *zero_page = xchg(&huge_zero_page, NULL); 257 BUG_ON(zero_page == NULL); 258 __free_pages(zero_page, compound_order(zero_page)); 259 return HPAGE_PMD_NR; 260 } 261 262 return 0; 263 } 264 265 static struct shrinker huge_zero_page_shrinker = { 266 .count_objects = shrink_huge_zero_page_count, 267 .scan_objects = shrink_huge_zero_page_scan, 268 .seeks = DEFAULT_SEEKS, 269 }; 270 271 #ifdef CONFIG_SYSFS 272 273 static ssize_t triple_flag_store(struct kobject *kobj, 274 struct kobj_attribute *attr, 275 const char *buf, size_t count, 276 enum transparent_hugepage_flag enabled, 277 enum transparent_hugepage_flag deferred, 278 enum transparent_hugepage_flag req_madv) 279 { 280 if (!memcmp("defer", buf, 281 min(sizeof("defer")-1, count))) { 282 if (enabled == deferred) 283 return -EINVAL; 284 clear_bit(enabled, &transparent_hugepage_flags); 285 clear_bit(req_madv, &transparent_hugepage_flags); 286 set_bit(deferred, &transparent_hugepage_flags); 287 } else if (!memcmp("always", buf, 288 min(sizeof("always")-1, count))) { 289 clear_bit(deferred, &transparent_hugepage_flags); 290 clear_bit(req_madv, &transparent_hugepage_flags); 291 set_bit(enabled, &transparent_hugepage_flags); 292 } else if (!memcmp("madvise", buf, 293 min(sizeof("madvise")-1, count))) { 294 clear_bit(enabled, &transparent_hugepage_flags); 295 clear_bit(deferred, &transparent_hugepage_flags); 296 set_bit(req_madv, &transparent_hugepage_flags); 297 } else if (!memcmp("never", buf, 298 min(sizeof("never")-1, count))) { 299 clear_bit(enabled, &transparent_hugepage_flags); 300 clear_bit(req_madv, &transparent_hugepage_flags); 301 clear_bit(deferred, &transparent_hugepage_flags); 302 } else 303 return -EINVAL; 304 305 return count; 306 } 307 308 static ssize_t enabled_show(struct kobject *kobj, 309 struct kobj_attribute *attr, char *buf) 310 { 311 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 312 return sprintf(buf, "[always] madvise never\n"); 313 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) 314 return sprintf(buf, "always [madvise] never\n"); 315 else 316 return sprintf(buf, "always madvise [never]\n"); 317 } 318 319 static ssize_t enabled_store(struct kobject *kobj, 320 struct kobj_attribute *attr, 321 const char *buf, size_t count) 322 { 323 ssize_t ret; 324 325 ret = triple_flag_store(kobj, attr, buf, count, 326 TRANSPARENT_HUGEPAGE_FLAG, 327 TRANSPARENT_HUGEPAGE_FLAG, 328 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 329 330 if (ret > 0) { 331 int err; 332 333 mutex_lock(&khugepaged_mutex); 334 err = start_stop_khugepaged(); 335 mutex_unlock(&khugepaged_mutex); 336 337 if (err) 338 ret = err; 339 } 340 341 return ret; 342 } 343 static struct kobj_attribute enabled_attr = 344 __ATTR(enabled, 0644, enabled_show, enabled_store); 345 346 static ssize_t single_flag_show(struct kobject *kobj, 347 struct kobj_attribute *attr, char *buf, 348 enum transparent_hugepage_flag flag) 349 { 350 return sprintf(buf, "%d\n", 351 !!test_bit(flag, &transparent_hugepage_flags)); 352 } 353 354 static ssize_t single_flag_store(struct kobject *kobj, 355 struct kobj_attribute *attr, 356 const char *buf, size_t count, 357 enum transparent_hugepage_flag flag) 358 { 359 unsigned long value; 360 int ret; 361 362 ret = kstrtoul(buf, 10, &value); 363 if (ret < 0) 364 return ret; 365 if (value > 1) 366 return -EINVAL; 367 368 if (value) 369 set_bit(flag, &transparent_hugepage_flags); 370 else 371 clear_bit(flag, &transparent_hugepage_flags); 372 373 return count; 374 } 375 376 /* 377 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind 378 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of 379 * memory just to allocate one more hugepage. 380 */ 381 static ssize_t defrag_show(struct kobject *kobj, 382 struct kobj_attribute *attr, char *buf) 383 { 384 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 385 return sprintf(buf, "[always] defer madvise never\n"); 386 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 387 return sprintf(buf, "always [defer] madvise never\n"); 388 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 389 return sprintf(buf, "always defer [madvise] never\n"); 390 else 391 return sprintf(buf, "always defer madvise [never]\n"); 392 393 } 394 static ssize_t defrag_store(struct kobject *kobj, 395 struct kobj_attribute *attr, 396 const char *buf, size_t count) 397 { 398 return triple_flag_store(kobj, attr, buf, count, 399 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 400 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 401 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 402 } 403 static struct kobj_attribute defrag_attr = 404 __ATTR(defrag, 0644, defrag_show, defrag_store); 405 406 static ssize_t use_zero_page_show(struct kobject *kobj, 407 struct kobj_attribute *attr, char *buf) 408 { 409 return single_flag_show(kobj, attr, buf, 410 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 411 } 412 static ssize_t use_zero_page_store(struct kobject *kobj, 413 struct kobj_attribute *attr, const char *buf, size_t count) 414 { 415 return single_flag_store(kobj, attr, buf, count, 416 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 417 } 418 static struct kobj_attribute use_zero_page_attr = 419 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 420 #ifdef CONFIG_DEBUG_VM 421 static ssize_t debug_cow_show(struct kobject *kobj, 422 struct kobj_attribute *attr, char *buf) 423 { 424 return single_flag_show(kobj, attr, buf, 425 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 426 } 427 static ssize_t debug_cow_store(struct kobject *kobj, 428 struct kobj_attribute *attr, 429 const char *buf, size_t count) 430 { 431 return single_flag_store(kobj, attr, buf, count, 432 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 433 } 434 static struct kobj_attribute debug_cow_attr = 435 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 436 #endif /* CONFIG_DEBUG_VM */ 437 438 static struct attribute *hugepage_attr[] = { 439 &enabled_attr.attr, 440 &defrag_attr.attr, 441 &use_zero_page_attr.attr, 442 #ifdef CONFIG_DEBUG_VM 443 &debug_cow_attr.attr, 444 #endif 445 NULL, 446 }; 447 448 static struct attribute_group hugepage_attr_group = { 449 .attrs = hugepage_attr, 450 }; 451 452 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 453 struct kobj_attribute *attr, 454 char *buf) 455 { 456 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 457 } 458 459 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 460 struct kobj_attribute *attr, 461 const char *buf, size_t count) 462 { 463 unsigned long msecs; 464 int err; 465 466 err = kstrtoul(buf, 10, &msecs); 467 if (err || msecs > UINT_MAX) 468 return -EINVAL; 469 470 khugepaged_scan_sleep_millisecs = msecs; 471 khugepaged_sleep_expire = 0; 472 wake_up_interruptible(&khugepaged_wait); 473 474 return count; 475 } 476 static struct kobj_attribute scan_sleep_millisecs_attr = 477 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 478 scan_sleep_millisecs_store); 479 480 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 481 struct kobj_attribute *attr, 482 char *buf) 483 { 484 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 485 } 486 487 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 488 struct kobj_attribute *attr, 489 const char *buf, size_t count) 490 { 491 unsigned long msecs; 492 int err; 493 494 err = kstrtoul(buf, 10, &msecs); 495 if (err || msecs > UINT_MAX) 496 return -EINVAL; 497 498 khugepaged_alloc_sleep_millisecs = msecs; 499 khugepaged_sleep_expire = 0; 500 wake_up_interruptible(&khugepaged_wait); 501 502 return count; 503 } 504 static struct kobj_attribute alloc_sleep_millisecs_attr = 505 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 506 alloc_sleep_millisecs_store); 507 508 static ssize_t pages_to_scan_show(struct kobject *kobj, 509 struct kobj_attribute *attr, 510 char *buf) 511 { 512 return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 513 } 514 static ssize_t pages_to_scan_store(struct kobject *kobj, 515 struct kobj_attribute *attr, 516 const char *buf, size_t count) 517 { 518 int err; 519 unsigned long pages; 520 521 err = kstrtoul(buf, 10, &pages); 522 if (err || !pages || pages > UINT_MAX) 523 return -EINVAL; 524 525 khugepaged_pages_to_scan = pages; 526 527 return count; 528 } 529 static struct kobj_attribute pages_to_scan_attr = 530 __ATTR(pages_to_scan, 0644, pages_to_scan_show, 531 pages_to_scan_store); 532 533 static ssize_t pages_collapsed_show(struct kobject *kobj, 534 struct kobj_attribute *attr, 535 char *buf) 536 { 537 return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 538 } 539 static struct kobj_attribute pages_collapsed_attr = 540 __ATTR_RO(pages_collapsed); 541 542 static ssize_t full_scans_show(struct kobject *kobj, 543 struct kobj_attribute *attr, 544 char *buf) 545 { 546 return sprintf(buf, "%u\n", khugepaged_full_scans); 547 } 548 static struct kobj_attribute full_scans_attr = 549 __ATTR_RO(full_scans); 550 551 static ssize_t khugepaged_defrag_show(struct kobject *kobj, 552 struct kobj_attribute *attr, char *buf) 553 { 554 return single_flag_show(kobj, attr, buf, 555 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 556 } 557 static ssize_t khugepaged_defrag_store(struct kobject *kobj, 558 struct kobj_attribute *attr, 559 const char *buf, size_t count) 560 { 561 return single_flag_store(kobj, attr, buf, count, 562 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 563 } 564 static struct kobj_attribute khugepaged_defrag_attr = 565 __ATTR(defrag, 0644, khugepaged_defrag_show, 566 khugepaged_defrag_store); 567 568 /* 569 * max_ptes_none controls if khugepaged should collapse hugepages over 570 * any unmapped ptes in turn potentially increasing the memory 571 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 572 * reduce the available free memory in the system as it 573 * runs. Increasing max_ptes_none will instead potentially reduce the 574 * free memory in the system during the khugepaged scan. 575 */ 576 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 577 struct kobj_attribute *attr, 578 char *buf) 579 { 580 return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 581 } 582 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 583 struct kobj_attribute *attr, 584 const char *buf, size_t count) 585 { 586 int err; 587 unsigned long max_ptes_none; 588 589 err = kstrtoul(buf, 10, &max_ptes_none); 590 if (err || max_ptes_none > HPAGE_PMD_NR-1) 591 return -EINVAL; 592 593 khugepaged_max_ptes_none = max_ptes_none; 594 595 return count; 596 } 597 static struct kobj_attribute khugepaged_max_ptes_none_attr = 598 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 599 khugepaged_max_ptes_none_store); 600 601 static struct attribute *khugepaged_attr[] = { 602 &khugepaged_defrag_attr.attr, 603 &khugepaged_max_ptes_none_attr.attr, 604 &pages_to_scan_attr.attr, 605 &pages_collapsed_attr.attr, 606 &full_scans_attr.attr, 607 &scan_sleep_millisecs_attr.attr, 608 &alloc_sleep_millisecs_attr.attr, 609 NULL, 610 }; 611 612 static struct attribute_group khugepaged_attr_group = { 613 .attrs = khugepaged_attr, 614 .name = "khugepaged", 615 }; 616 617 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 618 { 619 int err; 620 621 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 622 if (unlikely(!*hugepage_kobj)) { 623 pr_err("failed to create transparent hugepage kobject\n"); 624 return -ENOMEM; 625 } 626 627 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 628 if (err) { 629 pr_err("failed to register transparent hugepage group\n"); 630 goto delete_obj; 631 } 632 633 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 634 if (err) { 635 pr_err("failed to register transparent hugepage group\n"); 636 goto remove_hp_group; 637 } 638 639 return 0; 640 641 remove_hp_group: 642 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 643 delete_obj: 644 kobject_put(*hugepage_kobj); 645 return err; 646 } 647 648 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 649 { 650 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 651 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 652 kobject_put(hugepage_kobj); 653 } 654 #else 655 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 656 { 657 return 0; 658 } 659 660 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 661 { 662 } 663 #endif /* CONFIG_SYSFS */ 664 665 static int __init hugepage_init(void) 666 { 667 int err; 668 struct kobject *hugepage_kobj; 669 670 if (!has_transparent_hugepage()) { 671 transparent_hugepage_flags = 0; 672 return -EINVAL; 673 } 674 675 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 676 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; 677 /* 678 * hugepages can't be allocated by the buddy allocator 679 */ 680 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); 681 /* 682 * we use page->mapping and page->index in second tail page 683 * as list_head: assuming THP order >= 2 684 */ 685 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 686 687 err = hugepage_init_sysfs(&hugepage_kobj); 688 if (err) 689 goto err_sysfs; 690 691 err = khugepaged_slab_init(); 692 if (err) 693 goto err_slab; 694 695 err = register_shrinker(&huge_zero_page_shrinker); 696 if (err) 697 goto err_hzp_shrinker; 698 err = register_shrinker(&deferred_split_shrinker); 699 if (err) 700 goto err_split_shrinker; 701 702 /* 703 * By default disable transparent hugepages on smaller systems, 704 * where the extra memory used could hurt more than TLB overhead 705 * is likely to save. The admin can still enable it through /sys. 706 */ 707 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { 708 transparent_hugepage_flags = 0; 709 return 0; 710 } 711 712 err = start_stop_khugepaged(); 713 if (err) 714 goto err_khugepaged; 715 716 return 0; 717 err_khugepaged: 718 unregister_shrinker(&deferred_split_shrinker); 719 err_split_shrinker: 720 unregister_shrinker(&huge_zero_page_shrinker); 721 err_hzp_shrinker: 722 khugepaged_slab_exit(); 723 err_slab: 724 hugepage_exit_sysfs(hugepage_kobj); 725 err_sysfs: 726 return err; 727 } 728 subsys_initcall(hugepage_init); 729 730 static int __init setup_transparent_hugepage(char *str) 731 { 732 int ret = 0; 733 if (!str) 734 goto out; 735 if (!strcmp(str, "always")) { 736 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 737 &transparent_hugepage_flags); 738 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 739 &transparent_hugepage_flags); 740 ret = 1; 741 } else if (!strcmp(str, "madvise")) { 742 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 743 &transparent_hugepage_flags); 744 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 745 &transparent_hugepage_flags); 746 ret = 1; 747 } else if (!strcmp(str, "never")) { 748 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 749 &transparent_hugepage_flags); 750 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 751 &transparent_hugepage_flags); 752 ret = 1; 753 } 754 out: 755 if (!ret) 756 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 757 return ret; 758 } 759 __setup("transparent_hugepage=", setup_transparent_hugepage); 760 761 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 762 { 763 if (likely(vma->vm_flags & VM_WRITE)) 764 pmd = pmd_mkwrite(pmd); 765 return pmd; 766 } 767 768 static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) 769 { 770 return pmd_mkhuge(mk_pmd(page, prot)); 771 } 772 773 static inline struct list_head *page_deferred_list(struct page *page) 774 { 775 /* 776 * ->lru in the tail pages is occupied by compound_head. 777 * Let's use ->mapping + ->index in the second tail page as list_head. 778 */ 779 return (struct list_head *)&page[2].mapping; 780 } 781 782 void prep_transhuge_page(struct page *page) 783 { 784 /* 785 * we use page->mapping and page->indexlru in second tail page 786 * as list_head: assuming THP order >= 2 787 */ 788 789 INIT_LIST_HEAD(page_deferred_list(page)); 790 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 791 } 792 793 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 794 struct vm_area_struct *vma, 795 unsigned long address, pmd_t *pmd, 796 struct page *page, gfp_t gfp, 797 unsigned int flags) 798 { 799 struct mem_cgroup *memcg; 800 pgtable_t pgtable; 801 spinlock_t *ptl; 802 unsigned long haddr = address & HPAGE_PMD_MASK; 803 804 VM_BUG_ON_PAGE(!PageCompound(page), page); 805 806 if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { 807 put_page(page); 808 count_vm_event(THP_FAULT_FALLBACK); 809 return VM_FAULT_FALLBACK; 810 } 811 812 pgtable = pte_alloc_one(mm, haddr); 813 if (unlikely(!pgtable)) { 814 mem_cgroup_cancel_charge(page, memcg, true); 815 put_page(page); 816 return VM_FAULT_OOM; 817 } 818 819 clear_huge_page(page, haddr, HPAGE_PMD_NR); 820 /* 821 * The memory barrier inside __SetPageUptodate makes sure that 822 * clear_huge_page writes become visible before the set_pmd_at() 823 * write. 824 */ 825 __SetPageUptodate(page); 826 827 ptl = pmd_lock(mm, pmd); 828 if (unlikely(!pmd_none(*pmd))) { 829 spin_unlock(ptl); 830 mem_cgroup_cancel_charge(page, memcg, true); 831 put_page(page); 832 pte_free(mm, pgtable); 833 } else { 834 pmd_t entry; 835 836 /* Deliver the page fault to userland */ 837 if (userfaultfd_missing(vma)) { 838 int ret; 839 840 spin_unlock(ptl); 841 mem_cgroup_cancel_charge(page, memcg, true); 842 put_page(page); 843 pte_free(mm, pgtable); 844 ret = handle_userfault(vma, address, flags, 845 VM_UFFD_MISSING); 846 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 847 return ret; 848 } 849 850 entry = mk_huge_pmd(page, vma->vm_page_prot); 851 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 852 page_add_new_anon_rmap(page, vma, haddr, true); 853 mem_cgroup_commit_charge(page, memcg, false, true); 854 lru_cache_add_active_or_unevictable(page, vma); 855 pgtable_trans_huge_deposit(mm, pmd, pgtable); 856 set_pmd_at(mm, haddr, pmd, entry); 857 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 858 atomic_long_inc(&mm->nr_ptes); 859 spin_unlock(ptl); 860 count_vm_event(THP_FAULT_ALLOC); 861 } 862 863 return 0; 864 } 865 866 /* 867 * If THP is set to always then directly reclaim/compact as necessary 868 * If set to defer then do no reclaim and defer to khugepaged 869 * If set to madvise and the VMA is flagged then directly reclaim/compact 870 */ 871 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 872 { 873 gfp_t reclaim_flags = 0; 874 875 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && 876 (vma->vm_flags & VM_HUGEPAGE)) 877 reclaim_flags = __GFP_DIRECT_RECLAIM; 878 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 879 reclaim_flags = __GFP_KSWAPD_RECLAIM; 880 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 881 reclaim_flags = __GFP_DIRECT_RECLAIM; 882 883 return GFP_TRANSHUGE | reclaim_flags; 884 } 885 886 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 887 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 888 { 889 return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); 890 } 891 892 /* Caller must hold page table lock. */ 893 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 894 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 895 struct page *zero_page) 896 { 897 pmd_t entry; 898 if (!pmd_none(*pmd)) 899 return false; 900 entry = mk_pmd(zero_page, vma->vm_page_prot); 901 entry = pmd_mkhuge(entry); 902 if (pgtable) 903 pgtable_trans_huge_deposit(mm, pmd, pgtable); 904 set_pmd_at(mm, haddr, pmd, entry); 905 atomic_long_inc(&mm->nr_ptes); 906 return true; 907 } 908 909 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 910 unsigned long address, pmd_t *pmd, 911 unsigned int flags) 912 { 913 gfp_t gfp; 914 struct page *page; 915 unsigned long haddr = address & HPAGE_PMD_MASK; 916 917 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 918 return VM_FAULT_FALLBACK; 919 if (unlikely(anon_vma_prepare(vma))) 920 return VM_FAULT_OOM; 921 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 922 return VM_FAULT_OOM; 923 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && 924 transparent_hugepage_use_zero_page()) { 925 spinlock_t *ptl; 926 pgtable_t pgtable; 927 struct page *zero_page; 928 bool set; 929 int ret; 930 pgtable = pte_alloc_one(mm, haddr); 931 if (unlikely(!pgtable)) 932 return VM_FAULT_OOM; 933 zero_page = get_huge_zero_page(); 934 if (unlikely(!zero_page)) { 935 pte_free(mm, pgtable); 936 count_vm_event(THP_FAULT_FALLBACK); 937 return VM_FAULT_FALLBACK; 938 } 939 ptl = pmd_lock(mm, pmd); 940 ret = 0; 941 set = false; 942 if (pmd_none(*pmd)) { 943 if (userfaultfd_missing(vma)) { 944 spin_unlock(ptl); 945 ret = handle_userfault(vma, address, flags, 946 VM_UFFD_MISSING); 947 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 948 } else { 949 set_huge_zero_page(pgtable, mm, vma, 950 haddr, pmd, 951 zero_page); 952 spin_unlock(ptl); 953 set = true; 954 } 955 } else 956 spin_unlock(ptl); 957 if (!set) { 958 pte_free(mm, pgtable); 959 put_huge_zero_page(); 960 } 961 return ret; 962 } 963 gfp = alloc_hugepage_direct_gfpmask(vma); 964 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 965 if (unlikely(!page)) { 966 count_vm_event(THP_FAULT_FALLBACK); 967 return VM_FAULT_FALLBACK; 968 } 969 prep_transhuge_page(page); 970 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, 971 flags); 972 } 973 974 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 975 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) 976 { 977 struct mm_struct *mm = vma->vm_mm; 978 pmd_t entry; 979 spinlock_t *ptl; 980 981 ptl = pmd_lock(mm, pmd); 982 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 983 if (pfn_t_devmap(pfn)) 984 entry = pmd_mkdevmap(entry); 985 if (write) { 986 entry = pmd_mkyoung(pmd_mkdirty(entry)); 987 entry = maybe_pmd_mkwrite(entry, vma); 988 } 989 set_pmd_at(mm, addr, pmd, entry); 990 update_mmu_cache_pmd(vma, addr, pmd); 991 spin_unlock(ptl); 992 } 993 994 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 995 pmd_t *pmd, pfn_t pfn, bool write) 996 { 997 pgprot_t pgprot = vma->vm_page_prot; 998 /* 999 * If we had pmd_special, we could avoid all these restrictions, 1000 * but we need to be consistent with PTEs and architectures that 1001 * can't support a 'special' bit. 1002 */ 1003 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1004 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1005 (VM_PFNMAP|VM_MIXEDMAP)); 1006 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1007 BUG_ON(!pfn_t_devmap(pfn)); 1008 1009 if (addr < vma->vm_start || addr >= vma->vm_end) 1010 return VM_FAULT_SIGBUS; 1011 if (track_pfn_insert(vma, &pgprot, pfn)) 1012 return VM_FAULT_SIGBUS; 1013 insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); 1014 return VM_FAULT_NOPAGE; 1015 } 1016 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1017 1018 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1019 pmd_t *pmd) 1020 { 1021 pmd_t _pmd; 1022 1023 /* 1024 * We should set the dirty bit only for FOLL_WRITE but for now 1025 * the dirty bit in the pmd is meaningless. And if the dirty 1026 * bit will become meaningful and we'll only set it with 1027 * FOLL_WRITE, an atomic set_bit will be required on the pmd to 1028 * set the young bit, instead of the current set_pmd_at. 1029 */ 1030 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1031 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1032 pmd, _pmd, 1)) 1033 update_mmu_cache_pmd(vma, addr, pmd); 1034 } 1035 1036 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 1037 pmd_t *pmd, int flags) 1038 { 1039 unsigned long pfn = pmd_pfn(*pmd); 1040 struct mm_struct *mm = vma->vm_mm; 1041 struct dev_pagemap *pgmap; 1042 struct page *page; 1043 1044 assert_spin_locked(pmd_lockptr(mm, pmd)); 1045 1046 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1047 return NULL; 1048 1049 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 1050 /* pass */; 1051 else 1052 return NULL; 1053 1054 if (flags & FOLL_TOUCH) 1055 touch_pmd(vma, addr, pmd); 1056 1057 /* 1058 * device mapped pages can only be returned if the 1059 * caller will manage the page reference count. 1060 */ 1061 if (!(flags & FOLL_GET)) 1062 return ERR_PTR(-EEXIST); 1063 1064 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 1065 pgmap = get_dev_pagemap(pfn, NULL); 1066 if (!pgmap) 1067 return ERR_PTR(-EFAULT); 1068 page = pfn_to_page(pfn); 1069 get_page(page); 1070 put_dev_pagemap(pgmap); 1071 1072 return page; 1073 } 1074 1075 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1076 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1077 struct vm_area_struct *vma) 1078 { 1079 spinlock_t *dst_ptl, *src_ptl; 1080 struct page *src_page; 1081 pmd_t pmd; 1082 pgtable_t pgtable = NULL; 1083 int ret; 1084 1085 if (!vma_is_dax(vma)) { 1086 ret = -ENOMEM; 1087 pgtable = pte_alloc_one(dst_mm, addr); 1088 if (unlikely(!pgtable)) 1089 goto out; 1090 } 1091 1092 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1093 src_ptl = pmd_lockptr(src_mm, src_pmd); 1094 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1095 1096 ret = -EAGAIN; 1097 pmd = *src_pmd; 1098 if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { 1099 pte_free(dst_mm, pgtable); 1100 goto out_unlock; 1101 } 1102 /* 1103 * When page table lock is held, the huge zero pmd should not be 1104 * under splitting since we don't split the page itself, only pmd to 1105 * a page table. 1106 */ 1107 if (is_huge_zero_pmd(pmd)) { 1108 struct page *zero_page; 1109 /* 1110 * get_huge_zero_page() will never allocate a new page here, 1111 * since we already have a zero page to copy. It just takes a 1112 * reference. 1113 */ 1114 zero_page = get_huge_zero_page(); 1115 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 1116 zero_page); 1117 ret = 0; 1118 goto out_unlock; 1119 } 1120 1121 if (!vma_is_dax(vma)) { 1122 /* thp accounting separate from pmd_devmap accounting */ 1123 src_page = pmd_page(pmd); 1124 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1125 get_page(src_page); 1126 page_dup_rmap(src_page, true); 1127 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1128 atomic_long_inc(&dst_mm->nr_ptes); 1129 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1130 } 1131 1132 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1133 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1134 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1135 1136 ret = 0; 1137 out_unlock: 1138 spin_unlock(src_ptl); 1139 spin_unlock(dst_ptl); 1140 out: 1141 return ret; 1142 } 1143 1144 void huge_pmd_set_accessed(struct mm_struct *mm, 1145 struct vm_area_struct *vma, 1146 unsigned long address, 1147 pmd_t *pmd, pmd_t orig_pmd, 1148 int dirty) 1149 { 1150 spinlock_t *ptl; 1151 pmd_t entry; 1152 unsigned long haddr; 1153 1154 ptl = pmd_lock(mm, pmd); 1155 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1156 goto unlock; 1157 1158 entry = pmd_mkyoung(orig_pmd); 1159 haddr = address & HPAGE_PMD_MASK; 1160 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 1161 update_mmu_cache_pmd(vma, address, pmd); 1162 1163 unlock: 1164 spin_unlock(ptl); 1165 } 1166 1167 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1168 struct vm_area_struct *vma, 1169 unsigned long address, 1170 pmd_t *pmd, pmd_t orig_pmd, 1171 struct page *page, 1172 unsigned long haddr) 1173 { 1174 struct mem_cgroup *memcg; 1175 spinlock_t *ptl; 1176 pgtable_t pgtable; 1177 pmd_t _pmd; 1178 int ret = 0, i; 1179 struct page **pages; 1180 unsigned long mmun_start; /* For mmu_notifiers */ 1181 unsigned long mmun_end; /* For mmu_notifiers */ 1182 1183 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 1184 GFP_KERNEL); 1185 if (unlikely(!pages)) { 1186 ret |= VM_FAULT_OOM; 1187 goto out; 1188 } 1189 1190 for (i = 0; i < HPAGE_PMD_NR; i++) { 1191 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 1192 __GFP_OTHER_NODE, 1193 vma, address, page_to_nid(page)); 1194 if (unlikely(!pages[i] || 1195 mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, 1196 &memcg, false))) { 1197 if (pages[i]) 1198 put_page(pages[i]); 1199 while (--i >= 0) { 1200 memcg = (void *)page_private(pages[i]); 1201 set_page_private(pages[i], 0); 1202 mem_cgroup_cancel_charge(pages[i], memcg, 1203 false); 1204 put_page(pages[i]); 1205 } 1206 kfree(pages); 1207 ret |= VM_FAULT_OOM; 1208 goto out; 1209 } 1210 set_page_private(pages[i], (unsigned long)memcg); 1211 } 1212 1213 for (i = 0; i < HPAGE_PMD_NR; i++) { 1214 copy_user_highpage(pages[i], page + i, 1215 haddr + PAGE_SIZE * i, vma); 1216 __SetPageUptodate(pages[i]); 1217 cond_resched(); 1218 } 1219 1220 mmun_start = haddr; 1221 mmun_end = haddr + HPAGE_PMD_SIZE; 1222 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1223 1224 ptl = pmd_lock(mm, pmd); 1225 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1226 goto out_free_pages; 1227 VM_BUG_ON_PAGE(!PageHead(page), page); 1228 1229 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1230 /* leave pmd empty until pte is filled */ 1231 1232 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1233 pmd_populate(mm, &_pmd, pgtable); 1234 1235 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1236 pte_t *pte, entry; 1237 entry = mk_pte(pages[i], vma->vm_page_prot); 1238 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1239 memcg = (void *)page_private(pages[i]); 1240 set_page_private(pages[i], 0); 1241 page_add_new_anon_rmap(pages[i], vma, haddr, false); 1242 mem_cgroup_commit_charge(pages[i], memcg, false, false); 1243 lru_cache_add_active_or_unevictable(pages[i], vma); 1244 pte = pte_offset_map(&_pmd, haddr); 1245 VM_BUG_ON(!pte_none(*pte)); 1246 set_pte_at(mm, haddr, pte, entry); 1247 pte_unmap(pte); 1248 } 1249 kfree(pages); 1250 1251 smp_wmb(); /* make pte visible before pmd */ 1252 pmd_populate(mm, pmd, pgtable); 1253 page_remove_rmap(page, true); 1254 spin_unlock(ptl); 1255 1256 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1257 1258 ret |= VM_FAULT_WRITE; 1259 put_page(page); 1260 1261 out: 1262 return ret; 1263 1264 out_free_pages: 1265 spin_unlock(ptl); 1266 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1267 for (i = 0; i < HPAGE_PMD_NR; i++) { 1268 memcg = (void *)page_private(pages[i]); 1269 set_page_private(pages[i], 0); 1270 mem_cgroup_cancel_charge(pages[i], memcg, false); 1271 put_page(pages[i]); 1272 } 1273 kfree(pages); 1274 goto out; 1275 } 1276 1277 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1278 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1279 { 1280 spinlock_t *ptl; 1281 int ret = 0; 1282 struct page *page = NULL, *new_page; 1283 struct mem_cgroup *memcg; 1284 unsigned long haddr; 1285 unsigned long mmun_start; /* For mmu_notifiers */ 1286 unsigned long mmun_end; /* For mmu_notifiers */ 1287 gfp_t huge_gfp; /* for allocation and charge */ 1288 1289 ptl = pmd_lockptr(mm, pmd); 1290 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1291 haddr = address & HPAGE_PMD_MASK; 1292 if (is_huge_zero_pmd(orig_pmd)) 1293 goto alloc; 1294 spin_lock(ptl); 1295 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1296 goto out_unlock; 1297 1298 page = pmd_page(orig_pmd); 1299 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); 1300 /* 1301 * We can only reuse the page if nobody else maps the huge page or it's 1302 * part. 1303 */ 1304 if (page_trans_huge_mapcount(page, NULL) == 1) { 1305 pmd_t entry; 1306 entry = pmd_mkyoung(orig_pmd); 1307 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1308 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 1309 update_mmu_cache_pmd(vma, address, pmd); 1310 ret |= VM_FAULT_WRITE; 1311 goto out_unlock; 1312 } 1313 get_page(page); 1314 spin_unlock(ptl); 1315 alloc: 1316 if (transparent_hugepage_enabled(vma) && 1317 !transparent_hugepage_debug_cow()) { 1318 huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1319 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1320 } else 1321 new_page = NULL; 1322 1323 if (likely(new_page)) { 1324 prep_transhuge_page(new_page); 1325 } else { 1326 if (!page) { 1327 split_huge_pmd(vma, pmd, address); 1328 ret |= VM_FAULT_FALLBACK; 1329 } else { 1330 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1331 pmd, orig_pmd, page, haddr); 1332 if (ret & VM_FAULT_OOM) { 1333 split_huge_pmd(vma, pmd, address); 1334 ret |= VM_FAULT_FALLBACK; 1335 } 1336 put_page(page); 1337 } 1338 count_vm_event(THP_FAULT_FALLBACK); 1339 goto out; 1340 } 1341 1342 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, 1343 true))) { 1344 put_page(new_page); 1345 if (page) { 1346 split_huge_pmd(vma, pmd, address); 1347 put_page(page); 1348 } else 1349 split_huge_pmd(vma, pmd, address); 1350 ret |= VM_FAULT_FALLBACK; 1351 count_vm_event(THP_FAULT_FALLBACK); 1352 goto out; 1353 } 1354 1355 count_vm_event(THP_FAULT_ALLOC); 1356 1357 if (!page) 1358 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1359 else 1360 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1361 __SetPageUptodate(new_page); 1362 1363 mmun_start = haddr; 1364 mmun_end = haddr + HPAGE_PMD_SIZE; 1365 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1366 1367 spin_lock(ptl); 1368 if (page) 1369 put_page(page); 1370 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1371 spin_unlock(ptl); 1372 mem_cgroup_cancel_charge(new_page, memcg, true); 1373 put_page(new_page); 1374 goto out_mn; 1375 } else { 1376 pmd_t entry; 1377 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1378 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1379 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1380 page_add_new_anon_rmap(new_page, vma, haddr, true); 1381 mem_cgroup_commit_charge(new_page, memcg, false, true); 1382 lru_cache_add_active_or_unevictable(new_page, vma); 1383 set_pmd_at(mm, haddr, pmd, entry); 1384 update_mmu_cache_pmd(vma, address, pmd); 1385 if (!page) { 1386 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1387 put_huge_zero_page(); 1388 } else { 1389 VM_BUG_ON_PAGE(!PageHead(page), page); 1390 page_remove_rmap(page, true); 1391 put_page(page); 1392 } 1393 ret |= VM_FAULT_WRITE; 1394 } 1395 spin_unlock(ptl); 1396 out_mn: 1397 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1398 out: 1399 return ret; 1400 out_unlock: 1401 spin_unlock(ptl); 1402 return ret; 1403 } 1404 1405 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1406 unsigned long addr, 1407 pmd_t *pmd, 1408 unsigned int flags) 1409 { 1410 struct mm_struct *mm = vma->vm_mm; 1411 struct page *page = NULL; 1412 1413 assert_spin_locked(pmd_lockptr(mm, pmd)); 1414 1415 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1416 goto out; 1417 1418 /* Avoid dumping huge zero page */ 1419 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1420 return ERR_PTR(-EFAULT); 1421 1422 /* Full NUMA hinting faults to serialise migration in fault paths */ 1423 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 1424 goto out; 1425 1426 page = pmd_page(*pmd); 1427 VM_BUG_ON_PAGE(!PageHead(page), page); 1428 if (flags & FOLL_TOUCH) 1429 touch_pmd(vma, addr, pmd); 1430 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1431 /* 1432 * We don't mlock() pte-mapped THPs. This way we can avoid 1433 * leaking mlocked pages into non-VM_LOCKED VMAs. 1434 * 1435 * In most cases the pmd is the only mapping of the page as we 1436 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for 1437 * writable private mappings in populate_vma_page_range(). 1438 * 1439 * The only scenario when we have the page shared here is if we 1440 * mlocking read-only mapping shared over fork(). We skip 1441 * mlocking such pages. 1442 */ 1443 if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && 1444 page->mapping && trylock_page(page)) { 1445 lru_add_drain(); 1446 if (page->mapping) 1447 mlock_vma_page(page); 1448 unlock_page(page); 1449 } 1450 } 1451 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1452 VM_BUG_ON_PAGE(!PageCompound(page), page); 1453 if (flags & FOLL_GET) 1454 get_page(page); 1455 1456 out: 1457 return page; 1458 } 1459 1460 /* NUMA hinting page fault entry point for trans huge pmds */ 1461 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1462 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1463 { 1464 spinlock_t *ptl; 1465 struct anon_vma *anon_vma = NULL; 1466 struct page *page; 1467 unsigned long haddr = addr & HPAGE_PMD_MASK; 1468 int page_nid = -1, this_nid = numa_node_id(); 1469 int target_nid, last_cpupid = -1; 1470 bool page_locked; 1471 bool migrated = false; 1472 bool was_writable; 1473 int flags = 0; 1474 1475 /* A PROT_NONE fault should not end up here */ 1476 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); 1477 1478 ptl = pmd_lock(mm, pmdp); 1479 if (unlikely(!pmd_same(pmd, *pmdp))) 1480 goto out_unlock; 1481 1482 /* 1483 * If there are potential migrations, wait for completion and retry 1484 * without disrupting NUMA hinting information. Do not relock and 1485 * check_same as the page may no longer be mapped. 1486 */ 1487 if (unlikely(pmd_trans_migrating(*pmdp))) { 1488 page = pmd_page(*pmdp); 1489 spin_unlock(ptl); 1490 wait_on_page_locked(page); 1491 goto out; 1492 } 1493 1494 page = pmd_page(pmd); 1495 BUG_ON(is_huge_zero_page(page)); 1496 page_nid = page_to_nid(page); 1497 last_cpupid = page_cpupid_last(page); 1498 count_vm_numa_event(NUMA_HINT_FAULTS); 1499 if (page_nid == this_nid) { 1500 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1501 flags |= TNF_FAULT_LOCAL; 1502 } 1503 1504 /* See similar comment in do_numa_page for explanation */ 1505 if (!(vma->vm_flags & VM_WRITE)) 1506 flags |= TNF_NO_GROUP; 1507 1508 /* 1509 * Acquire the page lock to serialise THP migrations but avoid dropping 1510 * page_table_lock if at all possible 1511 */ 1512 page_locked = trylock_page(page); 1513 target_nid = mpol_misplaced(page, vma, haddr); 1514 if (target_nid == -1) { 1515 /* If the page was locked, there are no parallel migrations */ 1516 if (page_locked) 1517 goto clear_pmdnuma; 1518 } 1519 1520 /* Migration could have started since the pmd_trans_migrating check */ 1521 if (!page_locked) { 1522 spin_unlock(ptl); 1523 wait_on_page_locked(page); 1524 page_nid = -1; 1525 goto out; 1526 } 1527 1528 /* 1529 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma 1530 * to serialises splits 1531 */ 1532 get_page(page); 1533 spin_unlock(ptl); 1534 anon_vma = page_lock_anon_vma_read(page); 1535 1536 /* Confirm the PMD did not change while page_table_lock was released */ 1537 spin_lock(ptl); 1538 if (unlikely(!pmd_same(pmd, *pmdp))) { 1539 unlock_page(page); 1540 put_page(page); 1541 page_nid = -1; 1542 goto out_unlock; 1543 } 1544 1545 /* Bail if we fail to protect against THP splits for any reason */ 1546 if (unlikely(!anon_vma)) { 1547 put_page(page); 1548 page_nid = -1; 1549 goto clear_pmdnuma; 1550 } 1551 1552 /* 1553 * Migrate the THP to the requested node, returns with page unlocked 1554 * and access rights restored. 1555 */ 1556 spin_unlock(ptl); 1557 migrated = migrate_misplaced_transhuge_page(mm, vma, 1558 pmdp, pmd, addr, page, target_nid); 1559 if (migrated) { 1560 flags |= TNF_MIGRATED; 1561 page_nid = target_nid; 1562 } else 1563 flags |= TNF_MIGRATE_FAIL; 1564 1565 goto out; 1566 clear_pmdnuma: 1567 BUG_ON(!PageLocked(page)); 1568 was_writable = pmd_write(pmd); 1569 pmd = pmd_modify(pmd, vma->vm_page_prot); 1570 pmd = pmd_mkyoung(pmd); 1571 if (was_writable) 1572 pmd = pmd_mkwrite(pmd); 1573 set_pmd_at(mm, haddr, pmdp, pmd); 1574 update_mmu_cache_pmd(vma, addr, pmdp); 1575 unlock_page(page); 1576 out_unlock: 1577 spin_unlock(ptl); 1578 1579 out: 1580 if (anon_vma) 1581 page_unlock_anon_vma_read(anon_vma); 1582 1583 if (page_nid != -1) 1584 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); 1585 1586 return 0; 1587 } 1588 1589 int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1590 pmd_t *pmd, unsigned long addr, unsigned long next) 1591 1592 { 1593 spinlock_t *ptl; 1594 pmd_t orig_pmd; 1595 struct page *page; 1596 struct mm_struct *mm = tlb->mm; 1597 int ret = 0; 1598 1599 ptl = pmd_trans_huge_lock(pmd, vma); 1600 if (!ptl) 1601 goto out_unlocked; 1602 1603 orig_pmd = *pmd; 1604 if (is_huge_zero_pmd(orig_pmd)) { 1605 ret = 1; 1606 goto out; 1607 } 1608 1609 page = pmd_page(orig_pmd); 1610 /* 1611 * If other processes are mapping this page, we couldn't discard 1612 * the page unless they all do MADV_FREE so let's skip the page. 1613 */ 1614 if (page_mapcount(page) != 1) 1615 goto out; 1616 1617 if (!trylock_page(page)) 1618 goto out; 1619 1620 /* 1621 * If user want to discard part-pages of THP, split it so MADV_FREE 1622 * will deactivate only them. 1623 */ 1624 if (next - addr != HPAGE_PMD_SIZE) { 1625 get_page(page); 1626 spin_unlock(ptl); 1627 if (split_huge_page(page)) { 1628 put_page(page); 1629 unlock_page(page); 1630 goto out_unlocked; 1631 } 1632 put_page(page); 1633 unlock_page(page); 1634 ret = 1; 1635 goto out_unlocked; 1636 } 1637 1638 if (PageDirty(page)) 1639 ClearPageDirty(page); 1640 unlock_page(page); 1641 1642 if (PageActive(page)) 1643 deactivate_page(page); 1644 1645 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1646 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1647 tlb->fullmm); 1648 orig_pmd = pmd_mkold(orig_pmd); 1649 orig_pmd = pmd_mkclean(orig_pmd); 1650 1651 set_pmd_at(mm, addr, pmd, orig_pmd); 1652 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1653 } 1654 ret = 1; 1655 out: 1656 spin_unlock(ptl); 1657 out_unlocked: 1658 return ret; 1659 } 1660 1661 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1662 pmd_t *pmd, unsigned long addr) 1663 { 1664 pmd_t orig_pmd; 1665 spinlock_t *ptl; 1666 1667 ptl = __pmd_trans_huge_lock(pmd, vma); 1668 if (!ptl) 1669 return 0; 1670 /* 1671 * For architectures like ppc64 we look at deposited pgtable 1672 * when calling pmdp_huge_get_and_clear. So do the 1673 * pgtable_trans_huge_withdraw after finishing pmdp related 1674 * operations. 1675 */ 1676 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1677 tlb->fullmm); 1678 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1679 if (vma_is_dax(vma)) { 1680 spin_unlock(ptl); 1681 if (is_huge_zero_pmd(orig_pmd)) 1682 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1683 } else if (is_huge_zero_pmd(orig_pmd)) { 1684 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); 1685 atomic_long_dec(&tlb->mm->nr_ptes); 1686 spin_unlock(ptl); 1687 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1688 } else { 1689 struct page *page = pmd_page(orig_pmd); 1690 page_remove_rmap(page, true); 1691 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1692 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1693 VM_BUG_ON_PAGE(!PageHead(page), page); 1694 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); 1695 atomic_long_dec(&tlb->mm->nr_ptes); 1696 spin_unlock(ptl); 1697 tlb_remove_page(tlb, page); 1698 } 1699 return 1; 1700 } 1701 1702 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1703 unsigned long new_addr, unsigned long old_end, 1704 pmd_t *old_pmd, pmd_t *new_pmd) 1705 { 1706 spinlock_t *old_ptl, *new_ptl; 1707 pmd_t pmd; 1708 struct mm_struct *mm = vma->vm_mm; 1709 1710 if ((old_addr & ~HPAGE_PMD_MASK) || 1711 (new_addr & ~HPAGE_PMD_MASK) || 1712 old_end - old_addr < HPAGE_PMD_SIZE) 1713 return false; 1714 1715 /* 1716 * The destination pmd shouldn't be established, free_pgtables() 1717 * should have release it. 1718 */ 1719 if (WARN_ON(!pmd_none(*new_pmd))) { 1720 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1721 return false; 1722 } 1723 1724 /* 1725 * We don't have to worry about the ordering of src and dst 1726 * ptlocks because exclusive mmap_sem prevents deadlock. 1727 */ 1728 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1729 if (old_ptl) { 1730 new_ptl = pmd_lockptr(mm, new_pmd); 1731 if (new_ptl != old_ptl) 1732 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1733 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1734 VM_BUG_ON(!pmd_none(*new_pmd)); 1735 1736 if (pmd_move_must_withdraw(new_ptl, old_ptl) && 1737 vma_is_anonymous(vma)) { 1738 pgtable_t pgtable; 1739 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1740 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1741 } 1742 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); 1743 if (new_ptl != old_ptl) 1744 spin_unlock(new_ptl); 1745 spin_unlock(old_ptl); 1746 return true; 1747 } 1748 return false; 1749 } 1750 1751 /* 1752 * Returns 1753 * - 0 if PMD could not be locked 1754 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary 1755 * - HPAGE_PMD_NR is protections changed and TLB flush necessary 1756 */ 1757 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1758 unsigned long addr, pgprot_t newprot, int prot_numa) 1759 { 1760 struct mm_struct *mm = vma->vm_mm; 1761 spinlock_t *ptl; 1762 int ret = 0; 1763 1764 ptl = __pmd_trans_huge_lock(pmd, vma); 1765 if (ptl) { 1766 pmd_t entry; 1767 bool preserve_write = prot_numa && pmd_write(*pmd); 1768 ret = 1; 1769 1770 /* 1771 * Avoid trapping faults against the zero page. The read-only 1772 * data is likely to be read-cached on the local CPU and 1773 * local/remote hits to the zero page are not interesting. 1774 */ 1775 if (prot_numa && is_huge_zero_pmd(*pmd)) { 1776 spin_unlock(ptl); 1777 return ret; 1778 } 1779 1780 if (!prot_numa || !pmd_protnone(*pmd)) { 1781 entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); 1782 entry = pmd_modify(entry, newprot); 1783 if (preserve_write) 1784 entry = pmd_mkwrite(entry); 1785 ret = HPAGE_PMD_NR; 1786 set_pmd_at(mm, addr, pmd, entry); 1787 BUG_ON(!preserve_write && pmd_write(entry)); 1788 } 1789 spin_unlock(ptl); 1790 } 1791 1792 return ret; 1793 } 1794 1795 /* 1796 * Returns true if a given pmd maps a thp, false otherwise. 1797 * 1798 * Note that if it returns true, this routine returns without unlocking page 1799 * table lock. So callers must unlock it. 1800 */ 1801 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1802 { 1803 spinlock_t *ptl; 1804 ptl = pmd_lock(vma->vm_mm, pmd); 1805 if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) 1806 return ptl; 1807 spin_unlock(ptl); 1808 return NULL; 1809 } 1810 1811 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) 1812 1813 int hugepage_madvise(struct vm_area_struct *vma, 1814 unsigned long *vm_flags, int advice) 1815 { 1816 switch (advice) { 1817 case MADV_HUGEPAGE: 1818 #ifdef CONFIG_S390 1819 /* 1820 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 1821 * can't handle this properly after s390_enable_sie, so we simply 1822 * ignore the madvise to prevent qemu from causing a SIGSEGV. 1823 */ 1824 if (mm_has_pgste(vma->vm_mm)) 1825 return 0; 1826 #endif 1827 /* 1828 * Be somewhat over-protective like KSM for now! 1829 */ 1830 if (*vm_flags & VM_NO_THP) 1831 return -EINVAL; 1832 *vm_flags &= ~VM_NOHUGEPAGE; 1833 *vm_flags |= VM_HUGEPAGE; 1834 /* 1835 * If the vma become good for khugepaged to scan, 1836 * register it here without waiting a page fault that 1837 * may not happen any time soon. 1838 */ 1839 if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) 1840 return -ENOMEM; 1841 break; 1842 case MADV_NOHUGEPAGE: 1843 /* 1844 * Be somewhat over-protective like KSM for now! 1845 */ 1846 if (*vm_flags & VM_NO_THP) 1847 return -EINVAL; 1848 *vm_flags &= ~VM_HUGEPAGE; 1849 *vm_flags |= VM_NOHUGEPAGE; 1850 /* 1851 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 1852 * this vma even if we leave the mm registered in khugepaged if 1853 * it got registered before VM_NOHUGEPAGE was set. 1854 */ 1855 break; 1856 } 1857 1858 return 0; 1859 } 1860 1861 static int __init khugepaged_slab_init(void) 1862 { 1863 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 1864 sizeof(struct mm_slot), 1865 __alignof__(struct mm_slot), 0, NULL); 1866 if (!mm_slot_cache) 1867 return -ENOMEM; 1868 1869 return 0; 1870 } 1871 1872 static void __init khugepaged_slab_exit(void) 1873 { 1874 kmem_cache_destroy(mm_slot_cache); 1875 } 1876 1877 static inline struct mm_slot *alloc_mm_slot(void) 1878 { 1879 if (!mm_slot_cache) /* initialization failed */ 1880 return NULL; 1881 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 1882 } 1883 1884 static inline void free_mm_slot(struct mm_slot *mm_slot) 1885 { 1886 kmem_cache_free(mm_slot_cache, mm_slot); 1887 } 1888 1889 static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1890 { 1891 struct mm_slot *mm_slot; 1892 1893 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) 1894 if (mm == mm_slot->mm) 1895 return mm_slot; 1896 1897 return NULL; 1898 } 1899 1900 static void insert_to_mm_slots_hash(struct mm_struct *mm, 1901 struct mm_slot *mm_slot) 1902 { 1903 mm_slot->mm = mm; 1904 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); 1905 } 1906 1907 static inline int khugepaged_test_exit(struct mm_struct *mm) 1908 { 1909 return atomic_read(&mm->mm_users) == 0; 1910 } 1911 1912 int __khugepaged_enter(struct mm_struct *mm) 1913 { 1914 struct mm_slot *mm_slot; 1915 int wakeup; 1916 1917 mm_slot = alloc_mm_slot(); 1918 if (!mm_slot) 1919 return -ENOMEM; 1920 1921 /* __khugepaged_exit() must not run from under us */ 1922 VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); 1923 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 1924 free_mm_slot(mm_slot); 1925 return 0; 1926 } 1927 1928 spin_lock(&khugepaged_mm_lock); 1929 insert_to_mm_slots_hash(mm, mm_slot); 1930 /* 1931 * Insert just behind the scanning cursor, to let the area settle 1932 * down a little. 1933 */ 1934 wakeup = list_empty(&khugepaged_scan.mm_head); 1935 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 1936 spin_unlock(&khugepaged_mm_lock); 1937 1938 atomic_inc(&mm->mm_count); 1939 if (wakeup) 1940 wake_up_interruptible(&khugepaged_wait); 1941 1942 return 0; 1943 } 1944 1945 int khugepaged_enter_vma_merge(struct vm_area_struct *vma, 1946 unsigned long vm_flags) 1947 { 1948 unsigned long hstart, hend; 1949 if (!vma->anon_vma) 1950 /* 1951 * Not yet faulted in so we will register later in the 1952 * page fault if needed. 1953 */ 1954 return 0; 1955 if (vma->vm_ops || (vm_flags & VM_NO_THP)) 1956 /* khugepaged not yet working on file or special mappings */ 1957 return 0; 1958 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1959 hend = vma->vm_end & HPAGE_PMD_MASK; 1960 if (hstart < hend) 1961 return khugepaged_enter(vma, vm_flags); 1962 return 0; 1963 } 1964 1965 void __khugepaged_exit(struct mm_struct *mm) 1966 { 1967 struct mm_slot *mm_slot; 1968 int free = 0; 1969 1970 spin_lock(&khugepaged_mm_lock); 1971 mm_slot = get_mm_slot(mm); 1972 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 1973 hash_del(&mm_slot->hash); 1974 list_del(&mm_slot->mm_node); 1975 free = 1; 1976 } 1977 spin_unlock(&khugepaged_mm_lock); 1978 1979 if (free) { 1980 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1981 free_mm_slot(mm_slot); 1982 mmdrop(mm); 1983 } else if (mm_slot) { 1984 /* 1985 * This is required to serialize against 1986 * khugepaged_test_exit() (which is guaranteed to run 1987 * under mmap sem read mode). Stop here (after we 1988 * return all pagetables will be destroyed) until 1989 * khugepaged has finished working on the pagetables 1990 * under the mmap_sem. 1991 */ 1992 down_write(&mm->mmap_sem); 1993 up_write(&mm->mmap_sem); 1994 } 1995 } 1996 1997 static void release_pte_page(struct page *page) 1998 { 1999 /* 0 stands for page_is_file_cache(page) == false */ 2000 dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 2001 unlock_page(page); 2002 putback_lru_page(page); 2003 } 2004 2005 static void release_pte_pages(pte_t *pte, pte_t *_pte) 2006 { 2007 while (--_pte >= pte) { 2008 pte_t pteval = *_pte; 2009 if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) 2010 release_pte_page(pte_page(pteval)); 2011 } 2012 } 2013 2014 static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2015 unsigned long address, 2016 pte_t *pte) 2017 { 2018 struct page *page = NULL; 2019 pte_t *_pte; 2020 int none_or_zero = 0, result = 0; 2021 bool referenced = false, writable = false; 2022 2023 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2024 _pte++, address += PAGE_SIZE) { 2025 pte_t pteval = *_pte; 2026 if (pte_none(pteval) || (pte_present(pteval) && 2027 is_zero_pfn(pte_pfn(pteval)))) { 2028 if (!userfaultfd_armed(vma) && 2029 ++none_or_zero <= khugepaged_max_ptes_none) { 2030 continue; 2031 } else { 2032 result = SCAN_EXCEED_NONE_PTE; 2033 goto out; 2034 } 2035 } 2036 if (!pte_present(pteval)) { 2037 result = SCAN_PTE_NON_PRESENT; 2038 goto out; 2039 } 2040 page = vm_normal_page(vma, address, pteval); 2041 if (unlikely(!page)) { 2042 result = SCAN_PAGE_NULL; 2043 goto out; 2044 } 2045 2046 VM_BUG_ON_PAGE(PageCompound(page), page); 2047 VM_BUG_ON_PAGE(!PageAnon(page), page); 2048 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 2049 2050 /* 2051 * We can do it before isolate_lru_page because the 2052 * page can't be freed from under us. NOTE: PG_lock 2053 * is needed to serialize against split_huge_page 2054 * when invoked from the VM. 2055 */ 2056 if (!trylock_page(page)) { 2057 result = SCAN_PAGE_LOCK; 2058 goto out; 2059 } 2060 2061 /* 2062 * cannot use mapcount: can't collapse if there's a gup pin. 2063 * The page must only be referenced by the scanned process 2064 * and page swap cache. 2065 */ 2066 if (page_count(page) != 1 + !!PageSwapCache(page)) { 2067 unlock_page(page); 2068 result = SCAN_PAGE_COUNT; 2069 goto out; 2070 } 2071 if (pte_write(pteval)) { 2072 writable = true; 2073 } else { 2074 if (PageSwapCache(page) && 2075 !reuse_swap_page(page, NULL)) { 2076 unlock_page(page); 2077 result = SCAN_SWAP_CACHE_PAGE; 2078 goto out; 2079 } 2080 /* 2081 * Page is not in the swap cache. It can be collapsed 2082 * into a THP. 2083 */ 2084 } 2085 2086 /* 2087 * Isolate the page to avoid collapsing an hugepage 2088 * currently in use by the VM. 2089 */ 2090 if (isolate_lru_page(page)) { 2091 unlock_page(page); 2092 result = SCAN_DEL_PAGE_LRU; 2093 goto out; 2094 } 2095 /* 0 stands for page_is_file_cache(page) == false */ 2096 inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 2097 VM_BUG_ON_PAGE(!PageLocked(page), page); 2098 VM_BUG_ON_PAGE(PageLRU(page), page); 2099 2100 /* If there is no mapped pte young don't collapse the page */ 2101 if (pte_young(pteval) || 2102 page_is_young(page) || PageReferenced(page) || 2103 mmu_notifier_test_young(vma->vm_mm, address)) 2104 referenced = true; 2105 } 2106 if (likely(writable)) { 2107 if (likely(referenced)) { 2108 result = SCAN_SUCCEED; 2109 trace_mm_collapse_huge_page_isolate(page, none_or_zero, 2110 referenced, writable, result); 2111 return 1; 2112 } 2113 } else { 2114 result = SCAN_PAGE_RO; 2115 } 2116 2117 out: 2118 release_pte_pages(pte, _pte); 2119 trace_mm_collapse_huge_page_isolate(page, none_or_zero, 2120 referenced, writable, result); 2121 return 0; 2122 } 2123 2124 static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2125 struct vm_area_struct *vma, 2126 unsigned long address, 2127 spinlock_t *ptl) 2128 { 2129 pte_t *_pte; 2130 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { 2131 pte_t pteval = *_pte; 2132 struct page *src_page; 2133 2134 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 2135 clear_user_highpage(page, address); 2136 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 2137 if (is_zero_pfn(pte_pfn(pteval))) { 2138 /* 2139 * ptl mostly unnecessary. 2140 */ 2141 spin_lock(ptl); 2142 /* 2143 * paravirt calls inside pte_clear here are 2144 * superfluous. 2145 */ 2146 pte_clear(vma->vm_mm, address, _pte); 2147 spin_unlock(ptl); 2148 } 2149 } else { 2150 src_page = pte_page(pteval); 2151 copy_user_highpage(page, src_page, address, vma); 2152 VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); 2153 release_pte_page(src_page); 2154 /* 2155 * ptl mostly unnecessary, but preempt has to 2156 * be disabled to update the per-cpu stats 2157 * inside page_remove_rmap(). 2158 */ 2159 spin_lock(ptl); 2160 /* 2161 * paravirt calls inside pte_clear here are 2162 * superfluous. 2163 */ 2164 pte_clear(vma->vm_mm, address, _pte); 2165 page_remove_rmap(src_page, false); 2166 spin_unlock(ptl); 2167 free_page_and_swap_cache(src_page); 2168 } 2169 2170 address += PAGE_SIZE; 2171 page++; 2172 } 2173 } 2174 2175 static void khugepaged_alloc_sleep(void) 2176 { 2177 DEFINE_WAIT(wait); 2178 2179 add_wait_queue(&khugepaged_wait, &wait); 2180 freezable_schedule_timeout_interruptible( 2181 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2182 remove_wait_queue(&khugepaged_wait, &wait); 2183 } 2184 2185 static int khugepaged_node_load[MAX_NUMNODES]; 2186 2187 static bool khugepaged_scan_abort(int nid) 2188 { 2189 int i; 2190 2191 /* 2192 * If zone_reclaim_mode is disabled, then no extra effort is made to 2193 * allocate memory locally. 2194 */ 2195 if (!zone_reclaim_mode) 2196 return false; 2197 2198 /* If there is a count for this node already, it must be acceptable */ 2199 if (khugepaged_node_load[nid]) 2200 return false; 2201 2202 for (i = 0; i < MAX_NUMNODES; i++) { 2203 if (!khugepaged_node_load[i]) 2204 continue; 2205 if (node_distance(nid, i) > RECLAIM_DISTANCE) 2206 return true; 2207 } 2208 return false; 2209 } 2210 2211 #ifdef CONFIG_NUMA 2212 static int khugepaged_find_target_node(void) 2213 { 2214 static int last_khugepaged_target_node = NUMA_NO_NODE; 2215 int nid, target_node = 0, max_value = 0; 2216 2217 /* find first node with max normal pages hit */ 2218 for (nid = 0; nid < MAX_NUMNODES; nid++) 2219 if (khugepaged_node_load[nid] > max_value) { 2220 max_value = khugepaged_node_load[nid]; 2221 target_node = nid; 2222 } 2223 2224 /* do some balance if several nodes have the same hit record */ 2225 if (target_node <= last_khugepaged_target_node) 2226 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; 2227 nid++) 2228 if (max_value == khugepaged_node_load[nid]) { 2229 target_node = nid; 2230 break; 2231 } 2232 2233 last_khugepaged_target_node = target_node; 2234 return target_node; 2235 } 2236 2237 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2238 { 2239 if (IS_ERR(*hpage)) { 2240 if (!*wait) 2241 return false; 2242 2243 *wait = false; 2244 *hpage = NULL; 2245 khugepaged_alloc_sleep(); 2246 } else if (*hpage) { 2247 put_page(*hpage); 2248 *hpage = NULL; 2249 } 2250 2251 return true; 2252 } 2253 2254 static struct page * 2255 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, 2256 unsigned long address, int node) 2257 { 2258 VM_BUG_ON_PAGE(*hpage, *hpage); 2259 2260 /* 2261 * Before allocating the hugepage, release the mmap_sem read lock. 2262 * The allocation can take potentially a long time if it involves 2263 * sync compaction, and we do not need to hold the mmap_sem during 2264 * that. We will recheck the vma after taking it again in write mode. 2265 */ 2266 up_read(&mm->mmap_sem); 2267 2268 *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); 2269 if (unlikely(!*hpage)) { 2270 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2271 *hpage = ERR_PTR(-ENOMEM); 2272 return NULL; 2273 } 2274 2275 prep_transhuge_page(*hpage); 2276 count_vm_event(THP_COLLAPSE_ALLOC); 2277 return *hpage; 2278 } 2279 #else 2280 static int khugepaged_find_target_node(void) 2281 { 2282 return 0; 2283 } 2284 2285 static inline struct page *alloc_khugepaged_hugepage(void) 2286 { 2287 struct page *page; 2288 2289 page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), 2290 HPAGE_PMD_ORDER); 2291 if (page) 2292 prep_transhuge_page(page); 2293 return page; 2294 } 2295 2296 static struct page *khugepaged_alloc_hugepage(bool *wait) 2297 { 2298 struct page *hpage; 2299 2300 do { 2301 hpage = alloc_khugepaged_hugepage(); 2302 if (!hpage) { 2303 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2304 if (!*wait) 2305 return NULL; 2306 2307 *wait = false; 2308 khugepaged_alloc_sleep(); 2309 } else 2310 count_vm_event(THP_COLLAPSE_ALLOC); 2311 } while (unlikely(!hpage) && likely(khugepaged_enabled())); 2312 2313 return hpage; 2314 } 2315 2316 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2317 { 2318 if (!*hpage) 2319 *hpage = khugepaged_alloc_hugepage(wait); 2320 2321 if (unlikely(!*hpage)) 2322 return false; 2323 2324 return true; 2325 } 2326 2327 static struct page * 2328 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, 2329 unsigned long address, int node) 2330 { 2331 up_read(&mm->mmap_sem); 2332 VM_BUG_ON(!*hpage); 2333 2334 return *hpage; 2335 } 2336 #endif 2337 2338 static bool hugepage_vma_check(struct vm_area_struct *vma) 2339 { 2340 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 2341 (vma->vm_flags & VM_NOHUGEPAGE)) 2342 return false; 2343 if (!vma->anon_vma || vma->vm_ops) 2344 return false; 2345 if (is_vma_temporary_stack(vma)) 2346 return false; 2347 return !(vma->vm_flags & VM_NO_THP); 2348 } 2349 2350 static void collapse_huge_page(struct mm_struct *mm, 2351 unsigned long address, 2352 struct page **hpage, 2353 struct vm_area_struct *vma, 2354 int node) 2355 { 2356 pmd_t *pmd, _pmd; 2357 pte_t *pte; 2358 pgtable_t pgtable; 2359 struct page *new_page; 2360 spinlock_t *pmd_ptl, *pte_ptl; 2361 int isolated = 0, result = 0; 2362 unsigned long hstart, hend; 2363 struct mem_cgroup *memcg; 2364 unsigned long mmun_start; /* For mmu_notifiers */ 2365 unsigned long mmun_end; /* For mmu_notifiers */ 2366 gfp_t gfp; 2367 2368 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2369 2370 /* Only allocate from the target node */ 2371 gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; 2372 2373 /* release the mmap_sem read lock. */ 2374 new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); 2375 if (!new_page) { 2376 result = SCAN_ALLOC_HUGE_PAGE_FAIL; 2377 goto out_nolock; 2378 } 2379 2380 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { 2381 result = SCAN_CGROUP_CHARGE_FAIL; 2382 goto out_nolock; 2383 } 2384 2385 /* 2386 * Prevent all access to pagetables with the exception of 2387 * gup_fast later hanlded by the ptep_clear_flush and the VM 2388 * handled by the anon_vma lock + PG_lock. 2389 */ 2390 down_write(&mm->mmap_sem); 2391 if (unlikely(khugepaged_test_exit(mm))) { 2392 result = SCAN_ANY_PROCESS; 2393 goto out; 2394 } 2395 2396 vma = find_vma(mm, address); 2397 if (!vma) { 2398 result = SCAN_VMA_NULL; 2399 goto out; 2400 } 2401 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2402 hend = vma->vm_end & HPAGE_PMD_MASK; 2403 if (address < hstart || address + HPAGE_PMD_SIZE > hend) { 2404 result = SCAN_ADDRESS_RANGE; 2405 goto out; 2406 } 2407 if (!hugepage_vma_check(vma)) { 2408 result = SCAN_VMA_CHECK; 2409 goto out; 2410 } 2411 pmd = mm_find_pmd(mm, address); 2412 if (!pmd) { 2413 result = SCAN_PMD_NULL; 2414 goto out; 2415 } 2416 2417 anon_vma_lock_write(vma->anon_vma); 2418 2419 pte = pte_offset_map(pmd, address); 2420 pte_ptl = pte_lockptr(mm, pmd); 2421 2422 mmun_start = address; 2423 mmun_end = address + HPAGE_PMD_SIZE; 2424 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2425 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 2426 /* 2427 * After this gup_fast can't run anymore. This also removes 2428 * any huge TLB entry from the CPU so we won't allow 2429 * huge and small TLB entries for the same virtual address 2430 * to avoid the risk of CPU bugs in that area. 2431 */ 2432 _pmd = pmdp_collapse_flush(vma, address, pmd); 2433 spin_unlock(pmd_ptl); 2434 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2435 2436 spin_lock(pte_ptl); 2437 isolated = __collapse_huge_page_isolate(vma, address, pte); 2438 spin_unlock(pte_ptl); 2439 2440 if (unlikely(!isolated)) { 2441 pte_unmap(pte); 2442 spin_lock(pmd_ptl); 2443 BUG_ON(!pmd_none(*pmd)); 2444 /* 2445 * We can only use set_pmd_at when establishing 2446 * hugepmds and never for establishing regular pmds that 2447 * points to regular pagetables. Use pmd_populate for that 2448 */ 2449 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 2450 spin_unlock(pmd_ptl); 2451 anon_vma_unlock_write(vma->anon_vma); 2452 result = SCAN_FAIL; 2453 goto out; 2454 } 2455 2456 /* 2457 * All pages are isolated and locked so anon_vma rmap 2458 * can't run anymore. 2459 */ 2460 anon_vma_unlock_write(vma->anon_vma); 2461 2462 __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); 2463 pte_unmap(pte); 2464 __SetPageUptodate(new_page); 2465 pgtable = pmd_pgtable(_pmd); 2466 2467 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); 2468 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2469 2470 /* 2471 * spin_lock() below is not the equivalent of smp_wmb(), so 2472 * this is needed to avoid the copy_huge_page writes to become 2473 * visible after the set_pmd_at() write. 2474 */ 2475 smp_wmb(); 2476 2477 spin_lock(pmd_ptl); 2478 BUG_ON(!pmd_none(*pmd)); 2479 page_add_new_anon_rmap(new_page, vma, address, true); 2480 mem_cgroup_commit_charge(new_page, memcg, false, true); 2481 lru_cache_add_active_or_unevictable(new_page, vma); 2482 pgtable_trans_huge_deposit(mm, pmd, pgtable); 2483 set_pmd_at(mm, address, pmd, _pmd); 2484 update_mmu_cache_pmd(vma, address, pmd); 2485 spin_unlock(pmd_ptl); 2486 2487 *hpage = NULL; 2488 2489 khugepaged_pages_collapsed++; 2490 result = SCAN_SUCCEED; 2491 out_up_write: 2492 up_write(&mm->mmap_sem); 2493 trace_mm_collapse_huge_page(mm, isolated, result); 2494 return; 2495 2496 out_nolock: 2497 trace_mm_collapse_huge_page(mm, isolated, result); 2498 return; 2499 out: 2500 mem_cgroup_cancel_charge(new_page, memcg, true); 2501 goto out_up_write; 2502 } 2503 2504 static int khugepaged_scan_pmd(struct mm_struct *mm, 2505 struct vm_area_struct *vma, 2506 unsigned long address, 2507 struct page **hpage) 2508 { 2509 pmd_t *pmd; 2510 pte_t *pte, *_pte; 2511 int ret = 0, none_or_zero = 0, result = 0; 2512 struct page *page = NULL; 2513 unsigned long _address; 2514 spinlock_t *ptl; 2515 int node = NUMA_NO_NODE; 2516 bool writable = false, referenced = false; 2517 2518 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2519 2520 pmd = mm_find_pmd(mm, address); 2521 if (!pmd) { 2522 result = SCAN_PMD_NULL; 2523 goto out; 2524 } 2525 2526 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 2527 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2528 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2529 _pte++, _address += PAGE_SIZE) { 2530 pte_t pteval = *_pte; 2531 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 2532 if (!userfaultfd_armed(vma) && 2533 ++none_or_zero <= khugepaged_max_ptes_none) { 2534 continue; 2535 } else { 2536 result = SCAN_EXCEED_NONE_PTE; 2537 goto out_unmap; 2538 } 2539 } 2540 if (!pte_present(pteval)) { 2541 result = SCAN_PTE_NON_PRESENT; 2542 goto out_unmap; 2543 } 2544 if (pte_write(pteval)) 2545 writable = true; 2546 2547 page = vm_normal_page(vma, _address, pteval); 2548 if (unlikely(!page)) { 2549 result = SCAN_PAGE_NULL; 2550 goto out_unmap; 2551 } 2552 2553 /* TODO: teach khugepaged to collapse THP mapped with pte */ 2554 if (PageCompound(page)) { 2555 result = SCAN_PAGE_COMPOUND; 2556 goto out_unmap; 2557 } 2558 2559 /* 2560 * Record which node the original page is from and save this 2561 * information to khugepaged_node_load[]. 2562 * Khupaged will allocate hugepage from the node has the max 2563 * hit record. 2564 */ 2565 node = page_to_nid(page); 2566 if (khugepaged_scan_abort(node)) { 2567 result = SCAN_SCAN_ABORT; 2568 goto out_unmap; 2569 } 2570 khugepaged_node_load[node]++; 2571 if (!PageLRU(page)) { 2572 result = SCAN_PAGE_LRU; 2573 goto out_unmap; 2574 } 2575 if (PageLocked(page)) { 2576 result = SCAN_PAGE_LOCK; 2577 goto out_unmap; 2578 } 2579 if (!PageAnon(page)) { 2580 result = SCAN_PAGE_ANON; 2581 goto out_unmap; 2582 } 2583 2584 /* 2585 * cannot use mapcount: can't collapse if there's a gup pin. 2586 * The page must only be referenced by the scanned process 2587 * and page swap cache. 2588 */ 2589 if (page_count(page) != 1 + !!PageSwapCache(page)) { 2590 result = SCAN_PAGE_COUNT; 2591 goto out_unmap; 2592 } 2593 if (pte_young(pteval) || 2594 page_is_young(page) || PageReferenced(page) || 2595 mmu_notifier_test_young(vma->vm_mm, address)) 2596 referenced = true; 2597 } 2598 if (writable) { 2599 if (referenced) { 2600 result = SCAN_SUCCEED; 2601 ret = 1; 2602 } else { 2603 result = SCAN_NO_REFERENCED_PAGE; 2604 } 2605 } else { 2606 result = SCAN_PAGE_RO; 2607 } 2608 out_unmap: 2609 pte_unmap_unlock(pte, ptl); 2610 if (ret) { 2611 node = khugepaged_find_target_node(); 2612 /* collapse_huge_page will return with the mmap_sem released */ 2613 collapse_huge_page(mm, address, hpage, vma, node); 2614 } 2615 out: 2616 trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, 2617 none_or_zero, result); 2618 return ret; 2619 } 2620 2621 static void collect_mm_slot(struct mm_slot *mm_slot) 2622 { 2623 struct mm_struct *mm = mm_slot->mm; 2624 2625 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2626 2627 if (khugepaged_test_exit(mm)) { 2628 /* free mm_slot */ 2629 hash_del(&mm_slot->hash); 2630 list_del(&mm_slot->mm_node); 2631 2632 /* 2633 * Not strictly needed because the mm exited already. 2634 * 2635 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2636 */ 2637 2638 /* khugepaged_mm_lock actually not necessary for the below */ 2639 free_mm_slot(mm_slot); 2640 mmdrop(mm); 2641 } 2642 } 2643 2644 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2645 struct page **hpage) 2646 __releases(&khugepaged_mm_lock) 2647 __acquires(&khugepaged_mm_lock) 2648 { 2649 struct mm_slot *mm_slot; 2650 struct mm_struct *mm; 2651 struct vm_area_struct *vma; 2652 int progress = 0; 2653 2654 VM_BUG_ON(!pages); 2655 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2656 2657 if (khugepaged_scan.mm_slot) 2658 mm_slot = khugepaged_scan.mm_slot; 2659 else { 2660 mm_slot = list_entry(khugepaged_scan.mm_head.next, 2661 struct mm_slot, mm_node); 2662 khugepaged_scan.address = 0; 2663 khugepaged_scan.mm_slot = mm_slot; 2664 } 2665 spin_unlock(&khugepaged_mm_lock); 2666 2667 mm = mm_slot->mm; 2668 down_read(&mm->mmap_sem); 2669 if (unlikely(khugepaged_test_exit(mm))) 2670 vma = NULL; 2671 else 2672 vma = find_vma(mm, khugepaged_scan.address); 2673 2674 progress++; 2675 for (; vma; vma = vma->vm_next) { 2676 unsigned long hstart, hend; 2677 2678 cond_resched(); 2679 if (unlikely(khugepaged_test_exit(mm))) { 2680 progress++; 2681 break; 2682 } 2683 if (!hugepage_vma_check(vma)) { 2684 skip: 2685 progress++; 2686 continue; 2687 } 2688 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2689 hend = vma->vm_end & HPAGE_PMD_MASK; 2690 if (hstart >= hend) 2691 goto skip; 2692 if (khugepaged_scan.address > hend) 2693 goto skip; 2694 if (khugepaged_scan.address < hstart) 2695 khugepaged_scan.address = hstart; 2696 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2697 2698 while (khugepaged_scan.address < hend) { 2699 int ret; 2700 cond_resched(); 2701 if (unlikely(khugepaged_test_exit(mm))) 2702 goto breakouterloop; 2703 2704 VM_BUG_ON(khugepaged_scan.address < hstart || 2705 khugepaged_scan.address + HPAGE_PMD_SIZE > 2706 hend); 2707 ret = khugepaged_scan_pmd(mm, vma, 2708 khugepaged_scan.address, 2709 hpage); 2710 /* move to next address */ 2711 khugepaged_scan.address += HPAGE_PMD_SIZE; 2712 progress += HPAGE_PMD_NR; 2713 if (ret) 2714 /* we released mmap_sem so break loop */ 2715 goto breakouterloop_mmap_sem; 2716 if (progress >= pages) 2717 goto breakouterloop; 2718 } 2719 } 2720 breakouterloop: 2721 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ 2722 breakouterloop_mmap_sem: 2723 2724 spin_lock(&khugepaged_mm_lock); 2725 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2726 /* 2727 * Release the current mm_slot if this mm is about to die, or 2728 * if we scanned all vmas of this mm. 2729 */ 2730 if (khugepaged_test_exit(mm) || !vma) { 2731 /* 2732 * Make sure that if mm_users is reaching zero while 2733 * khugepaged runs here, khugepaged_exit will find 2734 * mm_slot not pointing to the exiting mm. 2735 */ 2736 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 2737 khugepaged_scan.mm_slot = list_entry( 2738 mm_slot->mm_node.next, 2739 struct mm_slot, mm_node); 2740 khugepaged_scan.address = 0; 2741 } else { 2742 khugepaged_scan.mm_slot = NULL; 2743 khugepaged_full_scans++; 2744 } 2745 2746 collect_mm_slot(mm_slot); 2747 } 2748 2749 return progress; 2750 } 2751 2752 static int khugepaged_has_work(void) 2753 { 2754 return !list_empty(&khugepaged_scan.mm_head) && 2755 khugepaged_enabled(); 2756 } 2757 2758 static int khugepaged_wait_event(void) 2759 { 2760 return !list_empty(&khugepaged_scan.mm_head) || 2761 kthread_should_stop(); 2762 } 2763 2764 static void khugepaged_do_scan(void) 2765 { 2766 struct page *hpage = NULL; 2767 unsigned int progress = 0, pass_through_head = 0; 2768 unsigned int pages = khugepaged_pages_to_scan; 2769 bool wait = true; 2770 2771 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2772 2773 while (progress < pages) { 2774 if (!khugepaged_prealloc_page(&hpage, &wait)) 2775 break; 2776 2777 cond_resched(); 2778 2779 if (unlikely(kthread_should_stop() || try_to_freeze())) 2780 break; 2781 2782 spin_lock(&khugepaged_mm_lock); 2783 if (!khugepaged_scan.mm_slot) 2784 pass_through_head++; 2785 if (khugepaged_has_work() && 2786 pass_through_head < 2) 2787 progress += khugepaged_scan_mm_slot(pages - progress, 2788 &hpage); 2789 else 2790 progress = pages; 2791 spin_unlock(&khugepaged_mm_lock); 2792 } 2793 2794 if (!IS_ERR_OR_NULL(hpage)) 2795 put_page(hpage); 2796 } 2797 2798 static bool khugepaged_should_wakeup(void) 2799 { 2800 return kthread_should_stop() || 2801 time_after_eq(jiffies, khugepaged_sleep_expire); 2802 } 2803 2804 static void khugepaged_wait_work(void) 2805 { 2806 if (khugepaged_has_work()) { 2807 const unsigned long scan_sleep_jiffies = 2808 msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 2809 2810 if (!scan_sleep_jiffies) 2811 return; 2812 2813 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 2814 wait_event_freezable_timeout(khugepaged_wait, 2815 khugepaged_should_wakeup(), 2816 scan_sleep_jiffies); 2817 return; 2818 } 2819 2820 if (khugepaged_enabled()) 2821 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2822 } 2823 2824 static int khugepaged(void *none) 2825 { 2826 struct mm_slot *mm_slot; 2827 2828 set_freezable(); 2829 set_user_nice(current, MAX_NICE); 2830 2831 while (!kthread_should_stop()) { 2832 khugepaged_do_scan(); 2833 khugepaged_wait_work(); 2834 } 2835 2836 spin_lock(&khugepaged_mm_lock); 2837 mm_slot = khugepaged_scan.mm_slot; 2838 khugepaged_scan.mm_slot = NULL; 2839 if (mm_slot) 2840 collect_mm_slot(mm_slot); 2841 spin_unlock(&khugepaged_mm_lock); 2842 return 0; 2843 } 2844 2845 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2846 unsigned long haddr, pmd_t *pmd) 2847 { 2848 struct mm_struct *mm = vma->vm_mm; 2849 pgtable_t pgtable; 2850 pmd_t _pmd; 2851 int i; 2852 2853 /* leave pmd empty until pte is filled */ 2854 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2855 2856 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2857 pmd_populate(mm, &_pmd, pgtable); 2858 2859 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2860 pte_t *pte, entry; 2861 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2862 entry = pte_mkspecial(entry); 2863 pte = pte_offset_map(&_pmd, haddr); 2864 VM_BUG_ON(!pte_none(*pte)); 2865 set_pte_at(mm, haddr, pte, entry); 2866 pte_unmap(pte); 2867 } 2868 smp_wmb(); /* make pte visible before pmd */ 2869 pmd_populate(mm, pmd, pgtable); 2870 put_huge_zero_page(); 2871 } 2872 2873 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2874 unsigned long haddr, bool freeze) 2875 { 2876 struct mm_struct *mm = vma->vm_mm; 2877 struct page *page; 2878 pgtable_t pgtable; 2879 pmd_t _pmd; 2880 bool young, write, dirty; 2881 unsigned long addr; 2882 int i; 2883 2884 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2885 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2886 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2887 VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); 2888 2889 count_vm_event(THP_SPLIT_PMD); 2890 2891 if (vma_is_dax(vma)) { 2892 pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2893 if (is_huge_zero_pmd(_pmd)) 2894 put_huge_zero_page(); 2895 return; 2896 } else if (is_huge_zero_pmd(*pmd)) { 2897 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2898 } 2899 2900 page = pmd_page(*pmd); 2901 VM_BUG_ON_PAGE(!page_count(page), page); 2902 page_ref_add(page, HPAGE_PMD_NR - 1); 2903 write = pmd_write(*pmd); 2904 young = pmd_young(*pmd); 2905 dirty = pmd_dirty(*pmd); 2906 2907 pmdp_huge_split_prepare(vma, haddr, pmd); 2908 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2909 pmd_populate(mm, &_pmd, pgtable); 2910 2911 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2912 pte_t entry, *pte; 2913 /* 2914 * Note that NUMA hinting access restrictions are not 2915 * transferred to avoid any possibility of altering 2916 * permissions across VMAs. 2917 */ 2918 if (freeze) { 2919 swp_entry_t swp_entry; 2920 swp_entry = make_migration_entry(page + i, write); 2921 entry = swp_entry_to_pte(swp_entry); 2922 } else { 2923 entry = mk_pte(page + i, vma->vm_page_prot); 2924 entry = maybe_mkwrite(entry, vma); 2925 if (!write) 2926 entry = pte_wrprotect(entry); 2927 if (!young) 2928 entry = pte_mkold(entry); 2929 } 2930 if (dirty) 2931 SetPageDirty(page + i); 2932 pte = pte_offset_map(&_pmd, addr); 2933 BUG_ON(!pte_none(*pte)); 2934 set_pte_at(mm, addr, pte, entry); 2935 atomic_inc(&page[i]._mapcount); 2936 pte_unmap(pte); 2937 } 2938 2939 /* 2940 * Set PG_double_map before dropping compound_mapcount to avoid 2941 * false-negative page_mapped(). 2942 */ 2943 if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { 2944 for (i = 0; i < HPAGE_PMD_NR; i++) 2945 atomic_inc(&page[i]._mapcount); 2946 } 2947 2948 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 2949 /* Last compound_mapcount is gone. */ 2950 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 2951 if (TestClearPageDoubleMap(page)) { 2952 /* No need in mapcount reference anymore */ 2953 for (i = 0; i < HPAGE_PMD_NR; i++) 2954 atomic_dec(&page[i]._mapcount); 2955 } 2956 } 2957 2958 smp_wmb(); /* make pte visible before pmd */ 2959 /* 2960 * Up to this point the pmd is present and huge and userland has the 2961 * whole access to the hugepage during the split (which happens in 2962 * place). If we overwrite the pmd with the not-huge version pointing 2963 * to the pte here (which of course we could if all CPUs were bug 2964 * free), userland could trigger a small page size TLB miss on the 2965 * small sized TLB while the hugepage TLB entry is still established in 2966 * the huge TLB. Some CPU doesn't like that. 2967 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 2968 * 383 on page 93. Intel should be safe but is also warns that it's 2969 * only safe if the permission and cache attributes of the two entries 2970 * loaded in the two TLB is identical (which should be the case here). 2971 * But it is generally safer to never allow small and huge TLB entries 2972 * for the same virtual address to be loaded simultaneously. So instead 2973 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 2974 * current pmd notpresent (atomically because here the pmd_trans_huge 2975 * and pmd_trans_splitting must remain set at all times on the pmd 2976 * until the split is complete for this pmd), then we flush the SMP TLB 2977 * and finally we write the non-huge version of the pmd entry with 2978 * pmd_populate. 2979 */ 2980 pmdp_invalidate(vma, haddr, pmd); 2981 pmd_populate(mm, pmd, pgtable); 2982 2983 if (freeze) { 2984 for (i = 0; i < HPAGE_PMD_NR; i++) { 2985 page_remove_rmap(page + i, false); 2986 put_page(page + i); 2987 } 2988 } 2989 } 2990 2991 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2992 unsigned long address, bool freeze) 2993 { 2994 spinlock_t *ptl; 2995 struct mm_struct *mm = vma->vm_mm; 2996 unsigned long haddr = address & HPAGE_PMD_MASK; 2997 2998 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); 2999 ptl = pmd_lock(mm, pmd); 3000 if (pmd_trans_huge(*pmd)) { 3001 struct page *page = pmd_page(*pmd); 3002 if (PageMlocked(page)) 3003 clear_page_mlock(page); 3004 } else if (!pmd_devmap(*pmd)) 3005 goto out; 3006 __split_huge_pmd_locked(vma, pmd, haddr, freeze); 3007 out: 3008 spin_unlock(ptl); 3009 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); 3010 } 3011 3012 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 3013 bool freeze, struct page *page) 3014 { 3015 pgd_t *pgd; 3016 pud_t *pud; 3017 pmd_t *pmd; 3018 3019 pgd = pgd_offset(vma->vm_mm, address); 3020 if (!pgd_present(*pgd)) 3021 return; 3022 3023 pud = pud_offset(pgd, address); 3024 if (!pud_present(*pud)) 3025 return; 3026 3027 pmd = pmd_offset(pud, address); 3028 if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) 3029 return; 3030 3031 /* 3032 * If caller asks to setup a migration entries, we need a page to check 3033 * pmd against. Otherwise we can end up replacing wrong page. 3034 */ 3035 VM_BUG_ON(freeze && !page); 3036 if (page && page != pmd_page(*pmd)) 3037 return; 3038 3039 /* 3040 * Caller holds the mmap_sem write mode or the anon_vma lock, 3041 * so a huge pmd cannot materialize from under us (khugepaged 3042 * holds both the mmap_sem write mode and the anon_vma lock 3043 * write mode). 3044 */ 3045 __split_huge_pmd(vma, pmd, address, freeze); 3046 } 3047 3048 void vma_adjust_trans_huge(struct vm_area_struct *vma, 3049 unsigned long start, 3050 unsigned long end, 3051 long adjust_next) 3052 { 3053 /* 3054 * If the new start address isn't hpage aligned and it could 3055 * previously contain an hugepage: check if we need to split 3056 * an huge pmd. 3057 */ 3058 if (start & ~HPAGE_PMD_MASK && 3059 (start & HPAGE_PMD_MASK) >= vma->vm_start && 3060 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 3061 split_huge_pmd_address(vma, start, false, NULL); 3062 3063 /* 3064 * If the new end address isn't hpage aligned and it could 3065 * previously contain an hugepage: check if we need to split 3066 * an huge pmd. 3067 */ 3068 if (end & ~HPAGE_PMD_MASK && 3069 (end & HPAGE_PMD_MASK) >= vma->vm_start && 3070 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 3071 split_huge_pmd_address(vma, end, false, NULL); 3072 3073 /* 3074 * If we're also updating the vma->vm_next->vm_start, if the new 3075 * vm_next->vm_start isn't page aligned and it could previously 3076 * contain an hugepage: check if we need to split an huge pmd. 3077 */ 3078 if (adjust_next > 0) { 3079 struct vm_area_struct *next = vma->vm_next; 3080 unsigned long nstart = next->vm_start; 3081 nstart += adjust_next << PAGE_SHIFT; 3082 if (nstart & ~HPAGE_PMD_MASK && 3083 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 3084 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 3085 split_huge_pmd_address(next, nstart, false, NULL); 3086 } 3087 } 3088 3089 static void freeze_page(struct page *page) 3090 { 3091 enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | 3092 TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; 3093 int i, ret; 3094 3095 VM_BUG_ON_PAGE(!PageHead(page), page); 3096 3097 /* We only need TTU_SPLIT_HUGE_PMD once */ 3098 ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); 3099 for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { 3100 /* Cut short if the page is unmapped */ 3101 if (page_count(page) == 1) 3102 return; 3103 3104 ret = try_to_unmap(page + i, ttu_flags); 3105 } 3106 VM_BUG_ON(ret); 3107 } 3108 3109 static void unfreeze_page(struct page *page) 3110 { 3111 int i; 3112 3113 for (i = 0; i < HPAGE_PMD_NR; i++) 3114 remove_migration_ptes(page + i, page + i, true); 3115 } 3116 3117 static void __split_huge_page_tail(struct page *head, int tail, 3118 struct lruvec *lruvec, struct list_head *list) 3119 { 3120 struct page *page_tail = head + tail; 3121 3122 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 3123 VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); 3124 3125 /* 3126 * tail_page->_refcount is zero and not changing from under us. But 3127 * get_page_unless_zero() may be running from under us on the 3128 * tail_page. If we used atomic_set() below instead of atomic_inc(), we 3129 * would then run atomic_set() concurrently with 3130 * get_page_unless_zero(), and atomic_set() is implemented in C not 3131 * using locked ops. spin_unlock on x86 sometime uses locked ops 3132 * because of PPro errata 66, 92, so unless somebody can guarantee 3133 * atomic_set() here would be safe on all archs (and not only on x86), 3134 * it's safer to use atomic_inc(). 3135 */ 3136 page_ref_inc(page_tail); 3137 3138 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3139 page_tail->flags |= (head->flags & 3140 ((1L << PG_referenced) | 3141 (1L << PG_swapbacked) | 3142 (1L << PG_mlocked) | 3143 (1L << PG_uptodate) | 3144 (1L << PG_active) | 3145 (1L << PG_locked) | 3146 (1L << PG_unevictable) | 3147 (1L << PG_dirty))); 3148 3149 /* 3150 * After clearing PageTail the gup refcount can be released. 3151 * Page flags also must be visible before we make the page non-compound. 3152 */ 3153 smp_wmb(); 3154 3155 clear_compound_head(page_tail); 3156 3157 if (page_is_young(head)) 3158 set_page_young(page_tail); 3159 if (page_is_idle(head)) 3160 set_page_idle(page_tail); 3161 3162 /* ->mapping in first tail page is compound_mapcount */ 3163 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 3164 page_tail); 3165 page_tail->mapping = head->mapping; 3166 3167 page_tail->index = head->index + tail; 3168 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 3169 lru_add_page_tail(head, page_tail, lruvec, list); 3170 } 3171 3172 static void __split_huge_page(struct page *page, struct list_head *list) 3173 { 3174 struct page *head = compound_head(page); 3175 struct zone *zone = page_zone(head); 3176 struct lruvec *lruvec; 3177 int i; 3178 3179 /* prevent PageLRU to go away from under us, and freeze lru stats */ 3180 spin_lock_irq(&zone->lru_lock); 3181 lruvec = mem_cgroup_page_lruvec(head, zone); 3182 3183 /* complete memcg works before add pages to LRU */ 3184 mem_cgroup_split_huge_fixup(head); 3185 3186 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) 3187 __split_huge_page_tail(head, i, lruvec, list); 3188 3189 ClearPageCompound(head); 3190 spin_unlock_irq(&zone->lru_lock); 3191 3192 unfreeze_page(head); 3193 3194 for (i = 0; i < HPAGE_PMD_NR; i++) { 3195 struct page *subpage = head + i; 3196 if (subpage == page) 3197 continue; 3198 unlock_page(subpage); 3199 3200 /* 3201 * Subpages may be freed if there wasn't any mapping 3202 * like if add_to_swap() is running on a lru page that 3203 * had its mapping zapped. And freeing these pages 3204 * requires taking the lru_lock so we do the put_page 3205 * of the tail pages after the split is complete. 3206 */ 3207 put_page(subpage); 3208 } 3209 } 3210 3211 int total_mapcount(struct page *page) 3212 { 3213 int i, ret; 3214 3215 VM_BUG_ON_PAGE(PageTail(page), page); 3216 3217 if (likely(!PageCompound(page))) 3218 return atomic_read(&page->_mapcount) + 1; 3219 3220 ret = compound_mapcount(page); 3221 if (PageHuge(page)) 3222 return ret; 3223 for (i = 0; i < HPAGE_PMD_NR; i++) 3224 ret += atomic_read(&page[i]._mapcount) + 1; 3225 if (PageDoubleMap(page)) 3226 ret -= HPAGE_PMD_NR; 3227 return ret; 3228 } 3229 3230 /* 3231 * This calculates accurately how many mappings a transparent hugepage 3232 * has (unlike page_mapcount() which isn't fully accurate). This full 3233 * accuracy is primarily needed to know if copy-on-write faults can 3234 * reuse the page and change the mapping to read-write instead of 3235 * copying them. At the same time this returns the total_mapcount too. 3236 * 3237 * The function returns the highest mapcount any one of the subpages 3238 * has. If the return value is one, even if different processes are 3239 * mapping different subpages of the transparent hugepage, they can 3240 * all reuse it, because each process is reusing a different subpage. 3241 * 3242 * The total_mapcount is instead counting all virtual mappings of the 3243 * subpages. If the total_mapcount is equal to "one", it tells the 3244 * caller all mappings belong to the same "mm" and in turn the 3245 * anon_vma of the transparent hugepage can become the vma->anon_vma 3246 * local one as no other process may be mapping any of the subpages. 3247 * 3248 * It would be more accurate to replace page_mapcount() with 3249 * page_trans_huge_mapcount(), however we only use 3250 * page_trans_huge_mapcount() in the copy-on-write faults where we 3251 * need full accuracy to avoid breaking page pinning, because 3252 * page_trans_huge_mapcount() is slower than page_mapcount(). 3253 */ 3254 int page_trans_huge_mapcount(struct page *page, int *total_mapcount) 3255 { 3256 int i, ret, _total_mapcount, mapcount; 3257 3258 /* hugetlbfs shouldn't call it */ 3259 VM_BUG_ON_PAGE(PageHuge(page), page); 3260 3261 if (likely(!PageTransCompound(page))) { 3262 mapcount = atomic_read(&page->_mapcount) + 1; 3263 if (total_mapcount) 3264 *total_mapcount = mapcount; 3265 return mapcount; 3266 } 3267 3268 page = compound_head(page); 3269 3270 _total_mapcount = ret = 0; 3271 for (i = 0; i < HPAGE_PMD_NR; i++) { 3272 mapcount = atomic_read(&page[i]._mapcount) + 1; 3273 ret = max(ret, mapcount); 3274 _total_mapcount += mapcount; 3275 } 3276 if (PageDoubleMap(page)) { 3277 ret -= 1; 3278 _total_mapcount -= HPAGE_PMD_NR; 3279 } 3280 mapcount = compound_mapcount(page); 3281 ret += mapcount; 3282 _total_mapcount += mapcount; 3283 if (total_mapcount) 3284 *total_mapcount = _total_mapcount; 3285 return ret; 3286 } 3287 3288 /* 3289 * This function splits huge page into normal pages. @page can point to any 3290 * subpage of huge page to split. Split doesn't change the position of @page. 3291 * 3292 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 3293 * The huge page must be locked. 3294 * 3295 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 3296 * 3297 * Both head page and tail pages will inherit mapping, flags, and so on from 3298 * the hugepage. 3299 * 3300 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if 3301 * they are not mapped. 3302 * 3303 * Returns 0 if the hugepage is split successfully. 3304 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under 3305 * us. 3306 */ 3307 int split_huge_page_to_list(struct page *page, struct list_head *list) 3308 { 3309 struct page *head = compound_head(page); 3310 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 3311 struct anon_vma *anon_vma; 3312 int count, mapcount, ret; 3313 bool mlocked; 3314 unsigned long flags; 3315 3316 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 3317 VM_BUG_ON_PAGE(!PageAnon(page), page); 3318 VM_BUG_ON_PAGE(!PageLocked(page), page); 3319 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 3320 VM_BUG_ON_PAGE(!PageCompound(page), page); 3321 3322 /* 3323 * The caller does not necessarily hold an mmap_sem that would prevent 3324 * the anon_vma disappearing so we first we take a reference to it 3325 * and then lock the anon_vma for write. This is similar to 3326 * page_lock_anon_vma_read except the write lock is taken to serialise 3327 * against parallel split or collapse operations. 3328 */ 3329 anon_vma = page_get_anon_vma(head); 3330 if (!anon_vma) { 3331 ret = -EBUSY; 3332 goto out; 3333 } 3334 anon_vma_lock_write(anon_vma); 3335 3336 /* 3337 * Racy check if we can split the page, before freeze_page() will 3338 * split PMDs 3339 */ 3340 if (total_mapcount(head) != page_count(head) - 1) { 3341 ret = -EBUSY; 3342 goto out_unlock; 3343 } 3344 3345 mlocked = PageMlocked(page); 3346 freeze_page(head); 3347 VM_BUG_ON_PAGE(compound_mapcount(head), head); 3348 3349 /* Make sure the page is not on per-CPU pagevec as it takes pin */ 3350 if (mlocked) 3351 lru_add_drain(); 3352 3353 /* Prevent deferred_split_scan() touching ->_refcount */ 3354 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3355 count = page_count(head); 3356 mapcount = total_mapcount(head); 3357 if (!mapcount && count == 1) { 3358 if (!list_empty(page_deferred_list(head))) { 3359 pgdata->split_queue_len--; 3360 list_del(page_deferred_list(head)); 3361 } 3362 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3363 __split_huge_page(page, list); 3364 ret = 0; 3365 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 3366 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3367 pr_alert("total_mapcount: %u, page_count(): %u\n", 3368 mapcount, count); 3369 if (PageTail(page)) 3370 dump_page(head, NULL); 3371 dump_page(page, "total_mapcount(head) > 0"); 3372 BUG(); 3373 } else { 3374 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3375 unfreeze_page(head); 3376 ret = -EBUSY; 3377 } 3378 3379 out_unlock: 3380 anon_vma_unlock_write(anon_vma); 3381 put_anon_vma(anon_vma); 3382 out: 3383 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3384 return ret; 3385 } 3386 3387 void free_transhuge_page(struct page *page) 3388 { 3389 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3390 unsigned long flags; 3391 3392 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3393 if (!list_empty(page_deferred_list(page))) { 3394 pgdata->split_queue_len--; 3395 list_del(page_deferred_list(page)); 3396 } 3397 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3398 free_compound_page(page); 3399 } 3400 3401 void deferred_split_huge_page(struct page *page) 3402 { 3403 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3404 unsigned long flags; 3405 3406 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3407 3408 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3409 if (list_empty(page_deferred_list(page))) { 3410 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 3411 list_add_tail(page_deferred_list(page), &pgdata->split_queue); 3412 pgdata->split_queue_len++; 3413 } 3414 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3415 } 3416 3417 static unsigned long deferred_split_count(struct shrinker *shrink, 3418 struct shrink_control *sc) 3419 { 3420 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3421 return ACCESS_ONCE(pgdata->split_queue_len); 3422 } 3423 3424 static unsigned long deferred_split_scan(struct shrinker *shrink, 3425 struct shrink_control *sc) 3426 { 3427 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3428 unsigned long flags; 3429 LIST_HEAD(list), *pos, *next; 3430 struct page *page; 3431 int split = 0; 3432 3433 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3434 /* Take pin on all head pages to avoid freeing them under us */ 3435 list_for_each_safe(pos, next, &pgdata->split_queue) { 3436 page = list_entry((void *)pos, struct page, mapping); 3437 page = compound_head(page); 3438 if (get_page_unless_zero(page)) { 3439 list_move(page_deferred_list(page), &list); 3440 } else { 3441 /* We lost race with put_compound_page() */ 3442 list_del_init(page_deferred_list(page)); 3443 pgdata->split_queue_len--; 3444 } 3445 if (!--sc->nr_to_scan) 3446 break; 3447 } 3448 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3449 3450 list_for_each_safe(pos, next, &list) { 3451 page = list_entry((void *)pos, struct page, mapping); 3452 lock_page(page); 3453 /* split_huge_page() removes page from list on success */ 3454 if (!split_huge_page(page)) 3455 split++; 3456 unlock_page(page); 3457 put_page(page); 3458 } 3459 3460 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3461 list_splice_tail(&list, &pgdata->split_queue); 3462 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3463 3464 /* 3465 * Stop shrinker if we didn't split any page, but the queue is empty. 3466 * This can happen if pages were freed under us. 3467 */ 3468 if (!split && list_empty(&pgdata->split_queue)) 3469 return SHRINK_STOP; 3470 return split; 3471 } 3472 3473 static struct shrinker deferred_split_shrinker = { 3474 .count_objects = deferred_split_count, 3475 .scan_objects = deferred_split_scan, 3476 .seeks = DEFAULT_SEEKS, 3477 .flags = SHRINKER_NUMA_AWARE, 3478 }; 3479 3480 #ifdef CONFIG_DEBUG_FS 3481 static int split_huge_pages_set(void *data, u64 val) 3482 { 3483 struct zone *zone; 3484 struct page *page; 3485 unsigned long pfn, max_zone_pfn; 3486 unsigned long total = 0, split = 0; 3487 3488 if (val != 1) 3489 return -EINVAL; 3490 3491 for_each_populated_zone(zone) { 3492 max_zone_pfn = zone_end_pfn(zone); 3493 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 3494 if (!pfn_valid(pfn)) 3495 continue; 3496 3497 page = pfn_to_page(pfn); 3498 if (!get_page_unless_zero(page)) 3499 continue; 3500 3501 if (zone != page_zone(page)) 3502 goto next; 3503 3504 if (!PageHead(page) || !PageAnon(page) || 3505 PageHuge(page)) 3506 goto next; 3507 3508 total++; 3509 lock_page(page); 3510 if (!split_huge_page(page)) 3511 split++; 3512 unlock_page(page); 3513 next: 3514 put_page(page); 3515 } 3516 } 3517 3518 pr_info("%lu of %lu THP split\n", split, total); 3519 3520 return 0; 3521 } 3522 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, 3523 "%llu\n"); 3524 3525 static int __init split_huge_pages_debugfs(void) 3526 { 3527 void *ret; 3528 3529 ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 3530 &split_huge_pages_fops); 3531 if (!ret) 3532 pr_warn("Failed to create split_huge_pages in debugfs"); 3533 return 0; 3534 } 3535 late_initcall(split_huge_pages_debugfs); 3536 #endif 3537