171e3aac0SAndrea Arcangeli /* 271e3aac0SAndrea Arcangeli * Copyright (C) 2009 Red Hat, Inc. 371e3aac0SAndrea Arcangeli * 471e3aac0SAndrea Arcangeli * This work is licensed under the terms of the GNU GPL, version 2. See 571e3aac0SAndrea Arcangeli * the COPYING file in the top-level directory. 671e3aac0SAndrea Arcangeli */ 771e3aac0SAndrea Arcangeli 871e3aac0SAndrea Arcangeli #include <linux/mm.h> 971e3aac0SAndrea Arcangeli #include <linux/sched.h> 1071e3aac0SAndrea Arcangeli #include <linux/highmem.h> 1171e3aac0SAndrea Arcangeli #include <linux/hugetlb.h> 1271e3aac0SAndrea Arcangeli #include <linux/mmu_notifier.h> 1371e3aac0SAndrea Arcangeli #include <linux/rmap.h> 1471e3aac0SAndrea Arcangeli #include <linux/swap.h> 15ba76149fSAndrea Arcangeli #include <linux/mm_inline.h> 16ba76149fSAndrea Arcangeli #include <linux/kthread.h> 17ba76149fSAndrea Arcangeli #include <linux/khugepaged.h> 18878aee7dSAndrea Arcangeli #include <linux/freezer.h> 19a664b2d8SAndrea Arcangeli #include <linux/mman.h> 20325adeb5SRalf Baechle #include <linux/pagemap.h> 2171e3aac0SAndrea Arcangeli #include <asm/tlb.h> 2271e3aac0SAndrea Arcangeli #include <asm/pgalloc.h> 2371e3aac0SAndrea Arcangeli #include "internal.h" 2471e3aac0SAndrea Arcangeli 25ba76149fSAndrea Arcangeli /* 26ba76149fSAndrea Arcangeli * By default transparent hugepage support is enabled for all mappings 27ba76149fSAndrea Arcangeli * and khugepaged scans all mappings. Defrag is only invoked by 28ba76149fSAndrea Arcangeli * khugepaged hugepage allocations and by page faults inside 29ba76149fSAndrea Arcangeli * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived 30ba76149fSAndrea Arcangeli * allocations. 31ba76149fSAndrea Arcangeli */ 3271e3aac0SAndrea Arcangeli unsigned long transparent_hugepage_flags __read_mostly = 3313ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 34ba76149fSAndrea Arcangeli (1<<TRANSPARENT_HUGEPAGE_FLAG)| 3513ece886SAndrea Arcangeli #endif 3613ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 3713ece886SAndrea Arcangeli (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 3813ece886SAndrea Arcangeli #endif 39d39d33c3SAndrea Arcangeli (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 40ba76149fSAndrea Arcangeli (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 41ba76149fSAndrea Arcangeli 42ba76149fSAndrea Arcangeli /* default scan 8*512 pte (or vmas) every 30 second */ 43ba76149fSAndrea Arcangeli static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 44ba76149fSAndrea Arcangeli static unsigned int khugepaged_pages_collapsed; 45ba76149fSAndrea Arcangeli static unsigned int khugepaged_full_scans; 46ba76149fSAndrea Arcangeli static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 47ba76149fSAndrea Arcangeli /* during fragmentation poll the hugepage allocator once every minute */ 48ba76149fSAndrea Arcangeli static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 49ba76149fSAndrea Arcangeli static struct task_struct *khugepaged_thread __read_mostly; 50ba76149fSAndrea Arcangeli static DEFINE_MUTEX(khugepaged_mutex); 51ba76149fSAndrea Arcangeli static DEFINE_SPINLOCK(khugepaged_mm_lock); 52ba76149fSAndrea Arcangeli static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 53ba76149fSAndrea Arcangeli /* 54ba76149fSAndrea Arcangeli * default collapse hugepages if there is at least one pte mapped like 55ba76149fSAndrea Arcangeli * it would have happened if the vma was large enough during page 56ba76149fSAndrea Arcangeli * fault. 57ba76149fSAndrea Arcangeli */ 58ba76149fSAndrea Arcangeli static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; 59ba76149fSAndrea Arcangeli 60ba76149fSAndrea Arcangeli static int khugepaged(void *none); 61ba76149fSAndrea Arcangeli static int mm_slots_hash_init(void); 62ba76149fSAndrea Arcangeli static int khugepaged_slab_init(void); 63ba76149fSAndrea Arcangeli static void khugepaged_slab_free(void); 64ba76149fSAndrea Arcangeli 65ba76149fSAndrea Arcangeli #define MM_SLOTS_HASH_HEADS 1024 66ba76149fSAndrea Arcangeli static struct hlist_head *mm_slots_hash __read_mostly; 67ba76149fSAndrea Arcangeli static struct kmem_cache *mm_slot_cache __read_mostly; 68ba76149fSAndrea Arcangeli 69ba76149fSAndrea Arcangeli /** 70ba76149fSAndrea Arcangeli * struct mm_slot - hash lookup from mm to mm_slot 71ba76149fSAndrea Arcangeli * @hash: hash collision list 72ba76149fSAndrea Arcangeli * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 73ba76149fSAndrea Arcangeli * @mm: the mm that this information is valid for 74ba76149fSAndrea Arcangeli */ 75ba76149fSAndrea Arcangeli struct mm_slot { 76ba76149fSAndrea Arcangeli struct hlist_node hash; 77ba76149fSAndrea Arcangeli struct list_head mm_node; 78ba76149fSAndrea Arcangeli struct mm_struct *mm; 79ba76149fSAndrea Arcangeli }; 80ba76149fSAndrea Arcangeli 81ba76149fSAndrea Arcangeli /** 82ba76149fSAndrea Arcangeli * struct khugepaged_scan - cursor for scanning 83ba76149fSAndrea Arcangeli * @mm_head: the head of the mm list to scan 84ba76149fSAndrea Arcangeli * @mm_slot: the current mm_slot we are scanning 85ba76149fSAndrea Arcangeli * @address: the next address inside that to be scanned 86ba76149fSAndrea Arcangeli * 87ba76149fSAndrea Arcangeli * There is only the one khugepaged_scan instance of this cursor structure. 88ba76149fSAndrea Arcangeli */ 89ba76149fSAndrea Arcangeli struct khugepaged_scan { 90ba76149fSAndrea Arcangeli struct list_head mm_head; 91ba76149fSAndrea Arcangeli struct mm_slot *mm_slot; 92ba76149fSAndrea Arcangeli unsigned long address; 932f1da642SH Hartley Sweeten }; 942f1da642SH Hartley Sweeten static struct khugepaged_scan khugepaged_scan = { 95ba76149fSAndrea Arcangeli .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 96ba76149fSAndrea Arcangeli }; 97ba76149fSAndrea Arcangeli 98f000565aSAndrea Arcangeli 99f000565aSAndrea Arcangeli static int set_recommended_min_free_kbytes(void) 100f000565aSAndrea Arcangeli { 101f000565aSAndrea Arcangeli struct zone *zone; 102f000565aSAndrea Arcangeli int nr_zones = 0; 103f000565aSAndrea Arcangeli unsigned long recommended_min; 104f000565aSAndrea Arcangeli extern int min_free_kbytes; 105f000565aSAndrea Arcangeli 10617c230afSXiao Guangrong if (!khugepaged_enabled()) 107f000565aSAndrea Arcangeli return 0; 108f000565aSAndrea Arcangeli 109f000565aSAndrea Arcangeli for_each_populated_zone(zone) 110f000565aSAndrea Arcangeli nr_zones++; 111f000565aSAndrea Arcangeli 112f000565aSAndrea Arcangeli /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ 113f000565aSAndrea Arcangeli recommended_min = pageblock_nr_pages * nr_zones * 2; 114f000565aSAndrea Arcangeli 115f000565aSAndrea Arcangeli /* 116f000565aSAndrea Arcangeli * Make sure that on average at least two pageblocks are almost free 117f000565aSAndrea Arcangeli * of another type, one for a migratetype to fall back to and a 118f000565aSAndrea Arcangeli * second to avoid subsequent fallbacks of other types There are 3 119f000565aSAndrea Arcangeli * MIGRATE_TYPES we care about. 120f000565aSAndrea Arcangeli */ 121f000565aSAndrea Arcangeli recommended_min += pageblock_nr_pages * nr_zones * 122f000565aSAndrea Arcangeli MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 123f000565aSAndrea Arcangeli 124f000565aSAndrea Arcangeli /* don't ever allow to reserve more than 5% of the lowmem */ 125f000565aSAndrea Arcangeli recommended_min = min(recommended_min, 126f000565aSAndrea Arcangeli (unsigned long) nr_free_buffer_pages() / 20); 127f000565aSAndrea Arcangeli recommended_min <<= (PAGE_SHIFT-10); 128f000565aSAndrea Arcangeli 129f000565aSAndrea Arcangeli if (recommended_min > min_free_kbytes) 130f000565aSAndrea Arcangeli min_free_kbytes = recommended_min; 131f000565aSAndrea Arcangeli setup_per_zone_wmarks(); 132f000565aSAndrea Arcangeli return 0; 133f000565aSAndrea Arcangeli } 134f000565aSAndrea Arcangeli late_initcall(set_recommended_min_free_kbytes); 135f000565aSAndrea Arcangeli 136ba76149fSAndrea Arcangeli static int start_khugepaged(void) 137ba76149fSAndrea Arcangeli { 138ba76149fSAndrea Arcangeli int err = 0; 139ba76149fSAndrea Arcangeli if (khugepaged_enabled()) { 140ba76149fSAndrea Arcangeli if (!khugepaged_thread) 141ba76149fSAndrea Arcangeli khugepaged_thread = kthread_run(khugepaged, NULL, 142ba76149fSAndrea Arcangeli "khugepaged"); 143ba76149fSAndrea Arcangeli if (unlikely(IS_ERR(khugepaged_thread))) { 144ba76149fSAndrea Arcangeli printk(KERN_ERR 145ba76149fSAndrea Arcangeli "khugepaged: kthread_run(khugepaged) failed\n"); 146ba76149fSAndrea Arcangeli err = PTR_ERR(khugepaged_thread); 147ba76149fSAndrea Arcangeli khugepaged_thread = NULL; 148ba76149fSAndrea Arcangeli } 149911891afSXiao Guangrong 150911891afSXiao Guangrong if (!list_empty(&khugepaged_scan.mm_head)) 151ba76149fSAndrea Arcangeli wake_up_interruptible(&khugepaged_wait); 152f000565aSAndrea Arcangeli 153f000565aSAndrea Arcangeli set_recommended_min_free_kbytes(); 154911891afSXiao Guangrong } else if (khugepaged_thread) { 155911891afSXiao Guangrong kthread_stop(khugepaged_thread); 156911891afSXiao Guangrong khugepaged_thread = NULL; 157911891afSXiao Guangrong } 158637e3a27SXiao Guangrong 159ba76149fSAndrea Arcangeli return err; 160ba76149fSAndrea Arcangeli } 16171e3aac0SAndrea Arcangeli 16271e3aac0SAndrea Arcangeli #ifdef CONFIG_SYSFS 163ba76149fSAndrea Arcangeli 16471e3aac0SAndrea Arcangeli static ssize_t double_flag_show(struct kobject *kobj, 16571e3aac0SAndrea Arcangeli struct kobj_attribute *attr, char *buf, 16671e3aac0SAndrea Arcangeli enum transparent_hugepage_flag enabled, 16771e3aac0SAndrea Arcangeli enum transparent_hugepage_flag req_madv) 16871e3aac0SAndrea Arcangeli { 16971e3aac0SAndrea Arcangeli if (test_bit(enabled, &transparent_hugepage_flags)) { 17071e3aac0SAndrea Arcangeli VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); 17171e3aac0SAndrea Arcangeli return sprintf(buf, "[always] madvise never\n"); 17271e3aac0SAndrea Arcangeli } else if (test_bit(req_madv, &transparent_hugepage_flags)) 17371e3aac0SAndrea Arcangeli return sprintf(buf, "always [madvise] never\n"); 17471e3aac0SAndrea Arcangeli else 17571e3aac0SAndrea Arcangeli return sprintf(buf, "always madvise [never]\n"); 17671e3aac0SAndrea Arcangeli } 17771e3aac0SAndrea Arcangeli static ssize_t double_flag_store(struct kobject *kobj, 17871e3aac0SAndrea Arcangeli struct kobj_attribute *attr, 17971e3aac0SAndrea Arcangeli const char *buf, size_t count, 18071e3aac0SAndrea Arcangeli enum transparent_hugepage_flag enabled, 18171e3aac0SAndrea Arcangeli enum transparent_hugepage_flag req_madv) 18271e3aac0SAndrea Arcangeli { 18371e3aac0SAndrea Arcangeli if (!memcmp("always", buf, 18471e3aac0SAndrea Arcangeli min(sizeof("always")-1, count))) { 18571e3aac0SAndrea Arcangeli set_bit(enabled, &transparent_hugepage_flags); 18671e3aac0SAndrea Arcangeli clear_bit(req_madv, &transparent_hugepage_flags); 18771e3aac0SAndrea Arcangeli } else if (!memcmp("madvise", buf, 18871e3aac0SAndrea Arcangeli min(sizeof("madvise")-1, count))) { 18971e3aac0SAndrea Arcangeli clear_bit(enabled, &transparent_hugepage_flags); 19071e3aac0SAndrea Arcangeli set_bit(req_madv, &transparent_hugepage_flags); 19171e3aac0SAndrea Arcangeli } else if (!memcmp("never", buf, 19271e3aac0SAndrea Arcangeli min(sizeof("never")-1, count))) { 19371e3aac0SAndrea Arcangeli clear_bit(enabled, &transparent_hugepage_flags); 19471e3aac0SAndrea Arcangeli clear_bit(req_madv, &transparent_hugepage_flags); 19571e3aac0SAndrea Arcangeli } else 19671e3aac0SAndrea Arcangeli return -EINVAL; 19771e3aac0SAndrea Arcangeli 19871e3aac0SAndrea Arcangeli return count; 19971e3aac0SAndrea Arcangeli } 20071e3aac0SAndrea Arcangeli 20171e3aac0SAndrea Arcangeli static ssize_t enabled_show(struct kobject *kobj, 20271e3aac0SAndrea Arcangeli struct kobj_attribute *attr, char *buf) 20371e3aac0SAndrea Arcangeli { 20471e3aac0SAndrea Arcangeli return double_flag_show(kobj, attr, buf, 20571e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_FLAG, 20671e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 20771e3aac0SAndrea Arcangeli } 20871e3aac0SAndrea Arcangeli static ssize_t enabled_store(struct kobject *kobj, 20971e3aac0SAndrea Arcangeli struct kobj_attribute *attr, 21071e3aac0SAndrea Arcangeli const char *buf, size_t count) 21171e3aac0SAndrea Arcangeli { 212ba76149fSAndrea Arcangeli ssize_t ret; 213ba76149fSAndrea Arcangeli 214ba76149fSAndrea Arcangeli ret = double_flag_store(kobj, attr, buf, count, 21571e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_FLAG, 21671e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 217ba76149fSAndrea Arcangeli 218ba76149fSAndrea Arcangeli if (ret > 0) { 219911891afSXiao Guangrong int err; 220911891afSXiao Guangrong 221911891afSXiao Guangrong mutex_lock(&khugepaged_mutex); 222911891afSXiao Guangrong err = start_khugepaged(); 223911891afSXiao Guangrong mutex_unlock(&khugepaged_mutex); 224911891afSXiao Guangrong 225ba76149fSAndrea Arcangeli if (err) 226ba76149fSAndrea Arcangeli ret = err; 227ba76149fSAndrea Arcangeli } 228ba76149fSAndrea Arcangeli 229ba76149fSAndrea Arcangeli return ret; 23071e3aac0SAndrea Arcangeli } 23171e3aac0SAndrea Arcangeli static struct kobj_attribute enabled_attr = 23271e3aac0SAndrea Arcangeli __ATTR(enabled, 0644, enabled_show, enabled_store); 23371e3aac0SAndrea Arcangeli 23471e3aac0SAndrea Arcangeli static ssize_t single_flag_show(struct kobject *kobj, 23571e3aac0SAndrea Arcangeli struct kobj_attribute *attr, char *buf, 23671e3aac0SAndrea Arcangeli enum transparent_hugepage_flag flag) 23771e3aac0SAndrea Arcangeli { 238e27e6151SBen Hutchings return sprintf(buf, "%d\n", 239e27e6151SBen Hutchings !!test_bit(flag, &transparent_hugepage_flags)); 24071e3aac0SAndrea Arcangeli } 241e27e6151SBen Hutchings 24271e3aac0SAndrea Arcangeli static ssize_t single_flag_store(struct kobject *kobj, 24371e3aac0SAndrea Arcangeli struct kobj_attribute *attr, 24471e3aac0SAndrea Arcangeli const char *buf, size_t count, 24571e3aac0SAndrea Arcangeli enum transparent_hugepage_flag flag) 24671e3aac0SAndrea Arcangeli { 247e27e6151SBen Hutchings unsigned long value; 248e27e6151SBen Hutchings int ret; 249e27e6151SBen Hutchings 250e27e6151SBen Hutchings ret = kstrtoul(buf, 10, &value); 251e27e6151SBen Hutchings if (ret < 0) 252e27e6151SBen Hutchings return ret; 253e27e6151SBen Hutchings if (value > 1) 25471e3aac0SAndrea Arcangeli return -EINVAL; 25571e3aac0SAndrea Arcangeli 256e27e6151SBen Hutchings if (value) 257e27e6151SBen Hutchings set_bit(flag, &transparent_hugepage_flags); 258e27e6151SBen Hutchings else 259e27e6151SBen Hutchings clear_bit(flag, &transparent_hugepage_flags); 260e27e6151SBen Hutchings 26171e3aac0SAndrea Arcangeli return count; 26271e3aac0SAndrea Arcangeli } 26371e3aac0SAndrea Arcangeli 26471e3aac0SAndrea Arcangeli /* 26571e3aac0SAndrea Arcangeli * Currently defrag only disables __GFP_NOWAIT for allocation. A blind 26671e3aac0SAndrea Arcangeli * __GFP_REPEAT is too aggressive, it's never worth swapping tons of 26771e3aac0SAndrea Arcangeli * memory just to allocate one more hugepage. 26871e3aac0SAndrea Arcangeli */ 26971e3aac0SAndrea Arcangeli static ssize_t defrag_show(struct kobject *kobj, 27071e3aac0SAndrea Arcangeli struct kobj_attribute *attr, char *buf) 27171e3aac0SAndrea Arcangeli { 27271e3aac0SAndrea Arcangeli return double_flag_show(kobj, attr, buf, 27371e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 27471e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 27571e3aac0SAndrea Arcangeli } 27671e3aac0SAndrea Arcangeli static ssize_t defrag_store(struct kobject *kobj, 27771e3aac0SAndrea Arcangeli struct kobj_attribute *attr, 27871e3aac0SAndrea Arcangeli const char *buf, size_t count) 27971e3aac0SAndrea Arcangeli { 28071e3aac0SAndrea Arcangeli return double_flag_store(kobj, attr, buf, count, 28171e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 28271e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 28371e3aac0SAndrea Arcangeli } 28471e3aac0SAndrea Arcangeli static struct kobj_attribute defrag_attr = 28571e3aac0SAndrea Arcangeli __ATTR(defrag, 0644, defrag_show, defrag_store); 28671e3aac0SAndrea Arcangeli 28771e3aac0SAndrea Arcangeli #ifdef CONFIG_DEBUG_VM 28871e3aac0SAndrea Arcangeli static ssize_t debug_cow_show(struct kobject *kobj, 28971e3aac0SAndrea Arcangeli struct kobj_attribute *attr, char *buf) 29071e3aac0SAndrea Arcangeli { 29171e3aac0SAndrea Arcangeli return single_flag_show(kobj, attr, buf, 29271e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 29371e3aac0SAndrea Arcangeli } 29471e3aac0SAndrea Arcangeli static ssize_t debug_cow_store(struct kobject *kobj, 29571e3aac0SAndrea Arcangeli struct kobj_attribute *attr, 29671e3aac0SAndrea Arcangeli const char *buf, size_t count) 29771e3aac0SAndrea Arcangeli { 29871e3aac0SAndrea Arcangeli return single_flag_store(kobj, attr, buf, count, 29971e3aac0SAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 30071e3aac0SAndrea Arcangeli } 30171e3aac0SAndrea Arcangeli static struct kobj_attribute debug_cow_attr = 30271e3aac0SAndrea Arcangeli __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 30371e3aac0SAndrea Arcangeli #endif /* CONFIG_DEBUG_VM */ 30471e3aac0SAndrea Arcangeli 30571e3aac0SAndrea Arcangeli static struct attribute *hugepage_attr[] = { 30671e3aac0SAndrea Arcangeli &enabled_attr.attr, 30771e3aac0SAndrea Arcangeli &defrag_attr.attr, 30871e3aac0SAndrea Arcangeli #ifdef CONFIG_DEBUG_VM 30971e3aac0SAndrea Arcangeli &debug_cow_attr.attr, 31071e3aac0SAndrea Arcangeli #endif 31171e3aac0SAndrea Arcangeli NULL, 31271e3aac0SAndrea Arcangeli }; 31371e3aac0SAndrea Arcangeli 31471e3aac0SAndrea Arcangeli static struct attribute_group hugepage_attr_group = { 31571e3aac0SAndrea Arcangeli .attrs = hugepage_attr, 316ba76149fSAndrea Arcangeli }; 317ba76149fSAndrea Arcangeli 318ba76149fSAndrea Arcangeli static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 319ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 320ba76149fSAndrea Arcangeli char *buf) 321ba76149fSAndrea Arcangeli { 322ba76149fSAndrea Arcangeli return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 323ba76149fSAndrea Arcangeli } 324ba76149fSAndrea Arcangeli 325ba76149fSAndrea Arcangeli static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 326ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 327ba76149fSAndrea Arcangeli const char *buf, size_t count) 328ba76149fSAndrea Arcangeli { 329ba76149fSAndrea Arcangeli unsigned long msecs; 330ba76149fSAndrea Arcangeli int err; 331ba76149fSAndrea Arcangeli 332ba76149fSAndrea Arcangeli err = strict_strtoul(buf, 10, &msecs); 333ba76149fSAndrea Arcangeli if (err || msecs > UINT_MAX) 334ba76149fSAndrea Arcangeli return -EINVAL; 335ba76149fSAndrea Arcangeli 336ba76149fSAndrea Arcangeli khugepaged_scan_sleep_millisecs = msecs; 337ba76149fSAndrea Arcangeli wake_up_interruptible(&khugepaged_wait); 338ba76149fSAndrea Arcangeli 339ba76149fSAndrea Arcangeli return count; 340ba76149fSAndrea Arcangeli } 341ba76149fSAndrea Arcangeli static struct kobj_attribute scan_sleep_millisecs_attr = 342ba76149fSAndrea Arcangeli __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 343ba76149fSAndrea Arcangeli scan_sleep_millisecs_store); 344ba76149fSAndrea Arcangeli 345ba76149fSAndrea Arcangeli static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 346ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 347ba76149fSAndrea Arcangeli char *buf) 348ba76149fSAndrea Arcangeli { 349ba76149fSAndrea Arcangeli return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 350ba76149fSAndrea Arcangeli } 351ba76149fSAndrea Arcangeli 352ba76149fSAndrea Arcangeli static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 353ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 354ba76149fSAndrea Arcangeli const char *buf, size_t count) 355ba76149fSAndrea Arcangeli { 356ba76149fSAndrea Arcangeli unsigned long msecs; 357ba76149fSAndrea Arcangeli int err; 358ba76149fSAndrea Arcangeli 359ba76149fSAndrea Arcangeli err = strict_strtoul(buf, 10, &msecs); 360ba76149fSAndrea Arcangeli if (err || msecs > UINT_MAX) 361ba76149fSAndrea Arcangeli return -EINVAL; 362ba76149fSAndrea Arcangeli 363ba76149fSAndrea Arcangeli khugepaged_alloc_sleep_millisecs = msecs; 364ba76149fSAndrea Arcangeli wake_up_interruptible(&khugepaged_wait); 365ba76149fSAndrea Arcangeli 366ba76149fSAndrea Arcangeli return count; 367ba76149fSAndrea Arcangeli } 368ba76149fSAndrea Arcangeli static struct kobj_attribute alloc_sleep_millisecs_attr = 369ba76149fSAndrea Arcangeli __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 370ba76149fSAndrea Arcangeli alloc_sleep_millisecs_store); 371ba76149fSAndrea Arcangeli 372ba76149fSAndrea Arcangeli static ssize_t pages_to_scan_show(struct kobject *kobj, 373ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 374ba76149fSAndrea Arcangeli char *buf) 375ba76149fSAndrea Arcangeli { 376ba76149fSAndrea Arcangeli return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 377ba76149fSAndrea Arcangeli } 378ba76149fSAndrea Arcangeli static ssize_t pages_to_scan_store(struct kobject *kobj, 379ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 380ba76149fSAndrea Arcangeli const char *buf, size_t count) 381ba76149fSAndrea Arcangeli { 382ba76149fSAndrea Arcangeli int err; 383ba76149fSAndrea Arcangeli unsigned long pages; 384ba76149fSAndrea Arcangeli 385ba76149fSAndrea Arcangeli err = strict_strtoul(buf, 10, &pages); 386ba76149fSAndrea Arcangeli if (err || !pages || pages > UINT_MAX) 387ba76149fSAndrea Arcangeli return -EINVAL; 388ba76149fSAndrea Arcangeli 389ba76149fSAndrea Arcangeli khugepaged_pages_to_scan = pages; 390ba76149fSAndrea Arcangeli 391ba76149fSAndrea Arcangeli return count; 392ba76149fSAndrea Arcangeli } 393ba76149fSAndrea Arcangeli static struct kobj_attribute pages_to_scan_attr = 394ba76149fSAndrea Arcangeli __ATTR(pages_to_scan, 0644, pages_to_scan_show, 395ba76149fSAndrea Arcangeli pages_to_scan_store); 396ba76149fSAndrea Arcangeli 397ba76149fSAndrea Arcangeli static ssize_t pages_collapsed_show(struct kobject *kobj, 398ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 399ba76149fSAndrea Arcangeli char *buf) 400ba76149fSAndrea Arcangeli { 401ba76149fSAndrea Arcangeli return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 402ba76149fSAndrea Arcangeli } 403ba76149fSAndrea Arcangeli static struct kobj_attribute pages_collapsed_attr = 404ba76149fSAndrea Arcangeli __ATTR_RO(pages_collapsed); 405ba76149fSAndrea Arcangeli 406ba76149fSAndrea Arcangeli static ssize_t full_scans_show(struct kobject *kobj, 407ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 408ba76149fSAndrea Arcangeli char *buf) 409ba76149fSAndrea Arcangeli { 410ba76149fSAndrea Arcangeli return sprintf(buf, "%u\n", khugepaged_full_scans); 411ba76149fSAndrea Arcangeli } 412ba76149fSAndrea Arcangeli static struct kobj_attribute full_scans_attr = 413ba76149fSAndrea Arcangeli __ATTR_RO(full_scans); 414ba76149fSAndrea Arcangeli 415ba76149fSAndrea Arcangeli static ssize_t khugepaged_defrag_show(struct kobject *kobj, 416ba76149fSAndrea Arcangeli struct kobj_attribute *attr, char *buf) 417ba76149fSAndrea Arcangeli { 418ba76149fSAndrea Arcangeli return single_flag_show(kobj, attr, buf, 419ba76149fSAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 420ba76149fSAndrea Arcangeli } 421ba76149fSAndrea Arcangeli static ssize_t khugepaged_defrag_store(struct kobject *kobj, 422ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 423ba76149fSAndrea Arcangeli const char *buf, size_t count) 424ba76149fSAndrea Arcangeli { 425ba76149fSAndrea Arcangeli return single_flag_store(kobj, attr, buf, count, 426ba76149fSAndrea Arcangeli TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 427ba76149fSAndrea Arcangeli } 428ba76149fSAndrea Arcangeli static struct kobj_attribute khugepaged_defrag_attr = 429ba76149fSAndrea Arcangeli __ATTR(defrag, 0644, khugepaged_defrag_show, 430ba76149fSAndrea Arcangeli khugepaged_defrag_store); 431ba76149fSAndrea Arcangeli 432ba76149fSAndrea Arcangeli /* 433ba76149fSAndrea Arcangeli * max_ptes_none controls if khugepaged should collapse hugepages over 434ba76149fSAndrea Arcangeli * any unmapped ptes in turn potentially increasing the memory 435ba76149fSAndrea Arcangeli * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 436ba76149fSAndrea Arcangeli * reduce the available free memory in the system as it 437ba76149fSAndrea Arcangeli * runs. Increasing max_ptes_none will instead potentially reduce the 438ba76149fSAndrea Arcangeli * free memory in the system during the khugepaged scan. 439ba76149fSAndrea Arcangeli */ 440ba76149fSAndrea Arcangeli static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 441ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 442ba76149fSAndrea Arcangeli char *buf) 443ba76149fSAndrea Arcangeli { 444ba76149fSAndrea Arcangeli return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 445ba76149fSAndrea Arcangeli } 446ba76149fSAndrea Arcangeli static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 447ba76149fSAndrea Arcangeli struct kobj_attribute *attr, 448ba76149fSAndrea Arcangeli const char *buf, size_t count) 449ba76149fSAndrea Arcangeli { 450ba76149fSAndrea Arcangeli int err; 451ba76149fSAndrea Arcangeli unsigned long max_ptes_none; 452ba76149fSAndrea Arcangeli 453ba76149fSAndrea Arcangeli err = strict_strtoul(buf, 10, &max_ptes_none); 454ba76149fSAndrea Arcangeli if (err || max_ptes_none > HPAGE_PMD_NR-1) 455ba76149fSAndrea Arcangeli return -EINVAL; 456ba76149fSAndrea Arcangeli 457ba76149fSAndrea Arcangeli khugepaged_max_ptes_none = max_ptes_none; 458ba76149fSAndrea Arcangeli 459ba76149fSAndrea Arcangeli return count; 460ba76149fSAndrea Arcangeli } 461ba76149fSAndrea Arcangeli static struct kobj_attribute khugepaged_max_ptes_none_attr = 462ba76149fSAndrea Arcangeli __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 463ba76149fSAndrea Arcangeli khugepaged_max_ptes_none_store); 464ba76149fSAndrea Arcangeli 465ba76149fSAndrea Arcangeli static struct attribute *khugepaged_attr[] = { 466ba76149fSAndrea Arcangeli &khugepaged_defrag_attr.attr, 467ba76149fSAndrea Arcangeli &khugepaged_max_ptes_none_attr.attr, 468ba76149fSAndrea Arcangeli &pages_to_scan_attr.attr, 469ba76149fSAndrea Arcangeli &pages_collapsed_attr.attr, 470ba76149fSAndrea Arcangeli &full_scans_attr.attr, 471ba76149fSAndrea Arcangeli &scan_sleep_millisecs_attr.attr, 472ba76149fSAndrea Arcangeli &alloc_sleep_millisecs_attr.attr, 473ba76149fSAndrea Arcangeli NULL, 474ba76149fSAndrea Arcangeli }; 475ba76149fSAndrea Arcangeli 476ba76149fSAndrea Arcangeli static struct attribute_group khugepaged_attr_group = { 477ba76149fSAndrea Arcangeli .attrs = khugepaged_attr, 478ba76149fSAndrea Arcangeli .name = "khugepaged", 47971e3aac0SAndrea Arcangeli }; 480569e5590SShaohua Li 481569e5590SShaohua Li static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 482569e5590SShaohua Li { 483569e5590SShaohua Li int err; 484569e5590SShaohua Li 485569e5590SShaohua Li *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 486569e5590SShaohua Li if (unlikely(!*hugepage_kobj)) { 487569e5590SShaohua Li printk(KERN_ERR "hugepage: failed kobject create\n"); 488569e5590SShaohua Li return -ENOMEM; 489569e5590SShaohua Li } 490569e5590SShaohua Li 491569e5590SShaohua Li err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 492569e5590SShaohua Li if (err) { 493569e5590SShaohua Li printk(KERN_ERR "hugepage: failed register hugeage group\n"); 494569e5590SShaohua Li goto delete_obj; 495569e5590SShaohua Li } 496569e5590SShaohua Li 497569e5590SShaohua Li err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 498569e5590SShaohua Li if (err) { 499569e5590SShaohua Li printk(KERN_ERR "hugepage: failed register hugeage group\n"); 500569e5590SShaohua Li goto remove_hp_group; 501569e5590SShaohua Li } 502569e5590SShaohua Li 503569e5590SShaohua Li return 0; 504569e5590SShaohua Li 505569e5590SShaohua Li remove_hp_group: 506569e5590SShaohua Li sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 507569e5590SShaohua Li delete_obj: 508569e5590SShaohua Li kobject_put(*hugepage_kobj); 509569e5590SShaohua Li return err; 510569e5590SShaohua Li } 511569e5590SShaohua Li 512569e5590SShaohua Li static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 513569e5590SShaohua Li { 514569e5590SShaohua Li sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 515569e5590SShaohua Li sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 516569e5590SShaohua Li kobject_put(hugepage_kobj); 517569e5590SShaohua Li } 518569e5590SShaohua Li #else 519569e5590SShaohua Li static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 520569e5590SShaohua Li { 521569e5590SShaohua Li return 0; 522569e5590SShaohua Li } 523569e5590SShaohua Li 524569e5590SShaohua Li static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 525569e5590SShaohua Li { 526569e5590SShaohua Li } 52771e3aac0SAndrea Arcangeli #endif /* CONFIG_SYSFS */ 52871e3aac0SAndrea Arcangeli 52971e3aac0SAndrea Arcangeli static int __init hugepage_init(void) 53071e3aac0SAndrea Arcangeli { 53171e3aac0SAndrea Arcangeli int err; 532569e5590SShaohua Li struct kobject *hugepage_kobj; 53371e3aac0SAndrea Arcangeli 5344b7167b9SAndrea Arcangeli if (!has_transparent_hugepage()) { 5354b7167b9SAndrea Arcangeli transparent_hugepage_flags = 0; 536569e5590SShaohua Li return -EINVAL; 5374b7167b9SAndrea Arcangeli } 5384b7167b9SAndrea Arcangeli 539569e5590SShaohua Li err = hugepage_init_sysfs(&hugepage_kobj); 540569e5590SShaohua Li if (err) 541569e5590SShaohua Li return err; 542ba76149fSAndrea Arcangeli 543ba76149fSAndrea Arcangeli err = khugepaged_slab_init(); 544ba76149fSAndrea Arcangeli if (err) 545ba76149fSAndrea Arcangeli goto out; 546ba76149fSAndrea Arcangeli 547ba76149fSAndrea Arcangeli err = mm_slots_hash_init(); 548ba76149fSAndrea Arcangeli if (err) { 549ba76149fSAndrea Arcangeli khugepaged_slab_free(); 550ba76149fSAndrea Arcangeli goto out; 551ba76149fSAndrea Arcangeli } 552ba76149fSAndrea Arcangeli 55397562cd2SRik van Riel /* 55497562cd2SRik van Riel * By default disable transparent hugepages on smaller systems, 55597562cd2SRik van Riel * where the extra memory used could hurt more than TLB overhead 55697562cd2SRik van Riel * is likely to save. The admin can still enable it through /sys. 55797562cd2SRik van Riel */ 55897562cd2SRik van Riel if (totalram_pages < (512 << (20 - PAGE_SHIFT))) 55997562cd2SRik van Riel transparent_hugepage_flags = 0; 56097562cd2SRik van Riel 561ba76149fSAndrea Arcangeli start_khugepaged(); 562ba76149fSAndrea Arcangeli 563569e5590SShaohua Li return 0; 564ba76149fSAndrea Arcangeli out: 565569e5590SShaohua Li hugepage_exit_sysfs(hugepage_kobj); 566ba76149fSAndrea Arcangeli return err; 56771e3aac0SAndrea Arcangeli } 56871e3aac0SAndrea Arcangeli module_init(hugepage_init) 56971e3aac0SAndrea Arcangeli 57071e3aac0SAndrea Arcangeli static int __init setup_transparent_hugepage(char *str) 57171e3aac0SAndrea Arcangeli { 57271e3aac0SAndrea Arcangeli int ret = 0; 57371e3aac0SAndrea Arcangeli if (!str) 57471e3aac0SAndrea Arcangeli goto out; 57571e3aac0SAndrea Arcangeli if (!strcmp(str, "always")) { 57671e3aac0SAndrea Arcangeli set_bit(TRANSPARENT_HUGEPAGE_FLAG, 57771e3aac0SAndrea Arcangeli &transparent_hugepage_flags); 57871e3aac0SAndrea Arcangeli clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 57971e3aac0SAndrea Arcangeli &transparent_hugepage_flags); 58071e3aac0SAndrea Arcangeli ret = 1; 58171e3aac0SAndrea Arcangeli } else if (!strcmp(str, "madvise")) { 58271e3aac0SAndrea Arcangeli clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 58371e3aac0SAndrea Arcangeli &transparent_hugepage_flags); 58471e3aac0SAndrea Arcangeli set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 58571e3aac0SAndrea Arcangeli &transparent_hugepage_flags); 58671e3aac0SAndrea Arcangeli ret = 1; 58771e3aac0SAndrea Arcangeli } else if (!strcmp(str, "never")) { 58871e3aac0SAndrea Arcangeli clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 58971e3aac0SAndrea Arcangeli &transparent_hugepage_flags); 59071e3aac0SAndrea Arcangeli clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 59171e3aac0SAndrea Arcangeli &transparent_hugepage_flags); 59271e3aac0SAndrea Arcangeli ret = 1; 59371e3aac0SAndrea Arcangeli } 59471e3aac0SAndrea Arcangeli out: 59571e3aac0SAndrea Arcangeli if (!ret) 59671e3aac0SAndrea Arcangeli printk(KERN_WARNING 59771e3aac0SAndrea Arcangeli "transparent_hugepage= cannot parse, ignored\n"); 59871e3aac0SAndrea Arcangeli return ret; 59971e3aac0SAndrea Arcangeli } 60071e3aac0SAndrea Arcangeli __setup("transparent_hugepage=", setup_transparent_hugepage); 60171e3aac0SAndrea Arcangeli 60271e3aac0SAndrea Arcangeli static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 60371e3aac0SAndrea Arcangeli { 60471e3aac0SAndrea Arcangeli if (likely(vma->vm_flags & VM_WRITE)) 60571e3aac0SAndrea Arcangeli pmd = pmd_mkwrite(pmd); 60671e3aac0SAndrea Arcangeli return pmd; 60771e3aac0SAndrea Arcangeli } 60871e3aac0SAndrea Arcangeli 609b3092b3bSBob Liu static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) 610b3092b3bSBob Liu { 611b3092b3bSBob Liu pmd_t entry; 612b3092b3bSBob Liu entry = mk_pmd(page, vma->vm_page_prot); 613b3092b3bSBob Liu entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 614b3092b3bSBob Liu entry = pmd_mkhuge(entry); 615b3092b3bSBob Liu return entry; 616b3092b3bSBob Liu } 617b3092b3bSBob Liu 61871e3aac0SAndrea Arcangeli static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 61971e3aac0SAndrea Arcangeli struct vm_area_struct *vma, 62071e3aac0SAndrea Arcangeli unsigned long haddr, pmd_t *pmd, 62171e3aac0SAndrea Arcangeli struct page *page) 62271e3aac0SAndrea Arcangeli { 62371e3aac0SAndrea Arcangeli pgtable_t pgtable; 62471e3aac0SAndrea Arcangeli 62571e3aac0SAndrea Arcangeli VM_BUG_ON(!PageCompound(page)); 62671e3aac0SAndrea Arcangeli pgtable = pte_alloc_one(mm, haddr); 627edad9d2cSDavid Rientjes if (unlikely(!pgtable)) 62871e3aac0SAndrea Arcangeli return VM_FAULT_OOM; 62971e3aac0SAndrea Arcangeli 63071e3aac0SAndrea Arcangeli clear_huge_page(page, haddr, HPAGE_PMD_NR); 63171e3aac0SAndrea Arcangeli __SetPageUptodate(page); 63271e3aac0SAndrea Arcangeli 63371e3aac0SAndrea Arcangeli spin_lock(&mm->page_table_lock); 63471e3aac0SAndrea Arcangeli if (unlikely(!pmd_none(*pmd))) { 63571e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 636b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_page(page); 63771e3aac0SAndrea Arcangeli put_page(page); 63871e3aac0SAndrea Arcangeli pte_free(mm, pgtable); 63971e3aac0SAndrea Arcangeli } else { 64071e3aac0SAndrea Arcangeli pmd_t entry; 641b3092b3bSBob Liu entry = mk_huge_pmd(page, vma); 64271e3aac0SAndrea Arcangeli /* 64371e3aac0SAndrea Arcangeli * The spinlocking to take the lru_lock inside 64471e3aac0SAndrea Arcangeli * page_add_new_anon_rmap() acts as a full memory 64571e3aac0SAndrea Arcangeli * barrier to be sure clear_huge_page writes become 64671e3aac0SAndrea Arcangeli * visible after the set_pmd_at() write. 64771e3aac0SAndrea Arcangeli */ 64871e3aac0SAndrea Arcangeli page_add_new_anon_rmap(page, vma, haddr); 64971e3aac0SAndrea Arcangeli set_pmd_at(mm, haddr, pmd, entry); 650e3ebcf64SGerald Schaefer pgtable_trans_huge_deposit(mm, pgtable); 65171e3aac0SAndrea Arcangeli add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 6521c641e84SAndrea Arcangeli mm->nr_ptes++; 65371e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 65471e3aac0SAndrea Arcangeli } 65571e3aac0SAndrea Arcangeli 656aa2e878eSDavid Rientjes return 0; 65771e3aac0SAndrea Arcangeli } 65871e3aac0SAndrea Arcangeli 659cc5d462fSAndi Kleen static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) 6600bbbc0b3SAndrea Arcangeli { 661cc5d462fSAndi Kleen return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; 6620bbbc0b3SAndrea Arcangeli } 6630bbbc0b3SAndrea Arcangeli 6640bbbc0b3SAndrea Arcangeli static inline struct page *alloc_hugepage_vma(int defrag, 6650bbbc0b3SAndrea Arcangeli struct vm_area_struct *vma, 666cc5d462fSAndi Kleen unsigned long haddr, int nd, 667cc5d462fSAndi Kleen gfp_t extra_gfp) 6680bbbc0b3SAndrea Arcangeli { 669cc5d462fSAndi Kleen return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), 6705c4b4be3SAndi Kleen HPAGE_PMD_ORDER, vma, haddr, nd); 6710bbbc0b3SAndrea Arcangeli } 6720bbbc0b3SAndrea Arcangeli 6730bbbc0b3SAndrea Arcangeli #ifndef CONFIG_NUMA 67471e3aac0SAndrea Arcangeli static inline struct page *alloc_hugepage(int defrag) 67571e3aac0SAndrea Arcangeli { 676cc5d462fSAndi Kleen return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), 67771e3aac0SAndrea Arcangeli HPAGE_PMD_ORDER); 67871e3aac0SAndrea Arcangeli } 6790bbbc0b3SAndrea Arcangeli #endif 68071e3aac0SAndrea Arcangeli 68171e3aac0SAndrea Arcangeli int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 68271e3aac0SAndrea Arcangeli unsigned long address, pmd_t *pmd, 68371e3aac0SAndrea Arcangeli unsigned int flags) 68471e3aac0SAndrea Arcangeli { 68571e3aac0SAndrea Arcangeli struct page *page; 68671e3aac0SAndrea Arcangeli unsigned long haddr = address & HPAGE_PMD_MASK; 68771e3aac0SAndrea Arcangeli pte_t *pte; 68871e3aac0SAndrea Arcangeli 68971e3aac0SAndrea Arcangeli if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { 69071e3aac0SAndrea Arcangeli if (unlikely(anon_vma_prepare(vma))) 69171e3aac0SAndrea Arcangeli return VM_FAULT_OOM; 692ba76149fSAndrea Arcangeli if (unlikely(khugepaged_enter(vma))) 693ba76149fSAndrea Arcangeli return VM_FAULT_OOM; 6940bbbc0b3SAndrea Arcangeli page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 695cc5d462fSAndi Kleen vma, haddr, numa_node_id(), 0); 69681ab4201SAndi Kleen if (unlikely(!page)) { 69781ab4201SAndi Kleen count_vm_event(THP_FAULT_FALLBACK); 69871e3aac0SAndrea Arcangeli goto out; 69981ab4201SAndi Kleen } 70081ab4201SAndi Kleen count_vm_event(THP_FAULT_ALLOC); 701b9bbfbe3SAndrea Arcangeli if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 702b9bbfbe3SAndrea Arcangeli put_page(page); 703b9bbfbe3SAndrea Arcangeli goto out; 704b9bbfbe3SAndrea Arcangeli } 705edad9d2cSDavid Rientjes if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, 706edad9d2cSDavid Rientjes page))) { 707edad9d2cSDavid Rientjes mem_cgroup_uncharge_page(page); 708edad9d2cSDavid Rientjes put_page(page); 709edad9d2cSDavid Rientjes goto out; 710edad9d2cSDavid Rientjes } 71171e3aac0SAndrea Arcangeli 712edad9d2cSDavid Rientjes return 0; 71371e3aac0SAndrea Arcangeli } 71471e3aac0SAndrea Arcangeli out: 71571e3aac0SAndrea Arcangeli /* 71671e3aac0SAndrea Arcangeli * Use __pte_alloc instead of pte_alloc_map, because we can't 71771e3aac0SAndrea Arcangeli * run pte_offset_map on the pmd, if an huge pmd could 71871e3aac0SAndrea Arcangeli * materialize from under us from a different thread. 71971e3aac0SAndrea Arcangeli */ 72071e3aac0SAndrea Arcangeli if (unlikely(__pte_alloc(mm, vma, pmd, address))) 72171e3aac0SAndrea Arcangeli return VM_FAULT_OOM; 72271e3aac0SAndrea Arcangeli /* if an huge pmd materialized from under us just retry later */ 72371e3aac0SAndrea Arcangeli if (unlikely(pmd_trans_huge(*pmd))) 72471e3aac0SAndrea Arcangeli return 0; 72571e3aac0SAndrea Arcangeli /* 72671e3aac0SAndrea Arcangeli * A regular pmd is established and it can't morph into a huge pmd 72771e3aac0SAndrea Arcangeli * from under us anymore at this point because we hold the mmap_sem 72871e3aac0SAndrea Arcangeli * read mode and khugepaged takes it in write mode. So now it's 72971e3aac0SAndrea Arcangeli * safe to run pte_offset_map(). 73071e3aac0SAndrea Arcangeli */ 73171e3aac0SAndrea Arcangeli pte = pte_offset_map(pmd, address); 73271e3aac0SAndrea Arcangeli return handle_pte_fault(mm, vma, address, pte, pmd, flags); 73371e3aac0SAndrea Arcangeli } 73471e3aac0SAndrea Arcangeli 73571e3aac0SAndrea Arcangeli int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 73671e3aac0SAndrea Arcangeli pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 73771e3aac0SAndrea Arcangeli struct vm_area_struct *vma) 73871e3aac0SAndrea Arcangeli { 73971e3aac0SAndrea Arcangeli struct page *src_page; 74071e3aac0SAndrea Arcangeli pmd_t pmd; 74171e3aac0SAndrea Arcangeli pgtable_t pgtable; 74271e3aac0SAndrea Arcangeli int ret; 74371e3aac0SAndrea Arcangeli 74471e3aac0SAndrea Arcangeli ret = -ENOMEM; 74571e3aac0SAndrea Arcangeli pgtable = pte_alloc_one(dst_mm, addr); 74671e3aac0SAndrea Arcangeli if (unlikely(!pgtable)) 74771e3aac0SAndrea Arcangeli goto out; 74871e3aac0SAndrea Arcangeli 74971e3aac0SAndrea Arcangeli spin_lock(&dst_mm->page_table_lock); 75071e3aac0SAndrea Arcangeli spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); 75171e3aac0SAndrea Arcangeli 75271e3aac0SAndrea Arcangeli ret = -EAGAIN; 75371e3aac0SAndrea Arcangeli pmd = *src_pmd; 75471e3aac0SAndrea Arcangeli if (unlikely(!pmd_trans_huge(pmd))) { 75571e3aac0SAndrea Arcangeli pte_free(dst_mm, pgtable); 75671e3aac0SAndrea Arcangeli goto out_unlock; 75771e3aac0SAndrea Arcangeli } 75871e3aac0SAndrea Arcangeli if (unlikely(pmd_trans_splitting(pmd))) { 75971e3aac0SAndrea Arcangeli /* split huge page running from under us */ 76071e3aac0SAndrea Arcangeli spin_unlock(&src_mm->page_table_lock); 76171e3aac0SAndrea Arcangeli spin_unlock(&dst_mm->page_table_lock); 76271e3aac0SAndrea Arcangeli pte_free(dst_mm, pgtable); 76371e3aac0SAndrea Arcangeli 76471e3aac0SAndrea Arcangeli wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ 76571e3aac0SAndrea Arcangeli goto out; 76671e3aac0SAndrea Arcangeli } 76771e3aac0SAndrea Arcangeli src_page = pmd_page(pmd); 76871e3aac0SAndrea Arcangeli VM_BUG_ON(!PageHead(src_page)); 76971e3aac0SAndrea Arcangeli get_page(src_page); 77071e3aac0SAndrea Arcangeli page_dup_rmap(src_page); 77171e3aac0SAndrea Arcangeli add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 77271e3aac0SAndrea Arcangeli 77371e3aac0SAndrea Arcangeli pmdp_set_wrprotect(src_mm, addr, src_pmd); 77471e3aac0SAndrea Arcangeli pmd = pmd_mkold(pmd_wrprotect(pmd)); 77571e3aac0SAndrea Arcangeli set_pmd_at(dst_mm, addr, dst_pmd, pmd); 776e3ebcf64SGerald Schaefer pgtable_trans_huge_deposit(dst_mm, pgtable); 7771c641e84SAndrea Arcangeli dst_mm->nr_ptes++; 77871e3aac0SAndrea Arcangeli 77971e3aac0SAndrea Arcangeli ret = 0; 78071e3aac0SAndrea Arcangeli out_unlock: 78171e3aac0SAndrea Arcangeli spin_unlock(&src_mm->page_table_lock); 78271e3aac0SAndrea Arcangeli spin_unlock(&dst_mm->page_table_lock); 78371e3aac0SAndrea Arcangeli out: 78471e3aac0SAndrea Arcangeli return ret; 78571e3aac0SAndrea Arcangeli } 78671e3aac0SAndrea Arcangeli 787*a1dd450bSWill Deacon void huge_pmd_set_accessed(struct mm_struct *mm, 788*a1dd450bSWill Deacon struct vm_area_struct *vma, 789*a1dd450bSWill Deacon unsigned long address, 790*a1dd450bSWill Deacon pmd_t *pmd, pmd_t orig_pmd, 791*a1dd450bSWill Deacon int dirty) 792*a1dd450bSWill Deacon { 793*a1dd450bSWill Deacon pmd_t entry; 794*a1dd450bSWill Deacon unsigned long haddr; 795*a1dd450bSWill Deacon 796*a1dd450bSWill Deacon spin_lock(&mm->page_table_lock); 797*a1dd450bSWill Deacon if (unlikely(!pmd_same(*pmd, orig_pmd))) 798*a1dd450bSWill Deacon goto unlock; 799*a1dd450bSWill Deacon 800*a1dd450bSWill Deacon entry = pmd_mkyoung(orig_pmd); 801*a1dd450bSWill Deacon haddr = address & HPAGE_PMD_MASK; 802*a1dd450bSWill Deacon if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 803*a1dd450bSWill Deacon update_mmu_cache_pmd(vma, address, pmd); 804*a1dd450bSWill Deacon 805*a1dd450bSWill Deacon unlock: 806*a1dd450bSWill Deacon spin_unlock(&mm->page_table_lock); 807*a1dd450bSWill Deacon } 808*a1dd450bSWill Deacon 80971e3aac0SAndrea Arcangeli static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 81071e3aac0SAndrea Arcangeli struct vm_area_struct *vma, 81171e3aac0SAndrea Arcangeli unsigned long address, 81271e3aac0SAndrea Arcangeli pmd_t *pmd, pmd_t orig_pmd, 81371e3aac0SAndrea Arcangeli struct page *page, 81471e3aac0SAndrea Arcangeli unsigned long haddr) 81571e3aac0SAndrea Arcangeli { 81671e3aac0SAndrea Arcangeli pgtable_t pgtable; 81771e3aac0SAndrea Arcangeli pmd_t _pmd; 81871e3aac0SAndrea Arcangeli int ret = 0, i; 81971e3aac0SAndrea Arcangeli struct page **pages; 8202ec74c3eSSagi Grimberg unsigned long mmun_start; /* For mmu_notifiers */ 8212ec74c3eSSagi Grimberg unsigned long mmun_end; /* For mmu_notifiers */ 82271e3aac0SAndrea Arcangeli 82371e3aac0SAndrea Arcangeli pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 82471e3aac0SAndrea Arcangeli GFP_KERNEL); 82571e3aac0SAndrea Arcangeli if (unlikely(!pages)) { 82671e3aac0SAndrea Arcangeli ret |= VM_FAULT_OOM; 82771e3aac0SAndrea Arcangeli goto out; 82871e3aac0SAndrea Arcangeli } 82971e3aac0SAndrea Arcangeli 83071e3aac0SAndrea Arcangeli for (i = 0; i < HPAGE_PMD_NR; i++) { 831cc5d462fSAndi Kleen pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 832cc5d462fSAndi Kleen __GFP_OTHER_NODE, 83319ee151eSAndi Kleen vma, address, page_to_nid(page)); 834b9bbfbe3SAndrea Arcangeli if (unlikely(!pages[i] || 835b9bbfbe3SAndrea Arcangeli mem_cgroup_newpage_charge(pages[i], mm, 836b9bbfbe3SAndrea Arcangeli GFP_KERNEL))) { 837b9bbfbe3SAndrea Arcangeli if (pages[i]) 83871e3aac0SAndrea Arcangeli put_page(pages[i]); 839b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_start(); 840b9bbfbe3SAndrea Arcangeli while (--i >= 0) { 841b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_page(pages[i]); 842b9bbfbe3SAndrea Arcangeli put_page(pages[i]); 843b9bbfbe3SAndrea Arcangeli } 844b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_end(); 84571e3aac0SAndrea Arcangeli kfree(pages); 84671e3aac0SAndrea Arcangeli ret |= VM_FAULT_OOM; 84771e3aac0SAndrea Arcangeli goto out; 84871e3aac0SAndrea Arcangeli } 84971e3aac0SAndrea Arcangeli } 85071e3aac0SAndrea Arcangeli 85171e3aac0SAndrea Arcangeli for (i = 0; i < HPAGE_PMD_NR; i++) { 85271e3aac0SAndrea Arcangeli copy_user_highpage(pages[i], page + i, 8530089e485SHillf Danton haddr + PAGE_SIZE * i, vma); 85471e3aac0SAndrea Arcangeli __SetPageUptodate(pages[i]); 85571e3aac0SAndrea Arcangeli cond_resched(); 85671e3aac0SAndrea Arcangeli } 85771e3aac0SAndrea Arcangeli 8582ec74c3eSSagi Grimberg mmun_start = haddr; 8592ec74c3eSSagi Grimberg mmun_end = haddr + HPAGE_PMD_SIZE; 8602ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 8612ec74c3eSSagi Grimberg 86271e3aac0SAndrea Arcangeli spin_lock(&mm->page_table_lock); 86371e3aac0SAndrea Arcangeli if (unlikely(!pmd_same(*pmd, orig_pmd))) 86471e3aac0SAndrea Arcangeli goto out_free_pages; 86571e3aac0SAndrea Arcangeli VM_BUG_ON(!PageHead(page)); 86671e3aac0SAndrea Arcangeli 8672ec74c3eSSagi Grimberg pmdp_clear_flush(vma, haddr, pmd); 86871e3aac0SAndrea Arcangeli /* leave pmd empty until pte is filled */ 86971e3aac0SAndrea Arcangeli 870e3ebcf64SGerald Schaefer pgtable = pgtable_trans_huge_withdraw(mm); 87171e3aac0SAndrea Arcangeli pmd_populate(mm, &_pmd, pgtable); 87271e3aac0SAndrea Arcangeli 87371e3aac0SAndrea Arcangeli for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 87471e3aac0SAndrea Arcangeli pte_t *pte, entry; 87571e3aac0SAndrea Arcangeli entry = mk_pte(pages[i], vma->vm_page_prot); 87671e3aac0SAndrea Arcangeli entry = maybe_mkwrite(pte_mkdirty(entry), vma); 87771e3aac0SAndrea Arcangeli page_add_new_anon_rmap(pages[i], vma, haddr); 87871e3aac0SAndrea Arcangeli pte = pte_offset_map(&_pmd, haddr); 87971e3aac0SAndrea Arcangeli VM_BUG_ON(!pte_none(*pte)); 88071e3aac0SAndrea Arcangeli set_pte_at(mm, haddr, pte, entry); 88171e3aac0SAndrea Arcangeli pte_unmap(pte); 88271e3aac0SAndrea Arcangeli } 88371e3aac0SAndrea Arcangeli kfree(pages); 88471e3aac0SAndrea Arcangeli 88571e3aac0SAndrea Arcangeli smp_wmb(); /* make pte visible before pmd */ 88671e3aac0SAndrea Arcangeli pmd_populate(mm, pmd, pgtable); 88771e3aac0SAndrea Arcangeli page_remove_rmap(page); 88871e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 88971e3aac0SAndrea Arcangeli 8902ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 8912ec74c3eSSagi Grimberg 89271e3aac0SAndrea Arcangeli ret |= VM_FAULT_WRITE; 89371e3aac0SAndrea Arcangeli put_page(page); 89471e3aac0SAndrea Arcangeli 89571e3aac0SAndrea Arcangeli out: 89671e3aac0SAndrea Arcangeli return ret; 89771e3aac0SAndrea Arcangeli 89871e3aac0SAndrea Arcangeli out_free_pages: 89971e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 9002ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 901b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_start(); 902b9bbfbe3SAndrea Arcangeli for (i = 0; i < HPAGE_PMD_NR; i++) { 903b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_page(pages[i]); 90471e3aac0SAndrea Arcangeli put_page(pages[i]); 905b9bbfbe3SAndrea Arcangeli } 906b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_end(); 90771e3aac0SAndrea Arcangeli kfree(pages); 90871e3aac0SAndrea Arcangeli goto out; 90971e3aac0SAndrea Arcangeli } 91071e3aac0SAndrea Arcangeli 91171e3aac0SAndrea Arcangeli int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 91271e3aac0SAndrea Arcangeli unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 91371e3aac0SAndrea Arcangeli { 91471e3aac0SAndrea Arcangeli int ret = 0; 91571e3aac0SAndrea Arcangeli struct page *page, *new_page; 91671e3aac0SAndrea Arcangeli unsigned long haddr; 9172ec74c3eSSagi Grimberg unsigned long mmun_start; /* For mmu_notifiers */ 9182ec74c3eSSagi Grimberg unsigned long mmun_end; /* For mmu_notifiers */ 91971e3aac0SAndrea Arcangeli 92071e3aac0SAndrea Arcangeli VM_BUG_ON(!vma->anon_vma); 92171e3aac0SAndrea Arcangeli spin_lock(&mm->page_table_lock); 92271e3aac0SAndrea Arcangeli if (unlikely(!pmd_same(*pmd, orig_pmd))) 92371e3aac0SAndrea Arcangeli goto out_unlock; 92471e3aac0SAndrea Arcangeli 92571e3aac0SAndrea Arcangeli page = pmd_page(orig_pmd); 92671e3aac0SAndrea Arcangeli VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 92771e3aac0SAndrea Arcangeli haddr = address & HPAGE_PMD_MASK; 92871e3aac0SAndrea Arcangeli if (page_mapcount(page) == 1) { 92971e3aac0SAndrea Arcangeli pmd_t entry; 93071e3aac0SAndrea Arcangeli entry = pmd_mkyoung(orig_pmd); 93171e3aac0SAndrea Arcangeli entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 93271e3aac0SAndrea Arcangeli if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 933b113da65SDavid Miller update_mmu_cache_pmd(vma, address, pmd); 93471e3aac0SAndrea Arcangeli ret |= VM_FAULT_WRITE; 93571e3aac0SAndrea Arcangeli goto out_unlock; 93671e3aac0SAndrea Arcangeli } 93771e3aac0SAndrea Arcangeli get_page(page); 93871e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 93971e3aac0SAndrea Arcangeli 94071e3aac0SAndrea Arcangeli if (transparent_hugepage_enabled(vma) && 94171e3aac0SAndrea Arcangeli !transparent_hugepage_debug_cow()) 9420bbbc0b3SAndrea Arcangeli new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 943cc5d462fSAndi Kleen vma, haddr, numa_node_id(), 0); 94471e3aac0SAndrea Arcangeli else 94571e3aac0SAndrea Arcangeli new_page = NULL; 94671e3aac0SAndrea Arcangeli 94771e3aac0SAndrea Arcangeli if (unlikely(!new_page)) { 94881ab4201SAndi Kleen count_vm_event(THP_FAULT_FALLBACK); 94971e3aac0SAndrea Arcangeli ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 95071e3aac0SAndrea Arcangeli pmd, orig_pmd, page, haddr); 9511f1d06c3SDavid Rientjes if (ret & VM_FAULT_OOM) 9521f1d06c3SDavid Rientjes split_huge_page(page); 95371e3aac0SAndrea Arcangeli put_page(page); 95471e3aac0SAndrea Arcangeli goto out; 95571e3aac0SAndrea Arcangeli } 95681ab4201SAndi Kleen count_vm_event(THP_FAULT_ALLOC); 95771e3aac0SAndrea Arcangeli 958b9bbfbe3SAndrea Arcangeli if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 959b9bbfbe3SAndrea Arcangeli put_page(new_page); 9601f1d06c3SDavid Rientjes split_huge_page(page); 961b9bbfbe3SAndrea Arcangeli put_page(page); 962b9bbfbe3SAndrea Arcangeli ret |= VM_FAULT_OOM; 963b9bbfbe3SAndrea Arcangeli goto out; 964b9bbfbe3SAndrea Arcangeli } 965b9bbfbe3SAndrea Arcangeli 96671e3aac0SAndrea Arcangeli copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 96771e3aac0SAndrea Arcangeli __SetPageUptodate(new_page); 96871e3aac0SAndrea Arcangeli 9692ec74c3eSSagi Grimberg mmun_start = haddr; 9702ec74c3eSSagi Grimberg mmun_end = haddr + HPAGE_PMD_SIZE; 9712ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 9722ec74c3eSSagi Grimberg 97371e3aac0SAndrea Arcangeli spin_lock(&mm->page_table_lock); 97471e3aac0SAndrea Arcangeli put_page(page); 975b9bbfbe3SAndrea Arcangeli if (unlikely(!pmd_same(*pmd, orig_pmd))) { 9766f60b69dSDavid Rientjes spin_unlock(&mm->page_table_lock); 977b9bbfbe3SAndrea Arcangeli mem_cgroup_uncharge_page(new_page); 97871e3aac0SAndrea Arcangeli put_page(new_page); 9792ec74c3eSSagi Grimberg goto out_mn; 980b9bbfbe3SAndrea Arcangeli } else { 98171e3aac0SAndrea Arcangeli pmd_t entry; 98271e3aac0SAndrea Arcangeli VM_BUG_ON(!PageHead(page)); 983b3092b3bSBob Liu entry = mk_huge_pmd(new_page, vma); 9842ec74c3eSSagi Grimberg pmdp_clear_flush(vma, haddr, pmd); 98571e3aac0SAndrea Arcangeli page_add_new_anon_rmap(new_page, vma, haddr); 98671e3aac0SAndrea Arcangeli set_pmd_at(mm, haddr, pmd, entry); 987b113da65SDavid Miller update_mmu_cache_pmd(vma, address, pmd); 98871e3aac0SAndrea Arcangeli page_remove_rmap(page); 98971e3aac0SAndrea Arcangeli put_page(page); 99071e3aac0SAndrea Arcangeli ret |= VM_FAULT_WRITE; 99171e3aac0SAndrea Arcangeli } 9922ec74c3eSSagi Grimberg spin_unlock(&mm->page_table_lock); 9932ec74c3eSSagi Grimberg out_mn: 9942ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 9952ec74c3eSSagi Grimberg out: 9962ec74c3eSSagi Grimberg return ret; 99771e3aac0SAndrea Arcangeli out_unlock: 99871e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 99971e3aac0SAndrea Arcangeli return ret; 100071e3aac0SAndrea Arcangeli } 100171e3aac0SAndrea Arcangeli 1002b676b293SDavid Rientjes struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 100371e3aac0SAndrea Arcangeli unsigned long addr, 100471e3aac0SAndrea Arcangeli pmd_t *pmd, 100571e3aac0SAndrea Arcangeli unsigned int flags) 100671e3aac0SAndrea Arcangeli { 1007b676b293SDavid Rientjes struct mm_struct *mm = vma->vm_mm; 100871e3aac0SAndrea Arcangeli struct page *page = NULL; 100971e3aac0SAndrea Arcangeli 101071e3aac0SAndrea Arcangeli assert_spin_locked(&mm->page_table_lock); 101171e3aac0SAndrea Arcangeli 101271e3aac0SAndrea Arcangeli if (flags & FOLL_WRITE && !pmd_write(*pmd)) 101371e3aac0SAndrea Arcangeli goto out; 101471e3aac0SAndrea Arcangeli 101571e3aac0SAndrea Arcangeli page = pmd_page(*pmd); 101671e3aac0SAndrea Arcangeli VM_BUG_ON(!PageHead(page)); 101771e3aac0SAndrea Arcangeli if (flags & FOLL_TOUCH) { 101871e3aac0SAndrea Arcangeli pmd_t _pmd; 101971e3aac0SAndrea Arcangeli /* 102071e3aac0SAndrea Arcangeli * We should set the dirty bit only for FOLL_WRITE but 102171e3aac0SAndrea Arcangeli * for now the dirty bit in the pmd is meaningless. 102271e3aac0SAndrea Arcangeli * And if the dirty bit will become meaningful and 102371e3aac0SAndrea Arcangeli * we'll only set it with FOLL_WRITE, an atomic 102471e3aac0SAndrea Arcangeli * set_bit will be required on the pmd to set the 102571e3aac0SAndrea Arcangeli * young bit, instead of the current set_pmd_at. 102671e3aac0SAndrea Arcangeli */ 102771e3aac0SAndrea Arcangeli _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 102871e3aac0SAndrea Arcangeli set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 102971e3aac0SAndrea Arcangeli } 1030b676b293SDavid Rientjes if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1031b676b293SDavid Rientjes if (page->mapping && trylock_page(page)) { 1032b676b293SDavid Rientjes lru_add_drain(); 1033b676b293SDavid Rientjes if (page->mapping) 1034b676b293SDavid Rientjes mlock_vma_page(page); 1035b676b293SDavid Rientjes unlock_page(page); 1036b676b293SDavid Rientjes } 1037b676b293SDavid Rientjes } 103871e3aac0SAndrea Arcangeli page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 103971e3aac0SAndrea Arcangeli VM_BUG_ON(!PageCompound(page)); 104071e3aac0SAndrea Arcangeli if (flags & FOLL_GET) 104170b50f94SAndrea Arcangeli get_page_foll(page); 104271e3aac0SAndrea Arcangeli 104371e3aac0SAndrea Arcangeli out: 104471e3aac0SAndrea Arcangeli return page; 104571e3aac0SAndrea Arcangeli } 104671e3aac0SAndrea Arcangeli 104771e3aac0SAndrea Arcangeli int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1048f21760b1SShaohua Li pmd_t *pmd, unsigned long addr) 104971e3aac0SAndrea Arcangeli { 105071e3aac0SAndrea Arcangeli int ret = 0; 105171e3aac0SAndrea Arcangeli 1052025c5b24SNaoya Horiguchi if (__pmd_trans_huge_lock(pmd, vma) == 1) { 105371e3aac0SAndrea Arcangeli struct page *page; 105471e3aac0SAndrea Arcangeli pgtable_t pgtable; 1055f5c8ad47SDavid Miller pmd_t orig_pmd; 1056e3ebcf64SGerald Schaefer pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1057f5c8ad47SDavid Miller orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1058f5c8ad47SDavid Miller page = pmd_page(orig_pmd); 1059f21760b1SShaohua Li tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 106071e3aac0SAndrea Arcangeli page_remove_rmap(page); 106171e3aac0SAndrea Arcangeli VM_BUG_ON(page_mapcount(page) < 0); 106271e3aac0SAndrea Arcangeli add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 106371e3aac0SAndrea Arcangeli VM_BUG_ON(!PageHead(page)); 10641c641e84SAndrea Arcangeli tlb->mm->nr_ptes--; 106571e3aac0SAndrea Arcangeli spin_unlock(&tlb->mm->page_table_lock); 106671e3aac0SAndrea Arcangeli tlb_remove_page(tlb, page); 106771e3aac0SAndrea Arcangeli pte_free(tlb->mm, pgtable); 106871e3aac0SAndrea Arcangeli ret = 1; 106971e3aac0SAndrea Arcangeli } 107071e3aac0SAndrea Arcangeli return ret; 107171e3aac0SAndrea Arcangeli } 107271e3aac0SAndrea Arcangeli 10730ca1634dSJohannes Weiner int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 10740ca1634dSJohannes Weiner unsigned long addr, unsigned long end, 10750ca1634dSJohannes Weiner unsigned char *vec) 10760ca1634dSJohannes Weiner { 10770ca1634dSJohannes Weiner int ret = 0; 10780ca1634dSJohannes Weiner 1079025c5b24SNaoya Horiguchi if (__pmd_trans_huge_lock(pmd, vma) == 1) { 10800ca1634dSJohannes Weiner /* 10810ca1634dSJohannes Weiner * All logical pages in the range are present 10820ca1634dSJohannes Weiner * if backed by a huge page. 10830ca1634dSJohannes Weiner */ 10840ca1634dSJohannes Weiner spin_unlock(&vma->vm_mm->page_table_lock); 1085025c5b24SNaoya Horiguchi memset(vec, 1, (end - addr) >> PAGE_SHIFT); 1086025c5b24SNaoya Horiguchi ret = 1; 1087025c5b24SNaoya Horiguchi } 10880ca1634dSJohannes Weiner 10890ca1634dSJohannes Weiner return ret; 10900ca1634dSJohannes Weiner } 10910ca1634dSJohannes Weiner 109237a1c49aSAndrea Arcangeli int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 109337a1c49aSAndrea Arcangeli unsigned long old_addr, 109437a1c49aSAndrea Arcangeli unsigned long new_addr, unsigned long old_end, 109537a1c49aSAndrea Arcangeli pmd_t *old_pmd, pmd_t *new_pmd) 109637a1c49aSAndrea Arcangeli { 109737a1c49aSAndrea Arcangeli int ret = 0; 109837a1c49aSAndrea Arcangeli pmd_t pmd; 109937a1c49aSAndrea Arcangeli 110037a1c49aSAndrea Arcangeli struct mm_struct *mm = vma->vm_mm; 110137a1c49aSAndrea Arcangeli 110237a1c49aSAndrea Arcangeli if ((old_addr & ~HPAGE_PMD_MASK) || 110337a1c49aSAndrea Arcangeli (new_addr & ~HPAGE_PMD_MASK) || 110437a1c49aSAndrea Arcangeli old_end - old_addr < HPAGE_PMD_SIZE || 110537a1c49aSAndrea Arcangeli (new_vma->vm_flags & VM_NOHUGEPAGE)) 110637a1c49aSAndrea Arcangeli goto out; 110737a1c49aSAndrea Arcangeli 110837a1c49aSAndrea Arcangeli /* 110937a1c49aSAndrea Arcangeli * The destination pmd shouldn't be established, free_pgtables() 111037a1c49aSAndrea Arcangeli * should have release it. 111137a1c49aSAndrea Arcangeli */ 111237a1c49aSAndrea Arcangeli if (WARN_ON(!pmd_none(*new_pmd))) { 111337a1c49aSAndrea Arcangeli VM_BUG_ON(pmd_trans_huge(*new_pmd)); 111437a1c49aSAndrea Arcangeli goto out; 111537a1c49aSAndrea Arcangeli } 111637a1c49aSAndrea Arcangeli 1117025c5b24SNaoya Horiguchi ret = __pmd_trans_huge_lock(old_pmd, vma); 1118025c5b24SNaoya Horiguchi if (ret == 1) { 111937a1c49aSAndrea Arcangeli pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 112037a1c49aSAndrea Arcangeli VM_BUG_ON(!pmd_none(*new_pmd)); 112137a1c49aSAndrea Arcangeli set_pmd_at(mm, new_addr, new_pmd, pmd); 112237a1c49aSAndrea Arcangeli spin_unlock(&mm->page_table_lock); 112337a1c49aSAndrea Arcangeli } 112437a1c49aSAndrea Arcangeli out: 112537a1c49aSAndrea Arcangeli return ret; 112637a1c49aSAndrea Arcangeli } 112737a1c49aSAndrea Arcangeli 1128cd7548abSJohannes Weiner int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1129cd7548abSJohannes Weiner unsigned long addr, pgprot_t newprot) 1130cd7548abSJohannes Weiner { 1131cd7548abSJohannes Weiner struct mm_struct *mm = vma->vm_mm; 1132cd7548abSJohannes Weiner int ret = 0; 1133cd7548abSJohannes Weiner 1134025c5b24SNaoya Horiguchi if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1135cd7548abSJohannes Weiner pmd_t entry; 1136cd7548abSJohannes Weiner entry = pmdp_get_and_clear(mm, addr, pmd); 1137cd7548abSJohannes Weiner entry = pmd_modify(entry, newprot); 1138cd7548abSJohannes Weiner set_pmd_at(mm, addr, pmd, entry); 1139cd7548abSJohannes Weiner spin_unlock(&vma->vm_mm->page_table_lock); 1140cd7548abSJohannes Weiner ret = 1; 1141cd7548abSJohannes Weiner } 1142cd7548abSJohannes Weiner 1143cd7548abSJohannes Weiner return ret; 1144cd7548abSJohannes Weiner } 1145cd7548abSJohannes Weiner 1146025c5b24SNaoya Horiguchi /* 1147025c5b24SNaoya Horiguchi * Returns 1 if a given pmd maps a stable (not under splitting) thp. 1148025c5b24SNaoya Horiguchi * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. 1149025c5b24SNaoya Horiguchi * 1150025c5b24SNaoya Horiguchi * Note that if it returns 1, this routine returns without unlocking page 1151025c5b24SNaoya Horiguchi * table locks. So callers must unlock them. 1152025c5b24SNaoya Horiguchi */ 1153025c5b24SNaoya Horiguchi int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1154025c5b24SNaoya Horiguchi { 1155025c5b24SNaoya Horiguchi spin_lock(&vma->vm_mm->page_table_lock); 1156025c5b24SNaoya Horiguchi if (likely(pmd_trans_huge(*pmd))) { 1157025c5b24SNaoya Horiguchi if (unlikely(pmd_trans_splitting(*pmd))) { 1158025c5b24SNaoya Horiguchi spin_unlock(&vma->vm_mm->page_table_lock); 1159025c5b24SNaoya Horiguchi wait_split_huge_page(vma->anon_vma, pmd); 1160025c5b24SNaoya Horiguchi return -1; 1161025c5b24SNaoya Horiguchi } else { 1162025c5b24SNaoya Horiguchi /* Thp mapped by 'pmd' is stable, so we can 1163025c5b24SNaoya Horiguchi * handle it as it is. */ 1164025c5b24SNaoya Horiguchi return 1; 1165025c5b24SNaoya Horiguchi } 1166025c5b24SNaoya Horiguchi } 1167025c5b24SNaoya Horiguchi spin_unlock(&vma->vm_mm->page_table_lock); 1168025c5b24SNaoya Horiguchi return 0; 1169025c5b24SNaoya Horiguchi } 1170025c5b24SNaoya Horiguchi 117171e3aac0SAndrea Arcangeli pmd_t *page_check_address_pmd(struct page *page, 117271e3aac0SAndrea Arcangeli struct mm_struct *mm, 117371e3aac0SAndrea Arcangeli unsigned long address, 117471e3aac0SAndrea Arcangeli enum page_check_address_pmd_flag flag) 117571e3aac0SAndrea Arcangeli { 117671e3aac0SAndrea Arcangeli pmd_t *pmd, *ret = NULL; 117771e3aac0SAndrea Arcangeli 117871e3aac0SAndrea Arcangeli if (address & ~HPAGE_PMD_MASK) 117971e3aac0SAndrea Arcangeli goto out; 118071e3aac0SAndrea Arcangeli 11816219049aSBob Liu pmd = mm_find_pmd(mm, address); 11826219049aSBob Liu if (!pmd) 118371e3aac0SAndrea Arcangeli goto out; 118471e3aac0SAndrea Arcangeli if (pmd_none(*pmd)) 118571e3aac0SAndrea Arcangeli goto out; 118671e3aac0SAndrea Arcangeli if (pmd_page(*pmd) != page) 118771e3aac0SAndrea Arcangeli goto out; 118894fcc585SAndrea Arcangeli /* 118994fcc585SAndrea Arcangeli * split_vma() may create temporary aliased mappings. There is 119094fcc585SAndrea Arcangeli * no risk as long as all huge pmd are found and have their 119194fcc585SAndrea Arcangeli * splitting bit set before __split_huge_page_refcount 119294fcc585SAndrea Arcangeli * runs. Finding the same huge pmd more than once during the 119394fcc585SAndrea Arcangeli * same rmap walk is not a problem. 119494fcc585SAndrea Arcangeli */ 119594fcc585SAndrea Arcangeli if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && 119694fcc585SAndrea Arcangeli pmd_trans_splitting(*pmd)) 119794fcc585SAndrea Arcangeli goto out; 119871e3aac0SAndrea Arcangeli if (pmd_trans_huge(*pmd)) { 119971e3aac0SAndrea Arcangeli VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && 120071e3aac0SAndrea Arcangeli !pmd_trans_splitting(*pmd)); 120171e3aac0SAndrea Arcangeli ret = pmd; 120271e3aac0SAndrea Arcangeli } 120371e3aac0SAndrea Arcangeli out: 120471e3aac0SAndrea Arcangeli return ret; 120571e3aac0SAndrea Arcangeli } 120671e3aac0SAndrea Arcangeli 120771e3aac0SAndrea Arcangeli static int __split_huge_page_splitting(struct page *page, 120871e3aac0SAndrea Arcangeli struct vm_area_struct *vma, 120971e3aac0SAndrea Arcangeli unsigned long address) 121071e3aac0SAndrea Arcangeli { 121171e3aac0SAndrea Arcangeli struct mm_struct *mm = vma->vm_mm; 121271e3aac0SAndrea Arcangeli pmd_t *pmd; 121371e3aac0SAndrea Arcangeli int ret = 0; 12142ec74c3eSSagi Grimberg /* For mmu_notifiers */ 12152ec74c3eSSagi Grimberg const unsigned long mmun_start = address; 12162ec74c3eSSagi Grimberg const unsigned long mmun_end = address + HPAGE_PMD_SIZE; 121771e3aac0SAndrea Arcangeli 12182ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 121971e3aac0SAndrea Arcangeli spin_lock(&mm->page_table_lock); 122071e3aac0SAndrea Arcangeli pmd = page_check_address_pmd(page, mm, address, 122171e3aac0SAndrea Arcangeli PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 122271e3aac0SAndrea Arcangeli if (pmd) { 122371e3aac0SAndrea Arcangeli /* 122471e3aac0SAndrea Arcangeli * We can't temporarily set the pmd to null in order 122571e3aac0SAndrea Arcangeli * to split it, the pmd must remain marked huge at all 122671e3aac0SAndrea Arcangeli * times or the VM won't take the pmd_trans_huge paths 12272b575eb6SPeter Zijlstra * and it won't wait on the anon_vma->root->mutex to 122871e3aac0SAndrea Arcangeli * serialize against split_huge_page*. 122971e3aac0SAndrea Arcangeli */ 12302ec74c3eSSagi Grimberg pmdp_splitting_flush(vma, address, pmd); 123171e3aac0SAndrea Arcangeli ret = 1; 123271e3aac0SAndrea Arcangeli } 123371e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 12342ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 123571e3aac0SAndrea Arcangeli 123671e3aac0SAndrea Arcangeli return ret; 123771e3aac0SAndrea Arcangeli } 123871e3aac0SAndrea Arcangeli 123971e3aac0SAndrea Arcangeli static void __split_huge_page_refcount(struct page *page) 124071e3aac0SAndrea Arcangeli { 124171e3aac0SAndrea Arcangeli int i; 124271e3aac0SAndrea Arcangeli struct zone *zone = page_zone(page); 1243fa9add64SHugh Dickins struct lruvec *lruvec; 124470b50f94SAndrea Arcangeli int tail_count = 0; 124571e3aac0SAndrea Arcangeli 124671e3aac0SAndrea Arcangeli /* prevent PageLRU to go away from under us, and freeze lru stats */ 124771e3aac0SAndrea Arcangeli spin_lock_irq(&zone->lru_lock); 1248fa9add64SHugh Dickins lruvec = mem_cgroup_page_lruvec(page, zone); 1249fa9add64SHugh Dickins 125071e3aac0SAndrea Arcangeli compound_lock(page); 1251e94c8a9cSKAMEZAWA Hiroyuki /* complete memcg works before add pages to LRU */ 1252e94c8a9cSKAMEZAWA Hiroyuki mem_cgroup_split_huge_fixup(page); 125371e3aac0SAndrea Arcangeli 125445676885SShaohua Li for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 125571e3aac0SAndrea Arcangeli struct page *page_tail = page + i; 125671e3aac0SAndrea Arcangeli 125770b50f94SAndrea Arcangeli /* tail_page->_mapcount cannot change */ 125870b50f94SAndrea Arcangeli BUG_ON(page_mapcount(page_tail) < 0); 125970b50f94SAndrea Arcangeli tail_count += page_mapcount(page_tail); 126070b50f94SAndrea Arcangeli /* check for overflow */ 126170b50f94SAndrea Arcangeli BUG_ON(tail_count < 0); 126270b50f94SAndrea Arcangeli BUG_ON(atomic_read(&page_tail->_count) != 0); 126370b50f94SAndrea Arcangeli /* 126470b50f94SAndrea Arcangeli * tail_page->_count is zero and not changing from 126570b50f94SAndrea Arcangeli * under us. But get_page_unless_zero() may be running 126670b50f94SAndrea Arcangeli * from under us on the tail_page. If we used 126770b50f94SAndrea Arcangeli * atomic_set() below instead of atomic_add(), we 126870b50f94SAndrea Arcangeli * would then run atomic_set() concurrently with 126970b50f94SAndrea Arcangeli * get_page_unless_zero(), and atomic_set() is 127070b50f94SAndrea Arcangeli * implemented in C not using locked ops. spin_unlock 127170b50f94SAndrea Arcangeli * on x86 sometime uses locked ops because of PPro 127270b50f94SAndrea Arcangeli * errata 66, 92, so unless somebody can guarantee 127370b50f94SAndrea Arcangeli * atomic_set() here would be safe on all archs (and 127470b50f94SAndrea Arcangeli * not only on x86), it's safer to use atomic_add(). 127570b50f94SAndrea Arcangeli */ 127670b50f94SAndrea Arcangeli atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, 127770b50f94SAndrea Arcangeli &page_tail->_count); 127871e3aac0SAndrea Arcangeli 127971e3aac0SAndrea Arcangeli /* after clearing PageTail the gup refcount can be released */ 128071e3aac0SAndrea Arcangeli smp_mb(); 128171e3aac0SAndrea Arcangeli 1282a6d30dddSJin Dongming /* 1283a6d30dddSJin Dongming * retain hwpoison flag of the poisoned tail page: 1284a6d30dddSJin Dongming * fix for the unsuitable process killed on Guest Machine(KVM) 1285a6d30dddSJin Dongming * by the memory-failure. 1286a6d30dddSJin Dongming */ 1287a6d30dddSJin Dongming page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; 128871e3aac0SAndrea Arcangeli page_tail->flags |= (page->flags & 128971e3aac0SAndrea Arcangeli ((1L << PG_referenced) | 129071e3aac0SAndrea Arcangeli (1L << PG_swapbacked) | 129171e3aac0SAndrea Arcangeli (1L << PG_mlocked) | 129271e3aac0SAndrea Arcangeli (1L << PG_uptodate))); 129371e3aac0SAndrea Arcangeli page_tail->flags |= (1L << PG_dirty); 129471e3aac0SAndrea Arcangeli 129570b50f94SAndrea Arcangeli /* clear PageTail before overwriting first_page */ 129671e3aac0SAndrea Arcangeli smp_wmb(); 129771e3aac0SAndrea Arcangeli 129871e3aac0SAndrea Arcangeli /* 129971e3aac0SAndrea Arcangeli * __split_huge_page_splitting() already set the 130071e3aac0SAndrea Arcangeli * splitting bit in all pmd that could map this 130171e3aac0SAndrea Arcangeli * hugepage, that will ensure no CPU can alter the 130271e3aac0SAndrea Arcangeli * mapcount on the head page. The mapcount is only 130371e3aac0SAndrea Arcangeli * accounted in the head page and it has to be 130471e3aac0SAndrea Arcangeli * transferred to all tail pages in the below code. So 130571e3aac0SAndrea Arcangeli * for this code to be safe, the split the mapcount 130671e3aac0SAndrea Arcangeli * can't change. But that doesn't mean userland can't 130771e3aac0SAndrea Arcangeli * keep changing and reading the page contents while 130871e3aac0SAndrea Arcangeli * we transfer the mapcount, so the pmd splitting 130971e3aac0SAndrea Arcangeli * status is achieved setting a reserved bit in the 131071e3aac0SAndrea Arcangeli * pmd, not by clearing the present bit. 131171e3aac0SAndrea Arcangeli */ 131271e3aac0SAndrea Arcangeli page_tail->_mapcount = page->_mapcount; 131371e3aac0SAndrea Arcangeli 131471e3aac0SAndrea Arcangeli BUG_ON(page_tail->mapping); 131571e3aac0SAndrea Arcangeli page_tail->mapping = page->mapping; 131671e3aac0SAndrea Arcangeli 131745676885SShaohua Li page_tail->index = page->index + i; 131871e3aac0SAndrea Arcangeli 131971e3aac0SAndrea Arcangeli BUG_ON(!PageAnon(page_tail)); 132071e3aac0SAndrea Arcangeli BUG_ON(!PageUptodate(page_tail)); 132171e3aac0SAndrea Arcangeli BUG_ON(!PageDirty(page_tail)); 132271e3aac0SAndrea Arcangeli BUG_ON(!PageSwapBacked(page_tail)); 132371e3aac0SAndrea Arcangeli 1324fa9add64SHugh Dickins lru_add_page_tail(page, page_tail, lruvec); 132571e3aac0SAndrea Arcangeli } 132670b50f94SAndrea Arcangeli atomic_sub(tail_count, &page->_count); 132770b50f94SAndrea Arcangeli BUG_ON(atomic_read(&page->_count) <= 0); 132871e3aac0SAndrea Arcangeli 1329fa9add64SHugh Dickins __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); 133079134171SAndrea Arcangeli __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 133179134171SAndrea Arcangeli 133271e3aac0SAndrea Arcangeli ClearPageCompound(page); 133371e3aac0SAndrea Arcangeli compound_unlock(page); 133471e3aac0SAndrea Arcangeli spin_unlock_irq(&zone->lru_lock); 133571e3aac0SAndrea Arcangeli 133671e3aac0SAndrea Arcangeli for (i = 1; i < HPAGE_PMD_NR; i++) { 133771e3aac0SAndrea Arcangeli struct page *page_tail = page + i; 133871e3aac0SAndrea Arcangeli BUG_ON(page_count(page_tail) <= 0); 133971e3aac0SAndrea Arcangeli /* 134071e3aac0SAndrea Arcangeli * Tail pages may be freed if there wasn't any mapping 134171e3aac0SAndrea Arcangeli * like if add_to_swap() is running on a lru page that 134271e3aac0SAndrea Arcangeli * had its mapping zapped. And freeing these pages 134371e3aac0SAndrea Arcangeli * requires taking the lru_lock so we do the put_page 134471e3aac0SAndrea Arcangeli * of the tail pages after the split is complete. 134571e3aac0SAndrea Arcangeli */ 134671e3aac0SAndrea Arcangeli put_page(page_tail); 134771e3aac0SAndrea Arcangeli } 134871e3aac0SAndrea Arcangeli 134971e3aac0SAndrea Arcangeli /* 135071e3aac0SAndrea Arcangeli * Only the head page (now become a regular page) is required 135171e3aac0SAndrea Arcangeli * to be pinned by the caller. 135271e3aac0SAndrea Arcangeli */ 135371e3aac0SAndrea Arcangeli BUG_ON(page_count(page) <= 0); 135471e3aac0SAndrea Arcangeli } 135571e3aac0SAndrea Arcangeli 135671e3aac0SAndrea Arcangeli static int __split_huge_page_map(struct page *page, 135771e3aac0SAndrea Arcangeli struct vm_area_struct *vma, 135871e3aac0SAndrea Arcangeli unsigned long address) 135971e3aac0SAndrea Arcangeli { 136071e3aac0SAndrea Arcangeli struct mm_struct *mm = vma->vm_mm; 136171e3aac0SAndrea Arcangeli pmd_t *pmd, _pmd; 136271e3aac0SAndrea Arcangeli int ret = 0, i; 136371e3aac0SAndrea Arcangeli pgtable_t pgtable; 136471e3aac0SAndrea Arcangeli unsigned long haddr; 136571e3aac0SAndrea Arcangeli 136671e3aac0SAndrea Arcangeli spin_lock(&mm->page_table_lock); 136771e3aac0SAndrea Arcangeli pmd = page_check_address_pmd(page, mm, address, 136871e3aac0SAndrea Arcangeli PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 136971e3aac0SAndrea Arcangeli if (pmd) { 1370e3ebcf64SGerald Schaefer pgtable = pgtable_trans_huge_withdraw(mm); 137171e3aac0SAndrea Arcangeli pmd_populate(mm, &_pmd, pgtable); 137271e3aac0SAndrea Arcangeli 1373e3ebcf64SGerald Schaefer haddr = address; 1374e3ebcf64SGerald Schaefer for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 137571e3aac0SAndrea Arcangeli pte_t *pte, entry; 137671e3aac0SAndrea Arcangeli BUG_ON(PageCompound(page+i)); 137771e3aac0SAndrea Arcangeli entry = mk_pte(page + i, vma->vm_page_prot); 137871e3aac0SAndrea Arcangeli entry = maybe_mkwrite(pte_mkdirty(entry), vma); 137971e3aac0SAndrea Arcangeli if (!pmd_write(*pmd)) 138071e3aac0SAndrea Arcangeli entry = pte_wrprotect(entry); 138171e3aac0SAndrea Arcangeli else 138271e3aac0SAndrea Arcangeli BUG_ON(page_mapcount(page) != 1); 138371e3aac0SAndrea Arcangeli if (!pmd_young(*pmd)) 138471e3aac0SAndrea Arcangeli entry = pte_mkold(entry); 138571e3aac0SAndrea Arcangeli pte = pte_offset_map(&_pmd, haddr); 138671e3aac0SAndrea Arcangeli BUG_ON(!pte_none(*pte)); 138771e3aac0SAndrea Arcangeli set_pte_at(mm, haddr, pte, entry); 138871e3aac0SAndrea Arcangeli pte_unmap(pte); 138971e3aac0SAndrea Arcangeli } 139071e3aac0SAndrea Arcangeli 139171e3aac0SAndrea Arcangeli smp_wmb(); /* make pte visible before pmd */ 139271e3aac0SAndrea Arcangeli /* 139371e3aac0SAndrea Arcangeli * Up to this point the pmd is present and huge and 139471e3aac0SAndrea Arcangeli * userland has the whole access to the hugepage 139571e3aac0SAndrea Arcangeli * during the split (which happens in place). If we 139671e3aac0SAndrea Arcangeli * overwrite the pmd with the not-huge version 139771e3aac0SAndrea Arcangeli * pointing to the pte here (which of course we could 139871e3aac0SAndrea Arcangeli * if all CPUs were bug free), userland could trigger 139971e3aac0SAndrea Arcangeli * a small page size TLB miss on the small sized TLB 140071e3aac0SAndrea Arcangeli * while the hugepage TLB entry is still established 140171e3aac0SAndrea Arcangeli * in the huge TLB. Some CPU doesn't like that. See 140271e3aac0SAndrea Arcangeli * http://support.amd.com/us/Processor_TechDocs/41322.pdf, 140371e3aac0SAndrea Arcangeli * Erratum 383 on page 93. Intel should be safe but is 140471e3aac0SAndrea Arcangeli * also warns that it's only safe if the permission 140571e3aac0SAndrea Arcangeli * and cache attributes of the two entries loaded in 140671e3aac0SAndrea Arcangeli * the two TLB is identical (which should be the case 140771e3aac0SAndrea Arcangeli * here). But it is generally safer to never allow 140871e3aac0SAndrea Arcangeli * small and huge TLB entries for the same virtual 140971e3aac0SAndrea Arcangeli * address to be loaded simultaneously. So instead of 141071e3aac0SAndrea Arcangeli * doing "pmd_populate(); flush_tlb_range();" we first 141171e3aac0SAndrea Arcangeli * mark the current pmd notpresent (atomically because 141271e3aac0SAndrea Arcangeli * here the pmd_trans_huge and pmd_trans_splitting 141371e3aac0SAndrea Arcangeli * must remain set at all times on the pmd until the 141471e3aac0SAndrea Arcangeli * split is complete for this pmd), then we flush the 141571e3aac0SAndrea Arcangeli * SMP TLB and finally we write the non-huge version 141671e3aac0SAndrea Arcangeli * of the pmd entry with pmd_populate. 141771e3aac0SAndrea Arcangeli */ 141846dcde73SGerald Schaefer pmdp_invalidate(vma, address, pmd); 141971e3aac0SAndrea Arcangeli pmd_populate(mm, pmd, pgtable); 142071e3aac0SAndrea Arcangeli ret = 1; 142171e3aac0SAndrea Arcangeli } 142271e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 142371e3aac0SAndrea Arcangeli 142471e3aac0SAndrea Arcangeli return ret; 142571e3aac0SAndrea Arcangeli } 142671e3aac0SAndrea Arcangeli 14272b575eb6SPeter Zijlstra /* must be called with anon_vma->root->mutex hold */ 142871e3aac0SAndrea Arcangeli static void __split_huge_page(struct page *page, 142971e3aac0SAndrea Arcangeli struct anon_vma *anon_vma) 143071e3aac0SAndrea Arcangeli { 143171e3aac0SAndrea Arcangeli int mapcount, mapcount2; 1432bf181b9fSMichel Lespinasse pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 143371e3aac0SAndrea Arcangeli struct anon_vma_chain *avc; 143471e3aac0SAndrea Arcangeli 143571e3aac0SAndrea Arcangeli BUG_ON(!PageHead(page)); 143671e3aac0SAndrea Arcangeli BUG_ON(PageTail(page)); 143771e3aac0SAndrea Arcangeli 143871e3aac0SAndrea Arcangeli mapcount = 0; 1439bf181b9fSMichel Lespinasse anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 144071e3aac0SAndrea Arcangeli struct vm_area_struct *vma = avc->vma; 144171e3aac0SAndrea Arcangeli unsigned long addr = vma_address(page, vma); 144271e3aac0SAndrea Arcangeli BUG_ON(is_vma_temporary_stack(vma)); 144371e3aac0SAndrea Arcangeli mapcount += __split_huge_page_splitting(page, vma, addr); 144471e3aac0SAndrea Arcangeli } 144505759d38SAndrea Arcangeli /* 144605759d38SAndrea Arcangeli * It is critical that new vmas are added to the tail of the 144705759d38SAndrea Arcangeli * anon_vma list. This guarantes that if copy_huge_pmd() runs 144805759d38SAndrea Arcangeli * and establishes a child pmd before 144905759d38SAndrea Arcangeli * __split_huge_page_splitting() freezes the parent pmd (so if 145005759d38SAndrea Arcangeli * we fail to prevent copy_huge_pmd() from running until the 145105759d38SAndrea Arcangeli * whole __split_huge_page() is complete), we will still see 145205759d38SAndrea Arcangeli * the newly established pmd of the child later during the 145305759d38SAndrea Arcangeli * walk, to be able to set it as pmd_trans_splitting too. 145405759d38SAndrea Arcangeli */ 145505759d38SAndrea Arcangeli if (mapcount != page_mapcount(page)) 145605759d38SAndrea Arcangeli printk(KERN_ERR "mapcount %d page_mapcount %d\n", 145705759d38SAndrea Arcangeli mapcount, page_mapcount(page)); 145871e3aac0SAndrea Arcangeli BUG_ON(mapcount != page_mapcount(page)); 145971e3aac0SAndrea Arcangeli 146071e3aac0SAndrea Arcangeli __split_huge_page_refcount(page); 146171e3aac0SAndrea Arcangeli 146271e3aac0SAndrea Arcangeli mapcount2 = 0; 1463bf181b9fSMichel Lespinasse anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 146471e3aac0SAndrea Arcangeli struct vm_area_struct *vma = avc->vma; 146571e3aac0SAndrea Arcangeli unsigned long addr = vma_address(page, vma); 146671e3aac0SAndrea Arcangeli BUG_ON(is_vma_temporary_stack(vma)); 146771e3aac0SAndrea Arcangeli mapcount2 += __split_huge_page_map(page, vma, addr); 146871e3aac0SAndrea Arcangeli } 146905759d38SAndrea Arcangeli if (mapcount != mapcount2) 147005759d38SAndrea Arcangeli printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", 147105759d38SAndrea Arcangeli mapcount, mapcount2, page_mapcount(page)); 147271e3aac0SAndrea Arcangeli BUG_ON(mapcount != mapcount2); 147371e3aac0SAndrea Arcangeli } 147471e3aac0SAndrea Arcangeli 147571e3aac0SAndrea Arcangeli int split_huge_page(struct page *page) 147671e3aac0SAndrea Arcangeli { 147771e3aac0SAndrea Arcangeli struct anon_vma *anon_vma; 147871e3aac0SAndrea Arcangeli int ret = 1; 147971e3aac0SAndrea Arcangeli 148071e3aac0SAndrea Arcangeli BUG_ON(!PageAnon(page)); 148171e3aac0SAndrea Arcangeli anon_vma = page_lock_anon_vma(page); 148271e3aac0SAndrea Arcangeli if (!anon_vma) 148371e3aac0SAndrea Arcangeli goto out; 148471e3aac0SAndrea Arcangeli ret = 0; 148571e3aac0SAndrea Arcangeli if (!PageCompound(page)) 148671e3aac0SAndrea Arcangeli goto out_unlock; 148771e3aac0SAndrea Arcangeli 148871e3aac0SAndrea Arcangeli BUG_ON(!PageSwapBacked(page)); 148971e3aac0SAndrea Arcangeli __split_huge_page(page, anon_vma); 149081ab4201SAndi Kleen count_vm_event(THP_SPLIT); 149171e3aac0SAndrea Arcangeli 149271e3aac0SAndrea Arcangeli BUG_ON(PageCompound(page)); 149371e3aac0SAndrea Arcangeli out_unlock: 149471e3aac0SAndrea Arcangeli page_unlock_anon_vma(anon_vma); 149571e3aac0SAndrea Arcangeli out: 149671e3aac0SAndrea Arcangeli return ret; 149771e3aac0SAndrea Arcangeli } 149871e3aac0SAndrea Arcangeli 14994b6e1e37SKonstantin Khlebnikov #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) 150078f11a25SAndrea Arcangeli 150160ab3244SAndrea Arcangeli int hugepage_madvise(struct vm_area_struct *vma, 150260ab3244SAndrea Arcangeli unsigned long *vm_flags, int advice) 15030af4e98bSAndrea Arcangeli { 15048e72033fSGerald Schaefer struct mm_struct *mm = vma->vm_mm; 15058e72033fSGerald Schaefer 1506a664b2d8SAndrea Arcangeli switch (advice) { 1507a664b2d8SAndrea Arcangeli case MADV_HUGEPAGE: 15080af4e98bSAndrea Arcangeli /* 15090af4e98bSAndrea Arcangeli * Be somewhat over-protective like KSM for now! 15100af4e98bSAndrea Arcangeli */ 151178f11a25SAndrea Arcangeli if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 15120af4e98bSAndrea Arcangeli return -EINVAL; 15138e72033fSGerald Schaefer if (mm->def_flags & VM_NOHUGEPAGE) 15148e72033fSGerald Schaefer return -EINVAL; 1515a664b2d8SAndrea Arcangeli *vm_flags &= ~VM_NOHUGEPAGE; 15160af4e98bSAndrea Arcangeli *vm_flags |= VM_HUGEPAGE; 151760ab3244SAndrea Arcangeli /* 151860ab3244SAndrea Arcangeli * If the vma become good for khugepaged to scan, 151960ab3244SAndrea Arcangeli * register it here without waiting a page fault that 152060ab3244SAndrea Arcangeli * may not happen any time soon. 152160ab3244SAndrea Arcangeli */ 152260ab3244SAndrea Arcangeli if (unlikely(khugepaged_enter_vma_merge(vma))) 152360ab3244SAndrea Arcangeli return -ENOMEM; 1524a664b2d8SAndrea Arcangeli break; 1525a664b2d8SAndrea Arcangeli case MADV_NOHUGEPAGE: 1526a664b2d8SAndrea Arcangeli /* 1527a664b2d8SAndrea Arcangeli * Be somewhat over-protective like KSM for now! 1528a664b2d8SAndrea Arcangeli */ 152978f11a25SAndrea Arcangeli if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) 1530a664b2d8SAndrea Arcangeli return -EINVAL; 1531a664b2d8SAndrea Arcangeli *vm_flags &= ~VM_HUGEPAGE; 1532a664b2d8SAndrea Arcangeli *vm_flags |= VM_NOHUGEPAGE; 153360ab3244SAndrea Arcangeli /* 153460ab3244SAndrea Arcangeli * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 153560ab3244SAndrea Arcangeli * this vma even if we leave the mm registered in khugepaged if 153660ab3244SAndrea Arcangeli * it got registered before VM_NOHUGEPAGE was set. 153760ab3244SAndrea Arcangeli */ 1538a664b2d8SAndrea Arcangeli break; 1539a664b2d8SAndrea Arcangeli } 15400af4e98bSAndrea Arcangeli 15410af4e98bSAndrea Arcangeli return 0; 15420af4e98bSAndrea Arcangeli } 15430af4e98bSAndrea Arcangeli 1544ba76149fSAndrea Arcangeli static int __init khugepaged_slab_init(void) 1545ba76149fSAndrea Arcangeli { 1546ba76149fSAndrea Arcangeli mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 1547ba76149fSAndrea Arcangeli sizeof(struct mm_slot), 1548ba76149fSAndrea Arcangeli __alignof__(struct mm_slot), 0, NULL); 1549ba76149fSAndrea Arcangeli if (!mm_slot_cache) 1550ba76149fSAndrea Arcangeli return -ENOMEM; 1551ba76149fSAndrea Arcangeli 1552ba76149fSAndrea Arcangeli return 0; 1553ba76149fSAndrea Arcangeli } 1554ba76149fSAndrea Arcangeli 1555ba76149fSAndrea Arcangeli static void __init khugepaged_slab_free(void) 1556ba76149fSAndrea Arcangeli { 1557ba76149fSAndrea Arcangeli kmem_cache_destroy(mm_slot_cache); 1558ba76149fSAndrea Arcangeli mm_slot_cache = NULL; 1559ba76149fSAndrea Arcangeli } 1560ba76149fSAndrea Arcangeli 1561ba76149fSAndrea Arcangeli static inline struct mm_slot *alloc_mm_slot(void) 1562ba76149fSAndrea Arcangeli { 1563ba76149fSAndrea Arcangeli if (!mm_slot_cache) /* initialization failed */ 1564ba76149fSAndrea Arcangeli return NULL; 1565ba76149fSAndrea Arcangeli return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 1566ba76149fSAndrea Arcangeli } 1567ba76149fSAndrea Arcangeli 1568ba76149fSAndrea Arcangeli static inline void free_mm_slot(struct mm_slot *mm_slot) 1569ba76149fSAndrea Arcangeli { 1570ba76149fSAndrea Arcangeli kmem_cache_free(mm_slot_cache, mm_slot); 1571ba76149fSAndrea Arcangeli } 1572ba76149fSAndrea Arcangeli 1573ba76149fSAndrea Arcangeli static int __init mm_slots_hash_init(void) 1574ba76149fSAndrea Arcangeli { 1575ba76149fSAndrea Arcangeli mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), 1576ba76149fSAndrea Arcangeli GFP_KERNEL); 1577ba76149fSAndrea Arcangeli if (!mm_slots_hash) 1578ba76149fSAndrea Arcangeli return -ENOMEM; 1579ba76149fSAndrea Arcangeli return 0; 1580ba76149fSAndrea Arcangeli } 1581ba76149fSAndrea Arcangeli 1582ba76149fSAndrea Arcangeli #if 0 1583ba76149fSAndrea Arcangeli static void __init mm_slots_hash_free(void) 1584ba76149fSAndrea Arcangeli { 1585ba76149fSAndrea Arcangeli kfree(mm_slots_hash); 1586ba76149fSAndrea Arcangeli mm_slots_hash = NULL; 1587ba76149fSAndrea Arcangeli } 1588ba76149fSAndrea Arcangeli #endif 1589ba76149fSAndrea Arcangeli 1590ba76149fSAndrea Arcangeli static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1591ba76149fSAndrea Arcangeli { 1592ba76149fSAndrea Arcangeli struct mm_slot *mm_slot; 1593ba76149fSAndrea Arcangeli struct hlist_head *bucket; 1594ba76149fSAndrea Arcangeli struct hlist_node *node; 1595ba76149fSAndrea Arcangeli 1596ba76149fSAndrea Arcangeli bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 1597ba76149fSAndrea Arcangeli % MM_SLOTS_HASH_HEADS]; 1598ba76149fSAndrea Arcangeli hlist_for_each_entry(mm_slot, node, bucket, hash) { 1599ba76149fSAndrea Arcangeli if (mm == mm_slot->mm) 1600ba76149fSAndrea Arcangeli return mm_slot; 1601ba76149fSAndrea Arcangeli } 1602ba76149fSAndrea Arcangeli return NULL; 1603ba76149fSAndrea Arcangeli } 1604ba76149fSAndrea Arcangeli 1605ba76149fSAndrea Arcangeli static void insert_to_mm_slots_hash(struct mm_struct *mm, 1606ba76149fSAndrea Arcangeli struct mm_slot *mm_slot) 1607ba76149fSAndrea Arcangeli { 1608ba76149fSAndrea Arcangeli struct hlist_head *bucket; 1609ba76149fSAndrea Arcangeli 1610ba76149fSAndrea Arcangeli bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 1611ba76149fSAndrea Arcangeli % MM_SLOTS_HASH_HEADS]; 1612ba76149fSAndrea Arcangeli mm_slot->mm = mm; 1613ba76149fSAndrea Arcangeli hlist_add_head(&mm_slot->hash, bucket); 1614ba76149fSAndrea Arcangeli } 1615ba76149fSAndrea Arcangeli 1616ba76149fSAndrea Arcangeli static inline int khugepaged_test_exit(struct mm_struct *mm) 1617ba76149fSAndrea Arcangeli { 1618ba76149fSAndrea Arcangeli return atomic_read(&mm->mm_users) == 0; 1619ba76149fSAndrea Arcangeli } 1620ba76149fSAndrea Arcangeli 1621ba76149fSAndrea Arcangeli int __khugepaged_enter(struct mm_struct *mm) 1622ba76149fSAndrea Arcangeli { 1623ba76149fSAndrea Arcangeli struct mm_slot *mm_slot; 1624ba76149fSAndrea Arcangeli int wakeup; 1625ba76149fSAndrea Arcangeli 1626ba76149fSAndrea Arcangeli mm_slot = alloc_mm_slot(); 1627ba76149fSAndrea Arcangeli if (!mm_slot) 1628ba76149fSAndrea Arcangeli return -ENOMEM; 1629ba76149fSAndrea Arcangeli 1630ba76149fSAndrea Arcangeli /* __khugepaged_exit() must not run from under us */ 1631ba76149fSAndrea Arcangeli VM_BUG_ON(khugepaged_test_exit(mm)); 1632ba76149fSAndrea Arcangeli if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 1633ba76149fSAndrea Arcangeli free_mm_slot(mm_slot); 1634ba76149fSAndrea Arcangeli return 0; 1635ba76149fSAndrea Arcangeli } 1636ba76149fSAndrea Arcangeli 1637ba76149fSAndrea Arcangeli spin_lock(&khugepaged_mm_lock); 1638ba76149fSAndrea Arcangeli insert_to_mm_slots_hash(mm, mm_slot); 1639ba76149fSAndrea Arcangeli /* 1640ba76149fSAndrea Arcangeli * Insert just behind the scanning cursor, to let the area settle 1641ba76149fSAndrea Arcangeli * down a little. 1642ba76149fSAndrea Arcangeli */ 1643ba76149fSAndrea Arcangeli wakeup = list_empty(&khugepaged_scan.mm_head); 1644ba76149fSAndrea Arcangeli list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 1645ba76149fSAndrea Arcangeli spin_unlock(&khugepaged_mm_lock); 1646ba76149fSAndrea Arcangeli 1647ba76149fSAndrea Arcangeli atomic_inc(&mm->mm_count); 1648ba76149fSAndrea Arcangeli if (wakeup) 1649ba76149fSAndrea Arcangeli wake_up_interruptible(&khugepaged_wait); 1650ba76149fSAndrea Arcangeli 1651ba76149fSAndrea Arcangeli return 0; 1652ba76149fSAndrea Arcangeli } 1653ba76149fSAndrea Arcangeli 1654ba76149fSAndrea Arcangeli int khugepaged_enter_vma_merge(struct vm_area_struct *vma) 1655ba76149fSAndrea Arcangeli { 1656ba76149fSAndrea Arcangeli unsigned long hstart, hend; 1657ba76149fSAndrea Arcangeli if (!vma->anon_vma) 1658ba76149fSAndrea Arcangeli /* 1659ba76149fSAndrea Arcangeli * Not yet faulted in so we will register later in the 1660ba76149fSAndrea Arcangeli * page fault if needed. 1661ba76149fSAndrea Arcangeli */ 1662ba76149fSAndrea Arcangeli return 0; 166378f11a25SAndrea Arcangeli if (vma->vm_ops) 1664ba76149fSAndrea Arcangeli /* khugepaged not yet working on file or special mappings */ 1665ba76149fSAndrea Arcangeli return 0; 1666b3b9c293SKonstantin Khlebnikov VM_BUG_ON(vma->vm_flags & VM_NO_THP); 1667ba76149fSAndrea Arcangeli hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1668ba76149fSAndrea Arcangeli hend = vma->vm_end & HPAGE_PMD_MASK; 1669ba76149fSAndrea Arcangeli if (hstart < hend) 1670ba76149fSAndrea Arcangeli return khugepaged_enter(vma); 1671ba76149fSAndrea Arcangeli return 0; 1672ba76149fSAndrea Arcangeli } 1673ba76149fSAndrea Arcangeli 1674ba76149fSAndrea Arcangeli void __khugepaged_exit(struct mm_struct *mm) 1675ba76149fSAndrea Arcangeli { 1676ba76149fSAndrea Arcangeli struct mm_slot *mm_slot; 1677ba76149fSAndrea Arcangeli int free = 0; 1678ba76149fSAndrea Arcangeli 1679ba76149fSAndrea Arcangeli spin_lock(&khugepaged_mm_lock); 1680ba76149fSAndrea Arcangeli mm_slot = get_mm_slot(mm); 1681ba76149fSAndrea Arcangeli if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 1682ba76149fSAndrea Arcangeli hlist_del(&mm_slot->hash); 1683ba76149fSAndrea Arcangeli list_del(&mm_slot->mm_node); 1684ba76149fSAndrea Arcangeli free = 1; 1685ba76149fSAndrea Arcangeli } 1686d788e80aSChris Wright spin_unlock(&khugepaged_mm_lock); 1687ba76149fSAndrea Arcangeli 1688ba76149fSAndrea Arcangeli if (free) { 1689ba76149fSAndrea Arcangeli clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1690ba76149fSAndrea Arcangeli free_mm_slot(mm_slot); 1691ba76149fSAndrea Arcangeli mmdrop(mm); 1692ba76149fSAndrea Arcangeli } else if (mm_slot) { 1693ba76149fSAndrea Arcangeli /* 1694ba76149fSAndrea Arcangeli * This is required to serialize against 1695ba76149fSAndrea Arcangeli * khugepaged_test_exit() (which is guaranteed to run 1696ba76149fSAndrea Arcangeli * under mmap sem read mode). Stop here (after we 1697ba76149fSAndrea Arcangeli * return all pagetables will be destroyed) until 1698ba76149fSAndrea Arcangeli * khugepaged has finished working on the pagetables 1699ba76149fSAndrea Arcangeli * under the mmap_sem. 1700ba76149fSAndrea Arcangeli */ 1701ba76149fSAndrea Arcangeli down_write(&mm->mmap_sem); 1702ba76149fSAndrea Arcangeli up_write(&mm->mmap_sem); 1703d788e80aSChris Wright } 1704ba76149fSAndrea Arcangeli } 1705ba76149fSAndrea Arcangeli 1706ba76149fSAndrea Arcangeli static void release_pte_page(struct page *page) 1707ba76149fSAndrea Arcangeli { 1708ba76149fSAndrea Arcangeli /* 0 stands for page_is_file_cache(page) == false */ 1709ba76149fSAndrea Arcangeli dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 1710ba76149fSAndrea Arcangeli unlock_page(page); 1711ba76149fSAndrea Arcangeli putback_lru_page(page); 1712ba76149fSAndrea Arcangeli } 1713ba76149fSAndrea Arcangeli 1714ba76149fSAndrea Arcangeli static void release_pte_pages(pte_t *pte, pte_t *_pte) 1715ba76149fSAndrea Arcangeli { 1716ba76149fSAndrea Arcangeli while (--_pte >= pte) { 1717ba76149fSAndrea Arcangeli pte_t pteval = *_pte; 1718ba76149fSAndrea Arcangeli if (!pte_none(pteval)) 1719ba76149fSAndrea Arcangeli release_pte_page(pte_page(pteval)); 1720ba76149fSAndrea Arcangeli } 1721ba76149fSAndrea Arcangeli } 1722ba76149fSAndrea Arcangeli 1723ba76149fSAndrea Arcangeli static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 1724ba76149fSAndrea Arcangeli unsigned long address, 1725ba76149fSAndrea Arcangeli pte_t *pte) 1726ba76149fSAndrea Arcangeli { 1727ba76149fSAndrea Arcangeli struct page *page; 1728ba76149fSAndrea Arcangeli pte_t *_pte; 1729344aa35cSBob Liu int referenced = 0, none = 0; 1730ba76149fSAndrea Arcangeli for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 1731ba76149fSAndrea Arcangeli _pte++, address += PAGE_SIZE) { 1732ba76149fSAndrea Arcangeli pte_t pteval = *_pte; 1733ba76149fSAndrea Arcangeli if (pte_none(pteval)) { 1734ba76149fSAndrea Arcangeli if (++none <= khugepaged_max_ptes_none) 1735ba76149fSAndrea Arcangeli continue; 1736344aa35cSBob Liu else 1737ba76149fSAndrea Arcangeli goto out; 1738ba76149fSAndrea Arcangeli } 1739344aa35cSBob Liu if (!pte_present(pteval) || !pte_write(pteval)) 1740ba76149fSAndrea Arcangeli goto out; 1741ba76149fSAndrea Arcangeli page = vm_normal_page(vma, address, pteval); 1742344aa35cSBob Liu if (unlikely(!page)) 1743ba76149fSAndrea Arcangeli goto out; 1744344aa35cSBob Liu 1745ba76149fSAndrea Arcangeli VM_BUG_ON(PageCompound(page)); 1746ba76149fSAndrea Arcangeli BUG_ON(!PageAnon(page)); 1747ba76149fSAndrea Arcangeli VM_BUG_ON(!PageSwapBacked(page)); 1748ba76149fSAndrea Arcangeli 1749ba76149fSAndrea Arcangeli /* cannot use mapcount: can't collapse if there's a gup pin */ 1750344aa35cSBob Liu if (page_count(page) != 1) 1751ba76149fSAndrea Arcangeli goto out; 1752ba76149fSAndrea Arcangeli /* 1753ba76149fSAndrea Arcangeli * We can do it before isolate_lru_page because the 1754ba76149fSAndrea Arcangeli * page can't be freed from under us. NOTE: PG_lock 1755ba76149fSAndrea Arcangeli * is needed to serialize against split_huge_page 1756ba76149fSAndrea Arcangeli * when invoked from the VM. 1757ba76149fSAndrea Arcangeli */ 1758344aa35cSBob Liu if (!trylock_page(page)) 1759ba76149fSAndrea Arcangeli goto out; 1760ba76149fSAndrea Arcangeli /* 1761ba76149fSAndrea Arcangeli * Isolate the page to avoid collapsing an hugepage 1762ba76149fSAndrea Arcangeli * currently in use by the VM. 1763ba76149fSAndrea Arcangeli */ 1764ba76149fSAndrea Arcangeli if (isolate_lru_page(page)) { 1765ba76149fSAndrea Arcangeli unlock_page(page); 1766ba76149fSAndrea Arcangeli goto out; 1767ba76149fSAndrea Arcangeli } 1768ba76149fSAndrea Arcangeli /* 0 stands for page_is_file_cache(page) == false */ 1769ba76149fSAndrea Arcangeli inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 1770ba76149fSAndrea Arcangeli VM_BUG_ON(!PageLocked(page)); 1771ba76149fSAndrea Arcangeli VM_BUG_ON(PageLRU(page)); 1772ba76149fSAndrea Arcangeli 1773ba76149fSAndrea Arcangeli /* If there is no mapped pte young don't collapse the page */ 17748ee53820SAndrea Arcangeli if (pte_young(pteval) || PageReferenced(page) || 17758ee53820SAndrea Arcangeli mmu_notifier_test_young(vma->vm_mm, address)) 1776ba76149fSAndrea Arcangeli referenced = 1; 1777ba76149fSAndrea Arcangeli } 1778344aa35cSBob Liu if (likely(referenced)) 1779344aa35cSBob Liu return 1; 1780ba76149fSAndrea Arcangeli out: 1781344aa35cSBob Liu release_pte_pages(pte, _pte); 1782344aa35cSBob Liu return 0; 1783ba76149fSAndrea Arcangeli } 1784ba76149fSAndrea Arcangeli 1785ba76149fSAndrea Arcangeli static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 1786ba76149fSAndrea Arcangeli struct vm_area_struct *vma, 1787ba76149fSAndrea Arcangeli unsigned long address, 1788ba76149fSAndrea Arcangeli spinlock_t *ptl) 1789ba76149fSAndrea Arcangeli { 1790ba76149fSAndrea Arcangeli pte_t *_pte; 1791ba76149fSAndrea Arcangeli for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { 1792ba76149fSAndrea Arcangeli pte_t pteval = *_pte; 1793ba76149fSAndrea Arcangeli struct page *src_page; 1794ba76149fSAndrea Arcangeli 1795ba76149fSAndrea Arcangeli if (pte_none(pteval)) { 1796ba76149fSAndrea Arcangeli clear_user_highpage(page, address); 1797ba76149fSAndrea Arcangeli add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 1798ba76149fSAndrea Arcangeli } else { 1799ba76149fSAndrea Arcangeli src_page = pte_page(pteval); 1800ba76149fSAndrea Arcangeli copy_user_highpage(page, src_page, address, vma); 1801ba76149fSAndrea Arcangeli VM_BUG_ON(page_mapcount(src_page) != 1); 1802ba76149fSAndrea Arcangeli release_pte_page(src_page); 1803ba76149fSAndrea Arcangeli /* 1804ba76149fSAndrea Arcangeli * ptl mostly unnecessary, but preempt has to 1805ba76149fSAndrea Arcangeli * be disabled to update the per-cpu stats 1806ba76149fSAndrea Arcangeli * inside page_remove_rmap(). 1807ba76149fSAndrea Arcangeli */ 1808ba76149fSAndrea Arcangeli spin_lock(ptl); 1809ba76149fSAndrea Arcangeli /* 1810ba76149fSAndrea Arcangeli * paravirt calls inside pte_clear here are 1811ba76149fSAndrea Arcangeli * superfluous. 1812ba76149fSAndrea Arcangeli */ 1813ba76149fSAndrea Arcangeli pte_clear(vma->vm_mm, address, _pte); 1814ba76149fSAndrea Arcangeli page_remove_rmap(src_page); 1815ba76149fSAndrea Arcangeli spin_unlock(ptl); 1816ba76149fSAndrea Arcangeli free_page_and_swap_cache(src_page); 1817ba76149fSAndrea Arcangeli } 1818ba76149fSAndrea Arcangeli 1819ba76149fSAndrea Arcangeli address += PAGE_SIZE; 1820ba76149fSAndrea Arcangeli page++; 1821ba76149fSAndrea Arcangeli } 1822ba76149fSAndrea Arcangeli } 1823ba76149fSAndrea Arcangeli 182426234f36SXiao Guangrong static void khugepaged_alloc_sleep(void) 182526234f36SXiao Guangrong { 182626234f36SXiao Guangrong wait_event_freezable_timeout(khugepaged_wait, false, 182726234f36SXiao Guangrong msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 182826234f36SXiao Guangrong } 182926234f36SXiao Guangrong 183026234f36SXiao Guangrong #ifdef CONFIG_NUMA 183126234f36SXiao Guangrong static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 183226234f36SXiao Guangrong { 183326234f36SXiao Guangrong if (IS_ERR(*hpage)) { 183426234f36SXiao Guangrong if (!*wait) 183526234f36SXiao Guangrong return false; 183626234f36SXiao Guangrong 183726234f36SXiao Guangrong *wait = false; 1838e3b4126cSXiao Guangrong *hpage = NULL; 183926234f36SXiao Guangrong khugepaged_alloc_sleep(); 184026234f36SXiao Guangrong } else if (*hpage) { 184126234f36SXiao Guangrong put_page(*hpage); 184226234f36SXiao Guangrong *hpage = NULL; 184326234f36SXiao Guangrong } 184426234f36SXiao Guangrong 184526234f36SXiao Guangrong return true; 184626234f36SXiao Guangrong } 184726234f36SXiao Guangrong 184826234f36SXiao Guangrong static struct page 184926234f36SXiao Guangrong *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 185026234f36SXiao Guangrong struct vm_area_struct *vma, unsigned long address, 185126234f36SXiao Guangrong int node) 185226234f36SXiao Guangrong { 185326234f36SXiao Guangrong VM_BUG_ON(*hpage); 185426234f36SXiao Guangrong /* 185526234f36SXiao Guangrong * Allocate the page while the vma is still valid and under 185626234f36SXiao Guangrong * the mmap_sem read mode so there is no memory allocation 185726234f36SXiao Guangrong * later when we take the mmap_sem in write mode. This is more 185826234f36SXiao Guangrong * friendly behavior (OTOH it may actually hide bugs) to 185926234f36SXiao Guangrong * filesystems in userland with daemons allocating memory in 186026234f36SXiao Guangrong * the userland I/O paths. Allocating memory with the 186126234f36SXiao Guangrong * mmap_sem in read mode is good idea also to allow greater 186226234f36SXiao Guangrong * scalability. 186326234f36SXiao Guangrong */ 186426234f36SXiao Guangrong *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 186526234f36SXiao Guangrong node, __GFP_OTHER_NODE); 186626234f36SXiao Guangrong 186726234f36SXiao Guangrong /* 186826234f36SXiao Guangrong * After allocating the hugepage, release the mmap_sem read lock in 186926234f36SXiao Guangrong * preparation for taking it in write mode. 187026234f36SXiao Guangrong */ 187126234f36SXiao Guangrong up_read(&mm->mmap_sem); 187226234f36SXiao Guangrong if (unlikely(!*hpage)) { 187326234f36SXiao Guangrong count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 187426234f36SXiao Guangrong *hpage = ERR_PTR(-ENOMEM); 187526234f36SXiao Guangrong return NULL; 187626234f36SXiao Guangrong } 187726234f36SXiao Guangrong 187826234f36SXiao Guangrong count_vm_event(THP_COLLAPSE_ALLOC); 187926234f36SXiao Guangrong return *hpage; 188026234f36SXiao Guangrong } 188126234f36SXiao Guangrong #else 188226234f36SXiao Guangrong static struct page *khugepaged_alloc_hugepage(bool *wait) 188326234f36SXiao Guangrong { 188426234f36SXiao Guangrong struct page *hpage; 188526234f36SXiao Guangrong 188626234f36SXiao Guangrong do { 188726234f36SXiao Guangrong hpage = alloc_hugepage(khugepaged_defrag()); 188826234f36SXiao Guangrong if (!hpage) { 188926234f36SXiao Guangrong count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 189026234f36SXiao Guangrong if (!*wait) 189126234f36SXiao Guangrong return NULL; 189226234f36SXiao Guangrong 189326234f36SXiao Guangrong *wait = false; 189426234f36SXiao Guangrong khugepaged_alloc_sleep(); 189526234f36SXiao Guangrong } else 189626234f36SXiao Guangrong count_vm_event(THP_COLLAPSE_ALLOC); 189726234f36SXiao Guangrong } while (unlikely(!hpage) && likely(khugepaged_enabled())); 189826234f36SXiao Guangrong 189926234f36SXiao Guangrong return hpage; 190026234f36SXiao Guangrong } 190126234f36SXiao Guangrong 190226234f36SXiao Guangrong static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 190326234f36SXiao Guangrong { 190426234f36SXiao Guangrong if (!*hpage) 190526234f36SXiao Guangrong *hpage = khugepaged_alloc_hugepage(wait); 190626234f36SXiao Guangrong 190726234f36SXiao Guangrong if (unlikely(!*hpage)) 190826234f36SXiao Guangrong return false; 190926234f36SXiao Guangrong 191026234f36SXiao Guangrong return true; 191126234f36SXiao Guangrong } 191226234f36SXiao Guangrong 191326234f36SXiao Guangrong static struct page 191426234f36SXiao Guangrong *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 191526234f36SXiao Guangrong struct vm_area_struct *vma, unsigned long address, 191626234f36SXiao Guangrong int node) 191726234f36SXiao Guangrong { 191826234f36SXiao Guangrong up_read(&mm->mmap_sem); 191926234f36SXiao Guangrong VM_BUG_ON(!*hpage); 192026234f36SXiao Guangrong return *hpage; 192126234f36SXiao Guangrong } 192226234f36SXiao Guangrong #endif 192326234f36SXiao Guangrong 1924fa475e51SBob Liu static bool hugepage_vma_check(struct vm_area_struct *vma) 1925fa475e51SBob Liu { 1926fa475e51SBob Liu if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 1927fa475e51SBob Liu (vma->vm_flags & VM_NOHUGEPAGE)) 1928fa475e51SBob Liu return false; 1929fa475e51SBob Liu 1930fa475e51SBob Liu if (!vma->anon_vma || vma->vm_ops) 1931fa475e51SBob Liu return false; 1932fa475e51SBob Liu if (is_vma_temporary_stack(vma)) 1933fa475e51SBob Liu return false; 1934fa475e51SBob Liu VM_BUG_ON(vma->vm_flags & VM_NO_THP); 1935fa475e51SBob Liu return true; 1936fa475e51SBob Liu } 1937fa475e51SBob Liu 1938ba76149fSAndrea Arcangeli static void collapse_huge_page(struct mm_struct *mm, 1939ba76149fSAndrea Arcangeli unsigned long address, 1940ce83d217SAndrea Arcangeli struct page **hpage, 19415c4b4be3SAndi Kleen struct vm_area_struct *vma, 19425c4b4be3SAndi Kleen int node) 1943ba76149fSAndrea Arcangeli { 1944ba76149fSAndrea Arcangeli pmd_t *pmd, _pmd; 1945ba76149fSAndrea Arcangeli pte_t *pte; 1946ba76149fSAndrea Arcangeli pgtable_t pgtable; 1947ba76149fSAndrea Arcangeli struct page *new_page; 1948ba76149fSAndrea Arcangeli spinlock_t *ptl; 1949ba76149fSAndrea Arcangeli int isolated; 1950ba76149fSAndrea Arcangeli unsigned long hstart, hend; 19512ec74c3eSSagi Grimberg unsigned long mmun_start; /* For mmu_notifiers */ 19522ec74c3eSSagi Grimberg unsigned long mmun_end; /* For mmu_notifiers */ 1953ba76149fSAndrea Arcangeli 1954ba76149fSAndrea Arcangeli VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1955692e0b35SAndrea Arcangeli 195626234f36SXiao Guangrong /* release the mmap_sem read lock. */ 195726234f36SXiao Guangrong new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); 195826234f36SXiao Guangrong if (!new_page) 1959ce83d217SAndrea Arcangeli return; 1960ce83d217SAndrea Arcangeli 1961420256efSXiao Guangrong if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) 1962692e0b35SAndrea Arcangeli return; 1963ba76149fSAndrea Arcangeli 1964ba76149fSAndrea Arcangeli /* 1965ba76149fSAndrea Arcangeli * Prevent all access to pagetables with the exception of 1966ba76149fSAndrea Arcangeli * gup_fast later hanlded by the ptep_clear_flush and the VM 1967ba76149fSAndrea Arcangeli * handled by the anon_vma lock + PG_lock. 1968ba76149fSAndrea Arcangeli */ 1969ba76149fSAndrea Arcangeli down_write(&mm->mmap_sem); 1970ba76149fSAndrea Arcangeli if (unlikely(khugepaged_test_exit(mm))) 1971ba76149fSAndrea Arcangeli goto out; 1972ba76149fSAndrea Arcangeli 1973ba76149fSAndrea Arcangeli vma = find_vma(mm, address); 1974ba76149fSAndrea Arcangeli hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1975ba76149fSAndrea Arcangeli hend = vma->vm_end & HPAGE_PMD_MASK; 1976ba76149fSAndrea Arcangeli if (address < hstart || address + HPAGE_PMD_SIZE > hend) 1977ba76149fSAndrea Arcangeli goto out; 1978fa475e51SBob Liu if (!hugepage_vma_check(vma)) 1979ba76149fSAndrea Arcangeli goto out; 19806219049aSBob Liu pmd = mm_find_pmd(mm, address); 19816219049aSBob Liu if (!pmd) 1982ba76149fSAndrea Arcangeli goto out; 19836219049aSBob Liu if (pmd_trans_huge(*pmd)) 1984ba76149fSAndrea Arcangeli goto out; 1985ba76149fSAndrea Arcangeli 1986ba76149fSAndrea Arcangeli anon_vma_lock(vma->anon_vma); 1987ba76149fSAndrea Arcangeli 1988ba76149fSAndrea Arcangeli pte = pte_offset_map(pmd, address); 1989ba76149fSAndrea Arcangeli ptl = pte_lockptr(mm, pmd); 1990ba76149fSAndrea Arcangeli 19912ec74c3eSSagi Grimberg mmun_start = address; 19922ec74c3eSSagi Grimberg mmun_end = address + HPAGE_PMD_SIZE; 19932ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1994ba76149fSAndrea Arcangeli spin_lock(&mm->page_table_lock); /* probably unnecessary */ 1995ba76149fSAndrea Arcangeli /* 1996ba76149fSAndrea Arcangeli * After this gup_fast can't run anymore. This also removes 1997ba76149fSAndrea Arcangeli * any huge TLB entry from the CPU so we won't allow 1998ba76149fSAndrea Arcangeli * huge and small TLB entries for the same virtual address 1999ba76149fSAndrea Arcangeli * to avoid the risk of CPU bugs in that area. 2000ba76149fSAndrea Arcangeli */ 20012ec74c3eSSagi Grimberg _pmd = pmdp_clear_flush(vma, address, pmd); 2002ba76149fSAndrea Arcangeli spin_unlock(&mm->page_table_lock); 20032ec74c3eSSagi Grimberg mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2004ba76149fSAndrea Arcangeli 2005ba76149fSAndrea Arcangeli spin_lock(ptl); 2006ba76149fSAndrea Arcangeli isolated = __collapse_huge_page_isolate(vma, address, pte); 2007ba76149fSAndrea Arcangeli spin_unlock(ptl); 2008ba76149fSAndrea Arcangeli 2009ba76149fSAndrea Arcangeli if (unlikely(!isolated)) { 2010453c7192SJohannes Weiner pte_unmap(pte); 2011ba76149fSAndrea Arcangeli spin_lock(&mm->page_table_lock); 2012ba76149fSAndrea Arcangeli BUG_ON(!pmd_none(*pmd)); 2013ba76149fSAndrea Arcangeli set_pmd_at(mm, address, pmd, _pmd); 2014ba76149fSAndrea Arcangeli spin_unlock(&mm->page_table_lock); 2015ba76149fSAndrea Arcangeli anon_vma_unlock(vma->anon_vma); 2016ce83d217SAndrea Arcangeli goto out; 2017ba76149fSAndrea Arcangeli } 2018ba76149fSAndrea Arcangeli 2019ba76149fSAndrea Arcangeli /* 2020ba76149fSAndrea Arcangeli * All pages are isolated and locked so anon_vma rmap 2021ba76149fSAndrea Arcangeli * can't run anymore. 2022ba76149fSAndrea Arcangeli */ 2023ba76149fSAndrea Arcangeli anon_vma_unlock(vma->anon_vma); 2024ba76149fSAndrea Arcangeli 2025ba76149fSAndrea Arcangeli __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2026453c7192SJohannes Weiner pte_unmap(pte); 2027ba76149fSAndrea Arcangeli __SetPageUptodate(new_page); 2028ba76149fSAndrea Arcangeli pgtable = pmd_pgtable(_pmd); 2029ba76149fSAndrea Arcangeli 2030b3092b3bSBob Liu _pmd = mk_huge_pmd(new_page, vma); 2031ba76149fSAndrea Arcangeli 2032ba76149fSAndrea Arcangeli /* 2033ba76149fSAndrea Arcangeli * spin_lock() below is not the equivalent of smp_wmb(), so 2034ba76149fSAndrea Arcangeli * this is needed to avoid the copy_huge_page writes to become 2035ba76149fSAndrea Arcangeli * visible after the set_pmd_at() write. 2036ba76149fSAndrea Arcangeli */ 2037ba76149fSAndrea Arcangeli smp_wmb(); 2038ba76149fSAndrea Arcangeli 2039ba76149fSAndrea Arcangeli spin_lock(&mm->page_table_lock); 2040ba76149fSAndrea Arcangeli BUG_ON(!pmd_none(*pmd)); 2041ba76149fSAndrea Arcangeli page_add_new_anon_rmap(new_page, vma, address); 2042ba76149fSAndrea Arcangeli set_pmd_at(mm, address, pmd, _pmd); 2043b113da65SDavid Miller update_mmu_cache_pmd(vma, address, pmd); 2044e3ebcf64SGerald Schaefer pgtable_trans_huge_deposit(mm, pgtable); 2045ba76149fSAndrea Arcangeli spin_unlock(&mm->page_table_lock); 2046ba76149fSAndrea Arcangeli 2047ba76149fSAndrea Arcangeli *hpage = NULL; 2048420256efSXiao Guangrong 2049ba76149fSAndrea Arcangeli khugepaged_pages_collapsed++; 2050ce83d217SAndrea Arcangeli out_up_write: 2051ba76149fSAndrea Arcangeli up_write(&mm->mmap_sem); 20520bbbc0b3SAndrea Arcangeli return; 20530bbbc0b3SAndrea Arcangeli 2054ce83d217SAndrea Arcangeli out: 2055678ff896SKAMEZAWA Hiroyuki mem_cgroup_uncharge_page(new_page); 2056ce83d217SAndrea Arcangeli goto out_up_write; 2057ba76149fSAndrea Arcangeli } 2058ba76149fSAndrea Arcangeli 2059ba76149fSAndrea Arcangeli static int khugepaged_scan_pmd(struct mm_struct *mm, 2060ba76149fSAndrea Arcangeli struct vm_area_struct *vma, 2061ba76149fSAndrea Arcangeli unsigned long address, 2062ba76149fSAndrea Arcangeli struct page **hpage) 2063ba76149fSAndrea Arcangeli { 2064ba76149fSAndrea Arcangeli pmd_t *pmd; 2065ba76149fSAndrea Arcangeli pte_t *pte, *_pte; 2066ba76149fSAndrea Arcangeli int ret = 0, referenced = 0, none = 0; 2067ba76149fSAndrea Arcangeli struct page *page; 2068ba76149fSAndrea Arcangeli unsigned long _address; 2069ba76149fSAndrea Arcangeli spinlock_t *ptl; 20705c4b4be3SAndi Kleen int node = -1; 2071ba76149fSAndrea Arcangeli 2072ba76149fSAndrea Arcangeli VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2073ba76149fSAndrea Arcangeli 20746219049aSBob Liu pmd = mm_find_pmd(mm, address); 20756219049aSBob Liu if (!pmd) 2076ba76149fSAndrea Arcangeli goto out; 20776219049aSBob Liu if (pmd_trans_huge(*pmd)) 2078ba76149fSAndrea Arcangeli goto out; 2079ba76149fSAndrea Arcangeli 2080ba76149fSAndrea Arcangeli pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2081ba76149fSAndrea Arcangeli for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2082ba76149fSAndrea Arcangeli _pte++, _address += PAGE_SIZE) { 2083ba76149fSAndrea Arcangeli pte_t pteval = *_pte; 2084ba76149fSAndrea Arcangeli if (pte_none(pteval)) { 2085ba76149fSAndrea Arcangeli if (++none <= khugepaged_max_ptes_none) 2086ba76149fSAndrea Arcangeli continue; 2087ba76149fSAndrea Arcangeli else 2088ba76149fSAndrea Arcangeli goto out_unmap; 2089ba76149fSAndrea Arcangeli } 2090ba76149fSAndrea Arcangeli if (!pte_present(pteval) || !pte_write(pteval)) 2091ba76149fSAndrea Arcangeli goto out_unmap; 2092ba76149fSAndrea Arcangeli page = vm_normal_page(vma, _address, pteval); 2093ba76149fSAndrea Arcangeli if (unlikely(!page)) 2094ba76149fSAndrea Arcangeli goto out_unmap; 20955c4b4be3SAndi Kleen /* 20965c4b4be3SAndi Kleen * Chose the node of the first page. This could 20975c4b4be3SAndi Kleen * be more sophisticated and look at more pages, 20985c4b4be3SAndi Kleen * but isn't for now. 20995c4b4be3SAndi Kleen */ 21005c4b4be3SAndi Kleen if (node == -1) 21015c4b4be3SAndi Kleen node = page_to_nid(page); 2102ba76149fSAndrea Arcangeli VM_BUG_ON(PageCompound(page)); 2103ba76149fSAndrea Arcangeli if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2104ba76149fSAndrea Arcangeli goto out_unmap; 2105ba76149fSAndrea Arcangeli /* cannot use mapcount: can't collapse if there's a gup pin */ 2106ba76149fSAndrea Arcangeli if (page_count(page) != 1) 2107ba76149fSAndrea Arcangeli goto out_unmap; 21088ee53820SAndrea Arcangeli if (pte_young(pteval) || PageReferenced(page) || 21098ee53820SAndrea Arcangeli mmu_notifier_test_young(vma->vm_mm, address)) 2110ba76149fSAndrea Arcangeli referenced = 1; 2111ba76149fSAndrea Arcangeli } 2112ba76149fSAndrea Arcangeli if (referenced) 2113ba76149fSAndrea Arcangeli ret = 1; 2114ba76149fSAndrea Arcangeli out_unmap: 2115ba76149fSAndrea Arcangeli pte_unmap_unlock(pte, ptl); 2116ce83d217SAndrea Arcangeli if (ret) 2117ce83d217SAndrea Arcangeli /* collapse_huge_page will return with the mmap_sem released */ 21185c4b4be3SAndi Kleen collapse_huge_page(mm, address, hpage, vma, node); 2119ba76149fSAndrea Arcangeli out: 2120ba76149fSAndrea Arcangeli return ret; 2121ba76149fSAndrea Arcangeli } 2122ba76149fSAndrea Arcangeli 2123ba76149fSAndrea Arcangeli static void collect_mm_slot(struct mm_slot *mm_slot) 2124ba76149fSAndrea Arcangeli { 2125ba76149fSAndrea Arcangeli struct mm_struct *mm = mm_slot->mm; 2126ba76149fSAndrea Arcangeli 2127b9980cdcSHugh Dickins VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2128ba76149fSAndrea Arcangeli 2129ba76149fSAndrea Arcangeli if (khugepaged_test_exit(mm)) { 2130ba76149fSAndrea Arcangeli /* free mm_slot */ 2131ba76149fSAndrea Arcangeli hlist_del(&mm_slot->hash); 2132ba76149fSAndrea Arcangeli list_del(&mm_slot->mm_node); 2133ba76149fSAndrea Arcangeli 2134ba76149fSAndrea Arcangeli /* 2135ba76149fSAndrea Arcangeli * Not strictly needed because the mm exited already. 2136ba76149fSAndrea Arcangeli * 2137ba76149fSAndrea Arcangeli * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2138ba76149fSAndrea Arcangeli */ 2139ba76149fSAndrea Arcangeli 2140ba76149fSAndrea Arcangeli /* khugepaged_mm_lock actually not necessary for the below */ 2141ba76149fSAndrea Arcangeli free_mm_slot(mm_slot); 2142ba76149fSAndrea Arcangeli mmdrop(mm); 2143ba76149fSAndrea Arcangeli } 2144ba76149fSAndrea Arcangeli } 2145ba76149fSAndrea Arcangeli 2146ba76149fSAndrea Arcangeli static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2147ba76149fSAndrea Arcangeli struct page **hpage) 21482f1da642SH Hartley Sweeten __releases(&khugepaged_mm_lock) 21492f1da642SH Hartley Sweeten __acquires(&khugepaged_mm_lock) 2150ba76149fSAndrea Arcangeli { 2151ba76149fSAndrea Arcangeli struct mm_slot *mm_slot; 2152ba76149fSAndrea Arcangeli struct mm_struct *mm; 2153ba76149fSAndrea Arcangeli struct vm_area_struct *vma; 2154ba76149fSAndrea Arcangeli int progress = 0; 2155ba76149fSAndrea Arcangeli 2156ba76149fSAndrea Arcangeli VM_BUG_ON(!pages); 2157b9980cdcSHugh Dickins VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2158ba76149fSAndrea Arcangeli 2159ba76149fSAndrea Arcangeli if (khugepaged_scan.mm_slot) 2160ba76149fSAndrea Arcangeli mm_slot = khugepaged_scan.mm_slot; 2161ba76149fSAndrea Arcangeli else { 2162ba76149fSAndrea Arcangeli mm_slot = list_entry(khugepaged_scan.mm_head.next, 2163ba76149fSAndrea Arcangeli struct mm_slot, mm_node); 2164ba76149fSAndrea Arcangeli khugepaged_scan.address = 0; 2165ba76149fSAndrea Arcangeli khugepaged_scan.mm_slot = mm_slot; 2166ba76149fSAndrea Arcangeli } 2167ba76149fSAndrea Arcangeli spin_unlock(&khugepaged_mm_lock); 2168ba76149fSAndrea Arcangeli 2169ba76149fSAndrea Arcangeli mm = mm_slot->mm; 2170ba76149fSAndrea Arcangeli down_read(&mm->mmap_sem); 2171ba76149fSAndrea Arcangeli if (unlikely(khugepaged_test_exit(mm))) 2172ba76149fSAndrea Arcangeli vma = NULL; 2173ba76149fSAndrea Arcangeli else 2174ba76149fSAndrea Arcangeli vma = find_vma(mm, khugepaged_scan.address); 2175ba76149fSAndrea Arcangeli 2176ba76149fSAndrea Arcangeli progress++; 2177ba76149fSAndrea Arcangeli for (; vma; vma = vma->vm_next) { 2178ba76149fSAndrea Arcangeli unsigned long hstart, hend; 2179ba76149fSAndrea Arcangeli 2180ba76149fSAndrea Arcangeli cond_resched(); 2181ba76149fSAndrea Arcangeli if (unlikely(khugepaged_test_exit(mm))) { 2182ba76149fSAndrea Arcangeli progress++; 2183ba76149fSAndrea Arcangeli break; 2184ba76149fSAndrea Arcangeli } 2185fa475e51SBob Liu if (!hugepage_vma_check(vma)) { 2186a7d6e4ecSAndrea Arcangeli skip: 2187ba76149fSAndrea Arcangeli progress++; 2188ba76149fSAndrea Arcangeli continue; 2189ba76149fSAndrea Arcangeli } 2190ba76149fSAndrea Arcangeli hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2191ba76149fSAndrea Arcangeli hend = vma->vm_end & HPAGE_PMD_MASK; 2192a7d6e4ecSAndrea Arcangeli if (hstart >= hend) 2193a7d6e4ecSAndrea Arcangeli goto skip; 2194a7d6e4ecSAndrea Arcangeli if (khugepaged_scan.address > hend) 2195a7d6e4ecSAndrea Arcangeli goto skip; 2196ba76149fSAndrea Arcangeli if (khugepaged_scan.address < hstart) 2197ba76149fSAndrea Arcangeli khugepaged_scan.address = hstart; 2198a7d6e4ecSAndrea Arcangeli VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2199ba76149fSAndrea Arcangeli 2200ba76149fSAndrea Arcangeli while (khugepaged_scan.address < hend) { 2201ba76149fSAndrea Arcangeli int ret; 2202ba76149fSAndrea Arcangeli cond_resched(); 2203ba76149fSAndrea Arcangeli if (unlikely(khugepaged_test_exit(mm))) 2204ba76149fSAndrea Arcangeli goto breakouterloop; 2205ba76149fSAndrea Arcangeli 2206ba76149fSAndrea Arcangeli VM_BUG_ON(khugepaged_scan.address < hstart || 2207ba76149fSAndrea Arcangeli khugepaged_scan.address + HPAGE_PMD_SIZE > 2208ba76149fSAndrea Arcangeli hend); 2209ba76149fSAndrea Arcangeli ret = khugepaged_scan_pmd(mm, vma, 2210ba76149fSAndrea Arcangeli khugepaged_scan.address, 2211ba76149fSAndrea Arcangeli hpage); 2212ba76149fSAndrea Arcangeli /* move to next address */ 2213ba76149fSAndrea Arcangeli khugepaged_scan.address += HPAGE_PMD_SIZE; 2214ba76149fSAndrea Arcangeli progress += HPAGE_PMD_NR; 2215ba76149fSAndrea Arcangeli if (ret) 2216ba76149fSAndrea Arcangeli /* we released mmap_sem so break loop */ 2217ba76149fSAndrea Arcangeli goto breakouterloop_mmap_sem; 2218ba76149fSAndrea Arcangeli if (progress >= pages) 2219ba76149fSAndrea Arcangeli goto breakouterloop; 2220ba76149fSAndrea Arcangeli } 2221ba76149fSAndrea Arcangeli } 2222ba76149fSAndrea Arcangeli breakouterloop: 2223ba76149fSAndrea Arcangeli up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ 2224ba76149fSAndrea Arcangeli breakouterloop_mmap_sem: 2225ba76149fSAndrea Arcangeli 2226ba76149fSAndrea Arcangeli spin_lock(&khugepaged_mm_lock); 2227a7d6e4ecSAndrea Arcangeli VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2228ba76149fSAndrea Arcangeli /* 2229ba76149fSAndrea Arcangeli * Release the current mm_slot if this mm is about to die, or 2230ba76149fSAndrea Arcangeli * if we scanned all vmas of this mm. 2231ba76149fSAndrea Arcangeli */ 2232ba76149fSAndrea Arcangeli if (khugepaged_test_exit(mm) || !vma) { 2233ba76149fSAndrea Arcangeli /* 2234ba76149fSAndrea Arcangeli * Make sure that if mm_users is reaching zero while 2235ba76149fSAndrea Arcangeli * khugepaged runs here, khugepaged_exit will find 2236ba76149fSAndrea Arcangeli * mm_slot not pointing to the exiting mm. 2237ba76149fSAndrea Arcangeli */ 2238ba76149fSAndrea Arcangeli if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 2239ba76149fSAndrea Arcangeli khugepaged_scan.mm_slot = list_entry( 2240ba76149fSAndrea Arcangeli mm_slot->mm_node.next, 2241ba76149fSAndrea Arcangeli struct mm_slot, mm_node); 2242ba76149fSAndrea Arcangeli khugepaged_scan.address = 0; 2243ba76149fSAndrea Arcangeli } else { 2244ba76149fSAndrea Arcangeli khugepaged_scan.mm_slot = NULL; 2245ba76149fSAndrea Arcangeli khugepaged_full_scans++; 2246ba76149fSAndrea Arcangeli } 2247ba76149fSAndrea Arcangeli 2248ba76149fSAndrea Arcangeli collect_mm_slot(mm_slot); 2249ba76149fSAndrea Arcangeli } 2250ba76149fSAndrea Arcangeli 2251ba76149fSAndrea Arcangeli return progress; 2252ba76149fSAndrea Arcangeli } 2253ba76149fSAndrea Arcangeli 2254ba76149fSAndrea Arcangeli static int khugepaged_has_work(void) 2255ba76149fSAndrea Arcangeli { 2256ba76149fSAndrea Arcangeli return !list_empty(&khugepaged_scan.mm_head) && 2257ba76149fSAndrea Arcangeli khugepaged_enabled(); 2258ba76149fSAndrea Arcangeli } 2259ba76149fSAndrea Arcangeli 2260ba76149fSAndrea Arcangeli static int khugepaged_wait_event(void) 2261ba76149fSAndrea Arcangeli { 2262ba76149fSAndrea Arcangeli return !list_empty(&khugepaged_scan.mm_head) || 22632017c0bfSXiao Guangrong kthread_should_stop(); 2264ba76149fSAndrea Arcangeli } 2265ba76149fSAndrea Arcangeli 2266d516904bSXiao Guangrong static void khugepaged_do_scan(void) 2267d516904bSXiao Guangrong { 2268d516904bSXiao Guangrong struct page *hpage = NULL; 2269ba76149fSAndrea Arcangeli unsigned int progress = 0, pass_through_head = 0; 2270ba76149fSAndrea Arcangeli unsigned int pages = khugepaged_pages_to_scan; 2271d516904bSXiao Guangrong bool wait = true; 2272ba76149fSAndrea Arcangeli 2273ba76149fSAndrea Arcangeli barrier(); /* write khugepaged_pages_to_scan to local stack */ 2274ba76149fSAndrea Arcangeli 2275ba76149fSAndrea Arcangeli while (progress < pages) { 227626234f36SXiao Guangrong if (!khugepaged_prealloc_page(&hpage, &wait)) 227726234f36SXiao Guangrong break; 2278d516904bSXiao Guangrong 2279420256efSXiao Guangrong cond_resched(); 2280ba76149fSAndrea Arcangeli 2281878aee7dSAndrea Arcangeli if (unlikely(kthread_should_stop() || freezing(current))) 2282878aee7dSAndrea Arcangeli break; 2283878aee7dSAndrea Arcangeli 2284ba76149fSAndrea Arcangeli spin_lock(&khugepaged_mm_lock); 2285ba76149fSAndrea Arcangeli if (!khugepaged_scan.mm_slot) 2286ba76149fSAndrea Arcangeli pass_through_head++; 2287ba76149fSAndrea Arcangeli if (khugepaged_has_work() && 2288ba76149fSAndrea Arcangeli pass_through_head < 2) 2289ba76149fSAndrea Arcangeli progress += khugepaged_scan_mm_slot(pages - progress, 2290d516904bSXiao Guangrong &hpage); 2291ba76149fSAndrea Arcangeli else 2292ba76149fSAndrea Arcangeli progress = pages; 2293ba76149fSAndrea Arcangeli spin_unlock(&khugepaged_mm_lock); 2294ba76149fSAndrea Arcangeli } 2295ba76149fSAndrea Arcangeli 2296d516904bSXiao Guangrong if (!IS_ERR_OR_NULL(hpage)) 2297d516904bSXiao Guangrong put_page(hpage); 2298ba76149fSAndrea Arcangeli } 22990bbbc0b3SAndrea Arcangeli 23002017c0bfSXiao Guangrong static void khugepaged_wait_work(void) 23012017c0bfSXiao Guangrong { 23022017c0bfSXiao Guangrong try_to_freeze(); 23032017c0bfSXiao Guangrong 23042017c0bfSXiao Guangrong if (khugepaged_has_work()) { 23052017c0bfSXiao Guangrong if (!khugepaged_scan_sleep_millisecs) 23062017c0bfSXiao Guangrong return; 23072017c0bfSXiao Guangrong 23082017c0bfSXiao Guangrong wait_event_freezable_timeout(khugepaged_wait, 23092017c0bfSXiao Guangrong kthread_should_stop(), 23102017c0bfSXiao Guangrong msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); 23112017c0bfSXiao Guangrong return; 23122017c0bfSXiao Guangrong } 23132017c0bfSXiao Guangrong 23142017c0bfSXiao Guangrong if (khugepaged_enabled()) 23152017c0bfSXiao Guangrong wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 23162017c0bfSXiao Guangrong } 23172017c0bfSXiao Guangrong 2318ba76149fSAndrea Arcangeli static int khugepaged(void *none) 2319ba76149fSAndrea Arcangeli { 2320ba76149fSAndrea Arcangeli struct mm_slot *mm_slot; 2321ba76149fSAndrea Arcangeli 2322878aee7dSAndrea Arcangeli set_freezable(); 2323ba76149fSAndrea Arcangeli set_user_nice(current, 19); 2324ba76149fSAndrea Arcangeli 2325b7231789SXiao Guangrong while (!kthread_should_stop()) { 2326b7231789SXiao Guangrong khugepaged_do_scan(); 2327b7231789SXiao Guangrong khugepaged_wait_work(); 2328b7231789SXiao Guangrong } 2329ba76149fSAndrea Arcangeli 2330ba76149fSAndrea Arcangeli spin_lock(&khugepaged_mm_lock); 2331ba76149fSAndrea Arcangeli mm_slot = khugepaged_scan.mm_slot; 2332ba76149fSAndrea Arcangeli khugepaged_scan.mm_slot = NULL; 2333ba76149fSAndrea Arcangeli if (mm_slot) 2334ba76149fSAndrea Arcangeli collect_mm_slot(mm_slot); 2335ba76149fSAndrea Arcangeli spin_unlock(&khugepaged_mm_lock); 2336ba76149fSAndrea Arcangeli return 0; 2337ba76149fSAndrea Arcangeli } 2338ba76149fSAndrea Arcangeli 233971e3aac0SAndrea Arcangeli void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 234071e3aac0SAndrea Arcangeli { 234171e3aac0SAndrea Arcangeli struct page *page; 234271e3aac0SAndrea Arcangeli 234371e3aac0SAndrea Arcangeli spin_lock(&mm->page_table_lock); 234471e3aac0SAndrea Arcangeli if (unlikely(!pmd_trans_huge(*pmd))) { 234571e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 234671e3aac0SAndrea Arcangeli return; 234771e3aac0SAndrea Arcangeli } 234871e3aac0SAndrea Arcangeli page = pmd_page(*pmd); 234971e3aac0SAndrea Arcangeli VM_BUG_ON(!page_count(page)); 235071e3aac0SAndrea Arcangeli get_page(page); 235171e3aac0SAndrea Arcangeli spin_unlock(&mm->page_table_lock); 235271e3aac0SAndrea Arcangeli 235371e3aac0SAndrea Arcangeli split_huge_page(page); 235471e3aac0SAndrea Arcangeli 235571e3aac0SAndrea Arcangeli put_page(page); 235671e3aac0SAndrea Arcangeli BUG_ON(pmd_trans_huge(*pmd)); 235771e3aac0SAndrea Arcangeli } 235894fcc585SAndrea Arcangeli 235994fcc585SAndrea Arcangeli static void split_huge_page_address(struct mm_struct *mm, 236094fcc585SAndrea Arcangeli unsigned long address) 236194fcc585SAndrea Arcangeli { 236294fcc585SAndrea Arcangeli pmd_t *pmd; 236394fcc585SAndrea Arcangeli 236494fcc585SAndrea Arcangeli VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 236594fcc585SAndrea Arcangeli 23666219049aSBob Liu pmd = mm_find_pmd(mm, address); 23676219049aSBob Liu if (!pmd) 236894fcc585SAndrea Arcangeli return; 236994fcc585SAndrea Arcangeli /* 237094fcc585SAndrea Arcangeli * Caller holds the mmap_sem write mode, so a huge pmd cannot 237194fcc585SAndrea Arcangeli * materialize from under us. 237294fcc585SAndrea Arcangeli */ 237394fcc585SAndrea Arcangeli split_huge_page_pmd(mm, pmd); 237494fcc585SAndrea Arcangeli } 237594fcc585SAndrea Arcangeli 237694fcc585SAndrea Arcangeli void __vma_adjust_trans_huge(struct vm_area_struct *vma, 237794fcc585SAndrea Arcangeli unsigned long start, 237894fcc585SAndrea Arcangeli unsigned long end, 237994fcc585SAndrea Arcangeli long adjust_next) 238094fcc585SAndrea Arcangeli { 238194fcc585SAndrea Arcangeli /* 238294fcc585SAndrea Arcangeli * If the new start address isn't hpage aligned and it could 238394fcc585SAndrea Arcangeli * previously contain an hugepage: check if we need to split 238494fcc585SAndrea Arcangeli * an huge pmd. 238594fcc585SAndrea Arcangeli */ 238694fcc585SAndrea Arcangeli if (start & ~HPAGE_PMD_MASK && 238794fcc585SAndrea Arcangeli (start & HPAGE_PMD_MASK) >= vma->vm_start && 238894fcc585SAndrea Arcangeli (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 238994fcc585SAndrea Arcangeli split_huge_page_address(vma->vm_mm, start); 239094fcc585SAndrea Arcangeli 239194fcc585SAndrea Arcangeli /* 239294fcc585SAndrea Arcangeli * If the new end address isn't hpage aligned and it could 239394fcc585SAndrea Arcangeli * previously contain an hugepage: check if we need to split 239494fcc585SAndrea Arcangeli * an huge pmd. 239594fcc585SAndrea Arcangeli */ 239694fcc585SAndrea Arcangeli if (end & ~HPAGE_PMD_MASK && 239794fcc585SAndrea Arcangeli (end & HPAGE_PMD_MASK) >= vma->vm_start && 239894fcc585SAndrea Arcangeli (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 239994fcc585SAndrea Arcangeli split_huge_page_address(vma->vm_mm, end); 240094fcc585SAndrea Arcangeli 240194fcc585SAndrea Arcangeli /* 240294fcc585SAndrea Arcangeli * If we're also updating the vma->vm_next->vm_start, if the new 240394fcc585SAndrea Arcangeli * vm_next->vm_start isn't page aligned and it could previously 240494fcc585SAndrea Arcangeli * contain an hugepage: check if we need to split an huge pmd. 240594fcc585SAndrea Arcangeli */ 240694fcc585SAndrea Arcangeli if (adjust_next > 0) { 240794fcc585SAndrea Arcangeli struct vm_area_struct *next = vma->vm_next; 240894fcc585SAndrea Arcangeli unsigned long nstart = next->vm_start; 240994fcc585SAndrea Arcangeli nstart += adjust_next << PAGE_SHIFT; 241094fcc585SAndrea Arcangeli if (nstart & ~HPAGE_PMD_MASK && 241194fcc585SAndrea Arcangeli (nstart & HPAGE_PMD_MASK) >= next->vm_start && 241294fcc585SAndrea Arcangeli (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 241394fcc585SAndrea Arcangeli split_huge_page_address(next->vm_mm, nstart); 241494fcc585SAndrea Arcangeli } 241594fcc585SAndrea Arcangeli } 2416