1*b46e756fSKirill A. Shutemov #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2*b46e756fSKirill A. Shutemov 3*b46e756fSKirill A. Shutemov #include <linux/mm.h> 4*b46e756fSKirill A. Shutemov #include <linux/sched.h> 5*b46e756fSKirill A. Shutemov #include <linux/mmu_notifier.h> 6*b46e756fSKirill A. Shutemov #include <linux/rmap.h> 7*b46e756fSKirill A. Shutemov #include <linux/swap.h> 8*b46e756fSKirill A. Shutemov #include <linux/mm_inline.h> 9*b46e756fSKirill A. Shutemov #include <linux/kthread.h> 10*b46e756fSKirill A. Shutemov #include <linux/khugepaged.h> 11*b46e756fSKirill A. Shutemov #include <linux/freezer.h> 12*b46e756fSKirill A. Shutemov #include <linux/mman.h> 13*b46e756fSKirill A. Shutemov #include <linux/hashtable.h> 14*b46e756fSKirill A. Shutemov #include <linux/userfaultfd_k.h> 15*b46e756fSKirill A. Shutemov #include <linux/page_idle.h> 16*b46e756fSKirill A. Shutemov #include <linux/swapops.h> 17*b46e756fSKirill A. Shutemov 18*b46e756fSKirill A. Shutemov #include <asm/tlb.h> 19*b46e756fSKirill A. Shutemov #include <asm/pgalloc.h> 20*b46e756fSKirill A. Shutemov #include "internal.h" 21*b46e756fSKirill A. Shutemov 22*b46e756fSKirill A. Shutemov enum scan_result { 23*b46e756fSKirill A. Shutemov SCAN_FAIL, 24*b46e756fSKirill A. Shutemov SCAN_SUCCEED, 25*b46e756fSKirill A. Shutemov SCAN_PMD_NULL, 26*b46e756fSKirill A. Shutemov SCAN_EXCEED_NONE_PTE, 27*b46e756fSKirill A. Shutemov SCAN_PTE_NON_PRESENT, 28*b46e756fSKirill A. Shutemov SCAN_PAGE_RO, 29*b46e756fSKirill A. Shutemov SCAN_NO_REFERENCED_PAGE, 30*b46e756fSKirill A. Shutemov SCAN_PAGE_NULL, 31*b46e756fSKirill A. Shutemov SCAN_SCAN_ABORT, 32*b46e756fSKirill A. Shutemov SCAN_PAGE_COUNT, 33*b46e756fSKirill A. Shutemov SCAN_PAGE_LRU, 34*b46e756fSKirill A. Shutemov SCAN_PAGE_LOCK, 35*b46e756fSKirill A. Shutemov SCAN_PAGE_ANON, 36*b46e756fSKirill A. Shutemov SCAN_PAGE_COMPOUND, 37*b46e756fSKirill A. Shutemov SCAN_ANY_PROCESS, 38*b46e756fSKirill A. Shutemov SCAN_VMA_NULL, 39*b46e756fSKirill A. Shutemov SCAN_VMA_CHECK, 40*b46e756fSKirill A. Shutemov SCAN_ADDRESS_RANGE, 41*b46e756fSKirill A. Shutemov SCAN_SWAP_CACHE_PAGE, 42*b46e756fSKirill A. Shutemov SCAN_DEL_PAGE_LRU, 43*b46e756fSKirill A. Shutemov SCAN_ALLOC_HUGE_PAGE_FAIL, 44*b46e756fSKirill A. Shutemov SCAN_CGROUP_CHARGE_FAIL, 45*b46e756fSKirill A. Shutemov SCAN_EXCEED_SWAP_PTE 46*b46e756fSKirill A. Shutemov }; 47*b46e756fSKirill A. Shutemov 48*b46e756fSKirill A. Shutemov #define CREATE_TRACE_POINTS 49*b46e756fSKirill A. Shutemov #include <trace/events/huge_memory.h> 50*b46e756fSKirill A. Shutemov 51*b46e756fSKirill A. Shutemov /* default scan 8*512 pte (or vmas) every 30 second */ 52*b46e756fSKirill A. Shutemov static unsigned int khugepaged_pages_to_scan __read_mostly; 53*b46e756fSKirill A. Shutemov static unsigned int khugepaged_pages_collapsed; 54*b46e756fSKirill A. Shutemov static unsigned int khugepaged_full_scans; 55*b46e756fSKirill A. Shutemov static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 56*b46e756fSKirill A. Shutemov /* during fragmentation poll the hugepage allocator once every minute */ 57*b46e756fSKirill A. Shutemov static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 58*b46e756fSKirill A. Shutemov static unsigned long khugepaged_sleep_expire; 59*b46e756fSKirill A. Shutemov static DEFINE_SPINLOCK(khugepaged_mm_lock); 60*b46e756fSKirill A. Shutemov static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 61*b46e756fSKirill A. Shutemov /* 62*b46e756fSKirill A. Shutemov * default collapse hugepages if there is at least one pte mapped like 63*b46e756fSKirill A. Shutemov * it would have happened if the vma was large enough during page 64*b46e756fSKirill A. Shutemov * fault. 65*b46e756fSKirill A. Shutemov */ 66*b46e756fSKirill A. Shutemov static unsigned int khugepaged_max_ptes_none __read_mostly; 67*b46e756fSKirill A. Shutemov static unsigned int khugepaged_max_ptes_swap __read_mostly; 68*b46e756fSKirill A. Shutemov 69*b46e756fSKirill A. Shutemov #define MM_SLOTS_HASH_BITS 10 70*b46e756fSKirill A. Shutemov static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 71*b46e756fSKirill A. Shutemov 72*b46e756fSKirill A. Shutemov static struct kmem_cache *mm_slot_cache __read_mostly; 73*b46e756fSKirill A. Shutemov 74*b46e756fSKirill A. Shutemov /** 75*b46e756fSKirill A. Shutemov * struct mm_slot - hash lookup from mm to mm_slot 76*b46e756fSKirill A. Shutemov * @hash: hash collision list 77*b46e756fSKirill A. Shutemov * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 78*b46e756fSKirill A. Shutemov * @mm: the mm that this information is valid for 79*b46e756fSKirill A. Shutemov */ 80*b46e756fSKirill A. Shutemov struct mm_slot { 81*b46e756fSKirill A. Shutemov struct hlist_node hash; 82*b46e756fSKirill A. Shutemov struct list_head mm_node; 83*b46e756fSKirill A. Shutemov struct mm_struct *mm; 84*b46e756fSKirill A. Shutemov }; 85*b46e756fSKirill A. Shutemov 86*b46e756fSKirill A. Shutemov /** 87*b46e756fSKirill A. Shutemov * struct khugepaged_scan - cursor for scanning 88*b46e756fSKirill A. Shutemov * @mm_head: the head of the mm list to scan 89*b46e756fSKirill A. Shutemov * @mm_slot: the current mm_slot we are scanning 90*b46e756fSKirill A. Shutemov * @address: the next address inside that to be scanned 91*b46e756fSKirill A. Shutemov * 92*b46e756fSKirill A. Shutemov * There is only the one khugepaged_scan instance of this cursor structure. 93*b46e756fSKirill A. Shutemov */ 94*b46e756fSKirill A. Shutemov struct khugepaged_scan { 95*b46e756fSKirill A. Shutemov struct list_head mm_head; 96*b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 97*b46e756fSKirill A. Shutemov unsigned long address; 98*b46e756fSKirill A. Shutemov }; 99*b46e756fSKirill A. Shutemov 100*b46e756fSKirill A. Shutemov static struct khugepaged_scan khugepaged_scan = { 101*b46e756fSKirill A. Shutemov .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 102*b46e756fSKirill A. Shutemov }; 103*b46e756fSKirill A. Shutemov 104*b46e756fSKirill A. Shutemov static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 105*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 106*b46e756fSKirill A. Shutemov char *buf) 107*b46e756fSKirill A. Shutemov { 108*b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 109*b46e756fSKirill A. Shutemov } 110*b46e756fSKirill A. Shutemov 111*b46e756fSKirill A. Shutemov static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 112*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 113*b46e756fSKirill A. Shutemov const char *buf, size_t count) 114*b46e756fSKirill A. Shutemov { 115*b46e756fSKirill A. Shutemov unsigned long msecs; 116*b46e756fSKirill A. Shutemov int err; 117*b46e756fSKirill A. Shutemov 118*b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &msecs); 119*b46e756fSKirill A. Shutemov if (err || msecs > UINT_MAX) 120*b46e756fSKirill A. Shutemov return -EINVAL; 121*b46e756fSKirill A. Shutemov 122*b46e756fSKirill A. Shutemov khugepaged_scan_sleep_millisecs = msecs; 123*b46e756fSKirill A. Shutemov khugepaged_sleep_expire = 0; 124*b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 125*b46e756fSKirill A. Shutemov 126*b46e756fSKirill A. Shutemov return count; 127*b46e756fSKirill A. Shutemov } 128*b46e756fSKirill A. Shutemov static struct kobj_attribute scan_sleep_millisecs_attr = 129*b46e756fSKirill A. Shutemov __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 130*b46e756fSKirill A. Shutemov scan_sleep_millisecs_store); 131*b46e756fSKirill A. Shutemov 132*b46e756fSKirill A. Shutemov static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 133*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 134*b46e756fSKirill A. Shutemov char *buf) 135*b46e756fSKirill A. Shutemov { 136*b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 137*b46e756fSKirill A. Shutemov } 138*b46e756fSKirill A. Shutemov 139*b46e756fSKirill A. Shutemov static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 140*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 141*b46e756fSKirill A. Shutemov const char *buf, size_t count) 142*b46e756fSKirill A. Shutemov { 143*b46e756fSKirill A. Shutemov unsigned long msecs; 144*b46e756fSKirill A. Shutemov int err; 145*b46e756fSKirill A. Shutemov 146*b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &msecs); 147*b46e756fSKirill A. Shutemov if (err || msecs > UINT_MAX) 148*b46e756fSKirill A. Shutemov return -EINVAL; 149*b46e756fSKirill A. Shutemov 150*b46e756fSKirill A. Shutemov khugepaged_alloc_sleep_millisecs = msecs; 151*b46e756fSKirill A. Shutemov khugepaged_sleep_expire = 0; 152*b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 153*b46e756fSKirill A. Shutemov 154*b46e756fSKirill A. Shutemov return count; 155*b46e756fSKirill A. Shutemov } 156*b46e756fSKirill A. Shutemov static struct kobj_attribute alloc_sleep_millisecs_attr = 157*b46e756fSKirill A. Shutemov __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 158*b46e756fSKirill A. Shutemov alloc_sleep_millisecs_store); 159*b46e756fSKirill A. Shutemov 160*b46e756fSKirill A. Shutemov static ssize_t pages_to_scan_show(struct kobject *kobj, 161*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 162*b46e756fSKirill A. Shutemov char *buf) 163*b46e756fSKirill A. Shutemov { 164*b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 165*b46e756fSKirill A. Shutemov } 166*b46e756fSKirill A. Shutemov static ssize_t pages_to_scan_store(struct kobject *kobj, 167*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 168*b46e756fSKirill A. Shutemov const char *buf, size_t count) 169*b46e756fSKirill A. Shutemov { 170*b46e756fSKirill A. Shutemov int err; 171*b46e756fSKirill A. Shutemov unsigned long pages; 172*b46e756fSKirill A. Shutemov 173*b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &pages); 174*b46e756fSKirill A. Shutemov if (err || !pages || pages > UINT_MAX) 175*b46e756fSKirill A. Shutemov return -EINVAL; 176*b46e756fSKirill A. Shutemov 177*b46e756fSKirill A. Shutemov khugepaged_pages_to_scan = pages; 178*b46e756fSKirill A. Shutemov 179*b46e756fSKirill A. Shutemov return count; 180*b46e756fSKirill A. Shutemov } 181*b46e756fSKirill A. Shutemov static struct kobj_attribute pages_to_scan_attr = 182*b46e756fSKirill A. Shutemov __ATTR(pages_to_scan, 0644, pages_to_scan_show, 183*b46e756fSKirill A. Shutemov pages_to_scan_store); 184*b46e756fSKirill A. Shutemov 185*b46e756fSKirill A. Shutemov static ssize_t pages_collapsed_show(struct kobject *kobj, 186*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 187*b46e756fSKirill A. Shutemov char *buf) 188*b46e756fSKirill A. Shutemov { 189*b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 190*b46e756fSKirill A. Shutemov } 191*b46e756fSKirill A. Shutemov static struct kobj_attribute pages_collapsed_attr = 192*b46e756fSKirill A. Shutemov __ATTR_RO(pages_collapsed); 193*b46e756fSKirill A. Shutemov 194*b46e756fSKirill A. Shutemov static ssize_t full_scans_show(struct kobject *kobj, 195*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 196*b46e756fSKirill A. Shutemov char *buf) 197*b46e756fSKirill A. Shutemov { 198*b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_full_scans); 199*b46e756fSKirill A. Shutemov } 200*b46e756fSKirill A. Shutemov static struct kobj_attribute full_scans_attr = 201*b46e756fSKirill A. Shutemov __ATTR_RO(full_scans); 202*b46e756fSKirill A. Shutemov 203*b46e756fSKirill A. Shutemov static ssize_t khugepaged_defrag_show(struct kobject *kobj, 204*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, char *buf) 205*b46e756fSKirill A. Shutemov { 206*b46e756fSKirill A. Shutemov return single_hugepage_flag_show(kobj, attr, buf, 207*b46e756fSKirill A. Shutemov TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 208*b46e756fSKirill A. Shutemov } 209*b46e756fSKirill A. Shutemov static ssize_t khugepaged_defrag_store(struct kobject *kobj, 210*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 211*b46e756fSKirill A. Shutemov const char *buf, size_t count) 212*b46e756fSKirill A. Shutemov { 213*b46e756fSKirill A. Shutemov return single_hugepage_flag_store(kobj, attr, buf, count, 214*b46e756fSKirill A. Shutemov TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 215*b46e756fSKirill A. Shutemov } 216*b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_defrag_attr = 217*b46e756fSKirill A. Shutemov __ATTR(defrag, 0644, khugepaged_defrag_show, 218*b46e756fSKirill A. Shutemov khugepaged_defrag_store); 219*b46e756fSKirill A. Shutemov 220*b46e756fSKirill A. Shutemov /* 221*b46e756fSKirill A. Shutemov * max_ptes_none controls if khugepaged should collapse hugepages over 222*b46e756fSKirill A. Shutemov * any unmapped ptes in turn potentially increasing the memory 223*b46e756fSKirill A. Shutemov * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 224*b46e756fSKirill A. Shutemov * reduce the available free memory in the system as it 225*b46e756fSKirill A. Shutemov * runs. Increasing max_ptes_none will instead potentially reduce the 226*b46e756fSKirill A. Shutemov * free memory in the system during the khugepaged scan. 227*b46e756fSKirill A. Shutemov */ 228*b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 229*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 230*b46e756fSKirill A. Shutemov char *buf) 231*b46e756fSKirill A. Shutemov { 232*b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 233*b46e756fSKirill A. Shutemov } 234*b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 235*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 236*b46e756fSKirill A. Shutemov const char *buf, size_t count) 237*b46e756fSKirill A. Shutemov { 238*b46e756fSKirill A. Shutemov int err; 239*b46e756fSKirill A. Shutemov unsigned long max_ptes_none; 240*b46e756fSKirill A. Shutemov 241*b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &max_ptes_none); 242*b46e756fSKirill A. Shutemov if (err || max_ptes_none > HPAGE_PMD_NR-1) 243*b46e756fSKirill A. Shutemov return -EINVAL; 244*b46e756fSKirill A. Shutemov 245*b46e756fSKirill A. Shutemov khugepaged_max_ptes_none = max_ptes_none; 246*b46e756fSKirill A. Shutemov 247*b46e756fSKirill A. Shutemov return count; 248*b46e756fSKirill A. Shutemov } 249*b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_max_ptes_none_attr = 250*b46e756fSKirill A. Shutemov __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 251*b46e756fSKirill A. Shutemov khugepaged_max_ptes_none_store); 252*b46e756fSKirill A. Shutemov 253*b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, 254*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 255*b46e756fSKirill A. Shutemov char *buf) 256*b46e756fSKirill A. Shutemov { 257*b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); 258*b46e756fSKirill A. Shutemov } 259*b46e756fSKirill A. Shutemov 260*b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, 261*b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 262*b46e756fSKirill A. Shutemov const char *buf, size_t count) 263*b46e756fSKirill A. Shutemov { 264*b46e756fSKirill A. Shutemov int err; 265*b46e756fSKirill A. Shutemov unsigned long max_ptes_swap; 266*b46e756fSKirill A. Shutemov 267*b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &max_ptes_swap); 268*b46e756fSKirill A. Shutemov if (err || max_ptes_swap > HPAGE_PMD_NR-1) 269*b46e756fSKirill A. Shutemov return -EINVAL; 270*b46e756fSKirill A. Shutemov 271*b46e756fSKirill A. Shutemov khugepaged_max_ptes_swap = max_ptes_swap; 272*b46e756fSKirill A. Shutemov 273*b46e756fSKirill A. Shutemov return count; 274*b46e756fSKirill A. Shutemov } 275*b46e756fSKirill A. Shutemov 276*b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_max_ptes_swap_attr = 277*b46e756fSKirill A. Shutemov __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, 278*b46e756fSKirill A. Shutemov khugepaged_max_ptes_swap_store); 279*b46e756fSKirill A. Shutemov 280*b46e756fSKirill A. Shutemov static struct attribute *khugepaged_attr[] = { 281*b46e756fSKirill A. Shutemov &khugepaged_defrag_attr.attr, 282*b46e756fSKirill A. Shutemov &khugepaged_max_ptes_none_attr.attr, 283*b46e756fSKirill A. Shutemov &pages_to_scan_attr.attr, 284*b46e756fSKirill A. Shutemov &pages_collapsed_attr.attr, 285*b46e756fSKirill A. Shutemov &full_scans_attr.attr, 286*b46e756fSKirill A. Shutemov &scan_sleep_millisecs_attr.attr, 287*b46e756fSKirill A. Shutemov &alloc_sleep_millisecs_attr.attr, 288*b46e756fSKirill A. Shutemov &khugepaged_max_ptes_swap_attr.attr, 289*b46e756fSKirill A. Shutemov NULL, 290*b46e756fSKirill A. Shutemov }; 291*b46e756fSKirill A. Shutemov 292*b46e756fSKirill A. Shutemov struct attribute_group khugepaged_attr_group = { 293*b46e756fSKirill A. Shutemov .attrs = khugepaged_attr, 294*b46e756fSKirill A. Shutemov .name = "khugepaged", 295*b46e756fSKirill A. Shutemov }; 296*b46e756fSKirill A. Shutemov 297*b46e756fSKirill A. Shutemov #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) 298*b46e756fSKirill A. Shutemov 299*b46e756fSKirill A. Shutemov int hugepage_madvise(struct vm_area_struct *vma, 300*b46e756fSKirill A. Shutemov unsigned long *vm_flags, int advice) 301*b46e756fSKirill A. Shutemov { 302*b46e756fSKirill A. Shutemov switch (advice) { 303*b46e756fSKirill A. Shutemov case MADV_HUGEPAGE: 304*b46e756fSKirill A. Shutemov #ifdef CONFIG_S390 305*b46e756fSKirill A. Shutemov /* 306*b46e756fSKirill A. Shutemov * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 307*b46e756fSKirill A. Shutemov * can't handle this properly after s390_enable_sie, so we simply 308*b46e756fSKirill A. Shutemov * ignore the madvise to prevent qemu from causing a SIGSEGV. 309*b46e756fSKirill A. Shutemov */ 310*b46e756fSKirill A. Shutemov if (mm_has_pgste(vma->vm_mm)) 311*b46e756fSKirill A. Shutemov return 0; 312*b46e756fSKirill A. Shutemov #endif 313*b46e756fSKirill A. Shutemov *vm_flags &= ~VM_NOHUGEPAGE; 314*b46e756fSKirill A. Shutemov *vm_flags |= VM_HUGEPAGE; 315*b46e756fSKirill A. Shutemov /* 316*b46e756fSKirill A. Shutemov * If the vma become good for khugepaged to scan, 317*b46e756fSKirill A. Shutemov * register it here without waiting a page fault that 318*b46e756fSKirill A. Shutemov * may not happen any time soon. 319*b46e756fSKirill A. Shutemov */ 320*b46e756fSKirill A. Shutemov if (!(*vm_flags & VM_NO_KHUGEPAGED) && 321*b46e756fSKirill A. Shutemov khugepaged_enter_vma_merge(vma, *vm_flags)) 322*b46e756fSKirill A. Shutemov return -ENOMEM; 323*b46e756fSKirill A. Shutemov break; 324*b46e756fSKirill A. Shutemov case MADV_NOHUGEPAGE: 325*b46e756fSKirill A. Shutemov *vm_flags &= ~VM_HUGEPAGE; 326*b46e756fSKirill A. Shutemov *vm_flags |= VM_NOHUGEPAGE; 327*b46e756fSKirill A. Shutemov /* 328*b46e756fSKirill A. Shutemov * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 329*b46e756fSKirill A. Shutemov * this vma even if we leave the mm registered in khugepaged if 330*b46e756fSKirill A. Shutemov * it got registered before VM_NOHUGEPAGE was set. 331*b46e756fSKirill A. Shutemov */ 332*b46e756fSKirill A. Shutemov break; 333*b46e756fSKirill A. Shutemov } 334*b46e756fSKirill A. Shutemov 335*b46e756fSKirill A. Shutemov return 0; 336*b46e756fSKirill A. Shutemov } 337*b46e756fSKirill A. Shutemov 338*b46e756fSKirill A. Shutemov int __init khugepaged_init(void) 339*b46e756fSKirill A. Shutemov { 340*b46e756fSKirill A. Shutemov mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 341*b46e756fSKirill A. Shutemov sizeof(struct mm_slot), 342*b46e756fSKirill A. Shutemov __alignof__(struct mm_slot), 0, NULL); 343*b46e756fSKirill A. Shutemov if (!mm_slot_cache) 344*b46e756fSKirill A. Shutemov return -ENOMEM; 345*b46e756fSKirill A. Shutemov 346*b46e756fSKirill A. Shutemov khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 347*b46e756fSKirill A. Shutemov khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; 348*b46e756fSKirill A. Shutemov khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; 349*b46e756fSKirill A. Shutemov 350*b46e756fSKirill A. Shutemov return 0; 351*b46e756fSKirill A. Shutemov } 352*b46e756fSKirill A. Shutemov 353*b46e756fSKirill A. Shutemov void __init khugepaged_destroy(void) 354*b46e756fSKirill A. Shutemov { 355*b46e756fSKirill A. Shutemov kmem_cache_destroy(mm_slot_cache); 356*b46e756fSKirill A. Shutemov } 357*b46e756fSKirill A. Shutemov 358*b46e756fSKirill A. Shutemov static inline struct mm_slot *alloc_mm_slot(void) 359*b46e756fSKirill A. Shutemov { 360*b46e756fSKirill A. Shutemov if (!mm_slot_cache) /* initialization failed */ 361*b46e756fSKirill A. Shutemov return NULL; 362*b46e756fSKirill A. Shutemov return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 363*b46e756fSKirill A. Shutemov } 364*b46e756fSKirill A. Shutemov 365*b46e756fSKirill A. Shutemov static inline void free_mm_slot(struct mm_slot *mm_slot) 366*b46e756fSKirill A. Shutemov { 367*b46e756fSKirill A. Shutemov kmem_cache_free(mm_slot_cache, mm_slot); 368*b46e756fSKirill A. Shutemov } 369*b46e756fSKirill A. Shutemov 370*b46e756fSKirill A. Shutemov static struct mm_slot *get_mm_slot(struct mm_struct *mm) 371*b46e756fSKirill A. Shutemov { 372*b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 373*b46e756fSKirill A. Shutemov 374*b46e756fSKirill A. Shutemov hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) 375*b46e756fSKirill A. Shutemov if (mm == mm_slot->mm) 376*b46e756fSKirill A. Shutemov return mm_slot; 377*b46e756fSKirill A. Shutemov 378*b46e756fSKirill A. Shutemov return NULL; 379*b46e756fSKirill A. Shutemov } 380*b46e756fSKirill A. Shutemov 381*b46e756fSKirill A. Shutemov static void insert_to_mm_slots_hash(struct mm_struct *mm, 382*b46e756fSKirill A. Shutemov struct mm_slot *mm_slot) 383*b46e756fSKirill A. Shutemov { 384*b46e756fSKirill A. Shutemov mm_slot->mm = mm; 385*b46e756fSKirill A. Shutemov hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); 386*b46e756fSKirill A. Shutemov } 387*b46e756fSKirill A. Shutemov 388*b46e756fSKirill A. Shutemov static inline int khugepaged_test_exit(struct mm_struct *mm) 389*b46e756fSKirill A. Shutemov { 390*b46e756fSKirill A. Shutemov return atomic_read(&mm->mm_users) == 0; 391*b46e756fSKirill A. Shutemov } 392*b46e756fSKirill A. Shutemov 393*b46e756fSKirill A. Shutemov int __khugepaged_enter(struct mm_struct *mm) 394*b46e756fSKirill A. Shutemov { 395*b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 396*b46e756fSKirill A. Shutemov int wakeup; 397*b46e756fSKirill A. Shutemov 398*b46e756fSKirill A. Shutemov mm_slot = alloc_mm_slot(); 399*b46e756fSKirill A. Shutemov if (!mm_slot) 400*b46e756fSKirill A. Shutemov return -ENOMEM; 401*b46e756fSKirill A. Shutemov 402*b46e756fSKirill A. Shutemov /* __khugepaged_exit() must not run from under us */ 403*b46e756fSKirill A. Shutemov VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); 404*b46e756fSKirill A. Shutemov if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 405*b46e756fSKirill A. Shutemov free_mm_slot(mm_slot); 406*b46e756fSKirill A. Shutemov return 0; 407*b46e756fSKirill A. Shutemov } 408*b46e756fSKirill A. Shutemov 409*b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 410*b46e756fSKirill A. Shutemov insert_to_mm_slots_hash(mm, mm_slot); 411*b46e756fSKirill A. Shutemov /* 412*b46e756fSKirill A. Shutemov * Insert just behind the scanning cursor, to let the area settle 413*b46e756fSKirill A. Shutemov * down a little. 414*b46e756fSKirill A. Shutemov */ 415*b46e756fSKirill A. Shutemov wakeup = list_empty(&khugepaged_scan.mm_head); 416*b46e756fSKirill A. Shutemov list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 417*b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 418*b46e756fSKirill A. Shutemov 419*b46e756fSKirill A. Shutemov atomic_inc(&mm->mm_count); 420*b46e756fSKirill A. Shutemov if (wakeup) 421*b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 422*b46e756fSKirill A. Shutemov 423*b46e756fSKirill A. Shutemov return 0; 424*b46e756fSKirill A. Shutemov } 425*b46e756fSKirill A. Shutemov 426*b46e756fSKirill A. Shutemov int khugepaged_enter_vma_merge(struct vm_area_struct *vma, 427*b46e756fSKirill A. Shutemov unsigned long vm_flags) 428*b46e756fSKirill A. Shutemov { 429*b46e756fSKirill A. Shutemov unsigned long hstart, hend; 430*b46e756fSKirill A. Shutemov if (!vma->anon_vma) 431*b46e756fSKirill A. Shutemov /* 432*b46e756fSKirill A. Shutemov * Not yet faulted in so we will register later in the 433*b46e756fSKirill A. Shutemov * page fault if needed. 434*b46e756fSKirill A. Shutemov */ 435*b46e756fSKirill A. Shutemov return 0; 436*b46e756fSKirill A. Shutemov if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) 437*b46e756fSKirill A. Shutemov /* khugepaged not yet working on file or special mappings */ 438*b46e756fSKirill A. Shutemov return 0; 439*b46e756fSKirill A. Shutemov hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 440*b46e756fSKirill A. Shutemov hend = vma->vm_end & HPAGE_PMD_MASK; 441*b46e756fSKirill A. Shutemov if (hstart < hend) 442*b46e756fSKirill A. Shutemov return khugepaged_enter(vma, vm_flags); 443*b46e756fSKirill A. Shutemov return 0; 444*b46e756fSKirill A. Shutemov } 445*b46e756fSKirill A. Shutemov 446*b46e756fSKirill A. Shutemov void __khugepaged_exit(struct mm_struct *mm) 447*b46e756fSKirill A. Shutemov { 448*b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 449*b46e756fSKirill A. Shutemov int free = 0; 450*b46e756fSKirill A. Shutemov 451*b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 452*b46e756fSKirill A. Shutemov mm_slot = get_mm_slot(mm); 453*b46e756fSKirill A. Shutemov if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 454*b46e756fSKirill A. Shutemov hash_del(&mm_slot->hash); 455*b46e756fSKirill A. Shutemov list_del(&mm_slot->mm_node); 456*b46e756fSKirill A. Shutemov free = 1; 457*b46e756fSKirill A. Shutemov } 458*b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 459*b46e756fSKirill A. Shutemov 460*b46e756fSKirill A. Shutemov if (free) { 461*b46e756fSKirill A. Shutemov clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 462*b46e756fSKirill A. Shutemov free_mm_slot(mm_slot); 463*b46e756fSKirill A. Shutemov mmdrop(mm); 464*b46e756fSKirill A. Shutemov } else if (mm_slot) { 465*b46e756fSKirill A. Shutemov /* 466*b46e756fSKirill A. Shutemov * This is required to serialize against 467*b46e756fSKirill A. Shutemov * khugepaged_test_exit() (which is guaranteed to run 468*b46e756fSKirill A. Shutemov * under mmap sem read mode). Stop here (after we 469*b46e756fSKirill A. Shutemov * return all pagetables will be destroyed) until 470*b46e756fSKirill A. Shutemov * khugepaged has finished working on the pagetables 471*b46e756fSKirill A. Shutemov * under the mmap_sem. 472*b46e756fSKirill A. Shutemov */ 473*b46e756fSKirill A. Shutemov down_write(&mm->mmap_sem); 474*b46e756fSKirill A. Shutemov up_write(&mm->mmap_sem); 475*b46e756fSKirill A. Shutemov } 476*b46e756fSKirill A. Shutemov } 477*b46e756fSKirill A. Shutemov 478*b46e756fSKirill A. Shutemov static void release_pte_page(struct page *page) 479*b46e756fSKirill A. Shutemov { 480*b46e756fSKirill A. Shutemov /* 0 stands for page_is_file_cache(page) == false */ 481*b46e756fSKirill A. Shutemov dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 482*b46e756fSKirill A. Shutemov unlock_page(page); 483*b46e756fSKirill A. Shutemov putback_lru_page(page); 484*b46e756fSKirill A. Shutemov } 485*b46e756fSKirill A. Shutemov 486*b46e756fSKirill A. Shutemov static void release_pte_pages(pte_t *pte, pte_t *_pte) 487*b46e756fSKirill A. Shutemov { 488*b46e756fSKirill A. Shutemov while (--_pte >= pte) { 489*b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 490*b46e756fSKirill A. Shutemov if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) 491*b46e756fSKirill A. Shutemov release_pte_page(pte_page(pteval)); 492*b46e756fSKirill A. Shutemov } 493*b46e756fSKirill A. Shutemov } 494*b46e756fSKirill A. Shutemov 495*b46e756fSKirill A. Shutemov static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 496*b46e756fSKirill A. Shutemov unsigned long address, 497*b46e756fSKirill A. Shutemov pte_t *pte) 498*b46e756fSKirill A. Shutemov { 499*b46e756fSKirill A. Shutemov struct page *page = NULL; 500*b46e756fSKirill A. Shutemov pte_t *_pte; 501*b46e756fSKirill A. Shutemov int none_or_zero = 0, result = 0; 502*b46e756fSKirill A. Shutemov bool referenced = false, writable = false; 503*b46e756fSKirill A. Shutemov 504*b46e756fSKirill A. Shutemov for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 505*b46e756fSKirill A. Shutemov _pte++, address += PAGE_SIZE) { 506*b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 507*b46e756fSKirill A. Shutemov if (pte_none(pteval) || (pte_present(pteval) && 508*b46e756fSKirill A. Shutemov is_zero_pfn(pte_pfn(pteval)))) { 509*b46e756fSKirill A. Shutemov if (!userfaultfd_armed(vma) && 510*b46e756fSKirill A. Shutemov ++none_or_zero <= khugepaged_max_ptes_none) { 511*b46e756fSKirill A. Shutemov continue; 512*b46e756fSKirill A. Shutemov } else { 513*b46e756fSKirill A. Shutemov result = SCAN_EXCEED_NONE_PTE; 514*b46e756fSKirill A. Shutemov goto out; 515*b46e756fSKirill A. Shutemov } 516*b46e756fSKirill A. Shutemov } 517*b46e756fSKirill A. Shutemov if (!pte_present(pteval)) { 518*b46e756fSKirill A. Shutemov result = SCAN_PTE_NON_PRESENT; 519*b46e756fSKirill A. Shutemov goto out; 520*b46e756fSKirill A. Shutemov } 521*b46e756fSKirill A. Shutemov page = vm_normal_page(vma, address, pteval); 522*b46e756fSKirill A. Shutemov if (unlikely(!page)) { 523*b46e756fSKirill A. Shutemov result = SCAN_PAGE_NULL; 524*b46e756fSKirill A. Shutemov goto out; 525*b46e756fSKirill A. Shutemov } 526*b46e756fSKirill A. Shutemov 527*b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(PageCompound(page), page); 528*b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(!PageAnon(page), page); 529*b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 530*b46e756fSKirill A. Shutemov 531*b46e756fSKirill A. Shutemov /* 532*b46e756fSKirill A. Shutemov * We can do it before isolate_lru_page because the 533*b46e756fSKirill A. Shutemov * page can't be freed from under us. NOTE: PG_lock 534*b46e756fSKirill A. Shutemov * is needed to serialize against split_huge_page 535*b46e756fSKirill A. Shutemov * when invoked from the VM. 536*b46e756fSKirill A. Shutemov */ 537*b46e756fSKirill A. Shutemov if (!trylock_page(page)) { 538*b46e756fSKirill A. Shutemov result = SCAN_PAGE_LOCK; 539*b46e756fSKirill A. Shutemov goto out; 540*b46e756fSKirill A. Shutemov } 541*b46e756fSKirill A. Shutemov 542*b46e756fSKirill A. Shutemov /* 543*b46e756fSKirill A. Shutemov * cannot use mapcount: can't collapse if there's a gup pin. 544*b46e756fSKirill A. Shutemov * The page must only be referenced by the scanned process 545*b46e756fSKirill A. Shutemov * and page swap cache. 546*b46e756fSKirill A. Shutemov */ 547*b46e756fSKirill A. Shutemov if (page_count(page) != 1 + !!PageSwapCache(page)) { 548*b46e756fSKirill A. Shutemov unlock_page(page); 549*b46e756fSKirill A. Shutemov result = SCAN_PAGE_COUNT; 550*b46e756fSKirill A. Shutemov goto out; 551*b46e756fSKirill A. Shutemov } 552*b46e756fSKirill A. Shutemov if (pte_write(pteval)) { 553*b46e756fSKirill A. Shutemov writable = true; 554*b46e756fSKirill A. Shutemov } else { 555*b46e756fSKirill A. Shutemov if (PageSwapCache(page) && 556*b46e756fSKirill A. Shutemov !reuse_swap_page(page, NULL)) { 557*b46e756fSKirill A. Shutemov unlock_page(page); 558*b46e756fSKirill A. Shutemov result = SCAN_SWAP_CACHE_PAGE; 559*b46e756fSKirill A. Shutemov goto out; 560*b46e756fSKirill A. Shutemov } 561*b46e756fSKirill A. Shutemov /* 562*b46e756fSKirill A. Shutemov * Page is not in the swap cache. It can be collapsed 563*b46e756fSKirill A. Shutemov * into a THP. 564*b46e756fSKirill A. Shutemov */ 565*b46e756fSKirill A. Shutemov } 566*b46e756fSKirill A. Shutemov 567*b46e756fSKirill A. Shutemov /* 568*b46e756fSKirill A. Shutemov * Isolate the page to avoid collapsing an hugepage 569*b46e756fSKirill A. Shutemov * currently in use by the VM. 570*b46e756fSKirill A. Shutemov */ 571*b46e756fSKirill A. Shutemov if (isolate_lru_page(page)) { 572*b46e756fSKirill A. Shutemov unlock_page(page); 573*b46e756fSKirill A. Shutemov result = SCAN_DEL_PAGE_LRU; 574*b46e756fSKirill A. Shutemov goto out; 575*b46e756fSKirill A. Shutemov } 576*b46e756fSKirill A. Shutemov /* 0 stands for page_is_file_cache(page) == false */ 577*b46e756fSKirill A. Shutemov inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 578*b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(!PageLocked(page), page); 579*b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(PageLRU(page), page); 580*b46e756fSKirill A. Shutemov 581*b46e756fSKirill A. Shutemov /* If there is no mapped pte young don't collapse the page */ 582*b46e756fSKirill A. Shutemov if (pte_young(pteval) || 583*b46e756fSKirill A. Shutemov page_is_young(page) || PageReferenced(page) || 584*b46e756fSKirill A. Shutemov mmu_notifier_test_young(vma->vm_mm, address)) 585*b46e756fSKirill A. Shutemov referenced = true; 586*b46e756fSKirill A. Shutemov } 587*b46e756fSKirill A. Shutemov if (likely(writable)) { 588*b46e756fSKirill A. Shutemov if (likely(referenced)) { 589*b46e756fSKirill A. Shutemov result = SCAN_SUCCEED; 590*b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page_isolate(page, none_or_zero, 591*b46e756fSKirill A. Shutemov referenced, writable, result); 592*b46e756fSKirill A. Shutemov return 1; 593*b46e756fSKirill A. Shutemov } 594*b46e756fSKirill A. Shutemov } else { 595*b46e756fSKirill A. Shutemov result = SCAN_PAGE_RO; 596*b46e756fSKirill A. Shutemov } 597*b46e756fSKirill A. Shutemov 598*b46e756fSKirill A. Shutemov out: 599*b46e756fSKirill A. Shutemov release_pte_pages(pte, _pte); 600*b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page_isolate(page, none_or_zero, 601*b46e756fSKirill A. Shutemov referenced, writable, result); 602*b46e756fSKirill A. Shutemov return 0; 603*b46e756fSKirill A. Shutemov } 604*b46e756fSKirill A. Shutemov 605*b46e756fSKirill A. Shutemov static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 606*b46e756fSKirill A. Shutemov struct vm_area_struct *vma, 607*b46e756fSKirill A. Shutemov unsigned long address, 608*b46e756fSKirill A. Shutemov spinlock_t *ptl) 609*b46e756fSKirill A. Shutemov { 610*b46e756fSKirill A. Shutemov pte_t *_pte; 611*b46e756fSKirill A. Shutemov for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { 612*b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 613*b46e756fSKirill A. Shutemov struct page *src_page; 614*b46e756fSKirill A. Shutemov 615*b46e756fSKirill A. Shutemov if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 616*b46e756fSKirill A. Shutemov clear_user_highpage(page, address); 617*b46e756fSKirill A. Shutemov add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 618*b46e756fSKirill A. Shutemov if (is_zero_pfn(pte_pfn(pteval))) { 619*b46e756fSKirill A. Shutemov /* 620*b46e756fSKirill A. Shutemov * ptl mostly unnecessary. 621*b46e756fSKirill A. Shutemov */ 622*b46e756fSKirill A. Shutemov spin_lock(ptl); 623*b46e756fSKirill A. Shutemov /* 624*b46e756fSKirill A. Shutemov * paravirt calls inside pte_clear here are 625*b46e756fSKirill A. Shutemov * superfluous. 626*b46e756fSKirill A. Shutemov */ 627*b46e756fSKirill A. Shutemov pte_clear(vma->vm_mm, address, _pte); 628*b46e756fSKirill A. Shutemov spin_unlock(ptl); 629*b46e756fSKirill A. Shutemov } 630*b46e756fSKirill A. Shutemov } else { 631*b46e756fSKirill A. Shutemov src_page = pte_page(pteval); 632*b46e756fSKirill A. Shutemov copy_user_highpage(page, src_page, address, vma); 633*b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); 634*b46e756fSKirill A. Shutemov release_pte_page(src_page); 635*b46e756fSKirill A. Shutemov /* 636*b46e756fSKirill A. Shutemov * ptl mostly unnecessary, but preempt has to 637*b46e756fSKirill A. Shutemov * be disabled to update the per-cpu stats 638*b46e756fSKirill A. Shutemov * inside page_remove_rmap(). 639*b46e756fSKirill A. Shutemov */ 640*b46e756fSKirill A. Shutemov spin_lock(ptl); 641*b46e756fSKirill A. Shutemov /* 642*b46e756fSKirill A. Shutemov * paravirt calls inside pte_clear here are 643*b46e756fSKirill A. Shutemov * superfluous. 644*b46e756fSKirill A. Shutemov */ 645*b46e756fSKirill A. Shutemov pte_clear(vma->vm_mm, address, _pte); 646*b46e756fSKirill A. Shutemov page_remove_rmap(src_page, false); 647*b46e756fSKirill A. Shutemov spin_unlock(ptl); 648*b46e756fSKirill A. Shutemov free_page_and_swap_cache(src_page); 649*b46e756fSKirill A. Shutemov } 650*b46e756fSKirill A. Shutemov 651*b46e756fSKirill A. Shutemov address += PAGE_SIZE; 652*b46e756fSKirill A. Shutemov page++; 653*b46e756fSKirill A. Shutemov } 654*b46e756fSKirill A. Shutemov } 655*b46e756fSKirill A. Shutemov 656*b46e756fSKirill A. Shutemov static void khugepaged_alloc_sleep(void) 657*b46e756fSKirill A. Shutemov { 658*b46e756fSKirill A. Shutemov DEFINE_WAIT(wait); 659*b46e756fSKirill A. Shutemov 660*b46e756fSKirill A. Shutemov add_wait_queue(&khugepaged_wait, &wait); 661*b46e756fSKirill A. Shutemov freezable_schedule_timeout_interruptible( 662*b46e756fSKirill A. Shutemov msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 663*b46e756fSKirill A. Shutemov remove_wait_queue(&khugepaged_wait, &wait); 664*b46e756fSKirill A. Shutemov } 665*b46e756fSKirill A. Shutemov 666*b46e756fSKirill A. Shutemov static int khugepaged_node_load[MAX_NUMNODES]; 667*b46e756fSKirill A. Shutemov 668*b46e756fSKirill A. Shutemov static bool khugepaged_scan_abort(int nid) 669*b46e756fSKirill A. Shutemov { 670*b46e756fSKirill A. Shutemov int i; 671*b46e756fSKirill A. Shutemov 672*b46e756fSKirill A. Shutemov /* 673*b46e756fSKirill A. Shutemov * If zone_reclaim_mode is disabled, then no extra effort is made to 674*b46e756fSKirill A. Shutemov * allocate memory locally. 675*b46e756fSKirill A. Shutemov */ 676*b46e756fSKirill A. Shutemov if (!zone_reclaim_mode) 677*b46e756fSKirill A. Shutemov return false; 678*b46e756fSKirill A. Shutemov 679*b46e756fSKirill A. Shutemov /* If there is a count for this node already, it must be acceptable */ 680*b46e756fSKirill A. Shutemov if (khugepaged_node_load[nid]) 681*b46e756fSKirill A. Shutemov return false; 682*b46e756fSKirill A. Shutemov 683*b46e756fSKirill A. Shutemov for (i = 0; i < MAX_NUMNODES; i++) { 684*b46e756fSKirill A. Shutemov if (!khugepaged_node_load[i]) 685*b46e756fSKirill A. Shutemov continue; 686*b46e756fSKirill A. Shutemov if (node_distance(nid, i) > RECLAIM_DISTANCE) 687*b46e756fSKirill A. Shutemov return true; 688*b46e756fSKirill A. Shutemov } 689*b46e756fSKirill A. Shutemov return false; 690*b46e756fSKirill A. Shutemov } 691*b46e756fSKirill A. Shutemov 692*b46e756fSKirill A. Shutemov /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 693*b46e756fSKirill A. Shutemov static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 694*b46e756fSKirill A. Shutemov { 695*b46e756fSKirill A. Shutemov return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); 696*b46e756fSKirill A. Shutemov } 697*b46e756fSKirill A. Shutemov 698*b46e756fSKirill A. Shutemov #ifdef CONFIG_NUMA 699*b46e756fSKirill A. Shutemov static int khugepaged_find_target_node(void) 700*b46e756fSKirill A. Shutemov { 701*b46e756fSKirill A. Shutemov static int last_khugepaged_target_node = NUMA_NO_NODE; 702*b46e756fSKirill A. Shutemov int nid, target_node = 0, max_value = 0; 703*b46e756fSKirill A. Shutemov 704*b46e756fSKirill A. Shutemov /* find first node with max normal pages hit */ 705*b46e756fSKirill A. Shutemov for (nid = 0; nid < MAX_NUMNODES; nid++) 706*b46e756fSKirill A. Shutemov if (khugepaged_node_load[nid] > max_value) { 707*b46e756fSKirill A. Shutemov max_value = khugepaged_node_load[nid]; 708*b46e756fSKirill A. Shutemov target_node = nid; 709*b46e756fSKirill A. Shutemov } 710*b46e756fSKirill A. Shutemov 711*b46e756fSKirill A. Shutemov /* do some balance if several nodes have the same hit record */ 712*b46e756fSKirill A. Shutemov if (target_node <= last_khugepaged_target_node) 713*b46e756fSKirill A. Shutemov for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; 714*b46e756fSKirill A. Shutemov nid++) 715*b46e756fSKirill A. Shutemov if (max_value == khugepaged_node_load[nid]) { 716*b46e756fSKirill A. Shutemov target_node = nid; 717*b46e756fSKirill A. Shutemov break; 718*b46e756fSKirill A. Shutemov } 719*b46e756fSKirill A. Shutemov 720*b46e756fSKirill A. Shutemov last_khugepaged_target_node = target_node; 721*b46e756fSKirill A. Shutemov return target_node; 722*b46e756fSKirill A. Shutemov } 723*b46e756fSKirill A. Shutemov 724*b46e756fSKirill A. Shutemov static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 725*b46e756fSKirill A. Shutemov { 726*b46e756fSKirill A. Shutemov if (IS_ERR(*hpage)) { 727*b46e756fSKirill A. Shutemov if (!*wait) 728*b46e756fSKirill A. Shutemov return false; 729*b46e756fSKirill A. Shutemov 730*b46e756fSKirill A. Shutemov *wait = false; 731*b46e756fSKirill A. Shutemov *hpage = NULL; 732*b46e756fSKirill A. Shutemov khugepaged_alloc_sleep(); 733*b46e756fSKirill A. Shutemov } else if (*hpage) { 734*b46e756fSKirill A. Shutemov put_page(*hpage); 735*b46e756fSKirill A. Shutemov *hpage = NULL; 736*b46e756fSKirill A. Shutemov } 737*b46e756fSKirill A. Shutemov 738*b46e756fSKirill A. Shutemov return true; 739*b46e756fSKirill A. Shutemov } 740*b46e756fSKirill A. Shutemov 741*b46e756fSKirill A. Shutemov static struct page * 742*b46e756fSKirill A. Shutemov khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, 743*b46e756fSKirill A. Shutemov unsigned long address, int node) 744*b46e756fSKirill A. Shutemov { 745*b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(*hpage, *hpage); 746*b46e756fSKirill A. Shutemov 747*b46e756fSKirill A. Shutemov /* 748*b46e756fSKirill A. Shutemov * Before allocating the hugepage, release the mmap_sem read lock. 749*b46e756fSKirill A. Shutemov * The allocation can take potentially a long time if it involves 750*b46e756fSKirill A. Shutemov * sync compaction, and we do not need to hold the mmap_sem during 751*b46e756fSKirill A. Shutemov * that. We will recheck the vma after taking it again in write mode. 752*b46e756fSKirill A. Shutemov */ 753*b46e756fSKirill A. Shutemov up_read(&mm->mmap_sem); 754*b46e756fSKirill A. Shutemov 755*b46e756fSKirill A. Shutemov *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); 756*b46e756fSKirill A. Shutemov if (unlikely(!*hpage)) { 757*b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 758*b46e756fSKirill A. Shutemov *hpage = ERR_PTR(-ENOMEM); 759*b46e756fSKirill A. Shutemov return NULL; 760*b46e756fSKirill A. Shutemov } 761*b46e756fSKirill A. Shutemov 762*b46e756fSKirill A. Shutemov prep_transhuge_page(*hpage); 763*b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC); 764*b46e756fSKirill A. Shutemov return *hpage; 765*b46e756fSKirill A. Shutemov } 766*b46e756fSKirill A. Shutemov #else 767*b46e756fSKirill A. Shutemov static int khugepaged_find_target_node(void) 768*b46e756fSKirill A. Shutemov { 769*b46e756fSKirill A. Shutemov return 0; 770*b46e756fSKirill A. Shutemov } 771*b46e756fSKirill A. Shutemov 772*b46e756fSKirill A. Shutemov static inline struct page *alloc_khugepaged_hugepage(void) 773*b46e756fSKirill A. Shutemov { 774*b46e756fSKirill A. Shutemov struct page *page; 775*b46e756fSKirill A. Shutemov 776*b46e756fSKirill A. Shutemov page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), 777*b46e756fSKirill A. Shutemov HPAGE_PMD_ORDER); 778*b46e756fSKirill A. Shutemov if (page) 779*b46e756fSKirill A. Shutemov prep_transhuge_page(page); 780*b46e756fSKirill A. Shutemov return page; 781*b46e756fSKirill A. Shutemov } 782*b46e756fSKirill A. Shutemov 783*b46e756fSKirill A. Shutemov static struct page *khugepaged_alloc_hugepage(bool *wait) 784*b46e756fSKirill A. Shutemov { 785*b46e756fSKirill A. Shutemov struct page *hpage; 786*b46e756fSKirill A. Shutemov 787*b46e756fSKirill A. Shutemov do { 788*b46e756fSKirill A. Shutemov hpage = alloc_khugepaged_hugepage(); 789*b46e756fSKirill A. Shutemov if (!hpage) { 790*b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 791*b46e756fSKirill A. Shutemov if (!*wait) 792*b46e756fSKirill A. Shutemov return NULL; 793*b46e756fSKirill A. Shutemov 794*b46e756fSKirill A. Shutemov *wait = false; 795*b46e756fSKirill A. Shutemov khugepaged_alloc_sleep(); 796*b46e756fSKirill A. Shutemov } else 797*b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC); 798*b46e756fSKirill A. Shutemov } while (unlikely(!hpage) && likely(khugepaged_enabled())); 799*b46e756fSKirill A. Shutemov 800*b46e756fSKirill A. Shutemov return hpage; 801*b46e756fSKirill A. Shutemov } 802*b46e756fSKirill A. Shutemov 803*b46e756fSKirill A. Shutemov static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 804*b46e756fSKirill A. Shutemov { 805*b46e756fSKirill A. Shutemov if (!*hpage) 806*b46e756fSKirill A. Shutemov *hpage = khugepaged_alloc_hugepage(wait); 807*b46e756fSKirill A. Shutemov 808*b46e756fSKirill A. Shutemov if (unlikely(!*hpage)) 809*b46e756fSKirill A. Shutemov return false; 810*b46e756fSKirill A. Shutemov 811*b46e756fSKirill A. Shutemov return true; 812*b46e756fSKirill A. Shutemov } 813*b46e756fSKirill A. Shutemov 814*b46e756fSKirill A. Shutemov static struct page * 815*b46e756fSKirill A. Shutemov khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, 816*b46e756fSKirill A. Shutemov unsigned long address, int node) 817*b46e756fSKirill A. Shutemov { 818*b46e756fSKirill A. Shutemov up_read(&mm->mmap_sem); 819*b46e756fSKirill A. Shutemov VM_BUG_ON(!*hpage); 820*b46e756fSKirill A. Shutemov 821*b46e756fSKirill A. Shutemov return *hpage; 822*b46e756fSKirill A. Shutemov } 823*b46e756fSKirill A. Shutemov #endif 824*b46e756fSKirill A. Shutemov 825*b46e756fSKirill A. Shutemov static bool hugepage_vma_check(struct vm_area_struct *vma) 826*b46e756fSKirill A. Shutemov { 827*b46e756fSKirill A. Shutemov if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 828*b46e756fSKirill A. Shutemov (vma->vm_flags & VM_NOHUGEPAGE)) 829*b46e756fSKirill A. Shutemov return false; 830*b46e756fSKirill A. Shutemov if (!vma->anon_vma || vma->vm_ops) 831*b46e756fSKirill A. Shutemov return false; 832*b46e756fSKirill A. Shutemov if (is_vma_temporary_stack(vma)) 833*b46e756fSKirill A. Shutemov return false; 834*b46e756fSKirill A. Shutemov return !(vma->vm_flags & VM_NO_KHUGEPAGED); 835*b46e756fSKirill A. Shutemov } 836*b46e756fSKirill A. Shutemov 837*b46e756fSKirill A. Shutemov /* 838*b46e756fSKirill A. Shutemov * If mmap_sem temporarily dropped, revalidate vma 839*b46e756fSKirill A. Shutemov * before taking mmap_sem. 840*b46e756fSKirill A. Shutemov * Return 0 if succeeds, otherwise return none-zero 841*b46e756fSKirill A. Shutemov * value (scan code). 842*b46e756fSKirill A. Shutemov */ 843*b46e756fSKirill A. Shutemov 844*b46e756fSKirill A. Shutemov static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) 845*b46e756fSKirill A. Shutemov { 846*b46e756fSKirill A. Shutemov struct vm_area_struct *vma; 847*b46e756fSKirill A. Shutemov unsigned long hstart, hend; 848*b46e756fSKirill A. Shutemov 849*b46e756fSKirill A. Shutemov if (unlikely(khugepaged_test_exit(mm))) 850*b46e756fSKirill A. Shutemov return SCAN_ANY_PROCESS; 851*b46e756fSKirill A. Shutemov 852*b46e756fSKirill A. Shutemov vma = find_vma(mm, address); 853*b46e756fSKirill A. Shutemov if (!vma) 854*b46e756fSKirill A. Shutemov return SCAN_VMA_NULL; 855*b46e756fSKirill A. Shutemov 856*b46e756fSKirill A. Shutemov hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 857*b46e756fSKirill A. Shutemov hend = vma->vm_end & HPAGE_PMD_MASK; 858*b46e756fSKirill A. Shutemov if (address < hstart || address + HPAGE_PMD_SIZE > hend) 859*b46e756fSKirill A. Shutemov return SCAN_ADDRESS_RANGE; 860*b46e756fSKirill A. Shutemov if (!hugepage_vma_check(vma)) 861*b46e756fSKirill A. Shutemov return SCAN_VMA_CHECK; 862*b46e756fSKirill A. Shutemov return 0; 863*b46e756fSKirill A. Shutemov } 864*b46e756fSKirill A. Shutemov 865*b46e756fSKirill A. Shutemov /* 866*b46e756fSKirill A. Shutemov * Bring missing pages in from swap, to complete THP collapse. 867*b46e756fSKirill A. Shutemov * Only done if khugepaged_scan_pmd believes it is worthwhile. 868*b46e756fSKirill A. Shutemov * 869*b46e756fSKirill A. Shutemov * Called and returns without pte mapped or spinlocks held, 870*b46e756fSKirill A. Shutemov * but with mmap_sem held to protect against vma changes. 871*b46e756fSKirill A. Shutemov */ 872*b46e756fSKirill A. Shutemov 873*b46e756fSKirill A. Shutemov static bool __collapse_huge_page_swapin(struct mm_struct *mm, 874*b46e756fSKirill A. Shutemov struct vm_area_struct *vma, 875*b46e756fSKirill A. Shutemov unsigned long address, pmd_t *pmd) 876*b46e756fSKirill A. Shutemov { 877*b46e756fSKirill A. Shutemov pte_t pteval; 878*b46e756fSKirill A. Shutemov int swapped_in = 0, ret = 0; 879*b46e756fSKirill A. Shutemov struct fault_env fe = { 880*b46e756fSKirill A. Shutemov .vma = vma, 881*b46e756fSKirill A. Shutemov .address = address, 882*b46e756fSKirill A. Shutemov .flags = FAULT_FLAG_ALLOW_RETRY, 883*b46e756fSKirill A. Shutemov .pmd = pmd, 884*b46e756fSKirill A. Shutemov }; 885*b46e756fSKirill A. Shutemov 886*b46e756fSKirill A. Shutemov fe.pte = pte_offset_map(pmd, address); 887*b46e756fSKirill A. Shutemov for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; 888*b46e756fSKirill A. Shutemov fe.pte++, fe.address += PAGE_SIZE) { 889*b46e756fSKirill A. Shutemov pteval = *fe.pte; 890*b46e756fSKirill A. Shutemov if (!is_swap_pte(pteval)) 891*b46e756fSKirill A. Shutemov continue; 892*b46e756fSKirill A. Shutemov swapped_in++; 893*b46e756fSKirill A. Shutemov ret = do_swap_page(&fe, pteval); 894*b46e756fSKirill A. Shutemov /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ 895*b46e756fSKirill A. Shutemov if (ret & VM_FAULT_RETRY) { 896*b46e756fSKirill A. Shutemov down_read(&mm->mmap_sem); 897*b46e756fSKirill A. Shutemov /* vma is no longer available, don't continue to swapin */ 898*b46e756fSKirill A. Shutemov if (hugepage_vma_revalidate(mm, address)) 899*b46e756fSKirill A. Shutemov return false; 900*b46e756fSKirill A. Shutemov /* check if the pmd is still valid */ 901*b46e756fSKirill A. Shutemov if (mm_find_pmd(mm, address) != pmd) 902*b46e756fSKirill A. Shutemov return false; 903*b46e756fSKirill A. Shutemov } 904*b46e756fSKirill A. Shutemov if (ret & VM_FAULT_ERROR) { 905*b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0); 906*b46e756fSKirill A. Shutemov return false; 907*b46e756fSKirill A. Shutemov } 908*b46e756fSKirill A. Shutemov /* pte is unmapped now, we need to map it */ 909*b46e756fSKirill A. Shutemov fe.pte = pte_offset_map(pmd, fe.address); 910*b46e756fSKirill A. Shutemov } 911*b46e756fSKirill A. Shutemov fe.pte--; 912*b46e756fSKirill A. Shutemov pte_unmap(fe.pte); 913*b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); 914*b46e756fSKirill A. Shutemov return true; 915*b46e756fSKirill A. Shutemov } 916*b46e756fSKirill A. Shutemov 917*b46e756fSKirill A. Shutemov static void collapse_huge_page(struct mm_struct *mm, 918*b46e756fSKirill A. Shutemov unsigned long address, 919*b46e756fSKirill A. Shutemov struct page **hpage, 920*b46e756fSKirill A. Shutemov struct vm_area_struct *vma, 921*b46e756fSKirill A. Shutemov int node) 922*b46e756fSKirill A. Shutemov { 923*b46e756fSKirill A. Shutemov pmd_t *pmd, _pmd; 924*b46e756fSKirill A. Shutemov pte_t *pte; 925*b46e756fSKirill A. Shutemov pgtable_t pgtable; 926*b46e756fSKirill A. Shutemov struct page *new_page; 927*b46e756fSKirill A. Shutemov spinlock_t *pmd_ptl, *pte_ptl; 928*b46e756fSKirill A. Shutemov int isolated = 0, result = 0; 929*b46e756fSKirill A. Shutemov struct mem_cgroup *memcg; 930*b46e756fSKirill A. Shutemov unsigned long mmun_start; /* For mmu_notifiers */ 931*b46e756fSKirill A. Shutemov unsigned long mmun_end; /* For mmu_notifiers */ 932*b46e756fSKirill A. Shutemov gfp_t gfp; 933*b46e756fSKirill A. Shutemov 934*b46e756fSKirill A. Shutemov VM_BUG_ON(address & ~HPAGE_PMD_MASK); 935*b46e756fSKirill A. Shutemov 936*b46e756fSKirill A. Shutemov /* Only allocate from the target node */ 937*b46e756fSKirill A. Shutemov gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; 938*b46e756fSKirill A. Shutemov 939*b46e756fSKirill A. Shutemov /* release the mmap_sem read lock. */ 940*b46e756fSKirill A. Shutemov new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); 941*b46e756fSKirill A. Shutemov if (!new_page) { 942*b46e756fSKirill A. Shutemov result = SCAN_ALLOC_HUGE_PAGE_FAIL; 943*b46e756fSKirill A. Shutemov goto out_nolock; 944*b46e756fSKirill A. Shutemov } 945*b46e756fSKirill A. Shutemov 946*b46e756fSKirill A. Shutemov if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { 947*b46e756fSKirill A. Shutemov result = SCAN_CGROUP_CHARGE_FAIL; 948*b46e756fSKirill A. Shutemov goto out_nolock; 949*b46e756fSKirill A. Shutemov } 950*b46e756fSKirill A. Shutemov 951*b46e756fSKirill A. Shutemov down_read(&mm->mmap_sem); 952*b46e756fSKirill A. Shutemov result = hugepage_vma_revalidate(mm, address); 953*b46e756fSKirill A. Shutemov if (result) { 954*b46e756fSKirill A. Shutemov mem_cgroup_cancel_charge(new_page, memcg, true); 955*b46e756fSKirill A. Shutemov up_read(&mm->mmap_sem); 956*b46e756fSKirill A. Shutemov goto out_nolock; 957*b46e756fSKirill A. Shutemov } 958*b46e756fSKirill A. Shutemov 959*b46e756fSKirill A. Shutemov pmd = mm_find_pmd(mm, address); 960*b46e756fSKirill A. Shutemov if (!pmd) { 961*b46e756fSKirill A. Shutemov result = SCAN_PMD_NULL; 962*b46e756fSKirill A. Shutemov mem_cgroup_cancel_charge(new_page, memcg, true); 963*b46e756fSKirill A. Shutemov up_read(&mm->mmap_sem); 964*b46e756fSKirill A. Shutemov goto out_nolock; 965*b46e756fSKirill A. Shutemov } 966*b46e756fSKirill A. Shutemov 967*b46e756fSKirill A. Shutemov /* 968*b46e756fSKirill A. Shutemov * __collapse_huge_page_swapin always returns with mmap_sem locked. 969*b46e756fSKirill A. Shutemov * If it fails, release mmap_sem and jump directly out. 970*b46e756fSKirill A. Shutemov * Continuing to collapse causes inconsistency. 971*b46e756fSKirill A. Shutemov */ 972*b46e756fSKirill A. Shutemov if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) { 973*b46e756fSKirill A. Shutemov mem_cgroup_cancel_charge(new_page, memcg, true); 974*b46e756fSKirill A. Shutemov up_read(&mm->mmap_sem); 975*b46e756fSKirill A. Shutemov goto out_nolock; 976*b46e756fSKirill A. Shutemov } 977*b46e756fSKirill A. Shutemov 978*b46e756fSKirill A. Shutemov up_read(&mm->mmap_sem); 979*b46e756fSKirill A. Shutemov /* 980*b46e756fSKirill A. Shutemov * Prevent all access to pagetables with the exception of 981*b46e756fSKirill A. Shutemov * gup_fast later handled by the ptep_clear_flush and the VM 982*b46e756fSKirill A. Shutemov * handled by the anon_vma lock + PG_lock. 983*b46e756fSKirill A. Shutemov */ 984*b46e756fSKirill A. Shutemov down_write(&mm->mmap_sem); 985*b46e756fSKirill A. Shutemov result = hugepage_vma_revalidate(mm, address); 986*b46e756fSKirill A. Shutemov if (result) 987*b46e756fSKirill A. Shutemov goto out; 988*b46e756fSKirill A. Shutemov /* check if the pmd is still valid */ 989*b46e756fSKirill A. Shutemov if (mm_find_pmd(mm, address) != pmd) 990*b46e756fSKirill A. Shutemov goto out; 991*b46e756fSKirill A. Shutemov 992*b46e756fSKirill A. Shutemov anon_vma_lock_write(vma->anon_vma); 993*b46e756fSKirill A. Shutemov 994*b46e756fSKirill A. Shutemov pte = pte_offset_map(pmd, address); 995*b46e756fSKirill A. Shutemov pte_ptl = pte_lockptr(mm, pmd); 996*b46e756fSKirill A. Shutemov 997*b46e756fSKirill A. Shutemov mmun_start = address; 998*b46e756fSKirill A. Shutemov mmun_end = address + HPAGE_PMD_SIZE; 999*b46e756fSKirill A. Shutemov mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1000*b46e756fSKirill A. Shutemov pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1001*b46e756fSKirill A. Shutemov /* 1002*b46e756fSKirill A. Shutemov * After this gup_fast can't run anymore. This also removes 1003*b46e756fSKirill A. Shutemov * any huge TLB entry from the CPU so we won't allow 1004*b46e756fSKirill A. Shutemov * huge and small TLB entries for the same virtual address 1005*b46e756fSKirill A. Shutemov * to avoid the risk of CPU bugs in that area. 1006*b46e756fSKirill A. Shutemov */ 1007*b46e756fSKirill A. Shutemov _pmd = pmdp_collapse_flush(vma, address, pmd); 1008*b46e756fSKirill A. Shutemov spin_unlock(pmd_ptl); 1009*b46e756fSKirill A. Shutemov mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1010*b46e756fSKirill A. Shutemov 1011*b46e756fSKirill A. Shutemov spin_lock(pte_ptl); 1012*b46e756fSKirill A. Shutemov isolated = __collapse_huge_page_isolate(vma, address, pte); 1013*b46e756fSKirill A. Shutemov spin_unlock(pte_ptl); 1014*b46e756fSKirill A. Shutemov 1015*b46e756fSKirill A. Shutemov if (unlikely(!isolated)) { 1016*b46e756fSKirill A. Shutemov pte_unmap(pte); 1017*b46e756fSKirill A. Shutemov spin_lock(pmd_ptl); 1018*b46e756fSKirill A. Shutemov BUG_ON(!pmd_none(*pmd)); 1019*b46e756fSKirill A. Shutemov /* 1020*b46e756fSKirill A. Shutemov * We can only use set_pmd_at when establishing 1021*b46e756fSKirill A. Shutemov * hugepmds and never for establishing regular pmds that 1022*b46e756fSKirill A. Shutemov * points to regular pagetables. Use pmd_populate for that 1023*b46e756fSKirill A. Shutemov */ 1024*b46e756fSKirill A. Shutemov pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 1025*b46e756fSKirill A. Shutemov spin_unlock(pmd_ptl); 1026*b46e756fSKirill A. Shutemov anon_vma_unlock_write(vma->anon_vma); 1027*b46e756fSKirill A. Shutemov result = SCAN_FAIL; 1028*b46e756fSKirill A. Shutemov goto out; 1029*b46e756fSKirill A. Shutemov } 1030*b46e756fSKirill A. Shutemov 1031*b46e756fSKirill A. Shutemov /* 1032*b46e756fSKirill A. Shutemov * All pages are isolated and locked so anon_vma rmap 1033*b46e756fSKirill A. Shutemov * can't run anymore. 1034*b46e756fSKirill A. Shutemov */ 1035*b46e756fSKirill A. Shutemov anon_vma_unlock_write(vma->anon_vma); 1036*b46e756fSKirill A. Shutemov 1037*b46e756fSKirill A. Shutemov __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); 1038*b46e756fSKirill A. Shutemov pte_unmap(pte); 1039*b46e756fSKirill A. Shutemov __SetPageUptodate(new_page); 1040*b46e756fSKirill A. Shutemov pgtable = pmd_pgtable(_pmd); 1041*b46e756fSKirill A. Shutemov 1042*b46e756fSKirill A. Shutemov _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); 1043*b46e756fSKirill A. Shutemov _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 1044*b46e756fSKirill A. Shutemov 1045*b46e756fSKirill A. Shutemov /* 1046*b46e756fSKirill A. Shutemov * spin_lock() below is not the equivalent of smp_wmb(), so 1047*b46e756fSKirill A. Shutemov * this is needed to avoid the copy_huge_page writes to become 1048*b46e756fSKirill A. Shutemov * visible after the set_pmd_at() write. 1049*b46e756fSKirill A. Shutemov */ 1050*b46e756fSKirill A. Shutemov smp_wmb(); 1051*b46e756fSKirill A. Shutemov 1052*b46e756fSKirill A. Shutemov spin_lock(pmd_ptl); 1053*b46e756fSKirill A. Shutemov BUG_ON(!pmd_none(*pmd)); 1054*b46e756fSKirill A. Shutemov page_add_new_anon_rmap(new_page, vma, address, true); 1055*b46e756fSKirill A. Shutemov mem_cgroup_commit_charge(new_page, memcg, false, true); 1056*b46e756fSKirill A. Shutemov lru_cache_add_active_or_unevictable(new_page, vma); 1057*b46e756fSKirill A. Shutemov pgtable_trans_huge_deposit(mm, pmd, pgtable); 1058*b46e756fSKirill A. Shutemov set_pmd_at(mm, address, pmd, _pmd); 1059*b46e756fSKirill A. Shutemov update_mmu_cache_pmd(vma, address, pmd); 1060*b46e756fSKirill A. Shutemov spin_unlock(pmd_ptl); 1061*b46e756fSKirill A. Shutemov 1062*b46e756fSKirill A. Shutemov *hpage = NULL; 1063*b46e756fSKirill A. Shutemov 1064*b46e756fSKirill A. Shutemov khugepaged_pages_collapsed++; 1065*b46e756fSKirill A. Shutemov result = SCAN_SUCCEED; 1066*b46e756fSKirill A. Shutemov out_up_write: 1067*b46e756fSKirill A. Shutemov up_write(&mm->mmap_sem); 1068*b46e756fSKirill A. Shutemov out_nolock: 1069*b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page(mm, isolated, result); 1070*b46e756fSKirill A. Shutemov return; 1071*b46e756fSKirill A. Shutemov out: 1072*b46e756fSKirill A. Shutemov mem_cgroup_cancel_charge(new_page, memcg, true); 1073*b46e756fSKirill A. Shutemov goto out_up_write; 1074*b46e756fSKirill A. Shutemov } 1075*b46e756fSKirill A. Shutemov 1076*b46e756fSKirill A. Shutemov static int khugepaged_scan_pmd(struct mm_struct *mm, 1077*b46e756fSKirill A. Shutemov struct vm_area_struct *vma, 1078*b46e756fSKirill A. Shutemov unsigned long address, 1079*b46e756fSKirill A. Shutemov struct page **hpage) 1080*b46e756fSKirill A. Shutemov { 1081*b46e756fSKirill A. Shutemov pmd_t *pmd; 1082*b46e756fSKirill A. Shutemov pte_t *pte, *_pte; 1083*b46e756fSKirill A. Shutemov int ret = 0, none_or_zero = 0, result = 0; 1084*b46e756fSKirill A. Shutemov struct page *page = NULL; 1085*b46e756fSKirill A. Shutemov unsigned long _address; 1086*b46e756fSKirill A. Shutemov spinlock_t *ptl; 1087*b46e756fSKirill A. Shutemov int node = NUMA_NO_NODE, unmapped = 0; 1088*b46e756fSKirill A. Shutemov bool writable = false, referenced = false; 1089*b46e756fSKirill A. Shutemov 1090*b46e756fSKirill A. Shutemov VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1091*b46e756fSKirill A. Shutemov 1092*b46e756fSKirill A. Shutemov pmd = mm_find_pmd(mm, address); 1093*b46e756fSKirill A. Shutemov if (!pmd) { 1094*b46e756fSKirill A. Shutemov result = SCAN_PMD_NULL; 1095*b46e756fSKirill A. Shutemov goto out; 1096*b46e756fSKirill A. Shutemov } 1097*b46e756fSKirill A. Shutemov 1098*b46e756fSKirill A. Shutemov memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 1099*b46e756fSKirill A. Shutemov pte = pte_offset_map_lock(mm, pmd, address, &ptl); 1100*b46e756fSKirill A. Shutemov for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 1101*b46e756fSKirill A. Shutemov _pte++, _address += PAGE_SIZE) { 1102*b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 1103*b46e756fSKirill A. Shutemov if (is_swap_pte(pteval)) { 1104*b46e756fSKirill A. Shutemov if (++unmapped <= khugepaged_max_ptes_swap) { 1105*b46e756fSKirill A. Shutemov continue; 1106*b46e756fSKirill A. Shutemov } else { 1107*b46e756fSKirill A. Shutemov result = SCAN_EXCEED_SWAP_PTE; 1108*b46e756fSKirill A. Shutemov goto out_unmap; 1109*b46e756fSKirill A. Shutemov } 1110*b46e756fSKirill A. Shutemov } 1111*b46e756fSKirill A. Shutemov if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 1112*b46e756fSKirill A. Shutemov if (!userfaultfd_armed(vma) && 1113*b46e756fSKirill A. Shutemov ++none_or_zero <= khugepaged_max_ptes_none) { 1114*b46e756fSKirill A. Shutemov continue; 1115*b46e756fSKirill A. Shutemov } else { 1116*b46e756fSKirill A. Shutemov result = SCAN_EXCEED_NONE_PTE; 1117*b46e756fSKirill A. Shutemov goto out_unmap; 1118*b46e756fSKirill A. Shutemov } 1119*b46e756fSKirill A. Shutemov } 1120*b46e756fSKirill A. Shutemov if (!pte_present(pteval)) { 1121*b46e756fSKirill A. Shutemov result = SCAN_PTE_NON_PRESENT; 1122*b46e756fSKirill A. Shutemov goto out_unmap; 1123*b46e756fSKirill A. Shutemov } 1124*b46e756fSKirill A. Shutemov if (pte_write(pteval)) 1125*b46e756fSKirill A. Shutemov writable = true; 1126*b46e756fSKirill A. Shutemov 1127*b46e756fSKirill A. Shutemov page = vm_normal_page(vma, _address, pteval); 1128*b46e756fSKirill A. Shutemov if (unlikely(!page)) { 1129*b46e756fSKirill A. Shutemov result = SCAN_PAGE_NULL; 1130*b46e756fSKirill A. Shutemov goto out_unmap; 1131*b46e756fSKirill A. Shutemov } 1132*b46e756fSKirill A. Shutemov 1133*b46e756fSKirill A. Shutemov /* TODO: teach khugepaged to collapse THP mapped with pte */ 1134*b46e756fSKirill A. Shutemov if (PageCompound(page)) { 1135*b46e756fSKirill A. Shutemov result = SCAN_PAGE_COMPOUND; 1136*b46e756fSKirill A. Shutemov goto out_unmap; 1137*b46e756fSKirill A. Shutemov } 1138*b46e756fSKirill A. Shutemov 1139*b46e756fSKirill A. Shutemov /* 1140*b46e756fSKirill A. Shutemov * Record which node the original page is from and save this 1141*b46e756fSKirill A. Shutemov * information to khugepaged_node_load[]. 1142*b46e756fSKirill A. Shutemov * Khupaged will allocate hugepage from the node has the max 1143*b46e756fSKirill A. Shutemov * hit record. 1144*b46e756fSKirill A. Shutemov */ 1145*b46e756fSKirill A. Shutemov node = page_to_nid(page); 1146*b46e756fSKirill A. Shutemov if (khugepaged_scan_abort(node)) { 1147*b46e756fSKirill A. Shutemov result = SCAN_SCAN_ABORT; 1148*b46e756fSKirill A. Shutemov goto out_unmap; 1149*b46e756fSKirill A. Shutemov } 1150*b46e756fSKirill A. Shutemov khugepaged_node_load[node]++; 1151*b46e756fSKirill A. Shutemov if (!PageLRU(page)) { 1152*b46e756fSKirill A. Shutemov result = SCAN_PAGE_LRU; 1153*b46e756fSKirill A. Shutemov goto out_unmap; 1154*b46e756fSKirill A. Shutemov } 1155*b46e756fSKirill A. Shutemov if (PageLocked(page)) { 1156*b46e756fSKirill A. Shutemov result = SCAN_PAGE_LOCK; 1157*b46e756fSKirill A. Shutemov goto out_unmap; 1158*b46e756fSKirill A. Shutemov } 1159*b46e756fSKirill A. Shutemov if (!PageAnon(page)) { 1160*b46e756fSKirill A. Shutemov result = SCAN_PAGE_ANON; 1161*b46e756fSKirill A. Shutemov goto out_unmap; 1162*b46e756fSKirill A. Shutemov } 1163*b46e756fSKirill A. Shutemov 1164*b46e756fSKirill A. Shutemov /* 1165*b46e756fSKirill A. Shutemov * cannot use mapcount: can't collapse if there's a gup pin. 1166*b46e756fSKirill A. Shutemov * The page must only be referenced by the scanned process 1167*b46e756fSKirill A. Shutemov * and page swap cache. 1168*b46e756fSKirill A. Shutemov */ 1169*b46e756fSKirill A. Shutemov if (page_count(page) != 1 + !!PageSwapCache(page)) { 1170*b46e756fSKirill A. Shutemov result = SCAN_PAGE_COUNT; 1171*b46e756fSKirill A. Shutemov goto out_unmap; 1172*b46e756fSKirill A. Shutemov } 1173*b46e756fSKirill A. Shutemov if (pte_young(pteval) || 1174*b46e756fSKirill A. Shutemov page_is_young(page) || PageReferenced(page) || 1175*b46e756fSKirill A. Shutemov mmu_notifier_test_young(vma->vm_mm, address)) 1176*b46e756fSKirill A. Shutemov referenced = true; 1177*b46e756fSKirill A. Shutemov } 1178*b46e756fSKirill A. Shutemov if (writable) { 1179*b46e756fSKirill A. Shutemov if (referenced) { 1180*b46e756fSKirill A. Shutemov result = SCAN_SUCCEED; 1181*b46e756fSKirill A. Shutemov ret = 1; 1182*b46e756fSKirill A. Shutemov } else { 1183*b46e756fSKirill A. Shutemov result = SCAN_NO_REFERENCED_PAGE; 1184*b46e756fSKirill A. Shutemov } 1185*b46e756fSKirill A. Shutemov } else { 1186*b46e756fSKirill A. Shutemov result = SCAN_PAGE_RO; 1187*b46e756fSKirill A. Shutemov } 1188*b46e756fSKirill A. Shutemov out_unmap: 1189*b46e756fSKirill A. Shutemov pte_unmap_unlock(pte, ptl); 1190*b46e756fSKirill A. Shutemov if (ret) { 1191*b46e756fSKirill A. Shutemov node = khugepaged_find_target_node(); 1192*b46e756fSKirill A. Shutemov /* collapse_huge_page will return with the mmap_sem released */ 1193*b46e756fSKirill A. Shutemov collapse_huge_page(mm, address, hpage, vma, node); 1194*b46e756fSKirill A. Shutemov } 1195*b46e756fSKirill A. Shutemov out: 1196*b46e756fSKirill A. Shutemov trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, 1197*b46e756fSKirill A. Shutemov none_or_zero, result, unmapped); 1198*b46e756fSKirill A. Shutemov return ret; 1199*b46e756fSKirill A. Shutemov } 1200*b46e756fSKirill A. Shutemov 1201*b46e756fSKirill A. Shutemov static void collect_mm_slot(struct mm_slot *mm_slot) 1202*b46e756fSKirill A. Shutemov { 1203*b46e756fSKirill A. Shutemov struct mm_struct *mm = mm_slot->mm; 1204*b46e756fSKirill A. Shutemov 1205*b46e756fSKirill A. Shutemov VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 1206*b46e756fSKirill A. Shutemov 1207*b46e756fSKirill A. Shutemov if (khugepaged_test_exit(mm)) { 1208*b46e756fSKirill A. Shutemov /* free mm_slot */ 1209*b46e756fSKirill A. Shutemov hash_del(&mm_slot->hash); 1210*b46e756fSKirill A. Shutemov list_del(&mm_slot->mm_node); 1211*b46e756fSKirill A. Shutemov 1212*b46e756fSKirill A. Shutemov /* 1213*b46e756fSKirill A. Shutemov * Not strictly needed because the mm exited already. 1214*b46e756fSKirill A. Shutemov * 1215*b46e756fSKirill A. Shutemov * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1216*b46e756fSKirill A. Shutemov */ 1217*b46e756fSKirill A. Shutemov 1218*b46e756fSKirill A. Shutemov /* khugepaged_mm_lock actually not necessary for the below */ 1219*b46e756fSKirill A. Shutemov free_mm_slot(mm_slot); 1220*b46e756fSKirill A. Shutemov mmdrop(mm); 1221*b46e756fSKirill A. Shutemov } 1222*b46e756fSKirill A. Shutemov } 1223*b46e756fSKirill A. Shutemov 1224*b46e756fSKirill A. Shutemov static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 1225*b46e756fSKirill A. Shutemov struct page **hpage) 1226*b46e756fSKirill A. Shutemov __releases(&khugepaged_mm_lock) 1227*b46e756fSKirill A. Shutemov __acquires(&khugepaged_mm_lock) 1228*b46e756fSKirill A. Shutemov { 1229*b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 1230*b46e756fSKirill A. Shutemov struct mm_struct *mm; 1231*b46e756fSKirill A. Shutemov struct vm_area_struct *vma; 1232*b46e756fSKirill A. Shutemov int progress = 0; 1233*b46e756fSKirill A. Shutemov 1234*b46e756fSKirill A. Shutemov VM_BUG_ON(!pages); 1235*b46e756fSKirill A. Shutemov VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 1236*b46e756fSKirill A. Shutemov 1237*b46e756fSKirill A. Shutemov if (khugepaged_scan.mm_slot) 1238*b46e756fSKirill A. Shutemov mm_slot = khugepaged_scan.mm_slot; 1239*b46e756fSKirill A. Shutemov else { 1240*b46e756fSKirill A. Shutemov mm_slot = list_entry(khugepaged_scan.mm_head.next, 1241*b46e756fSKirill A. Shutemov struct mm_slot, mm_node); 1242*b46e756fSKirill A. Shutemov khugepaged_scan.address = 0; 1243*b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = mm_slot; 1244*b46e756fSKirill A. Shutemov } 1245*b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 1246*b46e756fSKirill A. Shutemov 1247*b46e756fSKirill A. Shutemov mm = mm_slot->mm; 1248*b46e756fSKirill A. Shutemov down_read(&mm->mmap_sem); 1249*b46e756fSKirill A. Shutemov if (unlikely(khugepaged_test_exit(mm))) 1250*b46e756fSKirill A. Shutemov vma = NULL; 1251*b46e756fSKirill A. Shutemov else 1252*b46e756fSKirill A. Shutemov vma = find_vma(mm, khugepaged_scan.address); 1253*b46e756fSKirill A. Shutemov 1254*b46e756fSKirill A. Shutemov progress++; 1255*b46e756fSKirill A. Shutemov for (; vma; vma = vma->vm_next) { 1256*b46e756fSKirill A. Shutemov unsigned long hstart, hend; 1257*b46e756fSKirill A. Shutemov 1258*b46e756fSKirill A. Shutemov cond_resched(); 1259*b46e756fSKirill A. Shutemov if (unlikely(khugepaged_test_exit(mm))) { 1260*b46e756fSKirill A. Shutemov progress++; 1261*b46e756fSKirill A. Shutemov break; 1262*b46e756fSKirill A. Shutemov } 1263*b46e756fSKirill A. Shutemov if (!hugepage_vma_check(vma)) { 1264*b46e756fSKirill A. Shutemov skip: 1265*b46e756fSKirill A. Shutemov progress++; 1266*b46e756fSKirill A. Shutemov continue; 1267*b46e756fSKirill A. Shutemov } 1268*b46e756fSKirill A. Shutemov hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1269*b46e756fSKirill A. Shutemov hend = vma->vm_end & HPAGE_PMD_MASK; 1270*b46e756fSKirill A. Shutemov if (hstart >= hend) 1271*b46e756fSKirill A. Shutemov goto skip; 1272*b46e756fSKirill A. Shutemov if (khugepaged_scan.address > hend) 1273*b46e756fSKirill A. Shutemov goto skip; 1274*b46e756fSKirill A. Shutemov if (khugepaged_scan.address < hstart) 1275*b46e756fSKirill A. Shutemov khugepaged_scan.address = hstart; 1276*b46e756fSKirill A. Shutemov VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 1277*b46e756fSKirill A. Shutemov 1278*b46e756fSKirill A. Shutemov while (khugepaged_scan.address < hend) { 1279*b46e756fSKirill A. Shutemov int ret; 1280*b46e756fSKirill A. Shutemov cond_resched(); 1281*b46e756fSKirill A. Shutemov if (unlikely(khugepaged_test_exit(mm))) 1282*b46e756fSKirill A. Shutemov goto breakouterloop; 1283*b46e756fSKirill A. Shutemov 1284*b46e756fSKirill A. Shutemov VM_BUG_ON(khugepaged_scan.address < hstart || 1285*b46e756fSKirill A. Shutemov khugepaged_scan.address + HPAGE_PMD_SIZE > 1286*b46e756fSKirill A. Shutemov hend); 1287*b46e756fSKirill A. Shutemov ret = khugepaged_scan_pmd(mm, vma, 1288*b46e756fSKirill A. Shutemov khugepaged_scan.address, 1289*b46e756fSKirill A. Shutemov hpage); 1290*b46e756fSKirill A. Shutemov /* move to next address */ 1291*b46e756fSKirill A. Shutemov khugepaged_scan.address += HPAGE_PMD_SIZE; 1292*b46e756fSKirill A. Shutemov progress += HPAGE_PMD_NR; 1293*b46e756fSKirill A. Shutemov if (ret) 1294*b46e756fSKirill A. Shutemov /* we released mmap_sem so break loop */ 1295*b46e756fSKirill A. Shutemov goto breakouterloop_mmap_sem; 1296*b46e756fSKirill A. Shutemov if (progress >= pages) 1297*b46e756fSKirill A. Shutemov goto breakouterloop; 1298*b46e756fSKirill A. Shutemov } 1299*b46e756fSKirill A. Shutemov } 1300*b46e756fSKirill A. Shutemov breakouterloop: 1301*b46e756fSKirill A. Shutemov up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ 1302*b46e756fSKirill A. Shutemov breakouterloop_mmap_sem: 1303*b46e756fSKirill A. Shutemov 1304*b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 1305*b46e756fSKirill A. Shutemov VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 1306*b46e756fSKirill A. Shutemov /* 1307*b46e756fSKirill A. Shutemov * Release the current mm_slot if this mm is about to die, or 1308*b46e756fSKirill A. Shutemov * if we scanned all vmas of this mm. 1309*b46e756fSKirill A. Shutemov */ 1310*b46e756fSKirill A. Shutemov if (khugepaged_test_exit(mm) || !vma) { 1311*b46e756fSKirill A. Shutemov /* 1312*b46e756fSKirill A. Shutemov * Make sure that if mm_users is reaching zero while 1313*b46e756fSKirill A. Shutemov * khugepaged runs here, khugepaged_exit will find 1314*b46e756fSKirill A. Shutemov * mm_slot not pointing to the exiting mm. 1315*b46e756fSKirill A. Shutemov */ 1316*b46e756fSKirill A. Shutemov if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 1317*b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = list_entry( 1318*b46e756fSKirill A. Shutemov mm_slot->mm_node.next, 1319*b46e756fSKirill A. Shutemov struct mm_slot, mm_node); 1320*b46e756fSKirill A. Shutemov khugepaged_scan.address = 0; 1321*b46e756fSKirill A. Shutemov } else { 1322*b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = NULL; 1323*b46e756fSKirill A. Shutemov khugepaged_full_scans++; 1324*b46e756fSKirill A. Shutemov } 1325*b46e756fSKirill A. Shutemov 1326*b46e756fSKirill A. Shutemov collect_mm_slot(mm_slot); 1327*b46e756fSKirill A. Shutemov } 1328*b46e756fSKirill A. Shutemov 1329*b46e756fSKirill A. Shutemov return progress; 1330*b46e756fSKirill A. Shutemov } 1331*b46e756fSKirill A. Shutemov 1332*b46e756fSKirill A. Shutemov static int khugepaged_has_work(void) 1333*b46e756fSKirill A. Shutemov { 1334*b46e756fSKirill A. Shutemov return !list_empty(&khugepaged_scan.mm_head) && 1335*b46e756fSKirill A. Shutemov khugepaged_enabled(); 1336*b46e756fSKirill A. Shutemov } 1337*b46e756fSKirill A. Shutemov 1338*b46e756fSKirill A. Shutemov static int khugepaged_wait_event(void) 1339*b46e756fSKirill A. Shutemov { 1340*b46e756fSKirill A. Shutemov return !list_empty(&khugepaged_scan.mm_head) || 1341*b46e756fSKirill A. Shutemov kthread_should_stop(); 1342*b46e756fSKirill A. Shutemov } 1343*b46e756fSKirill A. Shutemov 1344*b46e756fSKirill A. Shutemov static void khugepaged_do_scan(void) 1345*b46e756fSKirill A. Shutemov { 1346*b46e756fSKirill A. Shutemov struct page *hpage = NULL; 1347*b46e756fSKirill A. Shutemov unsigned int progress = 0, pass_through_head = 0; 1348*b46e756fSKirill A. Shutemov unsigned int pages = khugepaged_pages_to_scan; 1349*b46e756fSKirill A. Shutemov bool wait = true; 1350*b46e756fSKirill A. Shutemov 1351*b46e756fSKirill A. Shutemov barrier(); /* write khugepaged_pages_to_scan to local stack */ 1352*b46e756fSKirill A. Shutemov 1353*b46e756fSKirill A. Shutemov while (progress < pages) { 1354*b46e756fSKirill A. Shutemov if (!khugepaged_prealloc_page(&hpage, &wait)) 1355*b46e756fSKirill A. Shutemov break; 1356*b46e756fSKirill A. Shutemov 1357*b46e756fSKirill A. Shutemov cond_resched(); 1358*b46e756fSKirill A. Shutemov 1359*b46e756fSKirill A. Shutemov if (unlikely(kthread_should_stop() || try_to_freeze())) 1360*b46e756fSKirill A. Shutemov break; 1361*b46e756fSKirill A. Shutemov 1362*b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 1363*b46e756fSKirill A. Shutemov if (!khugepaged_scan.mm_slot) 1364*b46e756fSKirill A. Shutemov pass_through_head++; 1365*b46e756fSKirill A. Shutemov if (khugepaged_has_work() && 1366*b46e756fSKirill A. Shutemov pass_through_head < 2) 1367*b46e756fSKirill A. Shutemov progress += khugepaged_scan_mm_slot(pages - progress, 1368*b46e756fSKirill A. Shutemov &hpage); 1369*b46e756fSKirill A. Shutemov else 1370*b46e756fSKirill A. Shutemov progress = pages; 1371*b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 1372*b46e756fSKirill A. Shutemov } 1373*b46e756fSKirill A. Shutemov 1374*b46e756fSKirill A. Shutemov if (!IS_ERR_OR_NULL(hpage)) 1375*b46e756fSKirill A. Shutemov put_page(hpage); 1376*b46e756fSKirill A. Shutemov } 1377*b46e756fSKirill A. Shutemov 1378*b46e756fSKirill A. Shutemov static bool khugepaged_should_wakeup(void) 1379*b46e756fSKirill A. Shutemov { 1380*b46e756fSKirill A. Shutemov return kthread_should_stop() || 1381*b46e756fSKirill A. Shutemov time_after_eq(jiffies, khugepaged_sleep_expire); 1382*b46e756fSKirill A. Shutemov } 1383*b46e756fSKirill A. Shutemov 1384*b46e756fSKirill A. Shutemov static void khugepaged_wait_work(void) 1385*b46e756fSKirill A. Shutemov { 1386*b46e756fSKirill A. Shutemov if (khugepaged_has_work()) { 1387*b46e756fSKirill A. Shutemov const unsigned long scan_sleep_jiffies = 1388*b46e756fSKirill A. Shutemov msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 1389*b46e756fSKirill A. Shutemov 1390*b46e756fSKirill A. Shutemov if (!scan_sleep_jiffies) 1391*b46e756fSKirill A. Shutemov return; 1392*b46e756fSKirill A. Shutemov 1393*b46e756fSKirill A. Shutemov khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 1394*b46e756fSKirill A. Shutemov wait_event_freezable_timeout(khugepaged_wait, 1395*b46e756fSKirill A. Shutemov khugepaged_should_wakeup(), 1396*b46e756fSKirill A. Shutemov scan_sleep_jiffies); 1397*b46e756fSKirill A. Shutemov return; 1398*b46e756fSKirill A. Shutemov } 1399*b46e756fSKirill A. Shutemov 1400*b46e756fSKirill A. Shutemov if (khugepaged_enabled()) 1401*b46e756fSKirill A. Shutemov wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 1402*b46e756fSKirill A. Shutemov } 1403*b46e756fSKirill A. Shutemov 1404*b46e756fSKirill A. Shutemov static int khugepaged(void *none) 1405*b46e756fSKirill A. Shutemov { 1406*b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 1407*b46e756fSKirill A. Shutemov 1408*b46e756fSKirill A. Shutemov set_freezable(); 1409*b46e756fSKirill A. Shutemov set_user_nice(current, MAX_NICE); 1410*b46e756fSKirill A. Shutemov 1411*b46e756fSKirill A. Shutemov while (!kthread_should_stop()) { 1412*b46e756fSKirill A. Shutemov khugepaged_do_scan(); 1413*b46e756fSKirill A. Shutemov khugepaged_wait_work(); 1414*b46e756fSKirill A. Shutemov } 1415*b46e756fSKirill A. Shutemov 1416*b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 1417*b46e756fSKirill A. Shutemov mm_slot = khugepaged_scan.mm_slot; 1418*b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = NULL; 1419*b46e756fSKirill A. Shutemov if (mm_slot) 1420*b46e756fSKirill A. Shutemov collect_mm_slot(mm_slot); 1421*b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 1422*b46e756fSKirill A. Shutemov return 0; 1423*b46e756fSKirill A. Shutemov } 1424*b46e756fSKirill A. Shutemov 1425*b46e756fSKirill A. Shutemov static void set_recommended_min_free_kbytes(void) 1426*b46e756fSKirill A. Shutemov { 1427*b46e756fSKirill A. Shutemov struct zone *zone; 1428*b46e756fSKirill A. Shutemov int nr_zones = 0; 1429*b46e756fSKirill A. Shutemov unsigned long recommended_min; 1430*b46e756fSKirill A. Shutemov 1431*b46e756fSKirill A. Shutemov for_each_populated_zone(zone) 1432*b46e756fSKirill A. Shutemov nr_zones++; 1433*b46e756fSKirill A. Shutemov 1434*b46e756fSKirill A. Shutemov /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 1435*b46e756fSKirill A. Shutemov recommended_min = pageblock_nr_pages * nr_zones * 2; 1436*b46e756fSKirill A. Shutemov 1437*b46e756fSKirill A. Shutemov /* 1438*b46e756fSKirill A. Shutemov * Make sure that on average at least two pageblocks are almost free 1439*b46e756fSKirill A. Shutemov * of another type, one for a migratetype to fall back to and a 1440*b46e756fSKirill A. Shutemov * second to avoid subsequent fallbacks of other types There are 3 1441*b46e756fSKirill A. Shutemov * MIGRATE_TYPES we care about. 1442*b46e756fSKirill A. Shutemov */ 1443*b46e756fSKirill A. Shutemov recommended_min += pageblock_nr_pages * nr_zones * 1444*b46e756fSKirill A. Shutemov MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 1445*b46e756fSKirill A. Shutemov 1446*b46e756fSKirill A. Shutemov /* don't ever allow to reserve more than 5% of the lowmem */ 1447*b46e756fSKirill A. Shutemov recommended_min = min(recommended_min, 1448*b46e756fSKirill A. Shutemov (unsigned long) nr_free_buffer_pages() / 20); 1449*b46e756fSKirill A. Shutemov recommended_min <<= (PAGE_SHIFT-10); 1450*b46e756fSKirill A. Shutemov 1451*b46e756fSKirill A. Shutemov if (recommended_min > min_free_kbytes) { 1452*b46e756fSKirill A. Shutemov if (user_min_free_kbytes >= 0) 1453*b46e756fSKirill A. Shutemov pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 1454*b46e756fSKirill A. Shutemov min_free_kbytes, recommended_min); 1455*b46e756fSKirill A. Shutemov 1456*b46e756fSKirill A. Shutemov min_free_kbytes = recommended_min; 1457*b46e756fSKirill A. Shutemov } 1458*b46e756fSKirill A. Shutemov setup_per_zone_wmarks(); 1459*b46e756fSKirill A. Shutemov } 1460*b46e756fSKirill A. Shutemov 1461*b46e756fSKirill A. Shutemov int start_stop_khugepaged(void) 1462*b46e756fSKirill A. Shutemov { 1463*b46e756fSKirill A. Shutemov static struct task_struct *khugepaged_thread __read_mostly; 1464*b46e756fSKirill A. Shutemov static DEFINE_MUTEX(khugepaged_mutex); 1465*b46e756fSKirill A. Shutemov int err = 0; 1466*b46e756fSKirill A. Shutemov 1467*b46e756fSKirill A. Shutemov mutex_lock(&khugepaged_mutex); 1468*b46e756fSKirill A. Shutemov if (khugepaged_enabled()) { 1469*b46e756fSKirill A. Shutemov if (!khugepaged_thread) 1470*b46e756fSKirill A. Shutemov khugepaged_thread = kthread_run(khugepaged, NULL, 1471*b46e756fSKirill A. Shutemov "khugepaged"); 1472*b46e756fSKirill A. Shutemov if (IS_ERR(khugepaged_thread)) { 1473*b46e756fSKirill A. Shutemov pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 1474*b46e756fSKirill A. Shutemov err = PTR_ERR(khugepaged_thread); 1475*b46e756fSKirill A. Shutemov khugepaged_thread = NULL; 1476*b46e756fSKirill A. Shutemov goto fail; 1477*b46e756fSKirill A. Shutemov } 1478*b46e756fSKirill A. Shutemov 1479*b46e756fSKirill A. Shutemov if (!list_empty(&khugepaged_scan.mm_head)) 1480*b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 1481*b46e756fSKirill A. Shutemov 1482*b46e756fSKirill A. Shutemov set_recommended_min_free_kbytes(); 1483*b46e756fSKirill A. Shutemov } else if (khugepaged_thread) { 1484*b46e756fSKirill A. Shutemov kthread_stop(khugepaged_thread); 1485*b46e756fSKirill A. Shutemov khugepaged_thread = NULL; 1486*b46e756fSKirill A. Shutemov } 1487*b46e756fSKirill A. Shutemov fail: 1488*b46e756fSKirill A. Shutemov mutex_unlock(&khugepaged_mutex); 1489*b46e756fSKirill A. Shutemov return err; 1490*b46e756fSKirill A. Shutemov } 1491