1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 2b46e756fSKirill A. Shutemov #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3b46e756fSKirill A. Shutemov 4b46e756fSKirill A. Shutemov #include <linux/mm.h> 5b46e756fSKirill A. Shutemov #include <linux/sched.h> 66e84f315SIngo Molnar #include <linux/sched/mm.h> 7f7ccbae4SIngo Molnar #include <linux/sched/coredump.h> 8b46e756fSKirill A. Shutemov #include <linux/mmu_notifier.h> 9b46e756fSKirill A. Shutemov #include <linux/rmap.h> 10b46e756fSKirill A. Shutemov #include <linux/swap.h> 11b46e756fSKirill A. Shutemov #include <linux/mm_inline.h> 12b46e756fSKirill A. Shutemov #include <linux/kthread.h> 13b46e756fSKirill A. Shutemov #include <linux/khugepaged.h> 14b46e756fSKirill A. Shutemov #include <linux/freezer.h> 15b46e756fSKirill A. Shutemov #include <linux/mman.h> 16b46e756fSKirill A. Shutemov #include <linux/hashtable.h> 17b46e756fSKirill A. Shutemov #include <linux/userfaultfd_k.h> 18b46e756fSKirill A. Shutemov #include <linux/page_idle.h> 19b46e756fSKirill A. Shutemov #include <linux/swapops.h> 20f3f0e1d2SKirill A. Shutemov #include <linux/shmem_fs.h> 21b46e756fSKirill A. Shutemov 22b46e756fSKirill A. Shutemov #include <asm/tlb.h> 23b46e756fSKirill A. Shutemov #include <asm/pgalloc.h> 24b46e756fSKirill A. Shutemov #include "internal.h" 25b46e756fSKirill A. Shutemov 26b46e756fSKirill A. Shutemov enum scan_result { 27b46e756fSKirill A. Shutemov SCAN_FAIL, 28b46e756fSKirill A. Shutemov SCAN_SUCCEED, 29b46e756fSKirill A. Shutemov SCAN_PMD_NULL, 30b46e756fSKirill A. Shutemov SCAN_EXCEED_NONE_PTE, 3171a2c112SKirill A. Shutemov SCAN_EXCEED_SWAP_PTE, 3271a2c112SKirill A. Shutemov SCAN_EXCEED_SHARED_PTE, 33b46e756fSKirill A. Shutemov SCAN_PTE_NON_PRESENT, 34e1e267c7SPeter Xu SCAN_PTE_UFFD_WP, 35b46e756fSKirill A. Shutemov SCAN_PAGE_RO, 360db501f7SEbru Akagunduz SCAN_LACK_REFERENCED_PAGE, 37b46e756fSKirill A. Shutemov SCAN_PAGE_NULL, 38b46e756fSKirill A. Shutemov SCAN_SCAN_ABORT, 39b46e756fSKirill A. Shutemov SCAN_PAGE_COUNT, 40b46e756fSKirill A. Shutemov SCAN_PAGE_LRU, 41b46e756fSKirill A. Shutemov SCAN_PAGE_LOCK, 42b46e756fSKirill A. Shutemov SCAN_PAGE_ANON, 43b46e756fSKirill A. Shutemov SCAN_PAGE_COMPOUND, 44b46e756fSKirill A. Shutemov SCAN_ANY_PROCESS, 45b46e756fSKirill A. Shutemov SCAN_VMA_NULL, 46b46e756fSKirill A. Shutemov SCAN_VMA_CHECK, 47b46e756fSKirill A. Shutemov SCAN_ADDRESS_RANGE, 48b46e756fSKirill A. Shutemov SCAN_SWAP_CACHE_PAGE, 49b46e756fSKirill A. Shutemov SCAN_DEL_PAGE_LRU, 50b46e756fSKirill A. Shutemov SCAN_ALLOC_HUGE_PAGE_FAIL, 51b46e756fSKirill A. Shutemov SCAN_CGROUP_CHARGE_FAIL, 52f3f0e1d2SKirill A. Shutemov SCAN_TRUNCATED, 5399cb0dbdSSong Liu SCAN_PAGE_HAS_PRIVATE, 54b46e756fSKirill A. Shutemov }; 55b46e756fSKirill A. Shutemov 56b46e756fSKirill A. Shutemov #define CREATE_TRACE_POINTS 57b46e756fSKirill A. Shutemov #include <trace/events/huge_memory.h> 58b46e756fSKirill A. Shutemov 59b46e756fSKirill A. Shutemov /* default scan 8*512 pte (or vmas) every 30 second */ 60b46e756fSKirill A. Shutemov static unsigned int khugepaged_pages_to_scan __read_mostly; 61b46e756fSKirill A. Shutemov static unsigned int khugepaged_pages_collapsed; 62b46e756fSKirill A. Shutemov static unsigned int khugepaged_full_scans; 63b46e756fSKirill A. Shutemov static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 64b46e756fSKirill A. Shutemov /* during fragmentation poll the hugepage allocator once every minute */ 65b46e756fSKirill A. Shutemov static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 66b46e756fSKirill A. Shutemov static unsigned long khugepaged_sleep_expire; 67b46e756fSKirill A. Shutemov static DEFINE_SPINLOCK(khugepaged_mm_lock); 68b46e756fSKirill A. Shutemov static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 69b46e756fSKirill A. Shutemov /* 70b46e756fSKirill A. Shutemov * default collapse hugepages if there is at least one pte mapped like 71b46e756fSKirill A. Shutemov * it would have happened if the vma was large enough during page 72b46e756fSKirill A. Shutemov * fault. 73b46e756fSKirill A. Shutemov */ 74b46e756fSKirill A. Shutemov static unsigned int khugepaged_max_ptes_none __read_mostly; 75b46e756fSKirill A. Shutemov static unsigned int khugepaged_max_ptes_swap __read_mostly; 7671a2c112SKirill A. Shutemov static unsigned int khugepaged_max_ptes_shared __read_mostly; 77b46e756fSKirill A. Shutemov 78b46e756fSKirill A. Shutemov #define MM_SLOTS_HASH_BITS 10 79b46e756fSKirill A. Shutemov static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 80b46e756fSKirill A. Shutemov 81b46e756fSKirill A. Shutemov static struct kmem_cache *mm_slot_cache __read_mostly; 82b46e756fSKirill A. Shutemov 8327e1f827SSong Liu #define MAX_PTE_MAPPED_THP 8 8427e1f827SSong Liu 85b46e756fSKirill A. Shutemov /** 86b46e756fSKirill A. Shutemov * struct mm_slot - hash lookup from mm to mm_slot 87b46e756fSKirill A. Shutemov * @hash: hash collision list 88b46e756fSKirill A. Shutemov * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 89b46e756fSKirill A. Shutemov * @mm: the mm that this information is valid for 90b46e756fSKirill A. Shutemov */ 91b46e756fSKirill A. Shutemov struct mm_slot { 92b46e756fSKirill A. Shutemov struct hlist_node hash; 93b46e756fSKirill A. Shutemov struct list_head mm_node; 94b46e756fSKirill A. Shutemov struct mm_struct *mm; 9527e1f827SSong Liu 9627e1f827SSong Liu /* pte-mapped THP in this mm */ 9727e1f827SSong Liu int nr_pte_mapped_thp; 9827e1f827SSong Liu unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; 99b46e756fSKirill A. Shutemov }; 100b46e756fSKirill A. Shutemov 101b46e756fSKirill A. Shutemov /** 102b46e756fSKirill A. Shutemov * struct khugepaged_scan - cursor for scanning 103b46e756fSKirill A. Shutemov * @mm_head: the head of the mm list to scan 104b46e756fSKirill A. Shutemov * @mm_slot: the current mm_slot we are scanning 105b46e756fSKirill A. Shutemov * @address: the next address inside that to be scanned 106b46e756fSKirill A. Shutemov * 107b46e756fSKirill A. Shutemov * There is only the one khugepaged_scan instance of this cursor structure. 108b46e756fSKirill A. Shutemov */ 109b46e756fSKirill A. Shutemov struct khugepaged_scan { 110b46e756fSKirill A. Shutemov struct list_head mm_head; 111b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 112b46e756fSKirill A. Shutemov unsigned long address; 113b46e756fSKirill A. Shutemov }; 114b46e756fSKirill A. Shutemov 115b46e756fSKirill A. Shutemov static struct khugepaged_scan khugepaged_scan = { 116b46e756fSKirill A. Shutemov .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 117b46e756fSKirill A. Shutemov }; 118b46e756fSKirill A. Shutemov 119e1465d12SJérémy Lefaure #ifdef CONFIG_SYSFS 120b46e756fSKirill A. Shutemov static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 121b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 122b46e756fSKirill A. Shutemov char *buf) 123b46e756fSKirill A. Shutemov { 124b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 125b46e756fSKirill A. Shutemov } 126b46e756fSKirill A. Shutemov 127b46e756fSKirill A. Shutemov static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 128b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 129b46e756fSKirill A. Shutemov const char *buf, size_t count) 130b46e756fSKirill A. Shutemov { 131b46e756fSKirill A. Shutemov unsigned long msecs; 132b46e756fSKirill A. Shutemov int err; 133b46e756fSKirill A. Shutemov 134b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &msecs); 135b46e756fSKirill A. Shutemov if (err || msecs > UINT_MAX) 136b46e756fSKirill A. Shutemov return -EINVAL; 137b46e756fSKirill A. Shutemov 138b46e756fSKirill A. Shutemov khugepaged_scan_sleep_millisecs = msecs; 139b46e756fSKirill A. Shutemov khugepaged_sleep_expire = 0; 140b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 141b46e756fSKirill A. Shutemov 142b46e756fSKirill A. Shutemov return count; 143b46e756fSKirill A. Shutemov } 144b46e756fSKirill A. Shutemov static struct kobj_attribute scan_sleep_millisecs_attr = 145b46e756fSKirill A. Shutemov __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 146b46e756fSKirill A. Shutemov scan_sleep_millisecs_store); 147b46e756fSKirill A. Shutemov 148b46e756fSKirill A. Shutemov static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 149b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 150b46e756fSKirill A. Shutemov char *buf) 151b46e756fSKirill A. Shutemov { 152b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 153b46e756fSKirill A. Shutemov } 154b46e756fSKirill A. Shutemov 155b46e756fSKirill A. Shutemov static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 156b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 157b46e756fSKirill A. Shutemov const char *buf, size_t count) 158b46e756fSKirill A. Shutemov { 159b46e756fSKirill A. Shutemov unsigned long msecs; 160b46e756fSKirill A. Shutemov int err; 161b46e756fSKirill A. Shutemov 162b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &msecs); 163b46e756fSKirill A. Shutemov if (err || msecs > UINT_MAX) 164b46e756fSKirill A. Shutemov return -EINVAL; 165b46e756fSKirill A. Shutemov 166b46e756fSKirill A. Shutemov khugepaged_alloc_sleep_millisecs = msecs; 167b46e756fSKirill A. Shutemov khugepaged_sleep_expire = 0; 168b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 169b46e756fSKirill A. Shutemov 170b46e756fSKirill A. Shutemov return count; 171b46e756fSKirill A. Shutemov } 172b46e756fSKirill A. Shutemov static struct kobj_attribute alloc_sleep_millisecs_attr = 173b46e756fSKirill A. Shutemov __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 174b46e756fSKirill A. Shutemov alloc_sleep_millisecs_store); 175b46e756fSKirill A. Shutemov 176b46e756fSKirill A. Shutemov static ssize_t pages_to_scan_show(struct kobject *kobj, 177b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 178b46e756fSKirill A. Shutemov char *buf) 179b46e756fSKirill A. Shutemov { 180b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 181b46e756fSKirill A. Shutemov } 182b46e756fSKirill A. Shutemov static ssize_t pages_to_scan_store(struct kobject *kobj, 183b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 184b46e756fSKirill A. Shutemov const char *buf, size_t count) 185b46e756fSKirill A. Shutemov { 186b46e756fSKirill A. Shutemov int err; 187b46e756fSKirill A. Shutemov unsigned long pages; 188b46e756fSKirill A. Shutemov 189b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &pages); 190b46e756fSKirill A. Shutemov if (err || !pages || pages > UINT_MAX) 191b46e756fSKirill A. Shutemov return -EINVAL; 192b46e756fSKirill A. Shutemov 193b46e756fSKirill A. Shutemov khugepaged_pages_to_scan = pages; 194b46e756fSKirill A. Shutemov 195b46e756fSKirill A. Shutemov return count; 196b46e756fSKirill A. Shutemov } 197b46e756fSKirill A. Shutemov static struct kobj_attribute pages_to_scan_attr = 198b46e756fSKirill A. Shutemov __ATTR(pages_to_scan, 0644, pages_to_scan_show, 199b46e756fSKirill A. Shutemov pages_to_scan_store); 200b46e756fSKirill A. Shutemov 201b46e756fSKirill A. Shutemov static ssize_t pages_collapsed_show(struct kobject *kobj, 202b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 203b46e756fSKirill A. Shutemov char *buf) 204b46e756fSKirill A. Shutemov { 205b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 206b46e756fSKirill A. Shutemov } 207b46e756fSKirill A. Shutemov static struct kobj_attribute pages_collapsed_attr = 208b46e756fSKirill A. Shutemov __ATTR_RO(pages_collapsed); 209b46e756fSKirill A. Shutemov 210b46e756fSKirill A. Shutemov static ssize_t full_scans_show(struct kobject *kobj, 211b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 212b46e756fSKirill A. Shutemov char *buf) 213b46e756fSKirill A. Shutemov { 214b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_full_scans); 215b46e756fSKirill A. Shutemov } 216b46e756fSKirill A. Shutemov static struct kobj_attribute full_scans_attr = 217b46e756fSKirill A. Shutemov __ATTR_RO(full_scans); 218b46e756fSKirill A. Shutemov 219b46e756fSKirill A. Shutemov static ssize_t khugepaged_defrag_show(struct kobject *kobj, 220b46e756fSKirill A. Shutemov struct kobj_attribute *attr, char *buf) 221b46e756fSKirill A. Shutemov { 222b46e756fSKirill A. Shutemov return single_hugepage_flag_show(kobj, attr, buf, 223b46e756fSKirill A. Shutemov TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 224b46e756fSKirill A. Shutemov } 225b46e756fSKirill A. Shutemov static ssize_t khugepaged_defrag_store(struct kobject *kobj, 226b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 227b46e756fSKirill A. Shutemov const char *buf, size_t count) 228b46e756fSKirill A. Shutemov { 229b46e756fSKirill A. Shutemov return single_hugepage_flag_store(kobj, attr, buf, count, 230b46e756fSKirill A. Shutemov TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 231b46e756fSKirill A. Shutemov } 232b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_defrag_attr = 233b46e756fSKirill A. Shutemov __ATTR(defrag, 0644, khugepaged_defrag_show, 234b46e756fSKirill A. Shutemov khugepaged_defrag_store); 235b46e756fSKirill A. Shutemov 236b46e756fSKirill A. Shutemov /* 237b46e756fSKirill A. Shutemov * max_ptes_none controls if khugepaged should collapse hugepages over 238b46e756fSKirill A. Shutemov * any unmapped ptes in turn potentially increasing the memory 239b46e756fSKirill A. Shutemov * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 240b46e756fSKirill A. Shutemov * reduce the available free memory in the system as it 241b46e756fSKirill A. Shutemov * runs. Increasing max_ptes_none will instead potentially reduce the 242b46e756fSKirill A. Shutemov * free memory in the system during the khugepaged scan. 243b46e756fSKirill A. Shutemov */ 244b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 245b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 246b46e756fSKirill A. Shutemov char *buf) 247b46e756fSKirill A. Shutemov { 248b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 249b46e756fSKirill A. Shutemov } 250b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 251b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 252b46e756fSKirill A. Shutemov const char *buf, size_t count) 253b46e756fSKirill A. Shutemov { 254b46e756fSKirill A. Shutemov int err; 255b46e756fSKirill A. Shutemov unsigned long max_ptes_none; 256b46e756fSKirill A. Shutemov 257b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &max_ptes_none); 258b46e756fSKirill A. Shutemov if (err || max_ptes_none > HPAGE_PMD_NR-1) 259b46e756fSKirill A. Shutemov return -EINVAL; 260b46e756fSKirill A. Shutemov 261b46e756fSKirill A. Shutemov khugepaged_max_ptes_none = max_ptes_none; 262b46e756fSKirill A. Shutemov 263b46e756fSKirill A. Shutemov return count; 264b46e756fSKirill A. Shutemov } 265b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_max_ptes_none_attr = 266b46e756fSKirill A. Shutemov __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 267b46e756fSKirill A. Shutemov khugepaged_max_ptes_none_store); 268b46e756fSKirill A. Shutemov 269b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, 270b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 271b46e756fSKirill A. Shutemov char *buf) 272b46e756fSKirill A. Shutemov { 273b46e756fSKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); 274b46e756fSKirill A. Shutemov } 275b46e756fSKirill A. Shutemov 276b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, 277b46e756fSKirill A. Shutemov struct kobj_attribute *attr, 278b46e756fSKirill A. Shutemov const char *buf, size_t count) 279b46e756fSKirill A. Shutemov { 280b46e756fSKirill A. Shutemov int err; 281b46e756fSKirill A. Shutemov unsigned long max_ptes_swap; 282b46e756fSKirill A. Shutemov 283b46e756fSKirill A. Shutemov err = kstrtoul(buf, 10, &max_ptes_swap); 284b46e756fSKirill A. Shutemov if (err || max_ptes_swap > HPAGE_PMD_NR-1) 285b46e756fSKirill A. Shutemov return -EINVAL; 286b46e756fSKirill A. Shutemov 287b46e756fSKirill A. Shutemov khugepaged_max_ptes_swap = max_ptes_swap; 288b46e756fSKirill A. Shutemov 289b46e756fSKirill A. Shutemov return count; 290b46e756fSKirill A. Shutemov } 291b46e756fSKirill A. Shutemov 292b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_max_ptes_swap_attr = 293b46e756fSKirill A. Shutemov __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, 294b46e756fSKirill A. Shutemov khugepaged_max_ptes_swap_store); 295b46e756fSKirill A. Shutemov 29671a2c112SKirill A. Shutemov static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, 29771a2c112SKirill A. Shutemov struct kobj_attribute *attr, 29871a2c112SKirill A. Shutemov char *buf) 29971a2c112SKirill A. Shutemov { 30071a2c112SKirill A. Shutemov return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); 30171a2c112SKirill A. Shutemov } 30271a2c112SKirill A. Shutemov 30371a2c112SKirill A. Shutemov static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, 30471a2c112SKirill A. Shutemov struct kobj_attribute *attr, 30571a2c112SKirill A. Shutemov const char *buf, size_t count) 30671a2c112SKirill A. Shutemov { 30771a2c112SKirill A. Shutemov int err; 30871a2c112SKirill A. Shutemov unsigned long max_ptes_shared; 30971a2c112SKirill A. Shutemov 31071a2c112SKirill A. Shutemov err = kstrtoul(buf, 10, &max_ptes_shared); 31171a2c112SKirill A. Shutemov if (err || max_ptes_shared > HPAGE_PMD_NR-1) 31271a2c112SKirill A. Shutemov return -EINVAL; 31371a2c112SKirill A. Shutemov 31471a2c112SKirill A. Shutemov khugepaged_max_ptes_shared = max_ptes_shared; 31571a2c112SKirill A. Shutemov 31671a2c112SKirill A. Shutemov return count; 31771a2c112SKirill A. Shutemov } 31871a2c112SKirill A. Shutemov 31971a2c112SKirill A. Shutemov static struct kobj_attribute khugepaged_max_ptes_shared_attr = 32071a2c112SKirill A. Shutemov __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, 32171a2c112SKirill A. Shutemov khugepaged_max_ptes_shared_store); 32271a2c112SKirill A. Shutemov 323b46e756fSKirill A. Shutemov static struct attribute *khugepaged_attr[] = { 324b46e756fSKirill A. Shutemov &khugepaged_defrag_attr.attr, 325b46e756fSKirill A. Shutemov &khugepaged_max_ptes_none_attr.attr, 32671a2c112SKirill A. Shutemov &khugepaged_max_ptes_swap_attr.attr, 32771a2c112SKirill A. Shutemov &khugepaged_max_ptes_shared_attr.attr, 328b46e756fSKirill A. Shutemov &pages_to_scan_attr.attr, 329b46e756fSKirill A. Shutemov &pages_collapsed_attr.attr, 330b46e756fSKirill A. Shutemov &full_scans_attr.attr, 331b46e756fSKirill A. Shutemov &scan_sleep_millisecs_attr.attr, 332b46e756fSKirill A. Shutemov &alloc_sleep_millisecs_attr.attr, 333b46e756fSKirill A. Shutemov NULL, 334b46e756fSKirill A. Shutemov }; 335b46e756fSKirill A. Shutemov 336b46e756fSKirill A. Shutemov struct attribute_group khugepaged_attr_group = { 337b46e756fSKirill A. Shutemov .attrs = khugepaged_attr, 338b46e756fSKirill A. Shutemov .name = "khugepaged", 339b46e756fSKirill A. Shutemov }; 340e1465d12SJérémy Lefaure #endif /* CONFIG_SYSFS */ 341b46e756fSKirill A. Shutemov 342b46e756fSKirill A. Shutemov int hugepage_madvise(struct vm_area_struct *vma, 343b46e756fSKirill A. Shutemov unsigned long *vm_flags, int advice) 344b46e756fSKirill A. Shutemov { 345b46e756fSKirill A. Shutemov switch (advice) { 346b46e756fSKirill A. Shutemov case MADV_HUGEPAGE: 347b46e756fSKirill A. Shutemov #ifdef CONFIG_S390 348b46e756fSKirill A. Shutemov /* 349b46e756fSKirill A. Shutemov * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 350b46e756fSKirill A. Shutemov * can't handle this properly after s390_enable_sie, so we simply 351b46e756fSKirill A. Shutemov * ignore the madvise to prevent qemu from causing a SIGSEGV. 352b46e756fSKirill A. Shutemov */ 353b46e756fSKirill A. Shutemov if (mm_has_pgste(vma->vm_mm)) 354b46e756fSKirill A. Shutemov return 0; 355b46e756fSKirill A. Shutemov #endif 356b46e756fSKirill A. Shutemov *vm_flags &= ~VM_NOHUGEPAGE; 357b46e756fSKirill A. Shutemov *vm_flags |= VM_HUGEPAGE; 358b46e756fSKirill A. Shutemov /* 359b46e756fSKirill A. Shutemov * If the vma become good for khugepaged to scan, 360b46e756fSKirill A. Shutemov * register it here without waiting a page fault that 361b46e756fSKirill A. Shutemov * may not happen any time soon. 362b46e756fSKirill A. Shutemov */ 363b46e756fSKirill A. Shutemov if (!(*vm_flags & VM_NO_KHUGEPAGED) && 364b46e756fSKirill A. Shutemov khugepaged_enter_vma_merge(vma, *vm_flags)) 365b46e756fSKirill A. Shutemov return -ENOMEM; 366b46e756fSKirill A. Shutemov break; 367b46e756fSKirill A. Shutemov case MADV_NOHUGEPAGE: 368b46e756fSKirill A. Shutemov *vm_flags &= ~VM_HUGEPAGE; 369b46e756fSKirill A. Shutemov *vm_flags |= VM_NOHUGEPAGE; 370b46e756fSKirill A. Shutemov /* 371b46e756fSKirill A. Shutemov * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 372b46e756fSKirill A. Shutemov * this vma even if we leave the mm registered in khugepaged if 373b46e756fSKirill A. Shutemov * it got registered before VM_NOHUGEPAGE was set. 374b46e756fSKirill A. Shutemov */ 375b46e756fSKirill A. Shutemov break; 376b46e756fSKirill A. Shutemov } 377b46e756fSKirill A. Shutemov 378b46e756fSKirill A. Shutemov return 0; 379b46e756fSKirill A. Shutemov } 380b46e756fSKirill A. Shutemov 381b46e756fSKirill A. Shutemov int __init khugepaged_init(void) 382b46e756fSKirill A. Shutemov { 383b46e756fSKirill A. Shutemov mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 384b46e756fSKirill A. Shutemov sizeof(struct mm_slot), 385b46e756fSKirill A. Shutemov __alignof__(struct mm_slot), 0, NULL); 386b46e756fSKirill A. Shutemov if (!mm_slot_cache) 387b46e756fSKirill A. Shutemov return -ENOMEM; 388b46e756fSKirill A. Shutemov 389b46e756fSKirill A. Shutemov khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 390b46e756fSKirill A. Shutemov khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; 391b46e756fSKirill A. Shutemov khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; 39271a2c112SKirill A. Shutemov khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; 393b46e756fSKirill A. Shutemov 394b46e756fSKirill A. Shutemov return 0; 395b46e756fSKirill A. Shutemov } 396b46e756fSKirill A. Shutemov 397b46e756fSKirill A. Shutemov void __init khugepaged_destroy(void) 398b46e756fSKirill A. Shutemov { 399b46e756fSKirill A. Shutemov kmem_cache_destroy(mm_slot_cache); 400b46e756fSKirill A. Shutemov } 401b46e756fSKirill A. Shutemov 402b46e756fSKirill A. Shutemov static inline struct mm_slot *alloc_mm_slot(void) 403b46e756fSKirill A. Shutemov { 404b46e756fSKirill A. Shutemov if (!mm_slot_cache) /* initialization failed */ 405b46e756fSKirill A. Shutemov return NULL; 406b46e756fSKirill A. Shutemov return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 407b46e756fSKirill A. Shutemov } 408b46e756fSKirill A. Shutemov 409b46e756fSKirill A. Shutemov static inline void free_mm_slot(struct mm_slot *mm_slot) 410b46e756fSKirill A. Shutemov { 411b46e756fSKirill A. Shutemov kmem_cache_free(mm_slot_cache, mm_slot); 412b46e756fSKirill A. Shutemov } 413b46e756fSKirill A. Shutemov 414b46e756fSKirill A. Shutemov static struct mm_slot *get_mm_slot(struct mm_struct *mm) 415b46e756fSKirill A. Shutemov { 416b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 417b46e756fSKirill A. Shutemov 418b46e756fSKirill A. Shutemov hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) 419b46e756fSKirill A. Shutemov if (mm == mm_slot->mm) 420b46e756fSKirill A. Shutemov return mm_slot; 421b46e756fSKirill A. Shutemov 422b46e756fSKirill A. Shutemov return NULL; 423b46e756fSKirill A. Shutemov } 424b46e756fSKirill A. Shutemov 425b46e756fSKirill A. Shutemov static void insert_to_mm_slots_hash(struct mm_struct *mm, 426b46e756fSKirill A. Shutemov struct mm_slot *mm_slot) 427b46e756fSKirill A. Shutemov { 428b46e756fSKirill A. Shutemov mm_slot->mm = mm; 429b46e756fSKirill A. Shutemov hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); 430b46e756fSKirill A. Shutemov } 431b46e756fSKirill A. Shutemov 432b46e756fSKirill A. Shutemov static inline int khugepaged_test_exit(struct mm_struct *mm) 433b46e756fSKirill A. Shutemov { 434b46e756fSKirill A. Shutemov return atomic_read(&mm->mm_users) == 0; 435b46e756fSKirill A. Shutemov } 436b46e756fSKirill A. Shutemov 43750f8b92fSSong Liu static bool hugepage_vma_check(struct vm_area_struct *vma, 43850f8b92fSSong Liu unsigned long vm_flags) 439c2231020SYang Shi { 44050f8b92fSSong Liu if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 44150f8b92fSSong Liu (vm_flags & VM_NOHUGEPAGE) || 442c2231020SYang Shi test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 443c2231020SYang Shi return false; 44499cb0dbdSSong Liu 44599cb0dbdSSong Liu if (shmem_file(vma->vm_file) || 44699cb0dbdSSong Liu (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 44799cb0dbdSSong Liu vma->vm_file && 44899cb0dbdSSong Liu (vm_flags & VM_DENYWRITE))) { 449c2231020SYang Shi return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, 450c2231020SYang Shi HPAGE_PMD_NR); 451c2231020SYang Shi } 452c2231020SYang Shi if (!vma->anon_vma || vma->vm_ops) 453c2231020SYang Shi return false; 454222100eeSAnshuman Khandual if (vma_is_temporary_stack(vma)) 455c2231020SYang Shi return false; 45650f8b92fSSong Liu return !(vm_flags & VM_NO_KHUGEPAGED); 457c2231020SYang Shi } 458c2231020SYang Shi 459b46e756fSKirill A. Shutemov int __khugepaged_enter(struct mm_struct *mm) 460b46e756fSKirill A. Shutemov { 461b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 462b46e756fSKirill A. Shutemov int wakeup; 463b46e756fSKirill A. Shutemov 464b46e756fSKirill A. Shutemov mm_slot = alloc_mm_slot(); 465b46e756fSKirill A. Shutemov if (!mm_slot) 466b46e756fSKirill A. Shutemov return -ENOMEM; 467b46e756fSKirill A. Shutemov 468b46e756fSKirill A. Shutemov /* __khugepaged_exit() must not run from under us */ 469b46e756fSKirill A. Shutemov VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); 470b46e756fSKirill A. Shutemov if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 471b46e756fSKirill A. Shutemov free_mm_slot(mm_slot); 472b46e756fSKirill A. Shutemov return 0; 473b46e756fSKirill A. Shutemov } 474b46e756fSKirill A. Shutemov 475b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 476b46e756fSKirill A. Shutemov insert_to_mm_slots_hash(mm, mm_slot); 477b46e756fSKirill A. Shutemov /* 478b46e756fSKirill A. Shutemov * Insert just behind the scanning cursor, to let the area settle 479b46e756fSKirill A. Shutemov * down a little. 480b46e756fSKirill A. Shutemov */ 481b46e756fSKirill A. Shutemov wakeup = list_empty(&khugepaged_scan.mm_head); 482b46e756fSKirill A. Shutemov list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 483b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 484b46e756fSKirill A. Shutemov 485f1f10076SVegard Nossum mmgrab(mm); 486b46e756fSKirill A. Shutemov if (wakeup) 487b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 488b46e756fSKirill A. Shutemov 489b46e756fSKirill A. Shutemov return 0; 490b46e756fSKirill A. Shutemov } 491b46e756fSKirill A. Shutemov 492b46e756fSKirill A. Shutemov int khugepaged_enter_vma_merge(struct vm_area_struct *vma, 493b46e756fSKirill A. Shutemov unsigned long vm_flags) 494b46e756fSKirill A. Shutemov { 495b46e756fSKirill A. Shutemov unsigned long hstart, hend; 496c2231020SYang Shi 497b46e756fSKirill A. Shutemov /* 49899cb0dbdSSong Liu * khugepaged only supports read-only files for non-shmem files. 49999cb0dbdSSong Liu * khugepaged does not yet work on special mappings. And 50099cb0dbdSSong Liu * file-private shmem THP is not supported. 501b46e756fSKirill A. Shutemov */ 50250f8b92fSSong Liu if (!hugepage_vma_check(vma, vm_flags)) 503b46e756fSKirill A. Shutemov return 0; 504c2231020SYang Shi 505b46e756fSKirill A. Shutemov hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 506b46e756fSKirill A. Shutemov hend = vma->vm_end & HPAGE_PMD_MASK; 507b46e756fSKirill A. Shutemov if (hstart < hend) 508b46e756fSKirill A. Shutemov return khugepaged_enter(vma, vm_flags); 509b46e756fSKirill A. Shutemov return 0; 510b46e756fSKirill A. Shutemov } 511b46e756fSKirill A. Shutemov 512b46e756fSKirill A. Shutemov void __khugepaged_exit(struct mm_struct *mm) 513b46e756fSKirill A. Shutemov { 514b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 515b46e756fSKirill A. Shutemov int free = 0; 516b46e756fSKirill A. Shutemov 517b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 518b46e756fSKirill A. Shutemov mm_slot = get_mm_slot(mm); 519b46e756fSKirill A. Shutemov if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 520b46e756fSKirill A. Shutemov hash_del(&mm_slot->hash); 521b46e756fSKirill A. Shutemov list_del(&mm_slot->mm_node); 522b46e756fSKirill A. Shutemov free = 1; 523b46e756fSKirill A. Shutemov } 524b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 525b46e756fSKirill A. Shutemov 526b46e756fSKirill A. Shutemov if (free) { 527b46e756fSKirill A. Shutemov clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 528b46e756fSKirill A. Shutemov free_mm_slot(mm_slot); 529b46e756fSKirill A. Shutemov mmdrop(mm); 530b46e756fSKirill A. Shutemov } else if (mm_slot) { 531b46e756fSKirill A. Shutemov /* 532b46e756fSKirill A. Shutemov * This is required to serialize against 533b46e756fSKirill A. Shutemov * khugepaged_test_exit() (which is guaranteed to run 534b46e756fSKirill A. Shutemov * under mmap sem read mode). Stop here (after we 535b46e756fSKirill A. Shutemov * return all pagetables will be destroyed) until 536b46e756fSKirill A. Shutemov * khugepaged has finished working on the pagetables 537b46e756fSKirill A. Shutemov * under the mmap_sem. 538b46e756fSKirill A. Shutemov */ 539*d8ed45c5SMichel Lespinasse mmap_write_lock(mm); 540*d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 541b46e756fSKirill A. Shutemov } 542b46e756fSKirill A. Shutemov } 543b46e756fSKirill A. Shutemov 544b46e756fSKirill A. Shutemov static void release_pte_page(struct page *page) 545b46e756fSKirill A. Shutemov { 5465503fbf2SKirill A. Shutemov mod_node_page_state(page_pgdat(page), 5475503fbf2SKirill A. Shutemov NR_ISOLATED_ANON + page_is_file_lru(page), 5485503fbf2SKirill A. Shutemov -compound_nr(page)); 549b46e756fSKirill A. Shutemov unlock_page(page); 550b46e756fSKirill A. Shutemov putback_lru_page(page); 551b46e756fSKirill A. Shutemov } 552b46e756fSKirill A. Shutemov 5535503fbf2SKirill A. Shutemov static void release_pte_pages(pte_t *pte, pte_t *_pte, 5545503fbf2SKirill A. Shutemov struct list_head *compound_pagelist) 555b46e756fSKirill A. Shutemov { 5565503fbf2SKirill A. Shutemov struct page *page, *tmp; 5575503fbf2SKirill A. Shutemov 558b46e756fSKirill A. Shutemov while (--_pte >= pte) { 559b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 5605503fbf2SKirill A. Shutemov 5615503fbf2SKirill A. Shutemov page = pte_page(pteval); 5625503fbf2SKirill A. Shutemov if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && 5635503fbf2SKirill A. Shutemov !PageCompound(page)) 5645503fbf2SKirill A. Shutemov release_pte_page(page); 5655503fbf2SKirill A. Shutemov } 5665503fbf2SKirill A. Shutemov 5675503fbf2SKirill A. Shutemov list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { 5685503fbf2SKirill A. Shutemov list_del(&page->lru); 5695503fbf2SKirill A. Shutemov release_pte_page(page); 570b46e756fSKirill A. Shutemov } 571b46e756fSKirill A. Shutemov } 572b46e756fSKirill A. Shutemov 5739445689fSKirill A. Shutemov static bool is_refcount_suitable(struct page *page) 5749445689fSKirill A. Shutemov { 5759445689fSKirill A. Shutemov int expected_refcount; 5769445689fSKirill A. Shutemov 5779445689fSKirill A. Shutemov expected_refcount = total_mapcount(page); 5789445689fSKirill A. Shutemov if (PageSwapCache(page)) 5799445689fSKirill A. Shutemov expected_refcount += compound_nr(page); 5809445689fSKirill A. Shutemov 5819445689fSKirill A. Shutemov return page_count(page) == expected_refcount; 5829445689fSKirill A. Shutemov } 5839445689fSKirill A. Shutemov 584b46e756fSKirill A. Shutemov static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 585b46e756fSKirill A. Shutemov unsigned long address, 5865503fbf2SKirill A. Shutemov pte_t *pte, 5875503fbf2SKirill A. Shutemov struct list_head *compound_pagelist) 588b46e756fSKirill A. Shutemov { 589b46e756fSKirill A. Shutemov struct page *page = NULL; 590b46e756fSKirill A. Shutemov pte_t *_pte; 59171a2c112SKirill A. Shutemov int none_or_zero = 0, shared = 0, result = 0, referenced = 0; 5920db501f7SEbru Akagunduz bool writable = false; 593b46e756fSKirill A. Shutemov 594b46e756fSKirill A. Shutemov for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 595b46e756fSKirill A. Shutemov _pte++, address += PAGE_SIZE) { 596b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 597b46e756fSKirill A. Shutemov if (pte_none(pteval) || (pte_present(pteval) && 598b46e756fSKirill A. Shutemov is_zero_pfn(pte_pfn(pteval)))) { 599b46e756fSKirill A. Shutemov if (!userfaultfd_armed(vma) && 600b46e756fSKirill A. Shutemov ++none_or_zero <= khugepaged_max_ptes_none) { 601b46e756fSKirill A. Shutemov continue; 602b46e756fSKirill A. Shutemov } else { 603b46e756fSKirill A. Shutemov result = SCAN_EXCEED_NONE_PTE; 604b46e756fSKirill A. Shutemov goto out; 605b46e756fSKirill A. Shutemov } 606b46e756fSKirill A. Shutemov } 607b46e756fSKirill A. Shutemov if (!pte_present(pteval)) { 608b46e756fSKirill A. Shutemov result = SCAN_PTE_NON_PRESENT; 609b46e756fSKirill A. Shutemov goto out; 610b46e756fSKirill A. Shutemov } 611b46e756fSKirill A. Shutemov page = vm_normal_page(vma, address, pteval); 612b46e756fSKirill A. Shutemov if (unlikely(!page)) { 613b46e756fSKirill A. Shutemov result = SCAN_PAGE_NULL; 614b46e756fSKirill A. Shutemov goto out; 615b46e756fSKirill A. Shutemov } 616b46e756fSKirill A. Shutemov 617b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(!PageAnon(page), page); 618b46e756fSKirill A. Shutemov 61971a2c112SKirill A. Shutemov if (page_mapcount(page) > 1 && 62071a2c112SKirill A. Shutemov ++shared > khugepaged_max_ptes_shared) { 62171a2c112SKirill A. Shutemov result = SCAN_EXCEED_SHARED_PTE; 62271a2c112SKirill A. Shutemov goto out; 62371a2c112SKirill A. Shutemov } 62471a2c112SKirill A. Shutemov 6255503fbf2SKirill A. Shutemov if (PageCompound(page)) { 6265503fbf2SKirill A. Shutemov struct page *p; 6275503fbf2SKirill A. Shutemov page = compound_head(page); 6285503fbf2SKirill A. Shutemov 6295503fbf2SKirill A. Shutemov /* 6305503fbf2SKirill A. Shutemov * Check if we have dealt with the compound page 6315503fbf2SKirill A. Shutemov * already 6325503fbf2SKirill A. Shutemov */ 6335503fbf2SKirill A. Shutemov list_for_each_entry(p, compound_pagelist, lru) { 6345503fbf2SKirill A. Shutemov if (page == p) 6355503fbf2SKirill A. Shutemov goto next; 6365503fbf2SKirill A. Shutemov } 6375503fbf2SKirill A. Shutemov } 6385503fbf2SKirill A. Shutemov 639b46e756fSKirill A. Shutemov /* 640b46e756fSKirill A. Shutemov * We can do it before isolate_lru_page because the 641b46e756fSKirill A. Shutemov * page can't be freed from under us. NOTE: PG_lock 642b46e756fSKirill A. Shutemov * is needed to serialize against split_huge_page 643b46e756fSKirill A. Shutemov * when invoked from the VM. 644b46e756fSKirill A. Shutemov */ 645b46e756fSKirill A. Shutemov if (!trylock_page(page)) { 646b46e756fSKirill A. Shutemov result = SCAN_PAGE_LOCK; 647b46e756fSKirill A. Shutemov goto out; 648b46e756fSKirill A. Shutemov } 649b46e756fSKirill A. Shutemov 650b46e756fSKirill A. Shutemov /* 6519445689fSKirill A. Shutemov * Check if the page has any GUP (or other external) pins. 6529445689fSKirill A. Shutemov * 6539445689fSKirill A. Shutemov * The page table that maps the page has been already unlinked 6549445689fSKirill A. Shutemov * from the page table tree and this process cannot get 6559445689fSKirill A. Shutemov * an additinal pin on the page. 6569445689fSKirill A. Shutemov * 6579445689fSKirill A. Shutemov * New pins can come later if the page is shared across fork, 6589445689fSKirill A. Shutemov * but not from this process. The other process cannot write to 6599445689fSKirill A. Shutemov * the page, only trigger CoW. 660b46e756fSKirill A. Shutemov */ 6619445689fSKirill A. Shutemov if (!is_refcount_suitable(page)) { 662b46e756fSKirill A. Shutemov unlock_page(page); 663b46e756fSKirill A. Shutemov result = SCAN_PAGE_COUNT; 664b46e756fSKirill A. Shutemov goto out; 665b46e756fSKirill A. Shutemov } 6665503fbf2SKirill A. Shutemov if (!pte_write(pteval) && PageSwapCache(page) && 667b46e756fSKirill A. Shutemov !reuse_swap_page(page, NULL)) { 6685503fbf2SKirill A. Shutemov /* 6695503fbf2SKirill A. Shutemov * Page is in the swap cache and cannot be re-used. 6705503fbf2SKirill A. Shutemov * It cannot be collapsed into a THP. 6715503fbf2SKirill A. Shutemov */ 672b46e756fSKirill A. Shutemov unlock_page(page); 673b46e756fSKirill A. Shutemov result = SCAN_SWAP_CACHE_PAGE; 674b46e756fSKirill A. Shutemov goto out; 675b46e756fSKirill A. Shutemov } 676b46e756fSKirill A. Shutemov 677b46e756fSKirill A. Shutemov /* 678b46e756fSKirill A. Shutemov * Isolate the page to avoid collapsing an hugepage 679b46e756fSKirill A. Shutemov * currently in use by the VM. 680b46e756fSKirill A. Shutemov */ 681b46e756fSKirill A. Shutemov if (isolate_lru_page(page)) { 682b46e756fSKirill A. Shutemov unlock_page(page); 683b46e756fSKirill A. Shutemov result = SCAN_DEL_PAGE_LRU; 684b46e756fSKirill A. Shutemov goto out; 685b46e756fSKirill A. Shutemov } 6865503fbf2SKirill A. Shutemov mod_node_page_state(page_pgdat(page), 6875503fbf2SKirill A. Shutemov NR_ISOLATED_ANON + page_is_file_lru(page), 6885503fbf2SKirill A. Shutemov compound_nr(page)); 689b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(!PageLocked(page), page); 690b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(PageLRU(page), page); 691b46e756fSKirill A. Shutemov 6925503fbf2SKirill A. Shutemov if (PageCompound(page)) 6935503fbf2SKirill A. Shutemov list_add_tail(&page->lru, compound_pagelist); 6945503fbf2SKirill A. Shutemov next: 6950db501f7SEbru Akagunduz /* There should be enough young pte to collapse the page */ 696b46e756fSKirill A. Shutemov if (pte_young(pteval) || 697b46e756fSKirill A. Shutemov page_is_young(page) || PageReferenced(page) || 698b46e756fSKirill A. Shutemov mmu_notifier_test_young(vma->vm_mm, address)) 6990db501f7SEbru Akagunduz referenced++; 7005503fbf2SKirill A. Shutemov 7015503fbf2SKirill A. Shutemov if (pte_write(pteval)) 7025503fbf2SKirill A. Shutemov writable = true; 703b46e756fSKirill A. Shutemov } 704b46e756fSKirill A. Shutemov if (likely(writable)) { 705b46e756fSKirill A. Shutemov if (likely(referenced)) { 706b46e756fSKirill A. Shutemov result = SCAN_SUCCEED; 707b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page_isolate(page, none_or_zero, 708b46e756fSKirill A. Shutemov referenced, writable, result); 709b46e756fSKirill A. Shutemov return 1; 710b46e756fSKirill A. Shutemov } 711b46e756fSKirill A. Shutemov } else { 712b46e756fSKirill A. Shutemov result = SCAN_PAGE_RO; 713b46e756fSKirill A. Shutemov } 714b46e756fSKirill A. Shutemov 715b46e756fSKirill A. Shutemov out: 7165503fbf2SKirill A. Shutemov release_pte_pages(pte, _pte, compound_pagelist); 717b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page_isolate(page, none_or_zero, 718b46e756fSKirill A. Shutemov referenced, writable, result); 719b46e756fSKirill A. Shutemov return 0; 720b46e756fSKirill A. Shutemov } 721b46e756fSKirill A. Shutemov 722b46e756fSKirill A. Shutemov static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 723b46e756fSKirill A. Shutemov struct vm_area_struct *vma, 724b46e756fSKirill A. Shutemov unsigned long address, 7255503fbf2SKirill A. Shutemov spinlock_t *ptl, 7265503fbf2SKirill A. Shutemov struct list_head *compound_pagelist) 727b46e756fSKirill A. Shutemov { 7285503fbf2SKirill A. Shutemov struct page *src_page, *tmp; 729b46e756fSKirill A. Shutemov pte_t *_pte; 730338a16baSDavid Rientjes for (_pte = pte; _pte < pte + HPAGE_PMD_NR; 731338a16baSDavid Rientjes _pte++, page++, address += PAGE_SIZE) { 732b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 733b46e756fSKirill A. Shutemov 734b46e756fSKirill A. Shutemov if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 735b46e756fSKirill A. Shutemov clear_user_highpage(page, address); 736b46e756fSKirill A. Shutemov add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 737b46e756fSKirill A. Shutemov if (is_zero_pfn(pte_pfn(pteval))) { 738b46e756fSKirill A. Shutemov /* 739b46e756fSKirill A. Shutemov * ptl mostly unnecessary. 740b46e756fSKirill A. Shutemov */ 741b46e756fSKirill A. Shutemov spin_lock(ptl); 742b46e756fSKirill A. Shutemov /* 743b46e756fSKirill A. Shutemov * paravirt calls inside pte_clear here are 744b46e756fSKirill A. Shutemov * superfluous. 745b46e756fSKirill A. Shutemov */ 746b46e756fSKirill A. Shutemov pte_clear(vma->vm_mm, address, _pte); 747b46e756fSKirill A. Shutemov spin_unlock(ptl); 748b46e756fSKirill A. Shutemov } 749b46e756fSKirill A. Shutemov } else { 750b46e756fSKirill A. Shutemov src_page = pte_page(pteval); 751b46e756fSKirill A. Shutemov copy_user_highpage(page, src_page, address, vma); 7525503fbf2SKirill A. Shutemov if (!PageCompound(src_page)) 753b46e756fSKirill A. Shutemov release_pte_page(src_page); 754b46e756fSKirill A. Shutemov /* 755b46e756fSKirill A. Shutemov * ptl mostly unnecessary, but preempt has to 756b46e756fSKirill A. Shutemov * be disabled to update the per-cpu stats 757b46e756fSKirill A. Shutemov * inside page_remove_rmap(). 758b46e756fSKirill A. Shutemov */ 759b46e756fSKirill A. Shutemov spin_lock(ptl); 760b46e756fSKirill A. Shutemov /* 761b46e756fSKirill A. Shutemov * paravirt calls inside pte_clear here are 762b46e756fSKirill A. Shutemov * superfluous. 763b46e756fSKirill A. Shutemov */ 764b46e756fSKirill A. Shutemov pte_clear(vma->vm_mm, address, _pte); 765b46e756fSKirill A. Shutemov page_remove_rmap(src_page, false); 766b46e756fSKirill A. Shutemov spin_unlock(ptl); 767b46e756fSKirill A. Shutemov free_page_and_swap_cache(src_page); 768b46e756fSKirill A. Shutemov } 769b46e756fSKirill A. Shutemov } 7705503fbf2SKirill A. Shutemov 7715503fbf2SKirill A. Shutemov list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { 7725503fbf2SKirill A. Shutemov list_del(&src_page->lru); 7735503fbf2SKirill A. Shutemov release_pte_page(src_page); 7745503fbf2SKirill A. Shutemov } 775b46e756fSKirill A. Shutemov } 776b46e756fSKirill A. Shutemov 777b46e756fSKirill A. Shutemov static void khugepaged_alloc_sleep(void) 778b46e756fSKirill A. Shutemov { 779b46e756fSKirill A. Shutemov DEFINE_WAIT(wait); 780b46e756fSKirill A. Shutemov 781b46e756fSKirill A. Shutemov add_wait_queue(&khugepaged_wait, &wait); 782b46e756fSKirill A. Shutemov freezable_schedule_timeout_interruptible( 783b46e756fSKirill A. Shutemov msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 784b46e756fSKirill A. Shutemov remove_wait_queue(&khugepaged_wait, &wait); 785b46e756fSKirill A. Shutemov } 786b46e756fSKirill A. Shutemov 787b46e756fSKirill A. Shutemov static int khugepaged_node_load[MAX_NUMNODES]; 788b46e756fSKirill A. Shutemov 789b46e756fSKirill A. Shutemov static bool khugepaged_scan_abort(int nid) 790b46e756fSKirill A. Shutemov { 791b46e756fSKirill A. Shutemov int i; 792b46e756fSKirill A. Shutemov 793b46e756fSKirill A. Shutemov /* 794a5f5f91dSMel Gorman * If node_reclaim_mode is disabled, then no extra effort is made to 795b46e756fSKirill A. Shutemov * allocate memory locally. 796b46e756fSKirill A. Shutemov */ 797a5f5f91dSMel Gorman if (!node_reclaim_mode) 798b46e756fSKirill A. Shutemov return false; 799b46e756fSKirill A. Shutemov 800b46e756fSKirill A. Shutemov /* If there is a count for this node already, it must be acceptable */ 801b46e756fSKirill A. Shutemov if (khugepaged_node_load[nid]) 802b46e756fSKirill A. Shutemov return false; 803b46e756fSKirill A. Shutemov 804b46e756fSKirill A. Shutemov for (i = 0; i < MAX_NUMNODES; i++) { 805b46e756fSKirill A. Shutemov if (!khugepaged_node_load[i]) 806b46e756fSKirill A. Shutemov continue; 807a55c7454SMatt Fleming if (node_distance(nid, i) > node_reclaim_distance) 808b46e756fSKirill A. Shutemov return true; 809b46e756fSKirill A. Shutemov } 810b46e756fSKirill A. Shutemov return false; 811b46e756fSKirill A. Shutemov } 812b46e756fSKirill A. Shutemov 813b46e756fSKirill A. Shutemov /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 814b46e756fSKirill A. Shutemov static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 815b46e756fSKirill A. Shutemov { 81625160354SVlastimil Babka return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; 817b46e756fSKirill A. Shutemov } 818b46e756fSKirill A. Shutemov 819b46e756fSKirill A. Shutemov #ifdef CONFIG_NUMA 820b46e756fSKirill A. Shutemov static int khugepaged_find_target_node(void) 821b46e756fSKirill A. Shutemov { 822b46e756fSKirill A. Shutemov static int last_khugepaged_target_node = NUMA_NO_NODE; 823b46e756fSKirill A. Shutemov int nid, target_node = 0, max_value = 0; 824b46e756fSKirill A. Shutemov 825b46e756fSKirill A. Shutemov /* find first node with max normal pages hit */ 826b46e756fSKirill A. Shutemov for (nid = 0; nid < MAX_NUMNODES; nid++) 827b46e756fSKirill A. Shutemov if (khugepaged_node_load[nid] > max_value) { 828b46e756fSKirill A. Shutemov max_value = khugepaged_node_load[nid]; 829b46e756fSKirill A. Shutemov target_node = nid; 830b46e756fSKirill A. Shutemov } 831b46e756fSKirill A. Shutemov 832b46e756fSKirill A. Shutemov /* do some balance if several nodes have the same hit record */ 833b46e756fSKirill A. Shutemov if (target_node <= last_khugepaged_target_node) 834b46e756fSKirill A. Shutemov for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; 835b46e756fSKirill A. Shutemov nid++) 836b46e756fSKirill A. Shutemov if (max_value == khugepaged_node_load[nid]) { 837b46e756fSKirill A. Shutemov target_node = nid; 838b46e756fSKirill A. Shutemov break; 839b46e756fSKirill A. Shutemov } 840b46e756fSKirill A. Shutemov 841b46e756fSKirill A. Shutemov last_khugepaged_target_node = target_node; 842b46e756fSKirill A. Shutemov return target_node; 843b46e756fSKirill A. Shutemov } 844b46e756fSKirill A. Shutemov 845b46e756fSKirill A. Shutemov static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 846b46e756fSKirill A. Shutemov { 847b46e756fSKirill A. Shutemov if (IS_ERR(*hpage)) { 848b46e756fSKirill A. Shutemov if (!*wait) 849b46e756fSKirill A. Shutemov return false; 850b46e756fSKirill A. Shutemov 851b46e756fSKirill A. Shutemov *wait = false; 852b46e756fSKirill A. Shutemov *hpage = NULL; 853b46e756fSKirill A. Shutemov khugepaged_alloc_sleep(); 854b46e756fSKirill A. Shutemov } else if (*hpage) { 855b46e756fSKirill A. Shutemov put_page(*hpage); 856b46e756fSKirill A. Shutemov *hpage = NULL; 857b46e756fSKirill A. Shutemov } 858b46e756fSKirill A. Shutemov 859b46e756fSKirill A. Shutemov return true; 860b46e756fSKirill A. Shutemov } 861b46e756fSKirill A. Shutemov 862b46e756fSKirill A. Shutemov static struct page * 863988ddb71SKirill A. Shutemov khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) 864b46e756fSKirill A. Shutemov { 865b46e756fSKirill A. Shutemov VM_BUG_ON_PAGE(*hpage, *hpage); 866b46e756fSKirill A. Shutemov 867b46e756fSKirill A. Shutemov *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); 868b46e756fSKirill A. Shutemov if (unlikely(!*hpage)) { 869b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 870b46e756fSKirill A. Shutemov *hpage = ERR_PTR(-ENOMEM); 871b46e756fSKirill A. Shutemov return NULL; 872b46e756fSKirill A. Shutemov } 873b46e756fSKirill A. Shutemov 874b46e756fSKirill A. Shutemov prep_transhuge_page(*hpage); 875b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC); 876b46e756fSKirill A. Shutemov return *hpage; 877b46e756fSKirill A. Shutemov } 878b46e756fSKirill A. Shutemov #else 879b46e756fSKirill A. Shutemov static int khugepaged_find_target_node(void) 880b46e756fSKirill A. Shutemov { 881b46e756fSKirill A. Shutemov return 0; 882b46e756fSKirill A. Shutemov } 883b46e756fSKirill A. Shutemov 884b46e756fSKirill A. Shutemov static inline struct page *alloc_khugepaged_hugepage(void) 885b46e756fSKirill A. Shutemov { 886b46e756fSKirill A. Shutemov struct page *page; 887b46e756fSKirill A. Shutemov 888b46e756fSKirill A. Shutemov page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), 889b46e756fSKirill A. Shutemov HPAGE_PMD_ORDER); 890b46e756fSKirill A. Shutemov if (page) 891b46e756fSKirill A. Shutemov prep_transhuge_page(page); 892b46e756fSKirill A. Shutemov return page; 893b46e756fSKirill A. Shutemov } 894b46e756fSKirill A. Shutemov 895b46e756fSKirill A. Shutemov static struct page *khugepaged_alloc_hugepage(bool *wait) 896b46e756fSKirill A. Shutemov { 897b46e756fSKirill A. Shutemov struct page *hpage; 898b46e756fSKirill A. Shutemov 899b46e756fSKirill A. Shutemov do { 900b46e756fSKirill A. Shutemov hpage = alloc_khugepaged_hugepage(); 901b46e756fSKirill A. Shutemov if (!hpage) { 902b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 903b46e756fSKirill A. Shutemov if (!*wait) 904b46e756fSKirill A. Shutemov return NULL; 905b46e756fSKirill A. Shutemov 906b46e756fSKirill A. Shutemov *wait = false; 907b46e756fSKirill A. Shutemov khugepaged_alloc_sleep(); 908b46e756fSKirill A. Shutemov } else 909b46e756fSKirill A. Shutemov count_vm_event(THP_COLLAPSE_ALLOC); 910b46e756fSKirill A. Shutemov } while (unlikely(!hpage) && likely(khugepaged_enabled())); 911b46e756fSKirill A. Shutemov 912b46e756fSKirill A. Shutemov return hpage; 913b46e756fSKirill A. Shutemov } 914b46e756fSKirill A. Shutemov 915b46e756fSKirill A. Shutemov static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 916b46e756fSKirill A. Shutemov { 917b46e756fSKirill A. Shutemov if (!*hpage) 918b46e756fSKirill A. Shutemov *hpage = khugepaged_alloc_hugepage(wait); 919b46e756fSKirill A. Shutemov 920b46e756fSKirill A. Shutemov if (unlikely(!*hpage)) 921b46e756fSKirill A. Shutemov return false; 922b46e756fSKirill A. Shutemov 923b46e756fSKirill A. Shutemov return true; 924b46e756fSKirill A. Shutemov } 925b46e756fSKirill A. Shutemov 926b46e756fSKirill A. Shutemov static struct page * 927988ddb71SKirill A. Shutemov khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) 928b46e756fSKirill A. Shutemov { 929b46e756fSKirill A. Shutemov VM_BUG_ON(!*hpage); 930b46e756fSKirill A. Shutemov 931b46e756fSKirill A. Shutemov return *hpage; 932b46e756fSKirill A. Shutemov } 933b46e756fSKirill A. Shutemov #endif 934b46e756fSKirill A. Shutemov 935b46e756fSKirill A. Shutemov /* 936b46e756fSKirill A. Shutemov * If mmap_sem temporarily dropped, revalidate vma 937b46e756fSKirill A. Shutemov * before taking mmap_sem. 938b46e756fSKirill A. Shutemov * Return 0 if succeeds, otherwise return none-zero 939b46e756fSKirill A. Shutemov * value (scan code). 940b46e756fSKirill A. Shutemov */ 941b46e756fSKirill A. Shutemov 942c131f751SKirill A. Shutemov static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, 943c131f751SKirill A. Shutemov struct vm_area_struct **vmap) 944b46e756fSKirill A. Shutemov { 945b46e756fSKirill A. Shutemov struct vm_area_struct *vma; 946b46e756fSKirill A. Shutemov unsigned long hstart, hend; 947b46e756fSKirill A. Shutemov 948b46e756fSKirill A. Shutemov if (unlikely(khugepaged_test_exit(mm))) 949b46e756fSKirill A. Shutemov return SCAN_ANY_PROCESS; 950b46e756fSKirill A. Shutemov 951c131f751SKirill A. Shutemov *vmap = vma = find_vma(mm, address); 952b46e756fSKirill A. Shutemov if (!vma) 953b46e756fSKirill A. Shutemov return SCAN_VMA_NULL; 954b46e756fSKirill A. Shutemov 955b46e756fSKirill A. Shutemov hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 956b46e756fSKirill A. Shutemov hend = vma->vm_end & HPAGE_PMD_MASK; 957b46e756fSKirill A. Shutemov if (address < hstart || address + HPAGE_PMD_SIZE > hend) 958b46e756fSKirill A. Shutemov return SCAN_ADDRESS_RANGE; 95950f8b92fSSong Liu if (!hugepage_vma_check(vma, vma->vm_flags)) 960b46e756fSKirill A. Shutemov return SCAN_VMA_CHECK; 961b46e756fSKirill A. Shutemov return 0; 962b46e756fSKirill A. Shutemov } 963b46e756fSKirill A. Shutemov 964b46e756fSKirill A. Shutemov /* 965b46e756fSKirill A. Shutemov * Bring missing pages in from swap, to complete THP collapse. 966b46e756fSKirill A. Shutemov * Only done if khugepaged_scan_pmd believes it is worthwhile. 967b46e756fSKirill A. Shutemov * 968b46e756fSKirill A. Shutemov * Called and returns without pte mapped or spinlocks held, 969b46e756fSKirill A. Shutemov * but with mmap_sem held to protect against vma changes. 970b46e756fSKirill A. Shutemov */ 971b46e756fSKirill A. Shutemov 972b46e756fSKirill A. Shutemov static bool __collapse_huge_page_swapin(struct mm_struct *mm, 973b46e756fSKirill A. Shutemov struct vm_area_struct *vma, 9740db501f7SEbru Akagunduz unsigned long address, pmd_t *pmd, 9750db501f7SEbru Akagunduz int referenced) 976b46e756fSKirill A. Shutemov { 9772b740303SSouptick Joarder int swapped_in = 0; 9782b740303SSouptick Joarder vm_fault_t ret = 0; 97982b0f8c3SJan Kara struct vm_fault vmf = { 980b46e756fSKirill A. Shutemov .vma = vma, 981b46e756fSKirill A. Shutemov .address = address, 982b46e756fSKirill A. Shutemov .flags = FAULT_FLAG_ALLOW_RETRY, 983b46e756fSKirill A. Shutemov .pmd = pmd, 9840721ec8bSJan Kara .pgoff = linear_page_index(vma, address), 985b46e756fSKirill A. Shutemov }; 986b46e756fSKirill A. Shutemov 98782b0f8c3SJan Kara vmf.pte = pte_offset_map(pmd, address); 98882b0f8c3SJan Kara for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; 98982b0f8c3SJan Kara vmf.pte++, vmf.address += PAGE_SIZE) { 9902994302bSJan Kara vmf.orig_pte = *vmf.pte; 9912994302bSJan Kara if (!is_swap_pte(vmf.orig_pte)) 992b46e756fSKirill A. Shutemov continue; 993b46e756fSKirill A. Shutemov swapped_in++; 9942994302bSJan Kara ret = do_swap_page(&vmf); 9950db501f7SEbru Akagunduz 996b46e756fSKirill A. Shutemov /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ 997b46e756fSKirill A. Shutemov if (ret & VM_FAULT_RETRY) { 998*d8ed45c5SMichel Lespinasse mmap_read_lock(mm); 99982b0f8c3SJan Kara if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { 1000b46e756fSKirill A. Shutemov /* vma is no longer available, don't continue to swapin */ 10010db501f7SEbru Akagunduz trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); 1002b46e756fSKirill A. Shutemov return false; 100347f863eaSEbru Akagunduz } 1004b46e756fSKirill A. Shutemov /* check if the pmd is still valid */ 1005835152a2SSeongJae Park if (mm_find_pmd(mm, address) != pmd) { 1006835152a2SSeongJae Park trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); 1007b46e756fSKirill A. Shutemov return false; 1008b46e756fSKirill A. Shutemov } 1009835152a2SSeongJae Park } 1010b46e756fSKirill A. Shutemov if (ret & VM_FAULT_ERROR) { 10110db501f7SEbru Akagunduz trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); 1012b46e756fSKirill A. Shutemov return false; 1013b46e756fSKirill A. Shutemov } 1014b46e756fSKirill A. Shutemov /* pte is unmapped now, we need to map it */ 101582b0f8c3SJan Kara vmf.pte = pte_offset_map(pmd, vmf.address); 1016b46e756fSKirill A. Shutemov } 101782b0f8c3SJan Kara vmf.pte--; 101882b0f8c3SJan Kara pte_unmap(vmf.pte); 1019ae2c5d80SKirill A. Shutemov 1020ae2c5d80SKirill A. Shutemov /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ 1021ae2c5d80SKirill A. Shutemov if (swapped_in) 1022ae2c5d80SKirill A. Shutemov lru_add_drain(); 1023ae2c5d80SKirill A. Shutemov 10240db501f7SEbru Akagunduz trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); 1025b46e756fSKirill A. Shutemov return true; 1026b46e756fSKirill A. Shutemov } 1027b46e756fSKirill A. Shutemov 1028b46e756fSKirill A. Shutemov static void collapse_huge_page(struct mm_struct *mm, 1029b46e756fSKirill A. Shutemov unsigned long address, 1030b46e756fSKirill A. Shutemov struct page **hpage, 1031ffe945e6SKirill A. Shutemov int node, int referenced, int unmapped) 1032b46e756fSKirill A. Shutemov { 10335503fbf2SKirill A. Shutemov LIST_HEAD(compound_pagelist); 1034b46e756fSKirill A. Shutemov pmd_t *pmd, _pmd; 1035b46e756fSKirill A. Shutemov pte_t *pte; 1036b46e756fSKirill A. Shutemov pgtable_t pgtable; 1037b46e756fSKirill A. Shutemov struct page *new_page; 1038b46e756fSKirill A. Shutemov spinlock_t *pmd_ptl, *pte_ptl; 1039b46e756fSKirill A. Shutemov int isolated = 0, result = 0; 1040c131f751SKirill A. Shutemov struct vm_area_struct *vma; 1041ac46d4f3SJérôme Glisse struct mmu_notifier_range range; 1042b46e756fSKirill A. Shutemov gfp_t gfp; 1043b46e756fSKirill A. Shutemov 1044b46e756fSKirill A. Shutemov VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1045b46e756fSKirill A. Shutemov 1046b46e756fSKirill A. Shutemov /* Only allocate from the target node */ 104741b6167eSMichal Hocko gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; 1048b46e756fSKirill A. Shutemov 1049988ddb71SKirill A. Shutemov /* 1050988ddb71SKirill A. Shutemov * Before allocating the hugepage, release the mmap_sem read lock. 1051988ddb71SKirill A. Shutemov * The allocation can take potentially a long time if it involves 1052988ddb71SKirill A. Shutemov * sync compaction, and we do not need to hold the mmap_sem during 1053988ddb71SKirill A. Shutemov * that. We will recheck the vma after taking it again in write mode. 1054988ddb71SKirill A. Shutemov */ 1055*d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 1056988ddb71SKirill A. Shutemov new_page = khugepaged_alloc_page(hpage, gfp, node); 1057b46e756fSKirill A. Shutemov if (!new_page) { 1058b46e756fSKirill A. Shutemov result = SCAN_ALLOC_HUGE_PAGE_FAIL; 1059b46e756fSKirill A. Shutemov goto out_nolock; 1060b46e756fSKirill A. Shutemov } 1061b46e756fSKirill A. Shutemov 1062d9eb1ea2SJohannes Weiner if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { 1063b46e756fSKirill A. Shutemov result = SCAN_CGROUP_CHARGE_FAIL; 1064b46e756fSKirill A. Shutemov goto out_nolock; 1065b46e756fSKirill A. Shutemov } 10669d82c694SJohannes Weiner count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); 1067b46e756fSKirill A. Shutemov 1068*d8ed45c5SMichel Lespinasse mmap_read_lock(mm); 1069c131f751SKirill A. Shutemov result = hugepage_vma_revalidate(mm, address, &vma); 1070b46e756fSKirill A. Shutemov if (result) { 1071*d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 1072b46e756fSKirill A. Shutemov goto out_nolock; 1073b46e756fSKirill A. Shutemov } 1074b46e756fSKirill A. Shutemov 1075b46e756fSKirill A. Shutemov pmd = mm_find_pmd(mm, address); 1076b46e756fSKirill A. Shutemov if (!pmd) { 1077b46e756fSKirill A. Shutemov result = SCAN_PMD_NULL; 1078*d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 1079b46e756fSKirill A. Shutemov goto out_nolock; 1080b46e756fSKirill A. Shutemov } 1081b46e756fSKirill A. Shutemov 1082b46e756fSKirill A. Shutemov /* 1083b46e756fSKirill A. Shutemov * __collapse_huge_page_swapin always returns with mmap_sem locked. 108447f863eaSEbru Akagunduz * If it fails, we release mmap_sem and jump out_nolock. 1085b46e756fSKirill A. Shutemov * Continuing to collapse causes inconsistency. 1086b46e756fSKirill A. Shutemov */ 1087ffe945e6SKirill A. Shutemov if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, 1088ffe945e6SKirill A. Shutemov pmd, referenced)) { 1089*d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 1090b46e756fSKirill A. Shutemov goto out_nolock; 1091b46e756fSKirill A. Shutemov } 1092b46e756fSKirill A. Shutemov 1093*d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 1094b46e756fSKirill A. Shutemov /* 1095b46e756fSKirill A. Shutemov * Prevent all access to pagetables with the exception of 1096b46e756fSKirill A. Shutemov * gup_fast later handled by the ptep_clear_flush and the VM 1097b46e756fSKirill A. Shutemov * handled by the anon_vma lock + PG_lock. 1098b46e756fSKirill A. Shutemov */ 1099*d8ed45c5SMichel Lespinasse mmap_write_lock(mm); 110059ea6d06SAndrea Arcangeli result = SCAN_ANY_PROCESS; 110159ea6d06SAndrea Arcangeli if (!mmget_still_valid(mm)) 110259ea6d06SAndrea Arcangeli goto out; 1103c131f751SKirill A. Shutemov result = hugepage_vma_revalidate(mm, address, &vma); 1104b46e756fSKirill A. Shutemov if (result) 1105b46e756fSKirill A. Shutemov goto out; 1106b46e756fSKirill A. Shutemov /* check if the pmd is still valid */ 1107b46e756fSKirill A. Shutemov if (mm_find_pmd(mm, address) != pmd) 1108b46e756fSKirill A. Shutemov goto out; 1109b46e756fSKirill A. Shutemov 1110b46e756fSKirill A. Shutemov anon_vma_lock_write(vma->anon_vma); 1111b46e756fSKirill A. Shutemov 11127269f999SJérôme Glisse mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, 11136f4f13e8SJérôme Glisse address, address + HPAGE_PMD_SIZE); 1114ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_start(&range); 1115ec649c9dSVille Syrjälä 1116ec649c9dSVille Syrjälä pte = pte_offset_map(pmd, address); 1117ec649c9dSVille Syrjälä pte_ptl = pte_lockptr(mm, pmd); 1118ec649c9dSVille Syrjälä 1119b46e756fSKirill A. Shutemov pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1120b46e756fSKirill A. Shutemov /* 1121b46e756fSKirill A. Shutemov * After this gup_fast can't run anymore. This also removes 1122b46e756fSKirill A. Shutemov * any huge TLB entry from the CPU so we won't allow 1123b46e756fSKirill A. Shutemov * huge and small TLB entries for the same virtual address 1124b46e756fSKirill A. Shutemov * to avoid the risk of CPU bugs in that area. 1125b46e756fSKirill A. Shutemov */ 1126b46e756fSKirill A. Shutemov _pmd = pmdp_collapse_flush(vma, address, pmd); 1127b46e756fSKirill A. Shutemov spin_unlock(pmd_ptl); 1128ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_end(&range); 1129b46e756fSKirill A. Shutemov 1130b46e756fSKirill A. Shutemov spin_lock(pte_ptl); 11315503fbf2SKirill A. Shutemov isolated = __collapse_huge_page_isolate(vma, address, pte, 11325503fbf2SKirill A. Shutemov &compound_pagelist); 1133b46e756fSKirill A. Shutemov spin_unlock(pte_ptl); 1134b46e756fSKirill A. Shutemov 1135b46e756fSKirill A. Shutemov if (unlikely(!isolated)) { 1136b46e756fSKirill A. Shutemov pte_unmap(pte); 1137b46e756fSKirill A. Shutemov spin_lock(pmd_ptl); 1138b46e756fSKirill A. Shutemov BUG_ON(!pmd_none(*pmd)); 1139b46e756fSKirill A. Shutemov /* 1140b46e756fSKirill A. Shutemov * We can only use set_pmd_at when establishing 1141b46e756fSKirill A. Shutemov * hugepmds and never for establishing regular pmds that 1142b46e756fSKirill A. Shutemov * points to regular pagetables. Use pmd_populate for that 1143b46e756fSKirill A. Shutemov */ 1144b46e756fSKirill A. Shutemov pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 1145b46e756fSKirill A. Shutemov spin_unlock(pmd_ptl); 1146b46e756fSKirill A. Shutemov anon_vma_unlock_write(vma->anon_vma); 1147b46e756fSKirill A. Shutemov result = SCAN_FAIL; 1148b46e756fSKirill A. Shutemov goto out; 1149b46e756fSKirill A. Shutemov } 1150b46e756fSKirill A. Shutemov 1151b46e756fSKirill A. Shutemov /* 1152b46e756fSKirill A. Shutemov * All pages are isolated and locked so anon_vma rmap 1153b46e756fSKirill A. Shutemov * can't run anymore. 1154b46e756fSKirill A. Shutemov */ 1155b46e756fSKirill A. Shutemov anon_vma_unlock_write(vma->anon_vma); 1156b46e756fSKirill A. Shutemov 11575503fbf2SKirill A. Shutemov __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, 11585503fbf2SKirill A. Shutemov &compound_pagelist); 1159b46e756fSKirill A. Shutemov pte_unmap(pte); 1160b46e756fSKirill A. Shutemov __SetPageUptodate(new_page); 1161b46e756fSKirill A. Shutemov pgtable = pmd_pgtable(_pmd); 1162b46e756fSKirill A. Shutemov 1163b46e756fSKirill A. Shutemov _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); 1164f55e1014SLinus Torvalds _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 1165b46e756fSKirill A. Shutemov 1166b46e756fSKirill A. Shutemov /* 1167b46e756fSKirill A. Shutemov * spin_lock() below is not the equivalent of smp_wmb(), so 1168b46e756fSKirill A. Shutemov * this is needed to avoid the copy_huge_page writes to become 1169b46e756fSKirill A. Shutemov * visible after the set_pmd_at() write. 1170b46e756fSKirill A. Shutemov */ 1171b46e756fSKirill A. Shutemov smp_wmb(); 1172b46e756fSKirill A. Shutemov 1173b46e756fSKirill A. Shutemov spin_lock(pmd_ptl); 1174b46e756fSKirill A. Shutemov BUG_ON(!pmd_none(*pmd)); 1175be5d0a74SJohannes Weiner page_add_new_anon_rmap(new_page, vma, address, true); 1176b46e756fSKirill A. Shutemov lru_cache_add_active_or_unevictable(new_page, vma); 1177b46e756fSKirill A. Shutemov pgtable_trans_huge_deposit(mm, pmd, pgtable); 1178b46e756fSKirill A. Shutemov set_pmd_at(mm, address, pmd, _pmd); 1179b46e756fSKirill A. Shutemov update_mmu_cache_pmd(vma, address, pmd); 1180b46e756fSKirill A. Shutemov spin_unlock(pmd_ptl); 1181b46e756fSKirill A. Shutemov 1182b46e756fSKirill A. Shutemov *hpage = NULL; 1183b46e756fSKirill A. Shutemov 1184b46e756fSKirill A. Shutemov khugepaged_pages_collapsed++; 1185b46e756fSKirill A. Shutemov result = SCAN_SUCCEED; 1186b46e756fSKirill A. Shutemov out_up_write: 1187*d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 1188b46e756fSKirill A. Shutemov out_nolock: 11899d82c694SJohannes Weiner if (!IS_ERR_OR_NULL(*hpage)) 11909d82c694SJohannes Weiner mem_cgroup_uncharge(*hpage); 1191b46e756fSKirill A. Shutemov trace_mm_collapse_huge_page(mm, isolated, result); 1192b46e756fSKirill A. Shutemov return; 1193b46e756fSKirill A. Shutemov out: 1194b46e756fSKirill A. Shutemov goto out_up_write; 1195b46e756fSKirill A. Shutemov } 1196b46e756fSKirill A. Shutemov 1197b46e756fSKirill A. Shutemov static int khugepaged_scan_pmd(struct mm_struct *mm, 1198b46e756fSKirill A. Shutemov struct vm_area_struct *vma, 1199b46e756fSKirill A. Shutemov unsigned long address, 1200b46e756fSKirill A. Shutemov struct page **hpage) 1201b46e756fSKirill A. Shutemov { 1202b46e756fSKirill A. Shutemov pmd_t *pmd; 1203b46e756fSKirill A. Shutemov pte_t *pte, *_pte; 120471a2c112SKirill A. Shutemov int ret = 0, result = 0, referenced = 0; 120571a2c112SKirill A. Shutemov int none_or_zero = 0, shared = 0; 1206b46e756fSKirill A. Shutemov struct page *page = NULL; 1207b46e756fSKirill A. Shutemov unsigned long _address; 1208b46e756fSKirill A. Shutemov spinlock_t *ptl; 1209b46e756fSKirill A. Shutemov int node = NUMA_NO_NODE, unmapped = 0; 12100db501f7SEbru Akagunduz bool writable = false; 1211b46e756fSKirill A. Shutemov 1212b46e756fSKirill A. Shutemov VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1213b46e756fSKirill A. Shutemov 1214b46e756fSKirill A. Shutemov pmd = mm_find_pmd(mm, address); 1215b46e756fSKirill A. Shutemov if (!pmd) { 1216b46e756fSKirill A. Shutemov result = SCAN_PMD_NULL; 1217b46e756fSKirill A. Shutemov goto out; 1218b46e756fSKirill A. Shutemov } 1219b46e756fSKirill A. Shutemov 1220b46e756fSKirill A. Shutemov memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 1221b46e756fSKirill A. Shutemov pte = pte_offset_map_lock(mm, pmd, address, &ptl); 1222b46e756fSKirill A. Shutemov for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 1223b46e756fSKirill A. Shutemov _pte++, _address += PAGE_SIZE) { 1224b46e756fSKirill A. Shutemov pte_t pteval = *_pte; 1225b46e756fSKirill A. Shutemov if (is_swap_pte(pteval)) { 1226b46e756fSKirill A. Shutemov if (++unmapped <= khugepaged_max_ptes_swap) { 1227e1e267c7SPeter Xu /* 1228e1e267c7SPeter Xu * Always be strict with uffd-wp 1229e1e267c7SPeter Xu * enabled swap entries. Please see 1230e1e267c7SPeter Xu * comment below for pte_uffd_wp(). 1231e1e267c7SPeter Xu */ 1232e1e267c7SPeter Xu if (pte_swp_uffd_wp(pteval)) { 1233e1e267c7SPeter Xu result = SCAN_PTE_UFFD_WP; 1234e1e267c7SPeter Xu goto out_unmap; 1235e1e267c7SPeter Xu } 1236b46e756fSKirill A. Shutemov continue; 1237b46e756fSKirill A. Shutemov } else { 1238b46e756fSKirill A. Shutemov result = SCAN_EXCEED_SWAP_PTE; 1239b46e756fSKirill A. Shutemov goto out_unmap; 1240b46e756fSKirill A. Shutemov } 1241b46e756fSKirill A. Shutemov } 1242b46e756fSKirill A. Shutemov if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 1243b46e756fSKirill A. Shutemov if (!userfaultfd_armed(vma) && 1244b46e756fSKirill A. Shutemov ++none_or_zero <= khugepaged_max_ptes_none) { 1245b46e756fSKirill A. Shutemov continue; 1246b46e756fSKirill A. Shutemov } else { 1247b46e756fSKirill A. Shutemov result = SCAN_EXCEED_NONE_PTE; 1248b46e756fSKirill A. Shutemov goto out_unmap; 1249b46e756fSKirill A. Shutemov } 1250b46e756fSKirill A. Shutemov } 1251b46e756fSKirill A. Shutemov if (!pte_present(pteval)) { 1252b46e756fSKirill A. Shutemov result = SCAN_PTE_NON_PRESENT; 1253b46e756fSKirill A. Shutemov goto out_unmap; 1254b46e756fSKirill A. Shutemov } 1255e1e267c7SPeter Xu if (pte_uffd_wp(pteval)) { 1256e1e267c7SPeter Xu /* 1257e1e267c7SPeter Xu * Don't collapse the page if any of the small 1258e1e267c7SPeter Xu * PTEs are armed with uffd write protection. 1259e1e267c7SPeter Xu * Here we can also mark the new huge pmd as 1260e1e267c7SPeter Xu * write protected if any of the small ones is 1261e1e267c7SPeter Xu * marked but that could bring uknown 1262e1e267c7SPeter Xu * userfault messages that falls outside of 1263e1e267c7SPeter Xu * the registered range. So, just be simple. 1264e1e267c7SPeter Xu */ 1265e1e267c7SPeter Xu result = SCAN_PTE_UFFD_WP; 1266e1e267c7SPeter Xu goto out_unmap; 1267e1e267c7SPeter Xu } 1268b46e756fSKirill A. Shutemov if (pte_write(pteval)) 1269b46e756fSKirill A. Shutemov writable = true; 1270b46e756fSKirill A. Shutemov 1271b46e756fSKirill A. Shutemov page = vm_normal_page(vma, _address, pteval); 1272b46e756fSKirill A. Shutemov if (unlikely(!page)) { 1273b46e756fSKirill A. Shutemov result = SCAN_PAGE_NULL; 1274b46e756fSKirill A. Shutemov goto out_unmap; 1275b46e756fSKirill A. Shutemov } 1276b46e756fSKirill A. Shutemov 127771a2c112SKirill A. Shutemov if (page_mapcount(page) > 1 && 127871a2c112SKirill A. Shutemov ++shared > khugepaged_max_ptes_shared) { 127971a2c112SKirill A. Shutemov result = SCAN_EXCEED_SHARED_PTE; 128071a2c112SKirill A. Shutemov goto out_unmap; 128171a2c112SKirill A. Shutemov } 128271a2c112SKirill A. Shutemov 12835503fbf2SKirill A. Shutemov page = compound_head(page); 1284b46e756fSKirill A. Shutemov 1285b46e756fSKirill A. Shutemov /* 1286b46e756fSKirill A. Shutemov * Record which node the original page is from and save this 1287b46e756fSKirill A. Shutemov * information to khugepaged_node_load[]. 1288b46e756fSKirill A. Shutemov * Khupaged will allocate hugepage from the node has the max 1289b46e756fSKirill A. Shutemov * hit record. 1290b46e756fSKirill A. Shutemov */ 1291b46e756fSKirill A. Shutemov node = page_to_nid(page); 1292b46e756fSKirill A. Shutemov if (khugepaged_scan_abort(node)) { 1293b46e756fSKirill A. Shutemov result = SCAN_SCAN_ABORT; 1294b46e756fSKirill A. Shutemov goto out_unmap; 1295b46e756fSKirill A. Shutemov } 1296b46e756fSKirill A. Shutemov khugepaged_node_load[node]++; 1297b46e756fSKirill A. Shutemov if (!PageLRU(page)) { 1298b46e756fSKirill A. Shutemov result = SCAN_PAGE_LRU; 1299b46e756fSKirill A. Shutemov goto out_unmap; 1300b46e756fSKirill A. Shutemov } 1301b46e756fSKirill A. Shutemov if (PageLocked(page)) { 1302b46e756fSKirill A. Shutemov result = SCAN_PAGE_LOCK; 1303b46e756fSKirill A. Shutemov goto out_unmap; 1304b46e756fSKirill A. Shutemov } 1305b46e756fSKirill A. Shutemov if (!PageAnon(page)) { 1306b46e756fSKirill A. Shutemov result = SCAN_PAGE_ANON; 1307b46e756fSKirill A. Shutemov goto out_unmap; 1308b46e756fSKirill A. Shutemov } 1309b46e756fSKirill A. Shutemov 1310b46e756fSKirill A. Shutemov /* 13119445689fSKirill A. Shutemov * Check if the page has any GUP (or other external) pins. 13129445689fSKirill A. Shutemov * 13139445689fSKirill A. Shutemov * Here the check is racy it may see totmal_mapcount > refcount 13149445689fSKirill A. Shutemov * in some cases. 13159445689fSKirill A. Shutemov * For example, one process with one forked child process. 13169445689fSKirill A. Shutemov * The parent has the PMD split due to MADV_DONTNEED, then 13179445689fSKirill A. Shutemov * the child is trying unmap the whole PMD, but khugepaged 13189445689fSKirill A. Shutemov * may be scanning the parent between the child has 13199445689fSKirill A. Shutemov * PageDoubleMap flag cleared and dec the mapcount. So 13209445689fSKirill A. Shutemov * khugepaged may see total_mapcount > refcount. 13219445689fSKirill A. Shutemov * 13229445689fSKirill A. Shutemov * But such case is ephemeral we could always retry collapse 13239445689fSKirill A. Shutemov * later. However it may report false positive if the page 13249445689fSKirill A. Shutemov * has excessive GUP pins (i.e. 512). Anyway the same check 13259445689fSKirill A. Shutemov * will be done again later the risk seems low. 1326b46e756fSKirill A. Shutemov */ 13279445689fSKirill A. Shutemov if (!is_refcount_suitable(page)) { 1328b46e756fSKirill A. Shutemov result = SCAN_PAGE_COUNT; 1329b46e756fSKirill A. Shutemov goto out_unmap; 1330b46e756fSKirill A. Shutemov } 1331b46e756fSKirill A. Shutemov if (pte_young(pteval) || 1332b46e756fSKirill A. Shutemov page_is_young(page) || PageReferenced(page) || 1333b46e756fSKirill A. Shutemov mmu_notifier_test_young(vma->vm_mm, address)) 13340db501f7SEbru Akagunduz referenced++; 1335b46e756fSKirill A. Shutemov } 1336ffe945e6SKirill A. Shutemov if (!writable) { 1337ffe945e6SKirill A. Shutemov result = SCAN_PAGE_RO; 1338ffe945e6SKirill A. Shutemov } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { 1339ffe945e6SKirill A. Shutemov result = SCAN_LACK_REFERENCED_PAGE; 1340ffe945e6SKirill A. Shutemov } else { 1341b46e756fSKirill A. Shutemov result = SCAN_SUCCEED; 1342b46e756fSKirill A. Shutemov ret = 1; 1343b46e756fSKirill A. Shutemov } 1344b46e756fSKirill A. Shutemov out_unmap: 1345b46e756fSKirill A. Shutemov pte_unmap_unlock(pte, ptl); 1346b46e756fSKirill A. Shutemov if (ret) { 1347b46e756fSKirill A. Shutemov node = khugepaged_find_target_node(); 1348b46e756fSKirill A. Shutemov /* collapse_huge_page will return with the mmap_sem released */ 1349ffe945e6SKirill A. Shutemov collapse_huge_page(mm, address, hpage, node, 1350ffe945e6SKirill A. Shutemov referenced, unmapped); 1351b46e756fSKirill A. Shutemov } 1352b46e756fSKirill A. Shutemov out: 1353b46e756fSKirill A. Shutemov trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, 1354b46e756fSKirill A. Shutemov none_or_zero, result, unmapped); 1355b46e756fSKirill A. Shutemov return ret; 1356b46e756fSKirill A. Shutemov } 1357b46e756fSKirill A. Shutemov 1358b46e756fSKirill A. Shutemov static void collect_mm_slot(struct mm_slot *mm_slot) 1359b46e756fSKirill A. Shutemov { 1360b46e756fSKirill A. Shutemov struct mm_struct *mm = mm_slot->mm; 1361b46e756fSKirill A. Shutemov 136235f3aa39SLance Roy lockdep_assert_held(&khugepaged_mm_lock); 1363b46e756fSKirill A. Shutemov 1364b46e756fSKirill A. Shutemov if (khugepaged_test_exit(mm)) { 1365b46e756fSKirill A. Shutemov /* free mm_slot */ 1366b46e756fSKirill A. Shutemov hash_del(&mm_slot->hash); 1367b46e756fSKirill A. Shutemov list_del(&mm_slot->mm_node); 1368b46e756fSKirill A. Shutemov 1369b46e756fSKirill A. Shutemov /* 1370b46e756fSKirill A. Shutemov * Not strictly needed because the mm exited already. 1371b46e756fSKirill A. Shutemov * 1372b46e756fSKirill A. Shutemov * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1373b46e756fSKirill A. Shutemov */ 1374b46e756fSKirill A. Shutemov 1375b46e756fSKirill A. Shutemov /* khugepaged_mm_lock actually not necessary for the below */ 1376b46e756fSKirill A. Shutemov free_mm_slot(mm_slot); 1377b46e756fSKirill A. Shutemov mmdrop(mm); 1378b46e756fSKirill A. Shutemov } 1379b46e756fSKirill A. Shutemov } 1380b46e756fSKirill A. Shutemov 1381396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_SHMEM 138227e1f827SSong Liu /* 138327e1f827SSong Liu * Notify khugepaged that given addr of the mm is pte-mapped THP. Then 138427e1f827SSong Liu * khugepaged should try to collapse the page table. 138527e1f827SSong Liu */ 138627e1f827SSong Liu static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, 138727e1f827SSong Liu unsigned long addr) 138827e1f827SSong Liu { 138927e1f827SSong Liu struct mm_slot *mm_slot; 139027e1f827SSong Liu 139127e1f827SSong Liu VM_BUG_ON(addr & ~HPAGE_PMD_MASK); 139227e1f827SSong Liu 139327e1f827SSong Liu spin_lock(&khugepaged_mm_lock); 139427e1f827SSong Liu mm_slot = get_mm_slot(mm); 139527e1f827SSong Liu if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) 139627e1f827SSong Liu mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; 139727e1f827SSong Liu spin_unlock(&khugepaged_mm_lock); 139827e1f827SSong Liu return 0; 139927e1f827SSong Liu } 140027e1f827SSong Liu 140127e1f827SSong Liu /** 140227e1f827SSong Liu * Try to collapse a pte-mapped THP for mm at address haddr. 140327e1f827SSong Liu * 140427e1f827SSong Liu * This function checks whether all the PTEs in the PMD are pointing to the 140527e1f827SSong Liu * right THP. If so, retract the page table so the THP can refault in with 140627e1f827SSong Liu * as pmd-mapped. 140727e1f827SSong Liu */ 140827e1f827SSong Liu void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) 140927e1f827SSong Liu { 141027e1f827SSong Liu unsigned long haddr = addr & HPAGE_PMD_MASK; 141127e1f827SSong Liu struct vm_area_struct *vma = find_vma(mm, haddr); 141227e1f827SSong Liu struct page *hpage = NULL; 141327e1f827SSong Liu pte_t *start_pte, *pte; 141427e1f827SSong Liu pmd_t *pmd, _pmd; 141527e1f827SSong Liu spinlock_t *ptl; 141627e1f827SSong Liu int count = 0; 141727e1f827SSong Liu int i; 141827e1f827SSong Liu 141927e1f827SSong Liu if (!vma || !vma->vm_file || 142027e1f827SSong Liu vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) 142127e1f827SSong Liu return; 142227e1f827SSong Liu 142327e1f827SSong Liu /* 142427e1f827SSong Liu * This vm_flags may not have VM_HUGEPAGE if the page was not 142527e1f827SSong Liu * collapsed by this mm. But we can still collapse if the page is 142627e1f827SSong Liu * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() 142727e1f827SSong Liu * will not fail the vma for missing VM_HUGEPAGE 142827e1f827SSong Liu */ 142927e1f827SSong Liu if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) 143027e1f827SSong Liu return; 143127e1f827SSong Liu 143227e1f827SSong Liu pmd = mm_find_pmd(mm, haddr); 143327e1f827SSong Liu if (!pmd) 143427e1f827SSong Liu return; 143527e1f827SSong Liu 143627e1f827SSong Liu start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); 143727e1f827SSong Liu 143827e1f827SSong Liu /* step 1: check all mapped PTEs are to the right huge page */ 143927e1f827SSong Liu for (i = 0, addr = haddr, pte = start_pte; 144027e1f827SSong Liu i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 144127e1f827SSong Liu struct page *page; 144227e1f827SSong Liu 144327e1f827SSong Liu /* empty pte, skip */ 144427e1f827SSong Liu if (pte_none(*pte)) 144527e1f827SSong Liu continue; 144627e1f827SSong Liu 144727e1f827SSong Liu /* page swapped out, abort */ 144827e1f827SSong Liu if (!pte_present(*pte)) 144927e1f827SSong Liu goto abort; 145027e1f827SSong Liu 145127e1f827SSong Liu page = vm_normal_page(vma, addr, *pte); 145227e1f827SSong Liu 145327e1f827SSong Liu if (!page || !PageCompound(page)) 145427e1f827SSong Liu goto abort; 145527e1f827SSong Liu 145627e1f827SSong Liu if (!hpage) { 145727e1f827SSong Liu hpage = compound_head(page); 145827e1f827SSong Liu /* 145927e1f827SSong Liu * The mapping of the THP should not change. 146027e1f827SSong Liu * 146127e1f827SSong Liu * Note that uprobe, debugger, or MAP_PRIVATE may 146227e1f827SSong Liu * change the page table, but the new page will 146327e1f827SSong Liu * not pass PageCompound() check. 146427e1f827SSong Liu */ 146527e1f827SSong Liu if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping)) 146627e1f827SSong Liu goto abort; 146727e1f827SSong Liu } 146827e1f827SSong Liu 146927e1f827SSong Liu /* 147027e1f827SSong Liu * Confirm the page maps to the correct subpage. 147127e1f827SSong Liu * 147227e1f827SSong Liu * Note that uprobe, debugger, or MAP_PRIVATE may change 147327e1f827SSong Liu * the page table, but the new page will not pass 147427e1f827SSong Liu * PageCompound() check. 147527e1f827SSong Liu */ 147627e1f827SSong Liu if (WARN_ON(hpage + i != page)) 147727e1f827SSong Liu goto abort; 147827e1f827SSong Liu count++; 147927e1f827SSong Liu } 148027e1f827SSong Liu 148127e1f827SSong Liu /* step 2: adjust rmap */ 148227e1f827SSong Liu for (i = 0, addr = haddr, pte = start_pte; 148327e1f827SSong Liu i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 148427e1f827SSong Liu struct page *page; 148527e1f827SSong Liu 148627e1f827SSong Liu if (pte_none(*pte)) 148727e1f827SSong Liu continue; 148827e1f827SSong Liu page = vm_normal_page(vma, addr, *pte); 148927e1f827SSong Liu page_remove_rmap(page, false); 149027e1f827SSong Liu } 149127e1f827SSong Liu 149227e1f827SSong Liu pte_unmap_unlock(start_pte, ptl); 149327e1f827SSong Liu 149427e1f827SSong Liu /* step 3: set proper refcount and mm_counters. */ 149527e1f827SSong Liu if (hpage) { 149627e1f827SSong Liu page_ref_sub(hpage, count); 149727e1f827SSong Liu add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); 149827e1f827SSong Liu } 149927e1f827SSong Liu 150027e1f827SSong Liu /* step 4: collapse pmd */ 150127e1f827SSong Liu ptl = pmd_lock(vma->vm_mm, pmd); 150227e1f827SSong Liu _pmd = pmdp_collapse_flush(vma, addr, pmd); 150327e1f827SSong Liu spin_unlock(ptl); 150427e1f827SSong Liu mm_dec_nr_ptes(mm); 150527e1f827SSong Liu pte_free(mm, pmd_pgtable(_pmd)); 150627e1f827SSong Liu return; 150727e1f827SSong Liu 150827e1f827SSong Liu abort: 150927e1f827SSong Liu pte_unmap_unlock(start_pte, ptl); 151027e1f827SSong Liu } 151127e1f827SSong Liu 151227e1f827SSong Liu static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) 151327e1f827SSong Liu { 151427e1f827SSong Liu struct mm_struct *mm = mm_slot->mm; 151527e1f827SSong Liu int i; 151627e1f827SSong Liu 151727e1f827SSong Liu if (likely(mm_slot->nr_pte_mapped_thp == 0)) 151827e1f827SSong Liu return 0; 151927e1f827SSong Liu 1520*d8ed45c5SMichel Lespinasse if (!mmap_write_trylock(mm)) 152127e1f827SSong Liu return -EBUSY; 152227e1f827SSong Liu 152327e1f827SSong Liu if (unlikely(khugepaged_test_exit(mm))) 152427e1f827SSong Liu goto out; 152527e1f827SSong Liu 152627e1f827SSong Liu for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) 152727e1f827SSong Liu collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); 152827e1f827SSong Liu 152927e1f827SSong Liu out: 153027e1f827SSong Liu mm_slot->nr_pte_mapped_thp = 0; 1531*d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 153227e1f827SSong Liu return 0; 153327e1f827SSong Liu } 153427e1f827SSong Liu 1535f3f0e1d2SKirill A. Shutemov static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) 1536f3f0e1d2SKirill A. Shutemov { 1537f3f0e1d2SKirill A. Shutemov struct vm_area_struct *vma; 1538f3f0e1d2SKirill A. Shutemov unsigned long addr; 1539f3f0e1d2SKirill A. Shutemov pmd_t *pmd, _pmd; 1540f3f0e1d2SKirill A. Shutemov 1541f3f0e1d2SKirill A. Shutemov i_mmap_lock_write(mapping); 1542f3f0e1d2SKirill A. Shutemov vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 154327e1f827SSong Liu /* 154427e1f827SSong Liu * Check vma->anon_vma to exclude MAP_PRIVATE mappings that 154527e1f827SSong Liu * got written to. These VMAs are likely not worth investing 154627e1f827SSong Liu * down_write(mmap_sem) as PMD-mapping is likely to be split 154727e1f827SSong Liu * later. 154827e1f827SSong Liu * 154927e1f827SSong Liu * Not that vma->anon_vma check is racy: it can be set up after 155027e1f827SSong Liu * the check but before we took mmap_sem by the fault path. 155127e1f827SSong Liu * But page lock would prevent establishing any new ptes of the 155227e1f827SSong Liu * page, so we are safe. 155327e1f827SSong Liu * 155427e1f827SSong Liu * An alternative would be drop the check, but check that page 155527e1f827SSong Liu * table is clear before calling pmdp_collapse_flush() under 155627e1f827SSong Liu * ptl. It has higher chance to recover THP for the VMA, but 155727e1f827SSong Liu * has higher cost too. 155827e1f827SSong Liu */ 1559f3f0e1d2SKirill A. Shutemov if (vma->anon_vma) 1560f3f0e1d2SKirill A. Shutemov continue; 1561f3f0e1d2SKirill A. Shutemov addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 1562f3f0e1d2SKirill A. Shutemov if (addr & ~HPAGE_PMD_MASK) 1563f3f0e1d2SKirill A. Shutemov continue; 1564f3f0e1d2SKirill A. Shutemov if (vma->vm_end < addr + HPAGE_PMD_SIZE) 1565f3f0e1d2SKirill A. Shutemov continue; 1566f3f0e1d2SKirill A. Shutemov pmd = mm_find_pmd(vma->vm_mm, addr); 1567f3f0e1d2SKirill A. Shutemov if (!pmd) 1568f3f0e1d2SKirill A. Shutemov continue; 1569f3f0e1d2SKirill A. Shutemov /* 1570f3f0e1d2SKirill A. Shutemov * We need exclusive mmap_sem to retract page table. 157127e1f827SSong Liu * 157227e1f827SSong Liu * We use trylock due to lock inversion: we need to acquire 157327e1f827SSong Liu * mmap_sem while holding page lock. Fault path does it in 157427e1f827SSong Liu * reverse order. Trylock is a way to avoid deadlock. 1575f3f0e1d2SKirill A. Shutemov */ 1576*d8ed45c5SMichel Lespinasse if (mmap_write_trylock(vma->vm_mm)) { 1577f3f0e1d2SKirill A. Shutemov spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); 1578f3f0e1d2SKirill A. Shutemov /* assume page table is clear */ 1579f3f0e1d2SKirill A. Shutemov _pmd = pmdp_collapse_flush(vma, addr, pmd); 1580f3f0e1d2SKirill A. Shutemov spin_unlock(ptl); 1581*d8ed45c5SMichel Lespinasse mmap_write_unlock(vma->vm_mm); 1582c4812909SKirill A. Shutemov mm_dec_nr_ptes(vma->vm_mm); 1583f3f0e1d2SKirill A. Shutemov pte_free(vma->vm_mm, pmd_pgtable(_pmd)); 158427e1f827SSong Liu } else { 158527e1f827SSong Liu /* Try again later */ 158627e1f827SSong Liu khugepaged_add_pte_mapped_thp(vma->vm_mm, addr); 1587f3f0e1d2SKirill A. Shutemov } 1588f3f0e1d2SKirill A. Shutemov } 1589f3f0e1d2SKirill A. Shutemov i_mmap_unlock_write(mapping); 1590f3f0e1d2SKirill A. Shutemov } 1591f3f0e1d2SKirill A. Shutemov 1592f3f0e1d2SKirill A. Shutemov /** 159399cb0dbdSSong Liu * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. 1594f3f0e1d2SKirill A. Shutemov * 1595f3f0e1d2SKirill A. Shutemov * Basic scheme is simple, details are more complex: 159687c460a0SHugh Dickins * - allocate and lock a new huge page; 159777da9389SMatthew Wilcox * - scan page cache replacing old pages with the new one 159899cb0dbdSSong Liu * + swap/gup in pages if necessary; 1599f3f0e1d2SKirill A. Shutemov * + fill in gaps; 160077da9389SMatthew Wilcox * + keep old pages around in case rollback is required; 160177da9389SMatthew Wilcox * - if replacing succeeds: 1602f3f0e1d2SKirill A. Shutemov * + copy data over; 1603f3f0e1d2SKirill A. Shutemov * + free old pages; 160487c460a0SHugh Dickins * + unlock huge page; 1605f3f0e1d2SKirill A. Shutemov * - if replacing failed; 1606f3f0e1d2SKirill A. Shutemov * + put all pages back and unfreeze them; 160777da9389SMatthew Wilcox * + restore gaps in the page cache; 160887c460a0SHugh Dickins * + unlock and free huge page; 1609f3f0e1d2SKirill A. Shutemov */ 1610579c571eSSong Liu static void collapse_file(struct mm_struct *mm, 1611579c571eSSong Liu struct file *file, pgoff_t start, 1612f3f0e1d2SKirill A. Shutemov struct page **hpage, int node) 1613f3f0e1d2SKirill A. Shutemov { 1614579c571eSSong Liu struct address_space *mapping = file->f_mapping; 1615f3f0e1d2SKirill A. Shutemov gfp_t gfp; 161677da9389SMatthew Wilcox struct page *new_page; 1617f3f0e1d2SKirill A. Shutemov pgoff_t index, end = start + HPAGE_PMD_NR; 1618f3f0e1d2SKirill A. Shutemov LIST_HEAD(pagelist); 161977da9389SMatthew Wilcox XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); 1620f3f0e1d2SKirill A. Shutemov int nr_none = 0, result = SCAN_SUCCEED; 162199cb0dbdSSong Liu bool is_shmem = shmem_file(file); 1622f3f0e1d2SKirill A. Shutemov 162399cb0dbdSSong Liu VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); 1624f3f0e1d2SKirill A. Shutemov VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 1625f3f0e1d2SKirill A. Shutemov 1626f3f0e1d2SKirill A. Shutemov /* Only allocate from the target node */ 162741b6167eSMichal Hocko gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; 1628f3f0e1d2SKirill A. Shutemov 1629f3f0e1d2SKirill A. Shutemov new_page = khugepaged_alloc_page(hpage, gfp, node); 1630f3f0e1d2SKirill A. Shutemov if (!new_page) { 1631f3f0e1d2SKirill A. Shutemov result = SCAN_ALLOC_HUGE_PAGE_FAIL; 1632f3f0e1d2SKirill A. Shutemov goto out; 1633f3f0e1d2SKirill A. Shutemov } 1634f3f0e1d2SKirill A. Shutemov 1635d9eb1ea2SJohannes Weiner if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { 1636f3f0e1d2SKirill A. Shutemov result = SCAN_CGROUP_CHARGE_FAIL; 1637f3f0e1d2SKirill A. Shutemov goto out; 1638f3f0e1d2SKirill A. Shutemov } 16399d82c694SJohannes Weiner count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); 1640f3f0e1d2SKirill A. Shutemov 164195feeabbSHugh Dickins /* This will be less messy when we use multi-index entries */ 164295feeabbSHugh Dickins do { 164395feeabbSHugh Dickins xas_lock_irq(&xas); 164495feeabbSHugh Dickins xas_create_range(&xas); 164595feeabbSHugh Dickins if (!xas_error(&xas)) 164695feeabbSHugh Dickins break; 164795feeabbSHugh Dickins xas_unlock_irq(&xas); 164895feeabbSHugh Dickins if (!xas_nomem(&xas, GFP_KERNEL)) { 164995feeabbSHugh Dickins result = SCAN_FAIL; 165095feeabbSHugh Dickins goto out; 165195feeabbSHugh Dickins } 165295feeabbSHugh Dickins } while (1); 165395feeabbSHugh Dickins 1654042a3082SHugh Dickins __SetPageLocked(new_page); 165599cb0dbdSSong Liu if (is_shmem) 1656042a3082SHugh Dickins __SetPageSwapBacked(new_page); 1657f3f0e1d2SKirill A. Shutemov new_page->index = start; 1658f3f0e1d2SKirill A. Shutemov new_page->mapping = mapping; 1659f3f0e1d2SKirill A. Shutemov 1660f3f0e1d2SKirill A. Shutemov /* 166187c460a0SHugh Dickins * At this point the new_page is locked and not up-to-date. 166287c460a0SHugh Dickins * It's safe to insert it into the page cache, because nobody would 166387c460a0SHugh Dickins * be able to map it or use it in another way until we unlock it. 1664f3f0e1d2SKirill A. Shutemov */ 1665f3f0e1d2SKirill A. Shutemov 166677da9389SMatthew Wilcox xas_set(&xas, start); 166777da9389SMatthew Wilcox for (index = start; index < end; index++) { 166877da9389SMatthew Wilcox struct page *page = xas_next(&xas); 166977da9389SMatthew Wilcox 167077da9389SMatthew Wilcox VM_BUG_ON(index != xas.xa_index); 167199cb0dbdSSong Liu if (is_shmem) { 167277da9389SMatthew Wilcox if (!page) { 1673701270faSHugh Dickins /* 167499cb0dbdSSong Liu * Stop if extent has been truncated or 167599cb0dbdSSong Liu * hole-punched, and is now completely 167699cb0dbdSSong Liu * empty. 1677701270faSHugh Dickins */ 1678701270faSHugh Dickins if (index == start) { 1679701270faSHugh Dickins if (!xas_next_entry(&xas, end - 1)) { 1680701270faSHugh Dickins result = SCAN_TRUNCATED; 1681042a3082SHugh Dickins goto xa_locked; 1682701270faSHugh Dickins } 1683701270faSHugh Dickins xas_set(&xas, index); 1684701270faSHugh Dickins } 168577da9389SMatthew Wilcox if (!shmem_charge(mapping->host, 1)) { 1686f3f0e1d2SKirill A. Shutemov result = SCAN_FAIL; 1687042a3082SHugh Dickins goto xa_locked; 1688f3f0e1d2SKirill A. Shutemov } 16894101196bSMatthew Wilcox (Oracle) xas_store(&xas, new_page); 169077da9389SMatthew Wilcox nr_none++; 169177da9389SMatthew Wilcox continue; 1692f3f0e1d2SKirill A. Shutemov } 1693f3f0e1d2SKirill A. Shutemov 16943159f943SMatthew Wilcox if (xa_is_value(page) || !PageUptodate(page)) { 169577da9389SMatthew Wilcox xas_unlock_irq(&xas); 1696f3f0e1d2SKirill A. Shutemov /* swap in or instantiate fallocated page */ 1697f3f0e1d2SKirill A. Shutemov if (shmem_getpage(mapping->host, index, &page, 1698f3f0e1d2SKirill A. Shutemov SGP_NOHUGE)) { 1699f3f0e1d2SKirill A. Shutemov result = SCAN_FAIL; 170077da9389SMatthew Wilcox goto xa_unlocked; 1701f3f0e1d2SKirill A. Shutemov } 1702f3f0e1d2SKirill A. Shutemov } else if (trylock_page(page)) { 1703f3f0e1d2SKirill A. Shutemov get_page(page); 1704042a3082SHugh Dickins xas_unlock_irq(&xas); 1705f3f0e1d2SKirill A. Shutemov } else { 1706f3f0e1d2SKirill A. Shutemov result = SCAN_PAGE_LOCK; 1707042a3082SHugh Dickins goto xa_locked; 1708f3f0e1d2SKirill A. Shutemov } 170999cb0dbdSSong Liu } else { /* !is_shmem */ 171099cb0dbdSSong Liu if (!page || xa_is_value(page)) { 171199cb0dbdSSong Liu xas_unlock_irq(&xas); 171299cb0dbdSSong Liu page_cache_sync_readahead(mapping, &file->f_ra, 171399cb0dbdSSong Liu file, index, 171499cb0dbdSSong Liu PAGE_SIZE); 171599cb0dbdSSong Liu /* drain pagevecs to help isolate_lru_page() */ 171699cb0dbdSSong Liu lru_add_drain(); 171799cb0dbdSSong Liu page = find_lock_page(mapping, index); 171899cb0dbdSSong Liu if (unlikely(page == NULL)) { 171999cb0dbdSSong Liu result = SCAN_FAIL; 172099cb0dbdSSong Liu goto xa_unlocked; 172199cb0dbdSSong Liu } 172275f36069SSong Liu } else if (PageDirty(page)) { 172375f36069SSong Liu /* 172475f36069SSong Liu * khugepaged only works on read-only fd, 172575f36069SSong Liu * so this page is dirty because it hasn't 172675f36069SSong Liu * been flushed since first write. There 172775f36069SSong Liu * won't be new dirty pages. 172875f36069SSong Liu * 172975f36069SSong Liu * Trigger async flush here and hope the 173075f36069SSong Liu * writeback is done when khugepaged 173175f36069SSong Liu * revisits this page. 173275f36069SSong Liu * 173375f36069SSong Liu * This is a one-off situation. We are not 173475f36069SSong Liu * forcing writeback in loop. 173575f36069SSong Liu */ 173675f36069SSong Liu xas_unlock_irq(&xas); 173775f36069SSong Liu filemap_flush(mapping); 173875f36069SSong Liu result = SCAN_FAIL; 173975f36069SSong Liu goto xa_unlocked; 174099cb0dbdSSong Liu } else if (trylock_page(page)) { 174199cb0dbdSSong Liu get_page(page); 174299cb0dbdSSong Liu xas_unlock_irq(&xas); 174399cb0dbdSSong Liu } else { 174499cb0dbdSSong Liu result = SCAN_PAGE_LOCK; 174599cb0dbdSSong Liu goto xa_locked; 174699cb0dbdSSong Liu } 174799cb0dbdSSong Liu } 1748f3f0e1d2SKirill A. Shutemov 1749f3f0e1d2SKirill A. Shutemov /* 1750b93b0163SMatthew Wilcox * The page must be locked, so we can drop the i_pages lock 1751f3f0e1d2SKirill A. Shutemov * without racing with truncate. 1752f3f0e1d2SKirill A. Shutemov */ 1753f3f0e1d2SKirill A. Shutemov VM_BUG_ON_PAGE(!PageLocked(page), page); 17544655e5e5SSong Liu 17554655e5e5SSong Liu /* make sure the page is up to date */ 17564655e5e5SSong Liu if (unlikely(!PageUptodate(page))) { 17574655e5e5SSong Liu result = SCAN_FAIL; 17584655e5e5SSong Liu goto out_unlock; 17594655e5e5SSong Liu } 176006a5e126SHugh Dickins 176106a5e126SHugh Dickins /* 176206a5e126SHugh Dickins * If file was truncated then extended, or hole-punched, before 176306a5e126SHugh Dickins * we locked the first page, then a THP might be there already. 176406a5e126SHugh Dickins */ 176506a5e126SHugh Dickins if (PageTransCompound(page)) { 176606a5e126SHugh Dickins result = SCAN_PAGE_COMPOUND; 176706a5e126SHugh Dickins goto out_unlock; 176806a5e126SHugh Dickins } 1769f3f0e1d2SKirill A. Shutemov 1770f3f0e1d2SKirill A. Shutemov if (page_mapping(page) != mapping) { 1771f3f0e1d2SKirill A. Shutemov result = SCAN_TRUNCATED; 1772f3f0e1d2SKirill A. Shutemov goto out_unlock; 1773f3f0e1d2SKirill A. Shutemov } 1774f3f0e1d2SKirill A. Shutemov 17754655e5e5SSong Liu if (!is_shmem && PageDirty(page)) { 17764655e5e5SSong Liu /* 17774655e5e5SSong Liu * khugepaged only works on read-only fd, so this 17784655e5e5SSong Liu * page is dirty because it hasn't been flushed 17794655e5e5SSong Liu * since first write. 17804655e5e5SSong Liu */ 17814655e5e5SSong Liu result = SCAN_FAIL; 17824655e5e5SSong Liu goto out_unlock; 17834655e5e5SSong Liu } 17844655e5e5SSong Liu 1785f3f0e1d2SKirill A. Shutemov if (isolate_lru_page(page)) { 1786f3f0e1d2SKirill A. Shutemov result = SCAN_DEL_PAGE_LRU; 1787042a3082SHugh Dickins goto out_unlock; 1788f3f0e1d2SKirill A. Shutemov } 1789f3f0e1d2SKirill A. Shutemov 179099cb0dbdSSong Liu if (page_has_private(page) && 179199cb0dbdSSong Liu !try_to_release_page(page, GFP_KERNEL)) { 179299cb0dbdSSong Liu result = SCAN_PAGE_HAS_PRIVATE; 17932f33a706SHugh Dickins putback_lru_page(page); 179499cb0dbdSSong Liu goto out_unlock; 179599cb0dbdSSong Liu } 179699cb0dbdSSong Liu 1797f3f0e1d2SKirill A. Shutemov if (page_mapped(page)) 1798977fbdcdSMatthew Wilcox unmap_mapping_pages(mapping, index, 1, false); 1799f3f0e1d2SKirill A. Shutemov 180077da9389SMatthew Wilcox xas_lock_irq(&xas); 180177da9389SMatthew Wilcox xas_set(&xas, index); 1802f3f0e1d2SKirill A. Shutemov 180377da9389SMatthew Wilcox VM_BUG_ON_PAGE(page != xas_load(&xas), page); 1804f3f0e1d2SKirill A. Shutemov VM_BUG_ON_PAGE(page_mapped(page), page); 1805f3f0e1d2SKirill A. Shutemov 1806f3f0e1d2SKirill A. Shutemov /* 1807f3f0e1d2SKirill A. Shutemov * The page is expected to have page_count() == 3: 1808f3f0e1d2SKirill A. Shutemov * - we hold a pin on it; 180977da9389SMatthew Wilcox * - one reference from page cache; 1810f3f0e1d2SKirill A. Shutemov * - one from isolate_lru_page; 1811f3f0e1d2SKirill A. Shutemov */ 1812f3f0e1d2SKirill A. Shutemov if (!page_ref_freeze(page, 3)) { 1813f3f0e1d2SKirill A. Shutemov result = SCAN_PAGE_COUNT; 1814042a3082SHugh Dickins xas_unlock_irq(&xas); 1815042a3082SHugh Dickins putback_lru_page(page); 1816042a3082SHugh Dickins goto out_unlock; 1817f3f0e1d2SKirill A. Shutemov } 1818f3f0e1d2SKirill A. Shutemov 1819f3f0e1d2SKirill A. Shutemov /* 1820f3f0e1d2SKirill A. Shutemov * Add the page to the list to be able to undo the collapse if 1821f3f0e1d2SKirill A. Shutemov * something go wrong. 1822f3f0e1d2SKirill A. Shutemov */ 1823f3f0e1d2SKirill A. Shutemov list_add_tail(&page->lru, &pagelist); 1824f3f0e1d2SKirill A. Shutemov 1825f3f0e1d2SKirill A. Shutemov /* Finally, replace with the new page. */ 18264101196bSMatthew Wilcox (Oracle) xas_store(&xas, new_page); 1827f3f0e1d2SKirill A. Shutemov continue; 1828f3f0e1d2SKirill A. Shutemov out_unlock: 1829f3f0e1d2SKirill A. Shutemov unlock_page(page); 1830f3f0e1d2SKirill A. Shutemov put_page(page); 1831042a3082SHugh Dickins goto xa_unlocked; 1832f3f0e1d2SKirill A. Shutemov } 1833f3f0e1d2SKirill A. Shutemov 183499cb0dbdSSong Liu if (is_shmem) 1835042a3082SHugh Dickins __inc_node_page_state(new_page, NR_SHMEM_THPS); 183609d91cdaSSong Liu else { 183799cb0dbdSSong Liu __inc_node_page_state(new_page, NR_FILE_THPS); 183809d91cdaSSong Liu filemap_nr_thps_inc(mapping); 183909d91cdaSSong Liu } 184099cb0dbdSSong Liu 1841042a3082SHugh Dickins if (nr_none) { 18429d82c694SJohannes Weiner __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); 184399cb0dbdSSong Liu if (is_shmem) 18449d82c694SJohannes Weiner __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); 1845042a3082SHugh Dickins } 1846042a3082SHugh Dickins 1847042a3082SHugh Dickins xa_locked: 1848042a3082SHugh Dickins xas_unlock_irq(&xas); 184977da9389SMatthew Wilcox xa_unlocked: 1850042a3082SHugh Dickins 1851f3f0e1d2SKirill A. Shutemov if (result == SCAN_SUCCEED) { 185277da9389SMatthew Wilcox struct page *page, *tmp; 1853f3f0e1d2SKirill A. Shutemov 1854f3f0e1d2SKirill A. Shutemov /* 185577da9389SMatthew Wilcox * Replacing old pages with new one has succeeded, now we 185677da9389SMatthew Wilcox * need to copy the content and free the old pages. 1857f3f0e1d2SKirill A. Shutemov */ 18582af8ff29SHugh Dickins index = start; 1859f3f0e1d2SKirill A. Shutemov list_for_each_entry_safe(page, tmp, &pagelist, lru) { 18602af8ff29SHugh Dickins while (index < page->index) { 18612af8ff29SHugh Dickins clear_highpage(new_page + (index % HPAGE_PMD_NR)); 18622af8ff29SHugh Dickins index++; 18632af8ff29SHugh Dickins } 1864f3f0e1d2SKirill A. Shutemov copy_highpage(new_page + (page->index % HPAGE_PMD_NR), 1865f3f0e1d2SKirill A. Shutemov page); 1866f3f0e1d2SKirill A. Shutemov list_del(&page->lru); 1867f3f0e1d2SKirill A. Shutemov page->mapping = NULL; 1868042a3082SHugh Dickins page_ref_unfreeze(page, 1); 1869f3f0e1d2SKirill A. Shutemov ClearPageActive(page); 1870f3f0e1d2SKirill A. Shutemov ClearPageUnevictable(page); 1871042a3082SHugh Dickins unlock_page(page); 1872f3f0e1d2SKirill A. Shutemov put_page(page); 18732af8ff29SHugh Dickins index++; 18742af8ff29SHugh Dickins } 18752af8ff29SHugh Dickins while (index < end) { 18762af8ff29SHugh Dickins clear_highpage(new_page + (index % HPAGE_PMD_NR)); 18772af8ff29SHugh Dickins index++; 1878f3f0e1d2SKirill A. Shutemov } 1879f3f0e1d2SKirill A. Shutemov 1880f3f0e1d2SKirill A. Shutemov SetPageUptodate(new_page); 188187c460a0SHugh Dickins page_ref_add(new_page, HPAGE_PMD_NR - 1); 18826058eaecSJohannes Weiner if (is_shmem) 188399cb0dbdSSong Liu set_page_dirty(new_page); 18846058eaecSJohannes Weiner lru_cache_add(new_page); 1885f3f0e1d2SKirill A. Shutemov 1886042a3082SHugh Dickins /* 1887042a3082SHugh Dickins * Remove pte page tables, so we can re-fault the page as huge. 1888042a3082SHugh Dickins */ 1889042a3082SHugh Dickins retract_page_tables(mapping, start); 1890f3f0e1d2SKirill A. Shutemov *hpage = NULL; 189187aa7529SYang Shi 189287aa7529SYang Shi khugepaged_pages_collapsed++; 1893f3f0e1d2SKirill A. Shutemov } else { 189477da9389SMatthew Wilcox struct page *page; 1895aaa52e34SHugh Dickins 189677da9389SMatthew Wilcox /* Something went wrong: roll back page cache changes */ 189777da9389SMatthew Wilcox xas_lock_irq(&xas); 1898aaa52e34SHugh Dickins mapping->nrpages -= nr_none; 189999cb0dbdSSong Liu 190099cb0dbdSSong Liu if (is_shmem) 1901aaa52e34SHugh Dickins shmem_uncharge(mapping->host, nr_none); 1902aaa52e34SHugh Dickins 190377da9389SMatthew Wilcox xas_set(&xas, start); 190477da9389SMatthew Wilcox xas_for_each(&xas, page, end - 1) { 1905f3f0e1d2SKirill A. Shutemov page = list_first_entry_or_null(&pagelist, 1906f3f0e1d2SKirill A. Shutemov struct page, lru); 190777da9389SMatthew Wilcox if (!page || xas.xa_index < page->index) { 1908f3f0e1d2SKirill A. Shutemov if (!nr_none) 1909f3f0e1d2SKirill A. Shutemov break; 1910f3f0e1d2SKirill A. Shutemov nr_none--; 191159749e6cSJohannes Weiner /* Put holes back where they were */ 191277da9389SMatthew Wilcox xas_store(&xas, NULL); 1913f3f0e1d2SKirill A. Shutemov continue; 1914f3f0e1d2SKirill A. Shutemov } 1915f3f0e1d2SKirill A. Shutemov 191677da9389SMatthew Wilcox VM_BUG_ON_PAGE(page->index != xas.xa_index, page); 1917f3f0e1d2SKirill A. Shutemov 1918f3f0e1d2SKirill A. Shutemov /* Unfreeze the page. */ 1919f3f0e1d2SKirill A. Shutemov list_del(&page->lru); 1920f3f0e1d2SKirill A. Shutemov page_ref_unfreeze(page, 2); 192177da9389SMatthew Wilcox xas_store(&xas, page); 192277da9389SMatthew Wilcox xas_pause(&xas); 192377da9389SMatthew Wilcox xas_unlock_irq(&xas); 1924f3f0e1d2SKirill A. Shutemov unlock_page(page); 1925042a3082SHugh Dickins putback_lru_page(page); 192677da9389SMatthew Wilcox xas_lock_irq(&xas); 1927f3f0e1d2SKirill A. Shutemov } 1928f3f0e1d2SKirill A. Shutemov VM_BUG_ON(nr_none); 192977da9389SMatthew Wilcox xas_unlock_irq(&xas); 1930f3f0e1d2SKirill A. Shutemov 1931f3f0e1d2SKirill A. Shutemov new_page->mapping = NULL; 1932f3f0e1d2SKirill A. Shutemov } 1933042a3082SHugh Dickins 1934042a3082SHugh Dickins unlock_page(new_page); 1935f3f0e1d2SKirill A. Shutemov out: 1936f3f0e1d2SKirill A. Shutemov VM_BUG_ON(!list_empty(&pagelist)); 19379d82c694SJohannes Weiner if (!IS_ERR_OR_NULL(*hpage)) 19389d82c694SJohannes Weiner mem_cgroup_uncharge(*hpage); 1939f3f0e1d2SKirill A. Shutemov /* TODO: tracepoints */ 1940f3f0e1d2SKirill A. Shutemov } 1941f3f0e1d2SKirill A. Shutemov 1942579c571eSSong Liu static void khugepaged_scan_file(struct mm_struct *mm, 1943579c571eSSong Liu struct file *file, pgoff_t start, struct page **hpage) 1944f3f0e1d2SKirill A. Shutemov { 1945f3f0e1d2SKirill A. Shutemov struct page *page = NULL; 1946579c571eSSong Liu struct address_space *mapping = file->f_mapping; 194785b392dbSMatthew Wilcox XA_STATE(xas, &mapping->i_pages, start); 1948f3f0e1d2SKirill A. Shutemov int present, swap; 1949f3f0e1d2SKirill A. Shutemov int node = NUMA_NO_NODE; 1950f3f0e1d2SKirill A. Shutemov int result = SCAN_SUCCEED; 1951f3f0e1d2SKirill A. Shutemov 1952f3f0e1d2SKirill A. Shutemov present = 0; 1953f3f0e1d2SKirill A. Shutemov swap = 0; 1954f3f0e1d2SKirill A. Shutemov memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 1955f3f0e1d2SKirill A. Shutemov rcu_read_lock(); 195685b392dbSMatthew Wilcox xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { 195785b392dbSMatthew Wilcox if (xas_retry(&xas, page)) 1958f3f0e1d2SKirill A. Shutemov continue; 1959f3f0e1d2SKirill A. Shutemov 196085b392dbSMatthew Wilcox if (xa_is_value(page)) { 1961f3f0e1d2SKirill A. Shutemov if (++swap > khugepaged_max_ptes_swap) { 1962f3f0e1d2SKirill A. Shutemov result = SCAN_EXCEED_SWAP_PTE; 1963f3f0e1d2SKirill A. Shutemov break; 1964f3f0e1d2SKirill A. Shutemov } 1965f3f0e1d2SKirill A. Shutemov continue; 1966f3f0e1d2SKirill A. Shutemov } 1967f3f0e1d2SKirill A. Shutemov 1968f3f0e1d2SKirill A. Shutemov if (PageTransCompound(page)) { 1969f3f0e1d2SKirill A. Shutemov result = SCAN_PAGE_COMPOUND; 1970f3f0e1d2SKirill A. Shutemov break; 1971f3f0e1d2SKirill A. Shutemov } 1972f3f0e1d2SKirill A. Shutemov 1973f3f0e1d2SKirill A. Shutemov node = page_to_nid(page); 1974f3f0e1d2SKirill A. Shutemov if (khugepaged_scan_abort(node)) { 1975f3f0e1d2SKirill A. Shutemov result = SCAN_SCAN_ABORT; 1976f3f0e1d2SKirill A. Shutemov break; 1977f3f0e1d2SKirill A. Shutemov } 1978f3f0e1d2SKirill A. Shutemov khugepaged_node_load[node]++; 1979f3f0e1d2SKirill A. Shutemov 1980f3f0e1d2SKirill A. Shutemov if (!PageLRU(page)) { 1981f3f0e1d2SKirill A. Shutemov result = SCAN_PAGE_LRU; 1982f3f0e1d2SKirill A. Shutemov break; 1983f3f0e1d2SKirill A. Shutemov } 1984f3f0e1d2SKirill A. Shutemov 198599cb0dbdSSong Liu if (page_count(page) != 198699cb0dbdSSong Liu 1 + page_mapcount(page) + page_has_private(page)) { 1987f3f0e1d2SKirill A. Shutemov result = SCAN_PAGE_COUNT; 1988f3f0e1d2SKirill A. Shutemov break; 1989f3f0e1d2SKirill A. Shutemov } 1990f3f0e1d2SKirill A. Shutemov 1991f3f0e1d2SKirill A. Shutemov /* 1992f3f0e1d2SKirill A. Shutemov * We probably should check if the page is referenced here, but 1993f3f0e1d2SKirill A. Shutemov * nobody would transfer pte_young() to PageReferenced() for us. 1994f3f0e1d2SKirill A. Shutemov * And rmap walk here is just too costly... 1995f3f0e1d2SKirill A. Shutemov */ 1996f3f0e1d2SKirill A. Shutemov 1997f3f0e1d2SKirill A. Shutemov present++; 1998f3f0e1d2SKirill A. Shutemov 1999f3f0e1d2SKirill A. Shutemov if (need_resched()) { 200085b392dbSMatthew Wilcox xas_pause(&xas); 2001f3f0e1d2SKirill A. Shutemov cond_resched_rcu(); 2002f3f0e1d2SKirill A. Shutemov } 2003f3f0e1d2SKirill A. Shutemov } 2004f3f0e1d2SKirill A. Shutemov rcu_read_unlock(); 2005f3f0e1d2SKirill A. Shutemov 2006f3f0e1d2SKirill A. Shutemov if (result == SCAN_SUCCEED) { 2007f3f0e1d2SKirill A. Shutemov if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { 2008f3f0e1d2SKirill A. Shutemov result = SCAN_EXCEED_NONE_PTE; 2009f3f0e1d2SKirill A. Shutemov } else { 2010f3f0e1d2SKirill A. Shutemov node = khugepaged_find_target_node(); 2011579c571eSSong Liu collapse_file(mm, file, start, hpage, node); 2012f3f0e1d2SKirill A. Shutemov } 2013f3f0e1d2SKirill A. Shutemov } 2014f3f0e1d2SKirill A. Shutemov 2015f3f0e1d2SKirill A. Shutemov /* TODO: tracepoints */ 2016f3f0e1d2SKirill A. Shutemov } 2017f3f0e1d2SKirill A. Shutemov #else 2018579c571eSSong Liu static void khugepaged_scan_file(struct mm_struct *mm, 2019579c571eSSong Liu struct file *file, pgoff_t start, struct page **hpage) 2020f3f0e1d2SKirill A. Shutemov { 2021f3f0e1d2SKirill A. Shutemov BUILD_BUG(); 2022f3f0e1d2SKirill A. Shutemov } 202327e1f827SSong Liu 202427e1f827SSong Liu static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) 202527e1f827SSong Liu { 202627e1f827SSong Liu return 0; 202727e1f827SSong Liu } 2028f3f0e1d2SKirill A. Shutemov #endif 2029f3f0e1d2SKirill A. Shutemov 2030b46e756fSKirill A. Shutemov static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2031b46e756fSKirill A. Shutemov struct page **hpage) 2032b46e756fSKirill A. Shutemov __releases(&khugepaged_mm_lock) 2033b46e756fSKirill A. Shutemov __acquires(&khugepaged_mm_lock) 2034b46e756fSKirill A. Shutemov { 2035b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 2036b46e756fSKirill A. Shutemov struct mm_struct *mm; 2037b46e756fSKirill A. Shutemov struct vm_area_struct *vma; 2038b46e756fSKirill A. Shutemov int progress = 0; 2039b46e756fSKirill A. Shutemov 2040b46e756fSKirill A. Shutemov VM_BUG_ON(!pages); 204135f3aa39SLance Roy lockdep_assert_held(&khugepaged_mm_lock); 2042b46e756fSKirill A. Shutemov 2043b46e756fSKirill A. Shutemov if (khugepaged_scan.mm_slot) 2044b46e756fSKirill A. Shutemov mm_slot = khugepaged_scan.mm_slot; 2045b46e756fSKirill A. Shutemov else { 2046b46e756fSKirill A. Shutemov mm_slot = list_entry(khugepaged_scan.mm_head.next, 2047b46e756fSKirill A. Shutemov struct mm_slot, mm_node); 2048b46e756fSKirill A. Shutemov khugepaged_scan.address = 0; 2049b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = mm_slot; 2050b46e756fSKirill A. Shutemov } 2051b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 205227e1f827SSong Liu khugepaged_collapse_pte_mapped_thps(mm_slot); 2053b46e756fSKirill A. Shutemov 2054b46e756fSKirill A. Shutemov mm = mm_slot->mm; 20553b454ad3SYang Shi /* 20563b454ad3SYang Shi * Don't wait for semaphore (to avoid long wait times). Just move to 20573b454ad3SYang Shi * the next mm on the list. 20583b454ad3SYang Shi */ 2059b46e756fSKirill A. Shutemov vma = NULL; 2060*d8ed45c5SMichel Lespinasse if (unlikely(!mmap_read_trylock(mm))) 20613b454ad3SYang Shi goto breakouterloop_mmap_sem; 20623b454ad3SYang Shi if (likely(!khugepaged_test_exit(mm))) 2063b46e756fSKirill A. Shutemov vma = find_vma(mm, khugepaged_scan.address); 2064b46e756fSKirill A. Shutemov 2065b46e756fSKirill A. Shutemov progress++; 2066b46e756fSKirill A. Shutemov for (; vma; vma = vma->vm_next) { 2067b46e756fSKirill A. Shutemov unsigned long hstart, hend; 2068b46e756fSKirill A. Shutemov 2069b46e756fSKirill A. Shutemov cond_resched(); 2070b46e756fSKirill A. Shutemov if (unlikely(khugepaged_test_exit(mm))) { 2071b46e756fSKirill A. Shutemov progress++; 2072b46e756fSKirill A. Shutemov break; 2073b46e756fSKirill A. Shutemov } 207450f8b92fSSong Liu if (!hugepage_vma_check(vma, vma->vm_flags)) { 2075b46e756fSKirill A. Shutemov skip: 2076b46e756fSKirill A. Shutemov progress++; 2077b46e756fSKirill A. Shutemov continue; 2078b46e756fSKirill A. Shutemov } 2079b46e756fSKirill A. Shutemov hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2080b46e756fSKirill A. Shutemov hend = vma->vm_end & HPAGE_PMD_MASK; 2081b46e756fSKirill A. Shutemov if (hstart >= hend) 2082b46e756fSKirill A. Shutemov goto skip; 2083b46e756fSKirill A. Shutemov if (khugepaged_scan.address > hend) 2084b46e756fSKirill A. Shutemov goto skip; 2085b46e756fSKirill A. Shutemov if (khugepaged_scan.address < hstart) 2086b46e756fSKirill A. Shutemov khugepaged_scan.address = hstart; 2087b46e756fSKirill A. Shutemov VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2088396bcc52SMatthew Wilcox (Oracle) if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma)) 2089396bcc52SMatthew Wilcox (Oracle) goto skip; 2090b46e756fSKirill A. Shutemov 2091b46e756fSKirill A. Shutemov while (khugepaged_scan.address < hend) { 2092b46e756fSKirill A. Shutemov int ret; 2093b46e756fSKirill A. Shutemov cond_resched(); 2094b46e756fSKirill A. Shutemov if (unlikely(khugepaged_test_exit(mm))) 2095b46e756fSKirill A. Shutemov goto breakouterloop; 2096b46e756fSKirill A. Shutemov 2097b46e756fSKirill A. Shutemov VM_BUG_ON(khugepaged_scan.address < hstart || 2098b46e756fSKirill A. Shutemov khugepaged_scan.address + HPAGE_PMD_SIZE > 2099b46e756fSKirill A. Shutemov hend); 210099cb0dbdSSong Liu if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { 2101396bcc52SMatthew Wilcox (Oracle) struct file *file = get_file(vma->vm_file); 2102f3f0e1d2SKirill A. Shutemov pgoff_t pgoff = linear_page_index(vma, 2103f3f0e1d2SKirill A. Shutemov khugepaged_scan.address); 210499cb0dbdSSong Liu 2105*d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); 2106f3f0e1d2SKirill A. Shutemov ret = 1; 2107579c571eSSong Liu khugepaged_scan_file(mm, file, pgoff, hpage); 2108f3f0e1d2SKirill A. Shutemov fput(file); 2109f3f0e1d2SKirill A. Shutemov } else { 2110b46e756fSKirill A. Shutemov ret = khugepaged_scan_pmd(mm, vma, 2111b46e756fSKirill A. Shutemov khugepaged_scan.address, 2112b46e756fSKirill A. Shutemov hpage); 2113f3f0e1d2SKirill A. Shutemov } 2114b46e756fSKirill A. Shutemov /* move to next address */ 2115b46e756fSKirill A. Shutemov khugepaged_scan.address += HPAGE_PMD_SIZE; 2116b46e756fSKirill A. Shutemov progress += HPAGE_PMD_NR; 2117b46e756fSKirill A. Shutemov if (ret) 2118b46e756fSKirill A. Shutemov /* we released mmap_sem so break loop */ 2119b46e756fSKirill A. Shutemov goto breakouterloop_mmap_sem; 2120b46e756fSKirill A. Shutemov if (progress >= pages) 2121b46e756fSKirill A. Shutemov goto breakouterloop; 2122b46e756fSKirill A. Shutemov } 2123b46e756fSKirill A. Shutemov } 2124b46e756fSKirill A. Shutemov breakouterloop: 2125*d8ed45c5SMichel Lespinasse mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ 2126b46e756fSKirill A. Shutemov breakouterloop_mmap_sem: 2127b46e756fSKirill A. Shutemov 2128b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 2129b46e756fSKirill A. Shutemov VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2130b46e756fSKirill A. Shutemov /* 2131b46e756fSKirill A. Shutemov * Release the current mm_slot if this mm is about to die, or 2132b46e756fSKirill A. Shutemov * if we scanned all vmas of this mm. 2133b46e756fSKirill A. Shutemov */ 2134b46e756fSKirill A. Shutemov if (khugepaged_test_exit(mm) || !vma) { 2135b46e756fSKirill A. Shutemov /* 2136b46e756fSKirill A. Shutemov * Make sure that if mm_users is reaching zero while 2137b46e756fSKirill A. Shutemov * khugepaged runs here, khugepaged_exit will find 2138b46e756fSKirill A. Shutemov * mm_slot not pointing to the exiting mm. 2139b46e756fSKirill A. Shutemov */ 2140b46e756fSKirill A. Shutemov if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 2141b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = list_entry( 2142b46e756fSKirill A. Shutemov mm_slot->mm_node.next, 2143b46e756fSKirill A. Shutemov struct mm_slot, mm_node); 2144b46e756fSKirill A. Shutemov khugepaged_scan.address = 0; 2145b46e756fSKirill A. Shutemov } else { 2146b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = NULL; 2147b46e756fSKirill A. Shutemov khugepaged_full_scans++; 2148b46e756fSKirill A. Shutemov } 2149b46e756fSKirill A. Shutemov 2150b46e756fSKirill A. Shutemov collect_mm_slot(mm_slot); 2151b46e756fSKirill A. Shutemov } 2152b46e756fSKirill A. Shutemov 2153b46e756fSKirill A. Shutemov return progress; 2154b46e756fSKirill A. Shutemov } 2155b46e756fSKirill A. Shutemov 2156b46e756fSKirill A. Shutemov static int khugepaged_has_work(void) 2157b46e756fSKirill A. Shutemov { 2158b46e756fSKirill A. Shutemov return !list_empty(&khugepaged_scan.mm_head) && 2159b46e756fSKirill A. Shutemov khugepaged_enabled(); 2160b46e756fSKirill A. Shutemov } 2161b46e756fSKirill A. Shutemov 2162b46e756fSKirill A. Shutemov static int khugepaged_wait_event(void) 2163b46e756fSKirill A. Shutemov { 2164b46e756fSKirill A. Shutemov return !list_empty(&khugepaged_scan.mm_head) || 2165b46e756fSKirill A. Shutemov kthread_should_stop(); 2166b46e756fSKirill A. Shutemov } 2167b46e756fSKirill A. Shutemov 2168b46e756fSKirill A. Shutemov static void khugepaged_do_scan(void) 2169b46e756fSKirill A. Shutemov { 2170b46e756fSKirill A. Shutemov struct page *hpage = NULL; 2171b46e756fSKirill A. Shutemov unsigned int progress = 0, pass_through_head = 0; 2172b46e756fSKirill A. Shutemov unsigned int pages = khugepaged_pages_to_scan; 2173b46e756fSKirill A. Shutemov bool wait = true; 2174b46e756fSKirill A. Shutemov 2175b46e756fSKirill A. Shutemov barrier(); /* write khugepaged_pages_to_scan to local stack */ 2176b46e756fSKirill A. Shutemov 2177a980df33SKirill A. Shutemov lru_add_drain_all(); 2178a980df33SKirill A. Shutemov 2179b46e756fSKirill A. Shutemov while (progress < pages) { 2180b46e756fSKirill A. Shutemov if (!khugepaged_prealloc_page(&hpage, &wait)) 2181b46e756fSKirill A. Shutemov break; 2182b46e756fSKirill A. Shutemov 2183b46e756fSKirill A. Shutemov cond_resched(); 2184b46e756fSKirill A. Shutemov 2185b46e756fSKirill A. Shutemov if (unlikely(kthread_should_stop() || try_to_freeze())) 2186b46e756fSKirill A. Shutemov break; 2187b46e756fSKirill A. Shutemov 2188b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 2189b46e756fSKirill A. Shutemov if (!khugepaged_scan.mm_slot) 2190b46e756fSKirill A. Shutemov pass_through_head++; 2191b46e756fSKirill A. Shutemov if (khugepaged_has_work() && 2192b46e756fSKirill A. Shutemov pass_through_head < 2) 2193b46e756fSKirill A. Shutemov progress += khugepaged_scan_mm_slot(pages - progress, 2194b46e756fSKirill A. Shutemov &hpage); 2195b46e756fSKirill A. Shutemov else 2196b46e756fSKirill A. Shutemov progress = pages; 2197b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 2198b46e756fSKirill A. Shutemov } 2199b46e756fSKirill A. Shutemov 2200b46e756fSKirill A. Shutemov if (!IS_ERR_OR_NULL(hpage)) 2201b46e756fSKirill A. Shutemov put_page(hpage); 2202b46e756fSKirill A. Shutemov } 2203b46e756fSKirill A. Shutemov 2204b46e756fSKirill A. Shutemov static bool khugepaged_should_wakeup(void) 2205b46e756fSKirill A. Shutemov { 2206b46e756fSKirill A. Shutemov return kthread_should_stop() || 2207b46e756fSKirill A. Shutemov time_after_eq(jiffies, khugepaged_sleep_expire); 2208b46e756fSKirill A. Shutemov } 2209b46e756fSKirill A. Shutemov 2210b46e756fSKirill A. Shutemov static void khugepaged_wait_work(void) 2211b46e756fSKirill A. Shutemov { 2212b46e756fSKirill A. Shutemov if (khugepaged_has_work()) { 2213b46e756fSKirill A. Shutemov const unsigned long scan_sleep_jiffies = 2214b46e756fSKirill A. Shutemov msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 2215b46e756fSKirill A. Shutemov 2216b46e756fSKirill A. Shutemov if (!scan_sleep_jiffies) 2217b46e756fSKirill A. Shutemov return; 2218b46e756fSKirill A. Shutemov 2219b46e756fSKirill A. Shutemov khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 2220b46e756fSKirill A. Shutemov wait_event_freezable_timeout(khugepaged_wait, 2221b46e756fSKirill A. Shutemov khugepaged_should_wakeup(), 2222b46e756fSKirill A. Shutemov scan_sleep_jiffies); 2223b46e756fSKirill A. Shutemov return; 2224b46e756fSKirill A. Shutemov } 2225b46e756fSKirill A. Shutemov 2226b46e756fSKirill A. Shutemov if (khugepaged_enabled()) 2227b46e756fSKirill A. Shutemov wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2228b46e756fSKirill A. Shutemov } 2229b46e756fSKirill A. Shutemov 2230b46e756fSKirill A. Shutemov static int khugepaged(void *none) 2231b46e756fSKirill A. Shutemov { 2232b46e756fSKirill A. Shutemov struct mm_slot *mm_slot; 2233b46e756fSKirill A. Shutemov 2234b46e756fSKirill A. Shutemov set_freezable(); 2235b46e756fSKirill A. Shutemov set_user_nice(current, MAX_NICE); 2236b46e756fSKirill A. Shutemov 2237b46e756fSKirill A. Shutemov while (!kthread_should_stop()) { 2238b46e756fSKirill A. Shutemov khugepaged_do_scan(); 2239b46e756fSKirill A. Shutemov khugepaged_wait_work(); 2240b46e756fSKirill A. Shutemov } 2241b46e756fSKirill A. Shutemov 2242b46e756fSKirill A. Shutemov spin_lock(&khugepaged_mm_lock); 2243b46e756fSKirill A. Shutemov mm_slot = khugepaged_scan.mm_slot; 2244b46e756fSKirill A. Shutemov khugepaged_scan.mm_slot = NULL; 2245b46e756fSKirill A. Shutemov if (mm_slot) 2246b46e756fSKirill A. Shutemov collect_mm_slot(mm_slot); 2247b46e756fSKirill A. Shutemov spin_unlock(&khugepaged_mm_lock); 2248b46e756fSKirill A. Shutemov return 0; 2249b46e756fSKirill A. Shutemov } 2250b46e756fSKirill A. Shutemov 2251b46e756fSKirill A. Shutemov static void set_recommended_min_free_kbytes(void) 2252b46e756fSKirill A. Shutemov { 2253b46e756fSKirill A. Shutemov struct zone *zone; 2254b46e756fSKirill A. Shutemov int nr_zones = 0; 2255b46e756fSKirill A. Shutemov unsigned long recommended_min; 2256b46e756fSKirill A. Shutemov 2257b7d349c7SJoonsoo Kim for_each_populated_zone(zone) { 2258b7d349c7SJoonsoo Kim /* 2259b7d349c7SJoonsoo Kim * We don't need to worry about fragmentation of 2260b7d349c7SJoonsoo Kim * ZONE_MOVABLE since it only has movable pages. 2261b7d349c7SJoonsoo Kim */ 2262b7d349c7SJoonsoo Kim if (zone_idx(zone) > gfp_zone(GFP_USER)) 2263b7d349c7SJoonsoo Kim continue; 2264b7d349c7SJoonsoo Kim 2265b46e756fSKirill A. Shutemov nr_zones++; 2266b7d349c7SJoonsoo Kim } 2267b46e756fSKirill A. Shutemov 2268b46e756fSKirill A. Shutemov /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 2269b46e756fSKirill A. Shutemov recommended_min = pageblock_nr_pages * nr_zones * 2; 2270b46e756fSKirill A. Shutemov 2271b46e756fSKirill A. Shutemov /* 2272b46e756fSKirill A. Shutemov * Make sure that on average at least two pageblocks are almost free 2273b46e756fSKirill A. Shutemov * of another type, one for a migratetype to fall back to and a 2274b46e756fSKirill A. Shutemov * second to avoid subsequent fallbacks of other types There are 3 2275b46e756fSKirill A. Shutemov * MIGRATE_TYPES we care about. 2276b46e756fSKirill A. Shutemov */ 2277b46e756fSKirill A. Shutemov recommended_min += pageblock_nr_pages * nr_zones * 2278b46e756fSKirill A. Shutemov MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 2279b46e756fSKirill A. Shutemov 2280b46e756fSKirill A. Shutemov /* don't ever allow to reserve more than 5% of the lowmem */ 2281b46e756fSKirill A. Shutemov recommended_min = min(recommended_min, 2282b46e756fSKirill A. Shutemov (unsigned long) nr_free_buffer_pages() / 20); 2283b46e756fSKirill A. Shutemov recommended_min <<= (PAGE_SHIFT-10); 2284b46e756fSKirill A. Shutemov 2285b46e756fSKirill A. Shutemov if (recommended_min > min_free_kbytes) { 2286b46e756fSKirill A. Shutemov if (user_min_free_kbytes >= 0) 2287b46e756fSKirill A. Shutemov pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 2288b46e756fSKirill A. Shutemov min_free_kbytes, recommended_min); 2289b46e756fSKirill A. Shutemov 2290b46e756fSKirill A. Shutemov min_free_kbytes = recommended_min; 2291b46e756fSKirill A. Shutemov } 2292b46e756fSKirill A. Shutemov setup_per_zone_wmarks(); 2293b46e756fSKirill A. Shutemov } 2294b46e756fSKirill A. Shutemov 2295b46e756fSKirill A. Shutemov int start_stop_khugepaged(void) 2296b46e756fSKirill A. Shutemov { 2297b46e756fSKirill A. Shutemov static struct task_struct *khugepaged_thread __read_mostly; 2298b46e756fSKirill A. Shutemov static DEFINE_MUTEX(khugepaged_mutex); 2299b46e756fSKirill A. Shutemov int err = 0; 2300b46e756fSKirill A. Shutemov 2301b46e756fSKirill A. Shutemov mutex_lock(&khugepaged_mutex); 2302b46e756fSKirill A. Shutemov if (khugepaged_enabled()) { 2303b46e756fSKirill A. Shutemov if (!khugepaged_thread) 2304b46e756fSKirill A. Shutemov khugepaged_thread = kthread_run(khugepaged, NULL, 2305b46e756fSKirill A. Shutemov "khugepaged"); 2306b46e756fSKirill A. Shutemov if (IS_ERR(khugepaged_thread)) { 2307b46e756fSKirill A. Shutemov pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 2308b46e756fSKirill A. Shutemov err = PTR_ERR(khugepaged_thread); 2309b46e756fSKirill A. Shutemov khugepaged_thread = NULL; 2310b46e756fSKirill A. Shutemov goto fail; 2311b46e756fSKirill A. Shutemov } 2312b46e756fSKirill A. Shutemov 2313b46e756fSKirill A. Shutemov if (!list_empty(&khugepaged_scan.mm_head)) 2314b46e756fSKirill A. Shutemov wake_up_interruptible(&khugepaged_wait); 2315b46e756fSKirill A. Shutemov 2316b46e756fSKirill A. Shutemov set_recommended_min_free_kbytes(); 2317b46e756fSKirill A. Shutemov } else if (khugepaged_thread) { 2318b46e756fSKirill A. Shutemov kthread_stop(khugepaged_thread); 2319b46e756fSKirill A. Shutemov khugepaged_thread = NULL; 2320b46e756fSKirill A. Shutemov } 2321b46e756fSKirill A. Shutemov fail: 2322b46e756fSKirill A. Shutemov mutex_unlock(&khugepaged_mutex); 2323b46e756fSKirill A. Shutemov return err; 2324b46e756fSKirill A. Shutemov } 2325