xref: /linux/mm/huge_memory.c (revision 4b471e8898c3d0f5c97a3c73ac32d0549fe01c87)
171e3aac0SAndrea Arcangeli /*
271e3aac0SAndrea Arcangeli  *  Copyright (C) 2009  Red Hat, Inc.
371e3aac0SAndrea Arcangeli  *
471e3aac0SAndrea Arcangeli  *  This work is licensed under the terms of the GNU GPL, version 2. See
571e3aac0SAndrea Arcangeli  *  the COPYING file in the top-level directory.
671e3aac0SAndrea Arcangeli  */
771e3aac0SAndrea Arcangeli 
8ae3a8c1cSAndrew Morton #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9ae3a8c1cSAndrew Morton 
1071e3aac0SAndrea Arcangeli #include <linux/mm.h>
1171e3aac0SAndrea Arcangeli #include <linux/sched.h>
1271e3aac0SAndrea Arcangeli #include <linux/highmem.h>
1371e3aac0SAndrea Arcangeli #include <linux/hugetlb.h>
1471e3aac0SAndrea Arcangeli #include <linux/mmu_notifier.h>
1571e3aac0SAndrea Arcangeli #include <linux/rmap.h>
1671e3aac0SAndrea Arcangeli #include <linux/swap.h>
1797ae1749SKirill A. Shutemov #include <linux/shrinker.h>
18ba76149fSAndrea Arcangeli #include <linux/mm_inline.h>
194897c765SMatthew Wilcox #include <linux/dax.h>
20ba76149fSAndrea Arcangeli #include <linux/kthread.h>
21ba76149fSAndrea Arcangeli #include <linux/khugepaged.h>
22878aee7dSAndrea Arcangeli #include <linux/freezer.h>
23a664b2d8SAndrea Arcangeli #include <linux/mman.h>
24325adeb5SRalf Baechle #include <linux/pagemap.h>
254daae3b4SMel Gorman #include <linux/migrate.h>
2643b5fbbdSSasha Levin #include <linux/hashtable.h>
276b251fc9SAndrea Arcangeli #include <linux/userfaultfd_k.h>
2833c3fc71SVladimir Davydov #include <linux/page_idle.h>
2997ae1749SKirill A. Shutemov 
3071e3aac0SAndrea Arcangeli #include <asm/tlb.h>
3171e3aac0SAndrea Arcangeli #include <asm/pgalloc.h>
3271e3aac0SAndrea Arcangeli #include "internal.h"
3371e3aac0SAndrea Arcangeli 
347d2eba05SEbru Akagunduz enum scan_result {
357d2eba05SEbru Akagunduz 	SCAN_FAIL,
367d2eba05SEbru Akagunduz 	SCAN_SUCCEED,
377d2eba05SEbru Akagunduz 	SCAN_PMD_NULL,
387d2eba05SEbru Akagunduz 	SCAN_EXCEED_NONE_PTE,
397d2eba05SEbru Akagunduz 	SCAN_PTE_NON_PRESENT,
407d2eba05SEbru Akagunduz 	SCAN_PAGE_RO,
417d2eba05SEbru Akagunduz 	SCAN_NO_REFERENCED_PAGE,
427d2eba05SEbru Akagunduz 	SCAN_PAGE_NULL,
437d2eba05SEbru Akagunduz 	SCAN_SCAN_ABORT,
447d2eba05SEbru Akagunduz 	SCAN_PAGE_COUNT,
457d2eba05SEbru Akagunduz 	SCAN_PAGE_LRU,
467d2eba05SEbru Akagunduz 	SCAN_PAGE_LOCK,
477d2eba05SEbru Akagunduz 	SCAN_PAGE_ANON,
48b1caa957SKirill A. Shutemov 	SCAN_PAGE_COMPOUND,
497d2eba05SEbru Akagunduz 	SCAN_ANY_PROCESS,
507d2eba05SEbru Akagunduz 	SCAN_VMA_NULL,
517d2eba05SEbru Akagunduz 	SCAN_VMA_CHECK,
527d2eba05SEbru Akagunduz 	SCAN_ADDRESS_RANGE,
537d2eba05SEbru Akagunduz 	SCAN_SWAP_CACHE_PAGE,
547d2eba05SEbru Akagunduz 	SCAN_DEL_PAGE_LRU,
557d2eba05SEbru Akagunduz 	SCAN_ALLOC_HUGE_PAGE_FAIL,
567d2eba05SEbru Akagunduz 	SCAN_CGROUP_CHARGE_FAIL
577d2eba05SEbru Akagunduz };
587d2eba05SEbru Akagunduz 
597d2eba05SEbru Akagunduz #define CREATE_TRACE_POINTS
607d2eba05SEbru Akagunduz #include <trace/events/huge_memory.h>
617d2eba05SEbru Akagunduz 
62ba76149fSAndrea Arcangeli /*
638bfa3f9aSJianguo Wu  * By default transparent hugepage support is disabled in order that avoid
648bfa3f9aSJianguo Wu  * to risk increase the memory footprint of applications without a guaranteed
658bfa3f9aSJianguo Wu  * benefit. When transparent hugepage support is enabled, is for all mappings,
668bfa3f9aSJianguo Wu  * and khugepaged scans all mappings.
678bfa3f9aSJianguo Wu  * Defrag is invoked by khugepaged hugepage allocations and by page faults
688bfa3f9aSJianguo Wu  * for all hugepage allocations.
69ba76149fSAndrea Arcangeli  */
7071e3aac0SAndrea Arcangeli unsigned long transparent_hugepage_flags __read_mostly =
7113ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
72ba76149fSAndrea Arcangeli 	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
7313ece886SAndrea Arcangeli #endif
7413ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
7513ece886SAndrea Arcangeli 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
7613ece886SAndrea Arcangeli #endif
77d39d33c3SAndrea Arcangeli 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
7879da5407SKirill A. Shutemov 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
7979da5407SKirill A. Shutemov 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
80ba76149fSAndrea Arcangeli 
81ba76149fSAndrea Arcangeli /* default scan 8*512 pte (or vmas) every 30 second */
82ba76149fSAndrea Arcangeli static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
83ba76149fSAndrea Arcangeli static unsigned int khugepaged_pages_collapsed;
84ba76149fSAndrea Arcangeli static unsigned int khugepaged_full_scans;
85ba76149fSAndrea Arcangeli static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
86ba76149fSAndrea Arcangeli /* during fragmentation poll the hugepage allocator once every minute */
87ba76149fSAndrea Arcangeli static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
88ba76149fSAndrea Arcangeli static struct task_struct *khugepaged_thread __read_mostly;
89ba76149fSAndrea Arcangeli static DEFINE_MUTEX(khugepaged_mutex);
90ba76149fSAndrea Arcangeli static DEFINE_SPINLOCK(khugepaged_mm_lock);
91ba76149fSAndrea Arcangeli static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
92ba76149fSAndrea Arcangeli /*
93ba76149fSAndrea Arcangeli  * default collapse hugepages if there is at least one pte mapped like
94ba76149fSAndrea Arcangeli  * it would have happened if the vma was large enough during page
95ba76149fSAndrea Arcangeli  * fault.
96ba76149fSAndrea Arcangeli  */
97ba76149fSAndrea Arcangeli static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
98ba76149fSAndrea Arcangeli 
99ba76149fSAndrea Arcangeli static int khugepaged(void *none);
100ba76149fSAndrea Arcangeli static int khugepaged_slab_init(void);
10165ebb64fSKirill A. Shutemov static void khugepaged_slab_exit(void);
102ba76149fSAndrea Arcangeli 
10343b5fbbdSSasha Levin #define MM_SLOTS_HASH_BITS 10
10443b5fbbdSSasha Levin static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
10543b5fbbdSSasha Levin 
106ba76149fSAndrea Arcangeli static struct kmem_cache *mm_slot_cache __read_mostly;
107ba76149fSAndrea Arcangeli 
108ba76149fSAndrea Arcangeli /**
109ba76149fSAndrea Arcangeli  * struct mm_slot - hash lookup from mm to mm_slot
110ba76149fSAndrea Arcangeli  * @hash: hash collision list
111ba76149fSAndrea Arcangeli  * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
112ba76149fSAndrea Arcangeli  * @mm: the mm that this information is valid for
113ba76149fSAndrea Arcangeli  */
114ba76149fSAndrea Arcangeli struct mm_slot {
115ba76149fSAndrea Arcangeli 	struct hlist_node hash;
116ba76149fSAndrea Arcangeli 	struct list_head mm_node;
117ba76149fSAndrea Arcangeli 	struct mm_struct *mm;
118ba76149fSAndrea Arcangeli };
119ba76149fSAndrea Arcangeli 
120ba76149fSAndrea Arcangeli /**
121ba76149fSAndrea Arcangeli  * struct khugepaged_scan - cursor for scanning
122ba76149fSAndrea Arcangeli  * @mm_head: the head of the mm list to scan
123ba76149fSAndrea Arcangeli  * @mm_slot: the current mm_slot we are scanning
124ba76149fSAndrea Arcangeli  * @address: the next address inside that to be scanned
125ba76149fSAndrea Arcangeli  *
126ba76149fSAndrea Arcangeli  * There is only the one khugepaged_scan instance of this cursor structure.
127ba76149fSAndrea Arcangeli  */
128ba76149fSAndrea Arcangeli struct khugepaged_scan {
129ba76149fSAndrea Arcangeli 	struct list_head mm_head;
130ba76149fSAndrea Arcangeli 	struct mm_slot *mm_slot;
131ba76149fSAndrea Arcangeli 	unsigned long address;
1322f1da642SH Hartley Sweeten };
1332f1da642SH Hartley Sweeten static struct khugepaged_scan khugepaged_scan = {
134ba76149fSAndrea Arcangeli 	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
135ba76149fSAndrea Arcangeli };
136ba76149fSAndrea Arcangeli 
137f000565aSAndrea Arcangeli 
1382c0b80d4SNicholas Krause static void set_recommended_min_free_kbytes(void)
139f000565aSAndrea Arcangeli {
140f000565aSAndrea Arcangeli 	struct zone *zone;
141f000565aSAndrea Arcangeli 	int nr_zones = 0;
142f000565aSAndrea Arcangeli 	unsigned long recommended_min;
143f000565aSAndrea Arcangeli 
144f000565aSAndrea Arcangeli 	for_each_populated_zone(zone)
145f000565aSAndrea Arcangeli 		nr_zones++;
146f000565aSAndrea Arcangeli 
147974a786eSMel Gorman 	/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
148f000565aSAndrea Arcangeli 	recommended_min = pageblock_nr_pages * nr_zones * 2;
149f000565aSAndrea Arcangeli 
150f000565aSAndrea Arcangeli 	/*
151f000565aSAndrea Arcangeli 	 * Make sure that on average at least two pageblocks are almost free
152f000565aSAndrea Arcangeli 	 * of another type, one for a migratetype to fall back to and a
153f000565aSAndrea Arcangeli 	 * second to avoid subsequent fallbacks of other types There are 3
154f000565aSAndrea Arcangeli 	 * MIGRATE_TYPES we care about.
155f000565aSAndrea Arcangeli 	 */
156f000565aSAndrea Arcangeli 	recommended_min += pageblock_nr_pages * nr_zones *
157f000565aSAndrea Arcangeli 			   MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
158f000565aSAndrea Arcangeli 
159f000565aSAndrea Arcangeli 	/* don't ever allow to reserve more than 5% of the lowmem */
160f000565aSAndrea Arcangeli 	recommended_min = min(recommended_min,
161f000565aSAndrea Arcangeli 			      (unsigned long) nr_free_buffer_pages() / 20);
162f000565aSAndrea Arcangeli 	recommended_min <<= (PAGE_SHIFT-10);
163f000565aSAndrea Arcangeli 
16442aa83cbSHan Pingtian 	if (recommended_min > min_free_kbytes) {
16542aa83cbSHan Pingtian 		if (user_min_free_kbytes >= 0)
16642aa83cbSHan Pingtian 			pr_info("raising min_free_kbytes from %d to %lu "
16742aa83cbSHan Pingtian 				"to help transparent hugepage allocations\n",
16842aa83cbSHan Pingtian 				min_free_kbytes, recommended_min);
16942aa83cbSHan Pingtian 
170f000565aSAndrea Arcangeli 		min_free_kbytes = recommended_min;
17142aa83cbSHan Pingtian 	}
172f000565aSAndrea Arcangeli 	setup_per_zone_wmarks();
173f000565aSAndrea Arcangeli }
174f000565aSAndrea Arcangeli 
17579553da2SKirill A. Shutemov static int start_stop_khugepaged(void)
176ba76149fSAndrea Arcangeli {
177ba76149fSAndrea Arcangeli 	int err = 0;
178ba76149fSAndrea Arcangeli 	if (khugepaged_enabled()) {
179ba76149fSAndrea Arcangeli 		if (!khugepaged_thread)
180ba76149fSAndrea Arcangeli 			khugepaged_thread = kthread_run(khugepaged, NULL,
181ba76149fSAndrea Arcangeli 							"khugepaged");
18218e8e5c7SViresh Kumar 		if (IS_ERR(khugepaged_thread)) {
183ae3a8c1cSAndrew Morton 			pr_err("khugepaged: kthread_run(khugepaged) failed\n");
184ba76149fSAndrea Arcangeli 			err = PTR_ERR(khugepaged_thread);
185ba76149fSAndrea Arcangeli 			khugepaged_thread = NULL;
18679553da2SKirill A. Shutemov 			goto fail;
187ba76149fSAndrea Arcangeli 		}
188911891afSXiao Guangrong 
189911891afSXiao Guangrong 		if (!list_empty(&khugepaged_scan.mm_head))
190ba76149fSAndrea Arcangeli 			wake_up_interruptible(&khugepaged_wait);
191f000565aSAndrea Arcangeli 
192f000565aSAndrea Arcangeli 		set_recommended_min_free_kbytes();
193911891afSXiao Guangrong 	} else if (khugepaged_thread) {
194911891afSXiao Guangrong 		kthread_stop(khugepaged_thread);
195911891afSXiao Guangrong 		khugepaged_thread = NULL;
196911891afSXiao Guangrong 	}
19779553da2SKirill A. Shutemov fail:
198ba76149fSAndrea Arcangeli 	return err;
199ba76149fSAndrea Arcangeli }
20071e3aac0SAndrea Arcangeli 
20197ae1749SKirill A. Shutemov static atomic_t huge_zero_refcount;
20256873f43SWang, Yalin struct page *huge_zero_page __read_mostly;
2034a6c1297SKirill A. Shutemov 
204fc437044SMatthew Wilcox struct page *get_huge_zero_page(void)
20597ae1749SKirill A. Shutemov {
20697ae1749SKirill A. Shutemov 	struct page *zero_page;
20797ae1749SKirill A. Shutemov retry:
20897ae1749SKirill A. Shutemov 	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
2094db0c3c2SJason Low 		return READ_ONCE(huge_zero_page);
21097ae1749SKirill A. Shutemov 
21197ae1749SKirill A. Shutemov 	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
21297ae1749SKirill A. Shutemov 			HPAGE_PMD_ORDER);
213d8a8e1f0SKirill A. Shutemov 	if (!zero_page) {
214d8a8e1f0SKirill A. Shutemov 		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
2155918d10aSKirill A. Shutemov 		return NULL;
216d8a8e1f0SKirill A. Shutemov 	}
217d8a8e1f0SKirill A. Shutemov 	count_vm_event(THP_ZERO_PAGE_ALLOC);
21897ae1749SKirill A. Shutemov 	preempt_disable();
2195918d10aSKirill A. Shutemov 	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
22097ae1749SKirill A. Shutemov 		preempt_enable();
2215ddacbe9SYu Zhao 		__free_pages(zero_page, compound_order(zero_page));
22297ae1749SKirill A. Shutemov 		goto retry;
22397ae1749SKirill A. Shutemov 	}
22497ae1749SKirill A. Shutemov 
22597ae1749SKirill A. Shutemov 	/* We take additional reference here. It will be put back by shrinker */
22697ae1749SKirill A. Shutemov 	atomic_set(&huge_zero_refcount, 2);
22797ae1749SKirill A. Shutemov 	preempt_enable();
2284db0c3c2SJason Low 	return READ_ONCE(huge_zero_page);
22997ae1749SKirill A. Shutemov }
23097ae1749SKirill A. Shutemov 
23197ae1749SKirill A. Shutemov static void put_huge_zero_page(void)
23297ae1749SKirill A. Shutemov {
23397ae1749SKirill A. Shutemov 	/*
23497ae1749SKirill A. Shutemov 	 * Counter should never go to zero here. Only shrinker can put
23597ae1749SKirill A. Shutemov 	 * last reference.
23697ae1749SKirill A. Shutemov 	 */
23797ae1749SKirill A. Shutemov 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
23897ae1749SKirill A. Shutemov }
23997ae1749SKirill A. Shutemov 
24048896466SGlauber Costa static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
24197ae1749SKirill A. Shutemov 					struct shrink_control *sc)
24297ae1749SKirill A. Shutemov {
24397ae1749SKirill A. Shutemov 	/* we can free zero page only if last reference remains */
24497ae1749SKirill A. Shutemov 	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
24548896466SGlauber Costa }
24697ae1749SKirill A. Shutemov 
24748896466SGlauber Costa static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
24848896466SGlauber Costa 				       struct shrink_control *sc)
24948896466SGlauber Costa {
25097ae1749SKirill A. Shutemov 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
2515918d10aSKirill A. Shutemov 		struct page *zero_page = xchg(&huge_zero_page, NULL);
2525918d10aSKirill A. Shutemov 		BUG_ON(zero_page == NULL);
2535ddacbe9SYu Zhao 		__free_pages(zero_page, compound_order(zero_page));
25448896466SGlauber Costa 		return HPAGE_PMD_NR;
25597ae1749SKirill A. Shutemov 	}
25697ae1749SKirill A. Shutemov 
25797ae1749SKirill A. Shutemov 	return 0;
25897ae1749SKirill A. Shutemov }
25997ae1749SKirill A. Shutemov 
26097ae1749SKirill A. Shutemov static struct shrinker huge_zero_page_shrinker = {
26148896466SGlauber Costa 	.count_objects = shrink_huge_zero_page_count,
26248896466SGlauber Costa 	.scan_objects = shrink_huge_zero_page_scan,
26397ae1749SKirill A. Shutemov 	.seeks = DEFAULT_SEEKS,
26497ae1749SKirill A. Shutemov };
26597ae1749SKirill A. Shutemov 
26671e3aac0SAndrea Arcangeli #ifdef CONFIG_SYSFS
267ba76149fSAndrea Arcangeli 
26871e3aac0SAndrea Arcangeli static ssize_t double_flag_show(struct kobject *kobj,
26971e3aac0SAndrea Arcangeli 				struct kobj_attribute *attr, char *buf,
27071e3aac0SAndrea Arcangeli 				enum transparent_hugepage_flag enabled,
27171e3aac0SAndrea Arcangeli 				enum transparent_hugepage_flag req_madv)
27271e3aac0SAndrea Arcangeli {
27371e3aac0SAndrea Arcangeli 	if (test_bit(enabled, &transparent_hugepage_flags)) {
27471e3aac0SAndrea Arcangeli 		VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
27571e3aac0SAndrea Arcangeli 		return sprintf(buf, "[always] madvise never\n");
27671e3aac0SAndrea Arcangeli 	} else if (test_bit(req_madv, &transparent_hugepage_flags))
27771e3aac0SAndrea Arcangeli 		return sprintf(buf, "always [madvise] never\n");
27871e3aac0SAndrea Arcangeli 	else
27971e3aac0SAndrea Arcangeli 		return sprintf(buf, "always madvise [never]\n");
28071e3aac0SAndrea Arcangeli }
28171e3aac0SAndrea Arcangeli static ssize_t double_flag_store(struct kobject *kobj,
28271e3aac0SAndrea Arcangeli 				 struct kobj_attribute *attr,
28371e3aac0SAndrea Arcangeli 				 const char *buf, size_t count,
28471e3aac0SAndrea Arcangeli 				 enum transparent_hugepage_flag enabled,
28571e3aac0SAndrea Arcangeli 				 enum transparent_hugepage_flag req_madv)
28671e3aac0SAndrea Arcangeli {
28771e3aac0SAndrea Arcangeli 	if (!memcmp("always", buf,
28871e3aac0SAndrea Arcangeli 		    min(sizeof("always")-1, count))) {
28971e3aac0SAndrea Arcangeli 		set_bit(enabled, &transparent_hugepage_flags);
29071e3aac0SAndrea Arcangeli 		clear_bit(req_madv, &transparent_hugepage_flags);
29171e3aac0SAndrea Arcangeli 	} else if (!memcmp("madvise", buf,
29271e3aac0SAndrea Arcangeli 			   min(sizeof("madvise")-1, count))) {
29371e3aac0SAndrea Arcangeli 		clear_bit(enabled, &transparent_hugepage_flags);
29471e3aac0SAndrea Arcangeli 		set_bit(req_madv, &transparent_hugepage_flags);
29571e3aac0SAndrea Arcangeli 	} else if (!memcmp("never", buf,
29671e3aac0SAndrea Arcangeli 			   min(sizeof("never")-1, count))) {
29771e3aac0SAndrea Arcangeli 		clear_bit(enabled, &transparent_hugepage_flags);
29871e3aac0SAndrea Arcangeli 		clear_bit(req_madv, &transparent_hugepage_flags);
29971e3aac0SAndrea Arcangeli 	} else
30071e3aac0SAndrea Arcangeli 		return -EINVAL;
30171e3aac0SAndrea Arcangeli 
30271e3aac0SAndrea Arcangeli 	return count;
30371e3aac0SAndrea Arcangeli }
30471e3aac0SAndrea Arcangeli 
30571e3aac0SAndrea Arcangeli static ssize_t enabled_show(struct kobject *kobj,
30671e3aac0SAndrea Arcangeli 			    struct kobj_attribute *attr, char *buf)
30771e3aac0SAndrea Arcangeli {
30871e3aac0SAndrea Arcangeli 	return double_flag_show(kobj, attr, buf,
30971e3aac0SAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_FLAG,
31071e3aac0SAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
31171e3aac0SAndrea Arcangeli }
31271e3aac0SAndrea Arcangeli static ssize_t enabled_store(struct kobject *kobj,
31371e3aac0SAndrea Arcangeli 			     struct kobj_attribute *attr,
31471e3aac0SAndrea Arcangeli 			     const char *buf, size_t count)
31571e3aac0SAndrea Arcangeli {
316ba76149fSAndrea Arcangeli 	ssize_t ret;
317ba76149fSAndrea Arcangeli 
318ba76149fSAndrea Arcangeli 	ret = double_flag_store(kobj, attr, buf, count,
31971e3aac0SAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_FLAG,
32071e3aac0SAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
321ba76149fSAndrea Arcangeli 
322ba76149fSAndrea Arcangeli 	if (ret > 0) {
323911891afSXiao Guangrong 		int err;
324911891afSXiao Guangrong 
325911891afSXiao Guangrong 		mutex_lock(&khugepaged_mutex);
32679553da2SKirill A. Shutemov 		err = start_stop_khugepaged();
327911891afSXiao Guangrong 		mutex_unlock(&khugepaged_mutex);
328911891afSXiao Guangrong 
329ba76149fSAndrea Arcangeli 		if (err)
330ba76149fSAndrea Arcangeli 			ret = err;
331ba76149fSAndrea Arcangeli 	}
332ba76149fSAndrea Arcangeli 
333ba76149fSAndrea Arcangeli 	return ret;
33471e3aac0SAndrea Arcangeli }
33571e3aac0SAndrea Arcangeli static struct kobj_attribute enabled_attr =
33671e3aac0SAndrea Arcangeli 	__ATTR(enabled, 0644, enabled_show, enabled_store);
33771e3aac0SAndrea Arcangeli 
33871e3aac0SAndrea Arcangeli static ssize_t single_flag_show(struct kobject *kobj,
33971e3aac0SAndrea Arcangeli 				struct kobj_attribute *attr, char *buf,
34071e3aac0SAndrea Arcangeli 				enum transparent_hugepage_flag flag)
34171e3aac0SAndrea Arcangeli {
342e27e6151SBen Hutchings 	return sprintf(buf, "%d\n",
343e27e6151SBen Hutchings 		       !!test_bit(flag, &transparent_hugepage_flags));
34471e3aac0SAndrea Arcangeli }
345e27e6151SBen Hutchings 
34671e3aac0SAndrea Arcangeli static ssize_t single_flag_store(struct kobject *kobj,
34771e3aac0SAndrea Arcangeli 				 struct kobj_attribute *attr,
34871e3aac0SAndrea Arcangeli 				 const char *buf, size_t count,
34971e3aac0SAndrea Arcangeli 				 enum transparent_hugepage_flag flag)
35071e3aac0SAndrea Arcangeli {
351e27e6151SBen Hutchings 	unsigned long value;
352e27e6151SBen Hutchings 	int ret;
353e27e6151SBen Hutchings 
354e27e6151SBen Hutchings 	ret = kstrtoul(buf, 10, &value);
355e27e6151SBen Hutchings 	if (ret < 0)
356e27e6151SBen Hutchings 		return ret;
357e27e6151SBen Hutchings 	if (value > 1)
35871e3aac0SAndrea Arcangeli 		return -EINVAL;
35971e3aac0SAndrea Arcangeli 
360e27e6151SBen Hutchings 	if (value)
361e27e6151SBen Hutchings 		set_bit(flag, &transparent_hugepage_flags);
362e27e6151SBen Hutchings 	else
363e27e6151SBen Hutchings 		clear_bit(flag, &transparent_hugepage_flags);
364e27e6151SBen Hutchings 
36571e3aac0SAndrea Arcangeli 	return count;
36671e3aac0SAndrea Arcangeli }
36771e3aac0SAndrea Arcangeli 
36871e3aac0SAndrea Arcangeli /*
36971e3aac0SAndrea Arcangeli  * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
37071e3aac0SAndrea Arcangeli  * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
37171e3aac0SAndrea Arcangeli  * memory just to allocate one more hugepage.
37271e3aac0SAndrea Arcangeli  */
37371e3aac0SAndrea Arcangeli static ssize_t defrag_show(struct kobject *kobj,
37471e3aac0SAndrea Arcangeli 			   struct kobj_attribute *attr, char *buf)
37571e3aac0SAndrea Arcangeli {
37671e3aac0SAndrea Arcangeli 	return double_flag_show(kobj, attr, buf,
37771e3aac0SAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
37871e3aac0SAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
37971e3aac0SAndrea Arcangeli }
38071e3aac0SAndrea Arcangeli static ssize_t defrag_store(struct kobject *kobj,
38171e3aac0SAndrea Arcangeli 			    struct kobj_attribute *attr,
38271e3aac0SAndrea Arcangeli 			    const char *buf, size_t count)
38371e3aac0SAndrea Arcangeli {
38471e3aac0SAndrea Arcangeli 	return double_flag_store(kobj, attr, buf, count,
38571e3aac0SAndrea Arcangeli 				 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
38671e3aac0SAndrea Arcangeli 				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
38771e3aac0SAndrea Arcangeli }
38871e3aac0SAndrea Arcangeli static struct kobj_attribute defrag_attr =
38971e3aac0SAndrea Arcangeli 	__ATTR(defrag, 0644, defrag_show, defrag_store);
39071e3aac0SAndrea Arcangeli 
39179da5407SKirill A. Shutemov static ssize_t use_zero_page_show(struct kobject *kobj,
39279da5407SKirill A. Shutemov 		struct kobj_attribute *attr, char *buf)
39379da5407SKirill A. Shutemov {
39479da5407SKirill A. Shutemov 	return single_flag_show(kobj, attr, buf,
39579da5407SKirill A. Shutemov 				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
39679da5407SKirill A. Shutemov }
39779da5407SKirill A. Shutemov static ssize_t use_zero_page_store(struct kobject *kobj,
39879da5407SKirill A. Shutemov 		struct kobj_attribute *attr, const char *buf, size_t count)
39979da5407SKirill A. Shutemov {
40079da5407SKirill A. Shutemov 	return single_flag_store(kobj, attr, buf, count,
40179da5407SKirill A. Shutemov 				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
40279da5407SKirill A. Shutemov }
40379da5407SKirill A. Shutemov static struct kobj_attribute use_zero_page_attr =
40479da5407SKirill A. Shutemov 	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
40571e3aac0SAndrea Arcangeli #ifdef CONFIG_DEBUG_VM
40671e3aac0SAndrea Arcangeli static ssize_t debug_cow_show(struct kobject *kobj,
40771e3aac0SAndrea Arcangeli 				struct kobj_attribute *attr, char *buf)
40871e3aac0SAndrea Arcangeli {
40971e3aac0SAndrea Arcangeli 	return single_flag_show(kobj, attr, buf,
41071e3aac0SAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
41171e3aac0SAndrea Arcangeli }
41271e3aac0SAndrea Arcangeli static ssize_t debug_cow_store(struct kobject *kobj,
41371e3aac0SAndrea Arcangeli 			       struct kobj_attribute *attr,
41471e3aac0SAndrea Arcangeli 			       const char *buf, size_t count)
41571e3aac0SAndrea Arcangeli {
41671e3aac0SAndrea Arcangeli 	return single_flag_store(kobj, attr, buf, count,
41771e3aac0SAndrea Arcangeli 				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
41871e3aac0SAndrea Arcangeli }
41971e3aac0SAndrea Arcangeli static struct kobj_attribute debug_cow_attr =
42071e3aac0SAndrea Arcangeli 	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
42171e3aac0SAndrea Arcangeli #endif /* CONFIG_DEBUG_VM */
42271e3aac0SAndrea Arcangeli 
42371e3aac0SAndrea Arcangeli static struct attribute *hugepage_attr[] = {
42471e3aac0SAndrea Arcangeli 	&enabled_attr.attr,
42571e3aac0SAndrea Arcangeli 	&defrag_attr.attr,
42679da5407SKirill A. Shutemov 	&use_zero_page_attr.attr,
42771e3aac0SAndrea Arcangeli #ifdef CONFIG_DEBUG_VM
42871e3aac0SAndrea Arcangeli 	&debug_cow_attr.attr,
42971e3aac0SAndrea Arcangeli #endif
43071e3aac0SAndrea Arcangeli 	NULL,
43171e3aac0SAndrea Arcangeli };
43271e3aac0SAndrea Arcangeli 
43371e3aac0SAndrea Arcangeli static struct attribute_group hugepage_attr_group = {
43471e3aac0SAndrea Arcangeli 	.attrs = hugepage_attr,
435ba76149fSAndrea Arcangeli };
436ba76149fSAndrea Arcangeli 
437ba76149fSAndrea Arcangeli static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
438ba76149fSAndrea Arcangeli 					 struct kobj_attribute *attr,
439ba76149fSAndrea Arcangeli 					 char *buf)
440ba76149fSAndrea Arcangeli {
441ba76149fSAndrea Arcangeli 	return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
442ba76149fSAndrea Arcangeli }
443ba76149fSAndrea Arcangeli 
444ba76149fSAndrea Arcangeli static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
445ba76149fSAndrea Arcangeli 					  struct kobj_attribute *attr,
446ba76149fSAndrea Arcangeli 					  const char *buf, size_t count)
447ba76149fSAndrea Arcangeli {
448ba76149fSAndrea Arcangeli 	unsigned long msecs;
449ba76149fSAndrea Arcangeli 	int err;
450ba76149fSAndrea Arcangeli 
4513dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &msecs);
452ba76149fSAndrea Arcangeli 	if (err || msecs > UINT_MAX)
453ba76149fSAndrea Arcangeli 		return -EINVAL;
454ba76149fSAndrea Arcangeli 
455ba76149fSAndrea Arcangeli 	khugepaged_scan_sleep_millisecs = msecs;
456ba76149fSAndrea Arcangeli 	wake_up_interruptible(&khugepaged_wait);
457ba76149fSAndrea Arcangeli 
458ba76149fSAndrea Arcangeli 	return count;
459ba76149fSAndrea Arcangeli }
460ba76149fSAndrea Arcangeli static struct kobj_attribute scan_sleep_millisecs_attr =
461ba76149fSAndrea Arcangeli 	__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
462ba76149fSAndrea Arcangeli 	       scan_sleep_millisecs_store);
463ba76149fSAndrea Arcangeli 
464ba76149fSAndrea Arcangeli static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
465ba76149fSAndrea Arcangeli 					  struct kobj_attribute *attr,
466ba76149fSAndrea Arcangeli 					  char *buf)
467ba76149fSAndrea Arcangeli {
468ba76149fSAndrea Arcangeli 	return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
469ba76149fSAndrea Arcangeli }
470ba76149fSAndrea Arcangeli 
471ba76149fSAndrea Arcangeli static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
472ba76149fSAndrea Arcangeli 					   struct kobj_attribute *attr,
473ba76149fSAndrea Arcangeli 					   const char *buf, size_t count)
474ba76149fSAndrea Arcangeli {
475ba76149fSAndrea Arcangeli 	unsigned long msecs;
476ba76149fSAndrea Arcangeli 	int err;
477ba76149fSAndrea Arcangeli 
4783dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &msecs);
479ba76149fSAndrea Arcangeli 	if (err || msecs > UINT_MAX)
480ba76149fSAndrea Arcangeli 		return -EINVAL;
481ba76149fSAndrea Arcangeli 
482ba76149fSAndrea Arcangeli 	khugepaged_alloc_sleep_millisecs = msecs;
483ba76149fSAndrea Arcangeli 	wake_up_interruptible(&khugepaged_wait);
484ba76149fSAndrea Arcangeli 
485ba76149fSAndrea Arcangeli 	return count;
486ba76149fSAndrea Arcangeli }
487ba76149fSAndrea Arcangeli static struct kobj_attribute alloc_sleep_millisecs_attr =
488ba76149fSAndrea Arcangeli 	__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
489ba76149fSAndrea Arcangeli 	       alloc_sleep_millisecs_store);
490ba76149fSAndrea Arcangeli 
491ba76149fSAndrea Arcangeli static ssize_t pages_to_scan_show(struct kobject *kobj,
492ba76149fSAndrea Arcangeli 				  struct kobj_attribute *attr,
493ba76149fSAndrea Arcangeli 				  char *buf)
494ba76149fSAndrea Arcangeli {
495ba76149fSAndrea Arcangeli 	return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
496ba76149fSAndrea Arcangeli }
497ba76149fSAndrea Arcangeli static ssize_t pages_to_scan_store(struct kobject *kobj,
498ba76149fSAndrea Arcangeli 				   struct kobj_attribute *attr,
499ba76149fSAndrea Arcangeli 				   const char *buf, size_t count)
500ba76149fSAndrea Arcangeli {
501ba76149fSAndrea Arcangeli 	int err;
502ba76149fSAndrea Arcangeli 	unsigned long pages;
503ba76149fSAndrea Arcangeli 
5043dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &pages);
505ba76149fSAndrea Arcangeli 	if (err || !pages || pages > UINT_MAX)
506ba76149fSAndrea Arcangeli 		return -EINVAL;
507ba76149fSAndrea Arcangeli 
508ba76149fSAndrea Arcangeli 	khugepaged_pages_to_scan = pages;
509ba76149fSAndrea Arcangeli 
510ba76149fSAndrea Arcangeli 	return count;
511ba76149fSAndrea Arcangeli }
512ba76149fSAndrea Arcangeli static struct kobj_attribute pages_to_scan_attr =
513ba76149fSAndrea Arcangeli 	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
514ba76149fSAndrea Arcangeli 	       pages_to_scan_store);
515ba76149fSAndrea Arcangeli 
516ba76149fSAndrea Arcangeli static ssize_t pages_collapsed_show(struct kobject *kobj,
517ba76149fSAndrea Arcangeli 				    struct kobj_attribute *attr,
518ba76149fSAndrea Arcangeli 				    char *buf)
519ba76149fSAndrea Arcangeli {
520ba76149fSAndrea Arcangeli 	return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
521ba76149fSAndrea Arcangeli }
522ba76149fSAndrea Arcangeli static struct kobj_attribute pages_collapsed_attr =
523ba76149fSAndrea Arcangeli 	__ATTR_RO(pages_collapsed);
524ba76149fSAndrea Arcangeli 
525ba76149fSAndrea Arcangeli static ssize_t full_scans_show(struct kobject *kobj,
526ba76149fSAndrea Arcangeli 			       struct kobj_attribute *attr,
527ba76149fSAndrea Arcangeli 			       char *buf)
528ba76149fSAndrea Arcangeli {
529ba76149fSAndrea Arcangeli 	return sprintf(buf, "%u\n", khugepaged_full_scans);
530ba76149fSAndrea Arcangeli }
531ba76149fSAndrea Arcangeli static struct kobj_attribute full_scans_attr =
532ba76149fSAndrea Arcangeli 	__ATTR_RO(full_scans);
533ba76149fSAndrea Arcangeli 
534ba76149fSAndrea Arcangeli static ssize_t khugepaged_defrag_show(struct kobject *kobj,
535ba76149fSAndrea Arcangeli 				      struct kobj_attribute *attr, char *buf)
536ba76149fSAndrea Arcangeli {
537ba76149fSAndrea Arcangeli 	return single_flag_show(kobj, attr, buf,
538ba76149fSAndrea Arcangeli 				TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
539ba76149fSAndrea Arcangeli }
540ba76149fSAndrea Arcangeli static ssize_t khugepaged_defrag_store(struct kobject *kobj,
541ba76149fSAndrea Arcangeli 				       struct kobj_attribute *attr,
542ba76149fSAndrea Arcangeli 				       const char *buf, size_t count)
543ba76149fSAndrea Arcangeli {
544ba76149fSAndrea Arcangeli 	return single_flag_store(kobj, attr, buf, count,
545ba76149fSAndrea Arcangeli 				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
546ba76149fSAndrea Arcangeli }
547ba76149fSAndrea Arcangeli static struct kobj_attribute khugepaged_defrag_attr =
548ba76149fSAndrea Arcangeli 	__ATTR(defrag, 0644, khugepaged_defrag_show,
549ba76149fSAndrea Arcangeli 	       khugepaged_defrag_store);
550ba76149fSAndrea Arcangeli 
551ba76149fSAndrea Arcangeli /*
552ba76149fSAndrea Arcangeli  * max_ptes_none controls if khugepaged should collapse hugepages over
553ba76149fSAndrea Arcangeli  * any unmapped ptes in turn potentially increasing the memory
554ba76149fSAndrea Arcangeli  * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
555ba76149fSAndrea Arcangeli  * reduce the available free memory in the system as it
556ba76149fSAndrea Arcangeli  * runs. Increasing max_ptes_none will instead potentially reduce the
557ba76149fSAndrea Arcangeli  * free memory in the system during the khugepaged scan.
558ba76149fSAndrea Arcangeli  */
559ba76149fSAndrea Arcangeli static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
560ba76149fSAndrea Arcangeli 					     struct kobj_attribute *attr,
561ba76149fSAndrea Arcangeli 					     char *buf)
562ba76149fSAndrea Arcangeli {
563ba76149fSAndrea Arcangeli 	return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
564ba76149fSAndrea Arcangeli }
565ba76149fSAndrea Arcangeli static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
566ba76149fSAndrea Arcangeli 					      struct kobj_attribute *attr,
567ba76149fSAndrea Arcangeli 					      const char *buf, size_t count)
568ba76149fSAndrea Arcangeli {
569ba76149fSAndrea Arcangeli 	int err;
570ba76149fSAndrea Arcangeli 	unsigned long max_ptes_none;
571ba76149fSAndrea Arcangeli 
5723dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &max_ptes_none);
573ba76149fSAndrea Arcangeli 	if (err || max_ptes_none > HPAGE_PMD_NR-1)
574ba76149fSAndrea Arcangeli 		return -EINVAL;
575ba76149fSAndrea Arcangeli 
576ba76149fSAndrea Arcangeli 	khugepaged_max_ptes_none = max_ptes_none;
577ba76149fSAndrea Arcangeli 
578ba76149fSAndrea Arcangeli 	return count;
579ba76149fSAndrea Arcangeli }
580ba76149fSAndrea Arcangeli static struct kobj_attribute khugepaged_max_ptes_none_attr =
581ba76149fSAndrea Arcangeli 	__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
582ba76149fSAndrea Arcangeli 	       khugepaged_max_ptes_none_store);
583ba76149fSAndrea Arcangeli 
584ba76149fSAndrea Arcangeli static struct attribute *khugepaged_attr[] = {
585ba76149fSAndrea Arcangeli 	&khugepaged_defrag_attr.attr,
586ba76149fSAndrea Arcangeli 	&khugepaged_max_ptes_none_attr.attr,
587ba76149fSAndrea Arcangeli 	&pages_to_scan_attr.attr,
588ba76149fSAndrea Arcangeli 	&pages_collapsed_attr.attr,
589ba76149fSAndrea Arcangeli 	&full_scans_attr.attr,
590ba76149fSAndrea Arcangeli 	&scan_sleep_millisecs_attr.attr,
591ba76149fSAndrea Arcangeli 	&alloc_sleep_millisecs_attr.attr,
592ba76149fSAndrea Arcangeli 	NULL,
593ba76149fSAndrea Arcangeli };
594ba76149fSAndrea Arcangeli 
595ba76149fSAndrea Arcangeli static struct attribute_group khugepaged_attr_group = {
596ba76149fSAndrea Arcangeli 	.attrs = khugepaged_attr,
597ba76149fSAndrea Arcangeli 	.name = "khugepaged",
59871e3aac0SAndrea Arcangeli };
599569e5590SShaohua Li 
600569e5590SShaohua Li static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
601569e5590SShaohua Li {
602569e5590SShaohua Li 	int err;
603569e5590SShaohua Li 
604569e5590SShaohua Li 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
605569e5590SShaohua Li 	if (unlikely(!*hugepage_kobj)) {
606ae3a8c1cSAndrew Morton 		pr_err("failed to create transparent hugepage kobject\n");
607569e5590SShaohua Li 		return -ENOMEM;
608569e5590SShaohua Li 	}
609569e5590SShaohua Li 
610569e5590SShaohua Li 	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
611569e5590SShaohua Li 	if (err) {
612ae3a8c1cSAndrew Morton 		pr_err("failed to register transparent hugepage group\n");
613569e5590SShaohua Li 		goto delete_obj;
614569e5590SShaohua Li 	}
615569e5590SShaohua Li 
616569e5590SShaohua Li 	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
617569e5590SShaohua Li 	if (err) {
618ae3a8c1cSAndrew Morton 		pr_err("failed to register transparent hugepage group\n");
619569e5590SShaohua Li 		goto remove_hp_group;
620569e5590SShaohua Li 	}
621569e5590SShaohua Li 
622569e5590SShaohua Li 	return 0;
623569e5590SShaohua Li 
624569e5590SShaohua Li remove_hp_group:
625569e5590SShaohua Li 	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
626569e5590SShaohua Li delete_obj:
627569e5590SShaohua Li 	kobject_put(*hugepage_kobj);
628569e5590SShaohua Li 	return err;
629569e5590SShaohua Li }
630569e5590SShaohua Li 
631569e5590SShaohua Li static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
632569e5590SShaohua Li {
633569e5590SShaohua Li 	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
634569e5590SShaohua Li 	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
635569e5590SShaohua Li 	kobject_put(hugepage_kobj);
636569e5590SShaohua Li }
637569e5590SShaohua Li #else
638569e5590SShaohua Li static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
639569e5590SShaohua Li {
640569e5590SShaohua Li 	return 0;
641569e5590SShaohua Li }
642569e5590SShaohua Li 
643569e5590SShaohua Li static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
644569e5590SShaohua Li {
645569e5590SShaohua Li }
64671e3aac0SAndrea Arcangeli #endif /* CONFIG_SYSFS */
64771e3aac0SAndrea Arcangeli 
64871e3aac0SAndrea Arcangeli static int __init hugepage_init(void)
64971e3aac0SAndrea Arcangeli {
65071e3aac0SAndrea Arcangeli 	int err;
651569e5590SShaohua Li 	struct kobject *hugepage_kobj;
65271e3aac0SAndrea Arcangeli 
6534b7167b9SAndrea Arcangeli 	if (!has_transparent_hugepage()) {
6544b7167b9SAndrea Arcangeli 		transparent_hugepage_flags = 0;
655569e5590SShaohua Li 		return -EINVAL;
6564b7167b9SAndrea Arcangeli 	}
6574b7167b9SAndrea Arcangeli 
658569e5590SShaohua Li 	err = hugepage_init_sysfs(&hugepage_kobj);
659569e5590SShaohua Li 	if (err)
66065ebb64fSKirill A. Shutemov 		goto err_sysfs;
661ba76149fSAndrea Arcangeli 
662ba76149fSAndrea Arcangeli 	err = khugepaged_slab_init();
663ba76149fSAndrea Arcangeli 	if (err)
66465ebb64fSKirill A. Shutemov 		goto err_slab;
665ba76149fSAndrea Arcangeli 
66665ebb64fSKirill A. Shutemov 	err = register_shrinker(&huge_zero_page_shrinker);
66765ebb64fSKirill A. Shutemov 	if (err)
66865ebb64fSKirill A. Shutemov 		goto err_hzp_shrinker;
66997ae1749SKirill A. Shutemov 
67097562cd2SRik van Riel 	/*
67197562cd2SRik van Riel 	 * By default disable transparent hugepages on smaller systems,
67297562cd2SRik van Riel 	 * where the extra memory used could hurt more than TLB overhead
67397562cd2SRik van Riel 	 * is likely to save.  The admin can still enable it through /sys.
67497562cd2SRik van Riel 	 */
67579553da2SKirill A. Shutemov 	if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
67697562cd2SRik van Riel 		transparent_hugepage_flags = 0;
67779553da2SKirill A. Shutemov 		return 0;
67879553da2SKirill A. Shutemov 	}
67997562cd2SRik van Riel 
68079553da2SKirill A. Shutemov 	err = start_stop_khugepaged();
68165ebb64fSKirill A. Shutemov 	if (err)
68265ebb64fSKirill A. Shutemov 		goto err_khugepaged;
683ba76149fSAndrea Arcangeli 
684569e5590SShaohua Li 	return 0;
68565ebb64fSKirill A. Shutemov err_khugepaged:
68665ebb64fSKirill A. Shutemov 	unregister_shrinker(&huge_zero_page_shrinker);
68765ebb64fSKirill A. Shutemov err_hzp_shrinker:
68865ebb64fSKirill A. Shutemov 	khugepaged_slab_exit();
68965ebb64fSKirill A. Shutemov err_slab:
690569e5590SShaohua Li 	hugepage_exit_sysfs(hugepage_kobj);
69165ebb64fSKirill A. Shutemov err_sysfs:
692ba76149fSAndrea Arcangeli 	return err;
69371e3aac0SAndrea Arcangeli }
694a64fb3cdSPaul Gortmaker subsys_initcall(hugepage_init);
69571e3aac0SAndrea Arcangeli 
69671e3aac0SAndrea Arcangeli static int __init setup_transparent_hugepage(char *str)
69771e3aac0SAndrea Arcangeli {
69871e3aac0SAndrea Arcangeli 	int ret = 0;
69971e3aac0SAndrea Arcangeli 	if (!str)
70071e3aac0SAndrea Arcangeli 		goto out;
70171e3aac0SAndrea Arcangeli 	if (!strcmp(str, "always")) {
70271e3aac0SAndrea Arcangeli 		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
70371e3aac0SAndrea Arcangeli 			&transparent_hugepage_flags);
70471e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
70571e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
70671e3aac0SAndrea Arcangeli 		ret = 1;
70771e3aac0SAndrea Arcangeli 	} else if (!strcmp(str, "madvise")) {
70871e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
70971e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
71071e3aac0SAndrea Arcangeli 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
71171e3aac0SAndrea Arcangeli 			&transparent_hugepage_flags);
71271e3aac0SAndrea Arcangeli 		ret = 1;
71371e3aac0SAndrea Arcangeli 	} else if (!strcmp(str, "never")) {
71471e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
71571e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
71671e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
71771e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
71871e3aac0SAndrea Arcangeli 		ret = 1;
71971e3aac0SAndrea Arcangeli 	}
72071e3aac0SAndrea Arcangeli out:
72171e3aac0SAndrea Arcangeli 	if (!ret)
722ae3a8c1cSAndrew Morton 		pr_warn("transparent_hugepage= cannot parse, ignored\n");
72371e3aac0SAndrea Arcangeli 	return ret;
72471e3aac0SAndrea Arcangeli }
72571e3aac0SAndrea Arcangeli __setup("transparent_hugepage=", setup_transparent_hugepage);
72671e3aac0SAndrea Arcangeli 
727b32967ffSMel Gorman pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
72871e3aac0SAndrea Arcangeli {
72971e3aac0SAndrea Arcangeli 	if (likely(vma->vm_flags & VM_WRITE))
73071e3aac0SAndrea Arcangeli 		pmd = pmd_mkwrite(pmd);
73171e3aac0SAndrea Arcangeli 	return pmd;
73271e3aac0SAndrea Arcangeli }
73371e3aac0SAndrea Arcangeli 
7343122359aSKirill A. Shutemov static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
735b3092b3bSBob Liu {
736b3092b3bSBob Liu 	pmd_t entry;
7373122359aSKirill A. Shutemov 	entry = mk_pmd(page, prot);
738b3092b3bSBob Liu 	entry = pmd_mkhuge(entry);
739b3092b3bSBob Liu 	return entry;
740b3092b3bSBob Liu }
741b3092b3bSBob Liu 
74271e3aac0SAndrea Arcangeli static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
74371e3aac0SAndrea Arcangeli 					struct vm_area_struct *vma,
744230c92a8SAndrea Arcangeli 					unsigned long address, pmd_t *pmd,
7456b251fc9SAndrea Arcangeli 					struct page *page, gfp_t gfp,
7466b251fc9SAndrea Arcangeli 					unsigned int flags)
74771e3aac0SAndrea Arcangeli {
74800501b53SJohannes Weiner 	struct mem_cgroup *memcg;
74971e3aac0SAndrea Arcangeli 	pgtable_t pgtable;
750c4088ebdSKirill A. Shutemov 	spinlock_t *ptl;
751230c92a8SAndrea Arcangeli 	unsigned long haddr = address & HPAGE_PMD_MASK;
75271e3aac0SAndrea Arcangeli 
753309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageCompound(page), page);
75400501b53SJohannes Weiner 
755f627c2f5SKirill A. Shutemov 	if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
7566b251fc9SAndrea Arcangeli 		put_page(page);
7576b251fc9SAndrea Arcangeli 		count_vm_event(THP_FAULT_FALLBACK);
7586b251fc9SAndrea Arcangeli 		return VM_FAULT_FALLBACK;
7596b251fc9SAndrea Arcangeli 	}
76071e3aac0SAndrea Arcangeli 
76100501b53SJohannes Weiner 	pgtable = pte_alloc_one(mm, haddr);
76200501b53SJohannes Weiner 	if (unlikely(!pgtable)) {
763f627c2f5SKirill A. Shutemov 		mem_cgroup_cancel_charge(page, memcg, true);
7646b251fc9SAndrea Arcangeli 		put_page(page);
76500501b53SJohannes Weiner 		return VM_FAULT_OOM;
76600501b53SJohannes Weiner 	}
76700501b53SJohannes Weiner 
76871e3aac0SAndrea Arcangeli 	clear_huge_page(page, haddr, HPAGE_PMD_NR);
76952f37629SMinchan Kim 	/*
77052f37629SMinchan Kim 	 * The memory barrier inside __SetPageUptodate makes sure that
77152f37629SMinchan Kim 	 * clear_huge_page writes become visible before the set_pmd_at()
77252f37629SMinchan Kim 	 * write.
77352f37629SMinchan Kim 	 */
77471e3aac0SAndrea Arcangeli 	__SetPageUptodate(page);
77571e3aac0SAndrea Arcangeli 
776c4088ebdSKirill A. Shutemov 	ptl = pmd_lock(mm, pmd);
77771e3aac0SAndrea Arcangeli 	if (unlikely(!pmd_none(*pmd))) {
778c4088ebdSKirill A. Shutemov 		spin_unlock(ptl);
779f627c2f5SKirill A. Shutemov 		mem_cgroup_cancel_charge(page, memcg, true);
78071e3aac0SAndrea Arcangeli 		put_page(page);
78171e3aac0SAndrea Arcangeli 		pte_free(mm, pgtable);
78271e3aac0SAndrea Arcangeli 	} else {
78371e3aac0SAndrea Arcangeli 		pmd_t entry;
7846b251fc9SAndrea Arcangeli 
7856b251fc9SAndrea Arcangeli 		/* Deliver the page fault to userland */
7866b251fc9SAndrea Arcangeli 		if (userfaultfd_missing(vma)) {
7876b251fc9SAndrea Arcangeli 			int ret;
7886b251fc9SAndrea Arcangeli 
7896b251fc9SAndrea Arcangeli 			spin_unlock(ptl);
790f627c2f5SKirill A. Shutemov 			mem_cgroup_cancel_charge(page, memcg, true);
7916b251fc9SAndrea Arcangeli 			put_page(page);
7926b251fc9SAndrea Arcangeli 			pte_free(mm, pgtable);
793230c92a8SAndrea Arcangeli 			ret = handle_userfault(vma, address, flags,
7946b251fc9SAndrea Arcangeli 					       VM_UFFD_MISSING);
7956b251fc9SAndrea Arcangeli 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
7966b251fc9SAndrea Arcangeli 			return ret;
7976b251fc9SAndrea Arcangeli 		}
7986b251fc9SAndrea Arcangeli 
7993122359aSKirill A. Shutemov 		entry = mk_huge_pmd(page, vma->vm_page_prot);
8003122359aSKirill A. Shutemov 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
801d281ee61SKirill A. Shutemov 		page_add_new_anon_rmap(page, vma, haddr, true);
802f627c2f5SKirill A. Shutemov 		mem_cgroup_commit_charge(page, memcg, false, true);
80300501b53SJohannes Weiner 		lru_cache_add_active_or_unevictable(page, vma);
8046b0b50b0SAneesh Kumar K.V 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
80571e3aac0SAndrea Arcangeli 		set_pmd_at(mm, haddr, pmd, entry);
80671e3aac0SAndrea Arcangeli 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
807e1f56c89SKirill A. Shutemov 		atomic_long_inc(&mm->nr_ptes);
808c4088ebdSKirill A. Shutemov 		spin_unlock(ptl);
8096b251fc9SAndrea Arcangeli 		count_vm_event(THP_FAULT_ALLOC);
81071e3aac0SAndrea Arcangeli 	}
81171e3aac0SAndrea Arcangeli 
812aa2e878eSDavid Rientjes 	return 0;
81371e3aac0SAndrea Arcangeli }
81471e3aac0SAndrea Arcangeli 
815cc5d462fSAndi Kleen static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
8160bbbc0b3SAndrea Arcangeli {
81771baba4bSMel Gorman 	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
8180bbbc0b3SAndrea Arcangeli }
8190bbbc0b3SAndrea Arcangeli 
820c4088ebdSKirill A. Shutemov /* Caller must hold page table lock. */
821d295e341SKirill A. Shutemov static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
82297ae1749SKirill A. Shutemov 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
8235918d10aSKirill A. Shutemov 		struct page *zero_page)
824fc9fe822SKirill A. Shutemov {
825fc9fe822SKirill A. Shutemov 	pmd_t entry;
8267c414164SAndrew Morton 	if (!pmd_none(*pmd))
8277c414164SAndrew Morton 		return false;
8285918d10aSKirill A. Shutemov 	entry = mk_pmd(zero_page, vma->vm_page_prot);
829fc9fe822SKirill A. Shutemov 	entry = pmd_mkhuge(entry);
8306b0b50b0SAneesh Kumar K.V 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
831fc9fe822SKirill A. Shutemov 	set_pmd_at(mm, haddr, pmd, entry);
832e1f56c89SKirill A. Shutemov 	atomic_long_inc(&mm->nr_ptes);
8337c414164SAndrew Morton 	return true;
834fc9fe822SKirill A. Shutemov }
835fc9fe822SKirill A. Shutemov 
83671e3aac0SAndrea Arcangeli int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
83771e3aac0SAndrea Arcangeli 			       unsigned long address, pmd_t *pmd,
83871e3aac0SAndrea Arcangeli 			       unsigned int flags)
83971e3aac0SAndrea Arcangeli {
840077fcf11SAneesh Kumar K.V 	gfp_t gfp;
84171e3aac0SAndrea Arcangeli 	struct page *page;
84271e3aac0SAndrea Arcangeli 	unsigned long haddr = address & HPAGE_PMD_MASK;
84371e3aac0SAndrea Arcangeli 
844128ec037SKirill A. Shutemov 	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
845c0292554SKirill A. Shutemov 		return VM_FAULT_FALLBACK;
8467479df6dSKirill A. Shutemov 	if (vma->vm_flags & VM_LOCKED)
8477479df6dSKirill A. Shutemov 		return VM_FAULT_FALLBACK;
84871e3aac0SAndrea Arcangeli 	if (unlikely(anon_vma_prepare(vma)))
84971e3aac0SAndrea Arcangeli 		return VM_FAULT_OOM;
8506d50e60cSDavid Rientjes 	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
851ba76149fSAndrea Arcangeli 		return VM_FAULT_OOM;
852593befa6SDominik Dingel 	if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
85379da5407SKirill A. Shutemov 			transparent_hugepage_use_zero_page()) {
854c4088ebdSKirill A. Shutemov 		spinlock_t *ptl;
85580371957SKirill A. Shutemov 		pgtable_t pgtable;
8565918d10aSKirill A. Shutemov 		struct page *zero_page;
8573ea41e62SKirill A. Shutemov 		bool set;
8586b251fc9SAndrea Arcangeli 		int ret;
85980371957SKirill A. Shutemov 		pgtable = pte_alloc_one(mm, haddr);
86080371957SKirill A. Shutemov 		if (unlikely(!pgtable))
86180371957SKirill A. Shutemov 			return VM_FAULT_OOM;
8625918d10aSKirill A. Shutemov 		zero_page = get_huge_zero_page();
8635918d10aSKirill A. Shutemov 		if (unlikely(!zero_page)) {
86497ae1749SKirill A. Shutemov 			pte_free(mm, pgtable);
86597ae1749SKirill A. Shutemov 			count_vm_event(THP_FAULT_FALLBACK);
866c0292554SKirill A. Shutemov 			return VM_FAULT_FALLBACK;
86797ae1749SKirill A. Shutemov 		}
868c4088ebdSKirill A. Shutemov 		ptl = pmd_lock(mm, pmd);
8696b251fc9SAndrea Arcangeli 		ret = 0;
8706b251fc9SAndrea Arcangeli 		set = false;
8716b251fc9SAndrea Arcangeli 		if (pmd_none(*pmd)) {
8726b251fc9SAndrea Arcangeli 			if (userfaultfd_missing(vma)) {
8736b251fc9SAndrea Arcangeli 				spin_unlock(ptl);
874230c92a8SAndrea Arcangeli 				ret = handle_userfault(vma, address, flags,
8756b251fc9SAndrea Arcangeli 						       VM_UFFD_MISSING);
8766b251fc9SAndrea Arcangeli 				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
8776b251fc9SAndrea Arcangeli 			} else {
8786b251fc9SAndrea Arcangeli 				set_huge_zero_page(pgtable, mm, vma,
8796b251fc9SAndrea Arcangeli 						   haddr, pmd,
8805918d10aSKirill A. Shutemov 						   zero_page);
881c4088ebdSKirill A. Shutemov 				spin_unlock(ptl);
8826b251fc9SAndrea Arcangeli 				set = true;
8836b251fc9SAndrea Arcangeli 			}
8846b251fc9SAndrea Arcangeli 		} else
8856b251fc9SAndrea Arcangeli 			spin_unlock(ptl);
8863ea41e62SKirill A. Shutemov 		if (!set) {
8873ea41e62SKirill A. Shutemov 			pte_free(mm, pgtable);
8883ea41e62SKirill A. Shutemov 			put_huge_zero_page();
8893ea41e62SKirill A. Shutemov 		}
8906b251fc9SAndrea Arcangeli 		return ret;
89180371957SKirill A. Shutemov 	}
892077fcf11SAneesh Kumar K.V 	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
893077fcf11SAneesh Kumar K.V 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
89481ab4201SAndi Kleen 	if (unlikely(!page)) {
89581ab4201SAndi Kleen 		count_vm_event(THP_FAULT_FALLBACK);
896c0292554SKirill A. Shutemov 		return VM_FAULT_FALLBACK;
89781ab4201SAndi Kleen 	}
898230c92a8SAndrea Arcangeli 	return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
899230c92a8SAndrea Arcangeli 					    flags);
90071e3aac0SAndrea Arcangeli }
90171e3aac0SAndrea Arcangeli 
902ae18d6dcSMatthew Wilcox static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
9035cad465dSMatthew Wilcox 		pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
9045cad465dSMatthew Wilcox {
9055cad465dSMatthew Wilcox 	struct mm_struct *mm = vma->vm_mm;
9065cad465dSMatthew Wilcox 	pmd_t entry;
9075cad465dSMatthew Wilcox 	spinlock_t *ptl;
9085cad465dSMatthew Wilcox 
9095cad465dSMatthew Wilcox 	ptl = pmd_lock(mm, pmd);
9105cad465dSMatthew Wilcox 	if (pmd_none(*pmd)) {
9115cad465dSMatthew Wilcox 		entry = pmd_mkhuge(pfn_pmd(pfn, prot));
9125cad465dSMatthew Wilcox 		if (write) {
9135cad465dSMatthew Wilcox 			entry = pmd_mkyoung(pmd_mkdirty(entry));
9145cad465dSMatthew Wilcox 			entry = maybe_pmd_mkwrite(entry, vma);
9155cad465dSMatthew Wilcox 		}
9165cad465dSMatthew Wilcox 		set_pmd_at(mm, addr, pmd, entry);
9175cad465dSMatthew Wilcox 		update_mmu_cache_pmd(vma, addr, pmd);
9185cad465dSMatthew Wilcox 	}
9195cad465dSMatthew Wilcox 	spin_unlock(ptl);
9205cad465dSMatthew Wilcox }
9215cad465dSMatthew Wilcox 
9225cad465dSMatthew Wilcox int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
9235cad465dSMatthew Wilcox 			pmd_t *pmd, unsigned long pfn, bool write)
9245cad465dSMatthew Wilcox {
9255cad465dSMatthew Wilcox 	pgprot_t pgprot = vma->vm_page_prot;
9265cad465dSMatthew Wilcox 	/*
9275cad465dSMatthew Wilcox 	 * If we had pmd_special, we could avoid all these restrictions,
9285cad465dSMatthew Wilcox 	 * but we need to be consistent with PTEs and architectures that
9295cad465dSMatthew Wilcox 	 * can't support a 'special' bit.
9305cad465dSMatthew Wilcox 	 */
9315cad465dSMatthew Wilcox 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
9325cad465dSMatthew Wilcox 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
9335cad465dSMatthew Wilcox 						(VM_PFNMAP|VM_MIXEDMAP));
9345cad465dSMatthew Wilcox 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
9355cad465dSMatthew Wilcox 	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
9365cad465dSMatthew Wilcox 
9375cad465dSMatthew Wilcox 	if (addr < vma->vm_start || addr >= vma->vm_end)
9385cad465dSMatthew Wilcox 		return VM_FAULT_SIGBUS;
9395cad465dSMatthew Wilcox 	if (track_pfn_insert(vma, &pgprot, pfn))
9405cad465dSMatthew Wilcox 		return VM_FAULT_SIGBUS;
941ae18d6dcSMatthew Wilcox 	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
942ae18d6dcSMatthew Wilcox 	return VM_FAULT_NOPAGE;
9435cad465dSMatthew Wilcox }
9445cad465dSMatthew Wilcox 
94571e3aac0SAndrea Arcangeli int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
94671e3aac0SAndrea Arcangeli 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
94771e3aac0SAndrea Arcangeli 		  struct vm_area_struct *vma)
94871e3aac0SAndrea Arcangeli {
949c4088ebdSKirill A. Shutemov 	spinlock_t *dst_ptl, *src_ptl;
95071e3aac0SAndrea Arcangeli 	struct page *src_page;
95171e3aac0SAndrea Arcangeli 	pmd_t pmd;
95271e3aac0SAndrea Arcangeli 	pgtable_t pgtable;
95371e3aac0SAndrea Arcangeli 	int ret;
95471e3aac0SAndrea Arcangeli 
95571e3aac0SAndrea Arcangeli 	ret = -ENOMEM;
95671e3aac0SAndrea Arcangeli 	pgtable = pte_alloc_one(dst_mm, addr);
95771e3aac0SAndrea Arcangeli 	if (unlikely(!pgtable))
95871e3aac0SAndrea Arcangeli 		goto out;
95971e3aac0SAndrea Arcangeli 
960c4088ebdSKirill A. Shutemov 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
961c4088ebdSKirill A. Shutemov 	src_ptl = pmd_lockptr(src_mm, src_pmd);
962c4088ebdSKirill A. Shutemov 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
96371e3aac0SAndrea Arcangeli 
96471e3aac0SAndrea Arcangeli 	ret = -EAGAIN;
96571e3aac0SAndrea Arcangeli 	pmd = *src_pmd;
96671e3aac0SAndrea Arcangeli 	if (unlikely(!pmd_trans_huge(pmd))) {
96771e3aac0SAndrea Arcangeli 		pte_free(dst_mm, pgtable);
96871e3aac0SAndrea Arcangeli 		goto out_unlock;
96971e3aac0SAndrea Arcangeli 	}
970fc9fe822SKirill A. Shutemov 	/*
971c4088ebdSKirill A. Shutemov 	 * When page table lock is held, the huge zero pmd should not be
972fc9fe822SKirill A. Shutemov 	 * under splitting since we don't split the page itself, only pmd to
973fc9fe822SKirill A. Shutemov 	 * a page table.
974fc9fe822SKirill A. Shutemov 	 */
975fc9fe822SKirill A. Shutemov 	if (is_huge_zero_pmd(pmd)) {
9765918d10aSKirill A. Shutemov 		struct page *zero_page;
97797ae1749SKirill A. Shutemov 		/*
97897ae1749SKirill A. Shutemov 		 * get_huge_zero_page() will never allocate a new page here,
97997ae1749SKirill A. Shutemov 		 * since we already have a zero page to copy. It just takes a
98097ae1749SKirill A. Shutemov 		 * reference.
98197ae1749SKirill A. Shutemov 		 */
9825918d10aSKirill A. Shutemov 		zero_page = get_huge_zero_page();
9836b251fc9SAndrea Arcangeli 		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
9845918d10aSKirill A. Shutemov 				zero_page);
985fc9fe822SKirill A. Shutemov 		ret = 0;
986fc9fe822SKirill A. Shutemov 		goto out_unlock;
987fc9fe822SKirill A. Shutemov 	}
988de466bd6SMel Gorman 
98971e3aac0SAndrea Arcangeli 	src_page = pmd_page(pmd);
990309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
99171e3aac0SAndrea Arcangeli 	get_page(src_page);
99271e3aac0SAndrea Arcangeli 	page_dup_rmap(src_page);
99371e3aac0SAndrea Arcangeli 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
99471e3aac0SAndrea Arcangeli 
99571e3aac0SAndrea Arcangeli 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
99671e3aac0SAndrea Arcangeli 	pmd = pmd_mkold(pmd_wrprotect(pmd));
9976b0b50b0SAneesh Kumar K.V 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
99871e3aac0SAndrea Arcangeli 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
999e1f56c89SKirill A. Shutemov 	atomic_long_inc(&dst_mm->nr_ptes);
100071e3aac0SAndrea Arcangeli 
100171e3aac0SAndrea Arcangeli 	ret = 0;
100271e3aac0SAndrea Arcangeli out_unlock:
1003c4088ebdSKirill A. Shutemov 	spin_unlock(src_ptl);
1004c4088ebdSKirill A. Shutemov 	spin_unlock(dst_ptl);
100571e3aac0SAndrea Arcangeli out:
100671e3aac0SAndrea Arcangeli 	return ret;
100771e3aac0SAndrea Arcangeli }
100871e3aac0SAndrea Arcangeli 
1009a1dd450bSWill Deacon void huge_pmd_set_accessed(struct mm_struct *mm,
1010a1dd450bSWill Deacon 			   struct vm_area_struct *vma,
1011a1dd450bSWill Deacon 			   unsigned long address,
1012a1dd450bSWill Deacon 			   pmd_t *pmd, pmd_t orig_pmd,
1013a1dd450bSWill Deacon 			   int dirty)
1014a1dd450bSWill Deacon {
1015c4088ebdSKirill A. Shutemov 	spinlock_t *ptl;
1016a1dd450bSWill Deacon 	pmd_t entry;
1017a1dd450bSWill Deacon 	unsigned long haddr;
1018a1dd450bSWill Deacon 
1019c4088ebdSKirill A. Shutemov 	ptl = pmd_lock(mm, pmd);
1020a1dd450bSWill Deacon 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
1021a1dd450bSWill Deacon 		goto unlock;
1022a1dd450bSWill Deacon 
1023a1dd450bSWill Deacon 	entry = pmd_mkyoung(orig_pmd);
1024a1dd450bSWill Deacon 	haddr = address & HPAGE_PMD_MASK;
1025a1dd450bSWill Deacon 	if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
1026a1dd450bSWill Deacon 		update_mmu_cache_pmd(vma, address, pmd);
1027a1dd450bSWill Deacon 
1028a1dd450bSWill Deacon unlock:
1029c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
1030a1dd450bSWill Deacon }
1031a1dd450bSWill Deacon 
103271e3aac0SAndrea Arcangeli static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
103371e3aac0SAndrea Arcangeli 					struct vm_area_struct *vma,
103471e3aac0SAndrea Arcangeli 					unsigned long address,
103571e3aac0SAndrea Arcangeli 					pmd_t *pmd, pmd_t orig_pmd,
103671e3aac0SAndrea Arcangeli 					struct page *page,
103771e3aac0SAndrea Arcangeli 					unsigned long haddr)
103871e3aac0SAndrea Arcangeli {
103900501b53SJohannes Weiner 	struct mem_cgroup *memcg;
1040c4088ebdSKirill A. Shutemov 	spinlock_t *ptl;
104171e3aac0SAndrea Arcangeli 	pgtable_t pgtable;
104271e3aac0SAndrea Arcangeli 	pmd_t _pmd;
104371e3aac0SAndrea Arcangeli 	int ret = 0, i;
104471e3aac0SAndrea Arcangeli 	struct page **pages;
10452ec74c3eSSagi Grimberg 	unsigned long mmun_start;	/* For mmu_notifiers */
10462ec74c3eSSagi Grimberg 	unsigned long mmun_end;		/* For mmu_notifiers */
104771e3aac0SAndrea Arcangeli 
104871e3aac0SAndrea Arcangeli 	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
104971e3aac0SAndrea Arcangeli 			GFP_KERNEL);
105071e3aac0SAndrea Arcangeli 	if (unlikely(!pages)) {
105171e3aac0SAndrea Arcangeli 		ret |= VM_FAULT_OOM;
105271e3aac0SAndrea Arcangeli 		goto out;
105371e3aac0SAndrea Arcangeli 	}
105471e3aac0SAndrea Arcangeli 
105571e3aac0SAndrea Arcangeli 	for (i = 0; i < HPAGE_PMD_NR; i++) {
1056cc5d462fSAndi Kleen 		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
1057cc5d462fSAndi Kleen 					       __GFP_OTHER_NODE,
105819ee151eSAndi Kleen 					       vma, address, page_to_nid(page));
1059b9bbfbe3SAndrea Arcangeli 		if (unlikely(!pages[i] ||
106000501b53SJohannes Weiner 			     mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
1061f627c2f5SKirill A. Shutemov 						   &memcg, false))) {
1062b9bbfbe3SAndrea Arcangeli 			if (pages[i])
106371e3aac0SAndrea Arcangeli 				put_page(pages[i]);
1064b9bbfbe3SAndrea Arcangeli 			while (--i >= 0) {
106500501b53SJohannes Weiner 				memcg = (void *)page_private(pages[i]);
106600501b53SJohannes Weiner 				set_page_private(pages[i], 0);
1067f627c2f5SKirill A. Shutemov 				mem_cgroup_cancel_charge(pages[i], memcg,
1068f627c2f5SKirill A. Shutemov 						false);
1069b9bbfbe3SAndrea Arcangeli 				put_page(pages[i]);
1070b9bbfbe3SAndrea Arcangeli 			}
107171e3aac0SAndrea Arcangeli 			kfree(pages);
107271e3aac0SAndrea Arcangeli 			ret |= VM_FAULT_OOM;
107371e3aac0SAndrea Arcangeli 			goto out;
107471e3aac0SAndrea Arcangeli 		}
107500501b53SJohannes Weiner 		set_page_private(pages[i], (unsigned long)memcg);
107671e3aac0SAndrea Arcangeli 	}
107771e3aac0SAndrea Arcangeli 
107871e3aac0SAndrea Arcangeli 	for (i = 0; i < HPAGE_PMD_NR; i++) {
107971e3aac0SAndrea Arcangeli 		copy_user_highpage(pages[i], page + i,
10800089e485SHillf Danton 				   haddr + PAGE_SIZE * i, vma);
108171e3aac0SAndrea Arcangeli 		__SetPageUptodate(pages[i]);
108271e3aac0SAndrea Arcangeli 		cond_resched();
108371e3aac0SAndrea Arcangeli 	}
108471e3aac0SAndrea Arcangeli 
10852ec74c3eSSagi Grimberg 	mmun_start = haddr;
10862ec74c3eSSagi Grimberg 	mmun_end   = haddr + HPAGE_PMD_SIZE;
10872ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
10882ec74c3eSSagi Grimberg 
1089c4088ebdSKirill A. Shutemov 	ptl = pmd_lock(mm, pmd);
109071e3aac0SAndrea Arcangeli 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
109171e3aac0SAndrea Arcangeli 		goto out_free_pages;
1092309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
109371e3aac0SAndrea Arcangeli 
10948809aa2dSAneesh Kumar K.V 	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
109571e3aac0SAndrea Arcangeli 	/* leave pmd empty until pte is filled */
109671e3aac0SAndrea Arcangeli 
10976b0b50b0SAneesh Kumar K.V 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
109871e3aac0SAndrea Arcangeli 	pmd_populate(mm, &_pmd, pgtable);
109971e3aac0SAndrea Arcangeli 
110071e3aac0SAndrea Arcangeli 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
110171e3aac0SAndrea Arcangeli 		pte_t *pte, entry;
110271e3aac0SAndrea Arcangeli 		entry = mk_pte(pages[i], vma->vm_page_prot);
110371e3aac0SAndrea Arcangeli 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
110400501b53SJohannes Weiner 		memcg = (void *)page_private(pages[i]);
110500501b53SJohannes Weiner 		set_page_private(pages[i], 0);
1106d281ee61SKirill A. Shutemov 		page_add_new_anon_rmap(pages[i], vma, haddr, false);
1107f627c2f5SKirill A. Shutemov 		mem_cgroup_commit_charge(pages[i], memcg, false, false);
110800501b53SJohannes Weiner 		lru_cache_add_active_or_unevictable(pages[i], vma);
110971e3aac0SAndrea Arcangeli 		pte = pte_offset_map(&_pmd, haddr);
111071e3aac0SAndrea Arcangeli 		VM_BUG_ON(!pte_none(*pte));
111171e3aac0SAndrea Arcangeli 		set_pte_at(mm, haddr, pte, entry);
111271e3aac0SAndrea Arcangeli 		pte_unmap(pte);
111371e3aac0SAndrea Arcangeli 	}
111471e3aac0SAndrea Arcangeli 	kfree(pages);
111571e3aac0SAndrea Arcangeli 
111671e3aac0SAndrea Arcangeli 	smp_wmb(); /* make pte visible before pmd */
111771e3aac0SAndrea Arcangeli 	pmd_populate(mm, pmd, pgtable);
1118d281ee61SKirill A. Shutemov 	page_remove_rmap(page, true);
1119c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
112071e3aac0SAndrea Arcangeli 
11212ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
11222ec74c3eSSagi Grimberg 
112371e3aac0SAndrea Arcangeli 	ret |= VM_FAULT_WRITE;
112471e3aac0SAndrea Arcangeli 	put_page(page);
112571e3aac0SAndrea Arcangeli 
112671e3aac0SAndrea Arcangeli out:
112771e3aac0SAndrea Arcangeli 	return ret;
112871e3aac0SAndrea Arcangeli 
112971e3aac0SAndrea Arcangeli out_free_pages:
1130c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
11312ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1132b9bbfbe3SAndrea Arcangeli 	for (i = 0; i < HPAGE_PMD_NR; i++) {
113300501b53SJohannes Weiner 		memcg = (void *)page_private(pages[i]);
113400501b53SJohannes Weiner 		set_page_private(pages[i], 0);
1135f627c2f5SKirill A. Shutemov 		mem_cgroup_cancel_charge(pages[i], memcg, false);
113671e3aac0SAndrea Arcangeli 		put_page(pages[i]);
1137b9bbfbe3SAndrea Arcangeli 	}
113871e3aac0SAndrea Arcangeli 	kfree(pages);
113971e3aac0SAndrea Arcangeli 	goto out;
114071e3aac0SAndrea Arcangeli }
114171e3aac0SAndrea Arcangeli 
114271e3aac0SAndrea Arcangeli int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
114371e3aac0SAndrea Arcangeli 			unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
114471e3aac0SAndrea Arcangeli {
1145c4088ebdSKirill A. Shutemov 	spinlock_t *ptl;
114671e3aac0SAndrea Arcangeli 	int ret = 0;
114793b4796dSKirill A. Shutemov 	struct page *page = NULL, *new_page;
114800501b53SJohannes Weiner 	struct mem_cgroup *memcg;
114971e3aac0SAndrea Arcangeli 	unsigned long haddr;
11502ec74c3eSSagi Grimberg 	unsigned long mmun_start;	/* For mmu_notifiers */
11512ec74c3eSSagi Grimberg 	unsigned long mmun_end;		/* For mmu_notifiers */
11523b363692SMichal Hocko 	gfp_t huge_gfp;			/* for allocation and charge */
115371e3aac0SAndrea Arcangeli 
1154c4088ebdSKirill A. Shutemov 	ptl = pmd_lockptr(mm, pmd);
115581d1b09cSSasha Levin 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
115693b4796dSKirill A. Shutemov 	haddr = address & HPAGE_PMD_MASK;
115793b4796dSKirill A. Shutemov 	if (is_huge_zero_pmd(orig_pmd))
115893b4796dSKirill A. Shutemov 		goto alloc;
1159c4088ebdSKirill A. Shutemov 	spin_lock(ptl);
116071e3aac0SAndrea Arcangeli 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
116171e3aac0SAndrea Arcangeli 		goto out_unlock;
116271e3aac0SAndrea Arcangeli 
116371e3aac0SAndrea Arcangeli 	page = pmd_page(orig_pmd);
1164309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
11651f25fe20SKirill A. Shutemov 	/*
11661f25fe20SKirill A. Shutemov 	 * We can only reuse the page if nobody else maps the huge page or it's
11671f25fe20SKirill A. Shutemov 	 * part. We can do it by checking page_mapcount() on each sub-page, but
11681f25fe20SKirill A. Shutemov 	 * it's expensive.
11691f25fe20SKirill A. Shutemov 	 * The cheaper way is to check page_count() to be equal 1: every
11701f25fe20SKirill A. Shutemov 	 * mapcount takes page reference reference, so this way we can
11711f25fe20SKirill A. Shutemov 	 * guarantee, that the PMD is the only mapping.
11721f25fe20SKirill A. Shutemov 	 * This can give false negative if somebody pinned the page, but that's
11731f25fe20SKirill A. Shutemov 	 * fine.
11741f25fe20SKirill A. Shutemov 	 */
11751f25fe20SKirill A. Shutemov 	if (page_mapcount(page) == 1 && page_count(page) == 1) {
117671e3aac0SAndrea Arcangeli 		pmd_t entry;
117771e3aac0SAndrea Arcangeli 		entry = pmd_mkyoung(orig_pmd);
117871e3aac0SAndrea Arcangeli 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
117971e3aac0SAndrea Arcangeli 		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
1180b113da65SDavid Miller 			update_mmu_cache_pmd(vma, address, pmd);
118171e3aac0SAndrea Arcangeli 		ret |= VM_FAULT_WRITE;
118271e3aac0SAndrea Arcangeli 		goto out_unlock;
118371e3aac0SAndrea Arcangeli 	}
1184ddc58f27SKirill A. Shutemov 	get_page(page);
1185c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
118693b4796dSKirill A. Shutemov alloc:
118771e3aac0SAndrea Arcangeli 	if (transparent_hugepage_enabled(vma) &&
1188077fcf11SAneesh Kumar K.V 	    !transparent_hugepage_debug_cow()) {
11893b363692SMichal Hocko 		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
11903b363692SMichal Hocko 		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1191077fcf11SAneesh Kumar K.V 	} else
119271e3aac0SAndrea Arcangeli 		new_page = NULL;
119371e3aac0SAndrea Arcangeli 
119471e3aac0SAndrea Arcangeli 	if (unlikely(!new_page)) {
1195eecc1e42SHugh Dickins 		if (!page) {
119678ddc534SKirill A. Shutemov 			split_huge_pmd(vma, pmd, address);
1197e9b71ca9SKirill A. Shutemov 			ret |= VM_FAULT_FALLBACK;
119893b4796dSKirill A. Shutemov 		} else {
119971e3aac0SAndrea Arcangeli 			ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
120071e3aac0SAndrea Arcangeli 					pmd, orig_pmd, page, haddr);
12019845cbbdSKirill A. Shutemov 			if (ret & VM_FAULT_OOM) {
120278ddc534SKirill A. Shutemov 				split_huge_pmd(vma, pmd, address);
12039845cbbdSKirill A. Shutemov 				ret |= VM_FAULT_FALLBACK;
12049845cbbdSKirill A. Shutemov 			}
1205ddc58f27SKirill A. Shutemov 			put_page(page);
120693b4796dSKirill A. Shutemov 		}
120717766ddeSDavid Rientjes 		count_vm_event(THP_FAULT_FALLBACK);
120871e3aac0SAndrea Arcangeli 		goto out;
120971e3aac0SAndrea Arcangeli 	}
121071e3aac0SAndrea Arcangeli 
1211f627c2f5SKirill A. Shutemov 	if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
1212f627c2f5SKirill A. Shutemov 					   true))) {
1213b9bbfbe3SAndrea Arcangeli 		put_page(new_page);
121493b4796dSKirill A. Shutemov 		if (page) {
121578ddc534SKirill A. Shutemov 			split_huge_pmd(vma, pmd, address);
1216ddc58f27SKirill A. Shutemov 			put_page(page);
12179845cbbdSKirill A. Shutemov 		} else
121878ddc534SKirill A. Shutemov 			split_huge_pmd(vma, pmd, address);
12199845cbbdSKirill A. Shutemov 		ret |= VM_FAULT_FALLBACK;
122017766ddeSDavid Rientjes 		count_vm_event(THP_FAULT_FALLBACK);
1221b9bbfbe3SAndrea Arcangeli 		goto out;
1222b9bbfbe3SAndrea Arcangeli 	}
1223b9bbfbe3SAndrea Arcangeli 
122417766ddeSDavid Rientjes 	count_vm_event(THP_FAULT_ALLOC);
122517766ddeSDavid Rientjes 
1226eecc1e42SHugh Dickins 	if (!page)
122793b4796dSKirill A. Shutemov 		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
122893b4796dSKirill A. Shutemov 	else
122971e3aac0SAndrea Arcangeli 		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
123071e3aac0SAndrea Arcangeli 	__SetPageUptodate(new_page);
123171e3aac0SAndrea Arcangeli 
12322ec74c3eSSagi Grimberg 	mmun_start = haddr;
12332ec74c3eSSagi Grimberg 	mmun_end   = haddr + HPAGE_PMD_SIZE;
12342ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
12352ec74c3eSSagi Grimberg 
1236c4088ebdSKirill A. Shutemov 	spin_lock(ptl);
123793b4796dSKirill A. Shutemov 	if (page)
1238ddc58f27SKirill A. Shutemov 		put_page(page);
1239b9bbfbe3SAndrea Arcangeli 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1240c4088ebdSKirill A. Shutemov 		spin_unlock(ptl);
1241f627c2f5SKirill A. Shutemov 		mem_cgroup_cancel_charge(new_page, memcg, true);
124271e3aac0SAndrea Arcangeli 		put_page(new_page);
12432ec74c3eSSagi Grimberg 		goto out_mn;
1244b9bbfbe3SAndrea Arcangeli 	} else {
124571e3aac0SAndrea Arcangeli 		pmd_t entry;
12463122359aSKirill A. Shutemov 		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
12473122359aSKirill A. Shutemov 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
12488809aa2dSAneesh Kumar K.V 		pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1249d281ee61SKirill A. Shutemov 		page_add_new_anon_rmap(new_page, vma, haddr, true);
1250f627c2f5SKirill A. Shutemov 		mem_cgroup_commit_charge(new_page, memcg, false, true);
125100501b53SJohannes Weiner 		lru_cache_add_active_or_unevictable(new_page, vma);
125271e3aac0SAndrea Arcangeli 		set_pmd_at(mm, haddr, pmd, entry);
1253b113da65SDavid Miller 		update_mmu_cache_pmd(vma, address, pmd);
1254eecc1e42SHugh Dickins 		if (!page) {
125593b4796dSKirill A. Shutemov 			add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
125697ae1749SKirill A. Shutemov 			put_huge_zero_page();
125797ae1749SKirill A. Shutemov 		} else {
1258309381feSSasha Levin 			VM_BUG_ON_PAGE(!PageHead(page), page);
1259d281ee61SKirill A. Shutemov 			page_remove_rmap(page, true);
126071e3aac0SAndrea Arcangeli 			put_page(page);
126193b4796dSKirill A. Shutemov 		}
126271e3aac0SAndrea Arcangeli 		ret |= VM_FAULT_WRITE;
126371e3aac0SAndrea Arcangeli 	}
1264c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
12652ec74c3eSSagi Grimberg out_mn:
12662ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
12672ec74c3eSSagi Grimberg out:
12682ec74c3eSSagi Grimberg 	return ret;
126971e3aac0SAndrea Arcangeli out_unlock:
1270c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
127171e3aac0SAndrea Arcangeli 	return ret;
127271e3aac0SAndrea Arcangeli }
127371e3aac0SAndrea Arcangeli 
1274b676b293SDavid Rientjes struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
127571e3aac0SAndrea Arcangeli 				   unsigned long addr,
127671e3aac0SAndrea Arcangeli 				   pmd_t *pmd,
127771e3aac0SAndrea Arcangeli 				   unsigned int flags)
127871e3aac0SAndrea Arcangeli {
1279b676b293SDavid Rientjes 	struct mm_struct *mm = vma->vm_mm;
128071e3aac0SAndrea Arcangeli 	struct page *page = NULL;
128171e3aac0SAndrea Arcangeli 
1282c4088ebdSKirill A. Shutemov 	assert_spin_locked(pmd_lockptr(mm, pmd));
128371e3aac0SAndrea Arcangeli 
128471e3aac0SAndrea Arcangeli 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
128571e3aac0SAndrea Arcangeli 		goto out;
128671e3aac0SAndrea Arcangeli 
128785facf25SKirill A. Shutemov 	/* Avoid dumping huge zero page */
128885facf25SKirill A. Shutemov 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
128985facf25SKirill A. Shutemov 		return ERR_PTR(-EFAULT);
129085facf25SKirill A. Shutemov 
12912b4847e7SMel Gorman 	/* Full NUMA hinting faults to serialise migration in fault paths */
12928a0516edSMel Gorman 	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
12932b4847e7SMel Gorman 		goto out;
12942b4847e7SMel Gorman 
129571e3aac0SAndrea Arcangeli 	page = pmd_page(*pmd);
1296309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
129771e3aac0SAndrea Arcangeli 	if (flags & FOLL_TOUCH) {
129871e3aac0SAndrea Arcangeli 		pmd_t _pmd;
129971e3aac0SAndrea Arcangeli 		/*
130071e3aac0SAndrea Arcangeli 		 * We should set the dirty bit only for FOLL_WRITE but
130171e3aac0SAndrea Arcangeli 		 * for now the dirty bit in the pmd is meaningless.
130271e3aac0SAndrea Arcangeli 		 * And if the dirty bit will become meaningful and
130371e3aac0SAndrea Arcangeli 		 * we'll only set it with FOLL_WRITE, an atomic
130471e3aac0SAndrea Arcangeli 		 * set_bit will be required on the pmd to set the
130571e3aac0SAndrea Arcangeli 		 * young bit, instead of the current set_pmd_at.
130671e3aac0SAndrea Arcangeli 		 */
130771e3aac0SAndrea Arcangeli 		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
13088663890aSAneesh Kumar K.V 		if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
13098663890aSAneesh Kumar K.V 					  pmd, _pmd,  1))
13108663890aSAneesh Kumar K.V 			update_mmu_cache_pmd(vma, addr, pmd);
131171e3aac0SAndrea Arcangeli 	}
1312de60f5f1SEric B Munson 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1313b676b293SDavid Rientjes 		if (page->mapping && trylock_page(page)) {
1314b676b293SDavid Rientjes 			lru_add_drain();
1315b676b293SDavid Rientjes 			if (page->mapping)
1316b676b293SDavid Rientjes 				mlock_vma_page(page);
1317b676b293SDavid Rientjes 			unlock_page(page);
1318b676b293SDavid Rientjes 		}
1319b676b293SDavid Rientjes 	}
132071e3aac0SAndrea Arcangeli 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1321309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageCompound(page), page);
132271e3aac0SAndrea Arcangeli 	if (flags & FOLL_GET)
1323ddc58f27SKirill A. Shutemov 		get_page(page);
132471e3aac0SAndrea Arcangeli 
132571e3aac0SAndrea Arcangeli out:
132671e3aac0SAndrea Arcangeli 	return page;
132771e3aac0SAndrea Arcangeli }
132871e3aac0SAndrea Arcangeli 
1329d10e63f2SMel Gorman /* NUMA hinting page fault entry point for trans huge pmds */
13304daae3b4SMel Gorman int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
13314daae3b4SMel Gorman 				unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1332d10e63f2SMel Gorman {
1333c4088ebdSKirill A. Shutemov 	spinlock_t *ptl;
1334b8916634SMel Gorman 	struct anon_vma *anon_vma = NULL;
1335b32967ffSMel Gorman 	struct page *page;
1336d10e63f2SMel Gorman 	unsigned long haddr = addr & HPAGE_PMD_MASK;
13378191acbdSMel Gorman 	int page_nid = -1, this_nid = numa_node_id();
133890572890SPeter Zijlstra 	int target_nid, last_cpupid = -1;
13398191acbdSMel Gorman 	bool page_locked;
13408191acbdSMel Gorman 	bool migrated = false;
1341b191f9b1SMel Gorman 	bool was_writable;
13426688cc05SPeter Zijlstra 	int flags = 0;
1343d10e63f2SMel Gorman 
1344c0e7cad9SMel Gorman 	/* A PROT_NONE fault should not end up here */
1345c0e7cad9SMel Gorman 	BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
1346c0e7cad9SMel Gorman 
1347c4088ebdSKirill A. Shutemov 	ptl = pmd_lock(mm, pmdp);
1348d10e63f2SMel Gorman 	if (unlikely(!pmd_same(pmd, *pmdp)))
1349d10e63f2SMel Gorman 		goto out_unlock;
1350d10e63f2SMel Gorman 
1351de466bd6SMel Gorman 	/*
1352de466bd6SMel Gorman 	 * If there are potential migrations, wait for completion and retry
1353de466bd6SMel Gorman 	 * without disrupting NUMA hinting information. Do not relock and
1354de466bd6SMel Gorman 	 * check_same as the page may no longer be mapped.
1355de466bd6SMel Gorman 	 */
1356de466bd6SMel Gorman 	if (unlikely(pmd_trans_migrating(*pmdp))) {
13575d833062SMel Gorman 		page = pmd_page(*pmdp);
1358de466bd6SMel Gorman 		spin_unlock(ptl);
13595d833062SMel Gorman 		wait_on_page_locked(page);
1360de466bd6SMel Gorman 		goto out;
1361de466bd6SMel Gorman 	}
1362de466bd6SMel Gorman 
1363d10e63f2SMel Gorman 	page = pmd_page(pmd);
1364a1a46184SMel Gorman 	BUG_ON(is_huge_zero_page(page));
13658191acbdSMel Gorman 	page_nid = page_to_nid(page);
136690572890SPeter Zijlstra 	last_cpupid = page_cpupid_last(page);
136703c5a6e1SMel Gorman 	count_vm_numa_event(NUMA_HINT_FAULTS);
136804bb2f94SRik van Riel 	if (page_nid == this_nid) {
136903c5a6e1SMel Gorman 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
137004bb2f94SRik van Riel 		flags |= TNF_FAULT_LOCAL;
137104bb2f94SRik van Riel 	}
13724daae3b4SMel Gorman 
1373bea66fbdSMel Gorman 	/* See similar comment in do_numa_page for explanation */
1374bea66fbdSMel Gorman 	if (!(vma->vm_flags & VM_WRITE))
13756688cc05SPeter Zijlstra 		flags |= TNF_NO_GROUP;
13766688cc05SPeter Zijlstra 
13776688cc05SPeter Zijlstra 	/*
1378ff9042b1SMel Gorman 	 * Acquire the page lock to serialise THP migrations but avoid dropping
1379ff9042b1SMel Gorman 	 * page_table_lock if at all possible
1380ff9042b1SMel Gorman 	 */
1381b8916634SMel Gorman 	page_locked = trylock_page(page);
1382b8916634SMel Gorman 	target_nid = mpol_misplaced(page, vma, haddr);
1383b8916634SMel Gorman 	if (target_nid == -1) {
1384b8916634SMel Gorman 		/* If the page was locked, there are no parallel migrations */
1385a54a407fSMel Gorman 		if (page_locked)
1386b8916634SMel Gorman 			goto clear_pmdnuma;
13872b4847e7SMel Gorman 	}
1388cbee9f88SPeter Zijlstra 
1389de466bd6SMel Gorman 	/* Migration could have started since the pmd_trans_migrating check */
13902b4847e7SMel Gorman 	if (!page_locked) {
1391c4088ebdSKirill A. Shutemov 		spin_unlock(ptl);
1392b8916634SMel Gorman 		wait_on_page_locked(page);
1393a54a407fSMel Gorman 		page_nid = -1;
1394b8916634SMel Gorman 		goto out;
1395b8916634SMel Gorman 	}
1396b8916634SMel Gorman 
13972b4847e7SMel Gorman 	/*
13982b4847e7SMel Gorman 	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
13992b4847e7SMel Gorman 	 * to serialises splits
14002b4847e7SMel Gorman 	 */
1401b8916634SMel Gorman 	get_page(page);
1402c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
1403b8916634SMel Gorman 	anon_vma = page_lock_anon_vma_read(page);
1404b32967ffSMel Gorman 
1405c69307d5SPeter Zijlstra 	/* Confirm the PMD did not change while page_table_lock was released */
1406c4088ebdSKirill A. Shutemov 	spin_lock(ptl);
1407b32967ffSMel Gorman 	if (unlikely(!pmd_same(pmd, *pmdp))) {
1408b32967ffSMel Gorman 		unlock_page(page);
1409b32967ffSMel Gorman 		put_page(page);
1410a54a407fSMel Gorman 		page_nid = -1;
1411b32967ffSMel Gorman 		goto out_unlock;
1412b32967ffSMel Gorman 	}
1413ff9042b1SMel Gorman 
1414c3a489caSMel Gorman 	/* Bail if we fail to protect against THP splits for any reason */
1415c3a489caSMel Gorman 	if (unlikely(!anon_vma)) {
1416c3a489caSMel Gorman 		put_page(page);
1417c3a489caSMel Gorman 		page_nid = -1;
1418c3a489caSMel Gorman 		goto clear_pmdnuma;
1419c3a489caSMel Gorman 	}
1420c3a489caSMel Gorman 
1421a54a407fSMel Gorman 	/*
1422a54a407fSMel Gorman 	 * Migrate the THP to the requested node, returns with page unlocked
14238a0516edSMel Gorman 	 * and access rights restored.
1424a54a407fSMel Gorman 	 */
1425c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
1426b32967ffSMel Gorman 	migrated = migrate_misplaced_transhuge_page(mm, vma,
1427340ef390SHugh Dickins 				pmdp, pmd, addr, page, target_nid);
14286688cc05SPeter Zijlstra 	if (migrated) {
14296688cc05SPeter Zijlstra 		flags |= TNF_MIGRATED;
14308191acbdSMel Gorman 		page_nid = target_nid;
1431074c2381SMel Gorman 	} else
1432074c2381SMel Gorman 		flags |= TNF_MIGRATE_FAIL;
1433b32967ffSMel Gorman 
14348191acbdSMel Gorman 	goto out;
14354daae3b4SMel Gorman clear_pmdnuma:
1436a54a407fSMel Gorman 	BUG_ON(!PageLocked(page));
1437b191f9b1SMel Gorman 	was_writable = pmd_write(pmd);
14384d942466SMel Gorman 	pmd = pmd_modify(pmd, vma->vm_page_prot);
1439b7b04004SMel Gorman 	pmd = pmd_mkyoung(pmd);
1440b191f9b1SMel Gorman 	if (was_writable)
1441b191f9b1SMel Gorman 		pmd = pmd_mkwrite(pmd);
1442d10e63f2SMel Gorman 	set_pmd_at(mm, haddr, pmdp, pmd);
1443d10e63f2SMel Gorman 	update_mmu_cache_pmd(vma, addr, pmdp);
1444a54a407fSMel Gorman 	unlock_page(page);
1445d10e63f2SMel Gorman out_unlock:
1446c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
1447b8916634SMel Gorman 
1448b8916634SMel Gorman out:
1449b8916634SMel Gorman 	if (anon_vma)
1450b8916634SMel Gorman 		page_unlock_anon_vma_read(anon_vma);
1451b8916634SMel Gorman 
14528191acbdSMel Gorman 	if (page_nid != -1)
14536688cc05SPeter Zijlstra 		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
14548191acbdSMel Gorman 
1455d10e63f2SMel Gorman 	return 0;
1456d10e63f2SMel Gorman }
1457d10e63f2SMel Gorman 
145871e3aac0SAndrea Arcangeli int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1459f21760b1SShaohua Li 		 pmd_t *pmd, unsigned long addr)
146071e3aac0SAndrea Arcangeli {
1461f5c8ad47SDavid Miller 	pmd_t orig_pmd;
1462da146769SKirill A. Shutemov 	spinlock_t *ptl;
1463da146769SKirill A. Shutemov 
1464*4b471e88SKirill A. Shutemov 	if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
1465da146769SKirill A. Shutemov 		return 0;
1466a6bf2bb0SAneesh Kumar K.V 	/*
1467a6bf2bb0SAneesh Kumar K.V 	 * For architectures like ppc64 we look at deposited pgtable
14688809aa2dSAneesh Kumar K.V 	 * when calling pmdp_huge_get_and_clear. So do the
1469a6bf2bb0SAneesh Kumar K.V 	 * pgtable_trans_huge_withdraw after finishing pmdp related
1470a6bf2bb0SAneesh Kumar K.V 	 * operations.
1471a6bf2bb0SAneesh Kumar K.V 	 */
14728809aa2dSAneesh Kumar K.V 	orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1473fcbe08d6SMartin Schwidefsky 			tlb->fullmm);
1474f21760b1SShaohua Li 	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
14754897c765SMatthew Wilcox 	if (vma_is_dax(vma)) {
14764897c765SMatthew Wilcox 		spin_unlock(ptl);
1477da146769SKirill A. Shutemov 		if (is_huge_zero_pmd(orig_pmd))
1478da146769SKirill A. Shutemov 			put_huge_zero_page();
1479da146769SKirill A. Shutemov 	} else if (is_huge_zero_pmd(orig_pmd)) {
1480da146769SKirill A. Shutemov 		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
1481e1f56c89SKirill A. Shutemov 		atomic_long_dec(&tlb->mm->nr_ptes);
1482bf929152SKirill A. Shutemov 		spin_unlock(ptl);
148397ae1749SKirill A. Shutemov 		put_huge_zero_page();
1484479f0abbSKirill A. Shutemov 	} else {
14854897c765SMatthew Wilcox 		struct page *page = pmd_page(orig_pmd);
1486d281ee61SKirill A. Shutemov 		page_remove_rmap(page, true);
1487309381feSSasha Levin 		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
148871e3aac0SAndrea Arcangeli 		add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1489309381feSSasha Levin 		VM_BUG_ON_PAGE(!PageHead(page), page);
1490da146769SKirill A. Shutemov 		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
1491e1f56c89SKirill A. Shutemov 		atomic_long_dec(&tlb->mm->nr_ptes);
1492bf929152SKirill A. Shutemov 		spin_unlock(ptl);
149371e3aac0SAndrea Arcangeli 		tlb_remove_page(tlb, page);
1494479f0abbSKirill A. Shutemov 	}
1495da146769SKirill A. Shutemov 	return 1;
149671e3aac0SAndrea Arcangeli }
149771e3aac0SAndrea Arcangeli 
1498*4b471e88SKirill A. Shutemov bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
149937a1c49aSAndrea Arcangeli 		  unsigned long old_addr,
150037a1c49aSAndrea Arcangeli 		  unsigned long new_addr, unsigned long old_end,
150137a1c49aSAndrea Arcangeli 		  pmd_t *old_pmd, pmd_t *new_pmd)
150237a1c49aSAndrea Arcangeli {
1503bf929152SKirill A. Shutemov 	spinlock_t *old_ptl, *new_ptl;
150437a1c49aSAndrea Arcangeli 	pmd_t pmd;
150537a1c49aSAndrea Arcangeli 
150637a1c49aSAndrea Arcangeli 	struct mm_struct *mm = vma->vm_mm;
150737a1c49aSAndrea Arcangeli 
150837a1c49aSAndrea Arcangeli 	if ((old_addr & ~HPAGE_PMD_MASK) ||
150937a1c49aSAndrea Arcangeli 	    (new_addr & ~HPAGE_PMD_MASK) ||
151037a1c49aSAndrea Arcangeli 	    old_end - old_addr < HPAGE_PMD_SIZE ||
151137a1c49aSAndrea Arcangeli 	    (new_vma->vm_flags & VM_NOHUGEPAGE))
1512*4b471e88SKirill A. Shutemov 		return false;
151337a1c49aSAndrea Arcangeli 
151437a1c49aSAndrea Arcangeli 	/*
151537a1c49aSAndrea Arcangeli 	 * The destination pmd shouldn't be established, free_pgtables()
151637a1c49aSAndrea Arcangeli 	 * should have release it.
151737a1c49aSAndrea Arcangeli 	 */
151837a1c49aSAndrea Arcangeli 	if (WARN_ON(!pmd_none(*new_pmd))) {
151937a1c49aSAndrea Arcangeli 		VM_BUG_ON(pmd_trans_huge(*new_pmd));
1520*4b471e88SKirill A. Shutemov 		return false;
152137a1c49aSAndrea Arcangeli 	}
152237a1c49aSAndrea Arcangeli 
1523bf929152SKirill A. Shutemov 	/*
1524bf929152SKirill A. Shutemov 	 * We don't have to worry about the ordering of src and dst
1525bf929152SKirill A. Shutemov 	 * ptlocks because exclusive mmap_sem prevents deadlock.
1526bf929152SKirill A. Shutemov 	 */
1527*4b471e88SKirill A. Shutemov 	if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
1528bf929152SKirill A. Shutemov 		new_ptl = pmd_lockptr(mm, new_pmd);
1529bf929152SKirill A. Shutemov 		if (new_ptl != old_ptl)
1530bf929152SKirill A. Shutemov 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
15318809aa2dSAneesh Kumar K.V 		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
153237a1c49aSAndrea Arcangeli 		VM_BUG_ON(!pmd_none(*new_pmd));
15333592806cSKirill A. Shutemov 
1534b3084f4dSAneesh Kumar K.V 		if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
1535b3084f4dSAneesh Kumar K.V 			pgtable_t pgtable;
15363592806cSKirill A. Shutemov 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
15373592806cSKirill A. Shutemov 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
15383592806cSKirill A. Shutemov 		}
1539b3084f4dSAneesh Kumar K.V 		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1540b3084f4dSAneesh Kumar K.V 		if (new_ptl != old_ptl)
1541b3084f4dSAneesh Kumar K.V 			spin_unlock(new_ptl);
1542bf929152SKirill A. Shutemov 		spin_unlock(old_ptl);
1543*4b471e88SKirill A. Shutemov 		return true;
154437a1c49aSAndrea Arcangeli 	}
1545*4b471e88SKirill A. Shutemov 	return false;
154637a1c49aSAndrea Arcangeli }
154737a1c49aSAndrea Arcangeli 
1548f123d74aSMel Gorman /*
1549f123d74aSMel Gorman  * Returns
1550f123d74aSMel Gorman  *  - 0 if PMD could not be locked
1551f123d74aSMel Gorman  *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1552f123d74aSMel Gorman  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
1553f123d74aSMel Gorman  */
1554cd7548abSJohannes Weiner int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1555e944fd67SMel Gorman 		unsigned long addr, pgprot_t newprot, int prot_numa)
1556cd7548abSJohannes Weiner {
1557cd7548abSJohannes Weiner 	struct mm_struct *mm = vma->vm_mm;
1558bf929152SKirill A. Shutemov 	spinlock_t *ptl;
1559cd7548abSJohannes Weiner 	int ret = 0;
1560cd7548abSJohannes Weiner 
1561*4b471e88SKirill A. Shutemov 	if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
1562cd7548abSJohannes Weiner 		pmd_t entry;
1563b191f9b1SMel Gorman 		bool preserve_write = prot_numa && pmd_write(*pmd);
1564ba68bc01SMel Gorman 		ret = 1;
1565e944fd67SMel Gorman 
1566e944fd67SMel Gorman 		/*
1567e944fd67SMel Gorman 		 * Avoid trapping faults against the zero page. The read-only
1568e944fd67SMel Gorman 		 * data is likely to be read-cached on the local CPU and
1569e944fd67SMel Gorman 		 * local/remote hits to the zero page are not interesting.
1570e944fd67SMel Gorman 		 */
1571e944fd67SMel Gorman 		if (prot_numa && is_huge_zero_pmd(*pmd)) {
1572e944fd67SMel Gorman 			spin_unlock(ptl);
1573ba68bc01SMel Gorman 			return ret;
1574e944fd67SMel Gorman 		}
1575e944fd67SMel Gorman 
157610c1045fSMel Gorman 		if (!prot_numa || !pmd_protnone(*pmd)) {
15778809aa2dSAneesh Kumar K.V 			entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
1578cd7548abSJohannes Weiner 			entry = pmd_modify(entry, newprot);
1579b191f9b1SMel Gorman 			if (preserve_write)
1580b191f9b1SMel Gorman 				entry = pmd_mkwrite(entry);
1581f123d74aSMel Gorman 			ret = HPAGE_PMD_NR;
158256eecdb9SAneesh Kumar K.V 			set_pmd_at(mm, addr, pmd, entry);
1583b191f9b1SMel Gorman 			BUG_ON(!preserve_write && pmd_write(entry));
158410c1045fSMel Gorman 		}
1585bf929152SKirill A. Shutemov 		spin_unlock(ptl);
1586cd7548abSJohannes Weiner 	}
1587cd7548abSJohannes Weiner 
1588cd7548abSJohannes Weiner 	return ret;
1589cd7548abSJohannes Weiner }
1590cd7548abSJohannes Weiner 
1591025c5b24SNaoya Horiguchi /*
1592*4b471e88SKirill A. Shutemov  * Returns true if a given pmd maps a thp, false otherwise.
1593025c5b24SNaoya Horiguchi  *
1594*4b471e88SKirill A. Shutemov  * Note that if it returns true, this routine returns without unlocking page
1595*4b471e88SKirill A. Shutemov  * table lock. So callers must unlock it.
1596025c5b24SNaoya Horiguchi  */
1597*4b471e88SKirill A. Shutemov bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
1598bf929152SKirill A. Shutemov 		spinlock_t **ptl)
1599025c5b24SNaoya Horiguchi {
1600bf929152SKirill A. Shutemov 	*ptl = pmd_lock(vma->vm_mm, pmd);
1601*4b471e88SKirill A. Shutemov 	if (likely(pmd_trans_huge(*pmd)))
1602*4b471e88SKirill A. Shutemov 		return true;
1603bf929152SKirill A. Shutemov 	spin_unlock(*ptl);
1604*4b471e88SKirill A. Shutemov 	return false;
1605025c5b24SNaoya Horiguchi }
1606025c5b24SNaoya Horiguchi 
1607117b0791SKirill A. Shutemov /*
1608117b0791SKirill A. Shutemov  * This function returns whether a given @page is mapped onto the @address
1609117b0791SKirill A. Shutemov  * in the virtual space of @mm.
1610117b0791SKirill A. Shutemov  *
1611117b0791SKirill A. Shutemov  * When it's true, this function returns *pmd with holding the page table lock
1612117b0791SKirill A. Shutemov  * and passing it back to the caller via @ptl.
1613117b0791SKirill A. Shutemov  * If it's false, returns NULL without holding the page table lock.
1614117b0791SKirill A. Shutemov  */
161571e3aac0SAndrea Arcangeli pmd_t *page_check_address_pmd(struct page *page,
161671e3aac0SAndrea Arcangeli 			      struct mm_struct *mm,
161771e3aac0SAndrea Arcangeli 			      unsigned long address,
1618117b0791SKirill A. Shutemov 			      spinlock_t **ptl)
161971e3aac0SAndrea Arcangeli {
1620b5a8cad3SKirill A. Shutemov 	pgd_t *pgd;
1621b5a8cad3SKirill A. Shutemov 	pud_t *pud;
1622117b0791SKirill A. Shutemov 	pmd_t *pmd;
162371e3aac0SAndrea Arcangeli 
162471e3aac0SAndrea Arcangeli 	if (address & ~HPAGE_PMD_MASK)
1625117b0791SKirill A. Shutemov 		return NULL;
162671e3aac0SAndrea Arcangeli 
1627b5a8cad3SKirill A. Shutemov 	pgd = pgd_offset(mm, address);
1628b5a8cad3SKirill A. Shutemov 	if (!pgd_present(*pgd))
1629117b0791SKirill A. Shutemov 		return NULL;
1630b5a8cad3SKirill A. Shutemov 	pud = pud_offset(pgd, address);
1631b5a8cad3SKirill A. Shutemov 	if (!pud_present(*pud))
1632b5a8cad3SKirill A. Shutemov 		return NULL;
1633b5a8cad3SKirill A. Shutemov 	pmd = pmd_offset(pud, address);
1634b5a8cad3SKirill A. Shutemov 
1635117b0791SKirill A. Shutemov 	*ptl = pmd_lock(mm, pmd);
1636b5a8cad3SKirill A. Shutemov 	if (!pmd_present(*pmd))
1637117b0791SKirill A. Shutemov 		goto unlock;
163871e3aac0SAndrea Arcangeli 	if (pmd_page(*pmd) != page)
1639117b0791SKirill A. Shutemov 		goto unlock;
1640*4b471e88SKirill A. Shutemov 	if (pmd_trans_huge(*pmd))
1641117b0791SKirill A. Shutemov 		return pmd;
1642117b0791SKirill A. Shutemov unlock:
1643117b0791SKirill A. Shutemov 	spin_unlock(*ptl);
1644117b0791SKirill A. Shutemov 	return NULL;
164571e3aac0SAndrea Arcangeli }
164671e3aac0SAndrea Arcangeli 
16479050d7ebSVlastimil Babka #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
164878f11a25SAndrea Arcangeli 
164960ab3244SAndrea Arcangeli int hugepage_madvise(struct vm_area_struct *vma,
165060ab3244SAndrea Arcangeli 		     unsigned long *vm_flags, int advice)
16510af4e98bSAndrea Arcangeli {
1652a664b2d8SAndrea Arcangeli 	switch (advice) {
1653a664b2d8SAndrea Arcangeli 	case MADV_HUGEPAGE:
16541e1836e8SAlex Thorlton #ifdef CONFIG_S390
16551e1836e8SAlex Thorlton 		/*
16561e1836e8SAlex Thorlton 		 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
16571e1836e8SAlex Thorlton 		 * can't handle this properly after s390_enable_sie, so we simply
16581e1836e8SAlex Thorlton 		 * ignore the madvise to prevent qemu from causing a SIGSEGV.
16591e1836e8SAlex Thorlton 		 */
16601e1836e8SAlex Thorlton 		if (mm_has_pgste(vma->vm_mm))
16611e1836e8SAlex Thorlton 			return 0;
16621e1836e8SAlex Thorlton #endif
16630af4e98bSAndrea Arcangeli 		/*
16640af4e98bSAndrea Arcangeli 		 * Be somewhat over-protective like KSM for now!
16650af4e98bSAndrea Arcangeli 		 */
16661a763615SJason J. Herne 		if (*vm_flags & VM_NO_THP)
16670af4e98bSAndrea Arcangeli 			return -EINVAL;
1668a664b2d8SAndrea Arcangeli 		*vm_flags &= ~VM_NOHUGEPAGE;
16690af4e98bSAndrea Arcangeli 		*vm_flags |= VM_HUGEPAGE;
167060ab3244SAndrea Arcangeli 		/*
167160ab3244SAndrea Arcangeli 		 * If the vma become good for khugepaged to scan,
167260ab3244SAndrea Arcangeli 		 * register it here without waiting a page fault that
167360ab3244SAndrea Arcangeli 		 * may not happen any time soon.
167460ab3244SAndrea Arcangeli 		 */
16756d50e60cSDavid Rientjes 		if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
167660ab3244SAndrea Arcangeli 			return -ENOMEM;
1677a664b2d8SAndrea Arcangeli 		break;
1678a664b2d8SAndrea Arcangeli 	case MADV_NOHUGEPAGE:
1679a664b2d8SAndrea Arcangeli 		/*
1680a664b2d8SAndrea Arcangeli 		 * Be somewhat over-protective like KSM for now!
1681a664b2d8SAndrea Arcangeli 		 */
16821a763615SJason J. Herne 		if (*vm_flags & VM_NO_THP)
1683a664b2d8SAndrea Arcangeli 			return -EINVAL;
1684a664b2d8SAndrea Arcangeli 		*vm_flags &= ~VM_HUGEPAGE;
1685a664b2d8SAndrea Arcangeli 		*vm_flags |= VM_NOHUGEPAGE;
168660ab3244SAndrea Arcangeli 		/*
168760ab3244SAndrea Arcangeli 		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
168860ab3244SAndrea Arcangeli 		 * this vma even if we leave the mm registered in khugepaged if
168960ab3244SAndrea Arcangeli 		 * it got registered before VM_NOHUGEPAGE was set.
169060ab3244SAndrea Arcangeli 		 */
1691a664b2d8SAndrea Arcangeli 		break;
1692a664b2d8SAndrea Arcangeli 	}
16930af4e98bSAndrea Arcangeli 
16940af4e98bSAndrea Arcangeli 	return 0;
16950af4e98bSAndrea Arcangeli }
16960af4e98bSAndrea Arcangeli 
1697ba76149fSAndrea Arcangeli static int __init khugepaged_slab_init(void)
1698ba76149fSAndrea Arcangeli {
1699ba76149fSAndrea Arcangeli 	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1700ba76149fSAndrea Arcangeli 					  sizeof(struct mm_slot),
1701ba76149fSAndrea Arcangeli 					  __alignof__(struct mm_slot), 0, NULL);
1702ba76149fSAndrea Arcangeli 	if (!mm_slot_cache)
1703ba76149fSAndrea Arcangeli 		return -ENOMEM;
1704ba76149fSAndrea Arcangeli 
1705ba76149fSAndrea Arcangeli 	return 0;
1706ba76149fSAndrea Arcangeli }
1707ba76149fSAndrea Arcangeli 
170865ebb64fSKirill A. Shutemov static void __init khugepaged_slab_exit(void)
170965ebb64fSKirill A. Shutemov {
171065ebb64fSKirill A. Shutemov 	kmem_cache_destroy(mm_slot_cache);
171165ebb64fSKirill A. Shutemov }
171265ebb64fSKirill A. Shutemov 
1713ba76149fSAndrea Arcangeli static inline struct mm_slot *alloc_mm_slot(void)
1714ba76149fSAndrea Arcangeli {
1715ba76149fSAndrea Arcangeli 	if (!mm_slot_cache)	/* initialization failed */
1716ba76149fSAndrea Arcangeli 		return NULL;
1717ba76149fSAndrea Arcangeli 	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1718ba76149fSAndrea Arcangeli }
1719ba76149fSAndrea Arcangeli 
1720ba76149fSAndrea Arcangeli static inline void free_mm_slot(struct mm_slot *mm_slot)
1721ba76149fSAndrea Arcangeli {
1722ba76149fSAndrea Arcangeli 	kmem_cache_free(mm_slot_cache, mm_slot);
1723ba76149fSAndrea Arcangeli }
1724ba76149fSAndrea Arcangeli 
1725ba76149fSAndrea Arcangeli static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1726ba76149fSAndrea Arcangeli {
1727ba76149fSAndrea Arcangeli 	struct mm_slot *mm_slot;
1728ba76149fSAndrea Arcangeli 
1729b67bfe0dSSasha Levin 	hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
1730ba76149fSAndrea Arcangeli 		if (mm == mm_slot->mm)
1731ba76149fSAndrea Arcangeli 			return mm_slot;
173243b5fbbdSSasha Levin 
1733ba76149fSAndrea Arcangeli 	return NULL;
1734ba76149fSAndrea Arcangeli }
1735ba76149fSAndrea Arcangeli 
1736ba76149fSAndrea Arcangeli static void insert_to_mm_slots_hash(struct mm_struct *mm,
1737ba76149fSAndrea Arcangeli 				    struct mm_slot *mm_slot)
1738ba76149fSAndrea Arcangeli {
1739ba76149fSAndrea Arcangeli 	mm_slot->mm = mm;
174043b5fbbdSSasha Levin 	hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
1741ba76149fSAndrea Arcangeli }
1742ba76149fSAndrea Arcangeli 
1743ba76149fSAndrea Arcangeli static inline int khugepaged_test_exit(struct mm_struct *mm)
1744ba76149fSAndrea Arcangeli {
1745ba76149fSAndrea Arcangeli 	return atomic_read(&mm->mm_users) == 0;
1746ba76149fSAndrea Arcangeli }
1747ba76149fSAndrea Arcangeli 
1748ba76149fSAndrea Arcangeli int __khugepaged_enter(struct mm_struct *mm)
1749ba76149fSAndrea Arcangeli {
1750ba76149fSAndrea Arcangeli 	struct mm_slot *mm_slot;
1751ba76149fSAndrea Arcangeli 	int wakeup;
1752ba76149fSAndrea Arcangeli 
1753ba76149fSAndrea Arcangeli 	mm_slot = alloc_mm_slot();
1754ba76149fSAndrea Arcangeli 	if (!mm_slot)
1755ba76149fSAndrea Arcangeli 		return -ENOMEM;
1756ba76149fSAndrea Arcangeli 
1757ba76149fSAndrea Arcangeli 	/* __khugepaged_exit() must not run from under us */
175896dad67fSSasha Levin 	VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
1759ba76149fSAndrea Arcangeli 	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1760ba76149fSAndrea Arcangeli 		free_mm_slot(mm_slot);
1761ba76149fSAndrea Arcangeli 		return 0;
1762ba76149fSAndrea Arcangeli 	}
1763ba76149fSAndrea Arcangeli 
1764ba76149fSAndrea Arcangeli 	spin_lock(&khugepaged_mm_lock);
1765ba76149fSAndrea Arcangeli 	insert_to_mm_slots_hash(mm, mm_slot);
1766ba76149fSAndrea Arcangeli 	/*
1767ba76149fSAndrea Arcangeli 	 * Insert just behind the scanning cursor, to let the area settle
1768ba76149fSAndrea Arcangeli 	 * down a little.
1769ba76149fSAndrea Arcangeli 	 */
1770ba76149fSAndrea Arcangeli 	wakeup = list_empty(&khugepaged_scan.mm_head);
1771ba76149fSAndrea Arcangeli 	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1772ba76149fSAndrea Arcangeli 	spin_unlock(&khugepaged_mm_lock);
1773ba76149fSAndrea Arcangeli 
1774ba76149fSAndrea Arcangeli 	atomic_inc(&mm->mm_count);
1775ba76149fSAndrea Arcangeli 	if (wakeup)
1776ba76149fSAndrea Arcangeli 		wake_up_interruptible(&khugepaged_wait);
1777ba76149fSAndrea Arcangeli 
1778ba76149fSAndrea Arcangeli 	return 0;
1779ba76149fSAndrea Arcangeli }
1780ba76149fSAndrea Arcangeli 
17816d50e60cSDavid Rientjes int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
17826d50e60cSDavid Rientjes 			       unsigned long vm_flags)
1783ba76149fSAndrea Arcangeli {
1784ba76149fSAndrea Arcangeli 	unsigned long hstart, hend;
1785ba76149fSAndrea Arcangeli 	if (!vma->anon_vma)
1786ba76149fSAndrea Arcangeli 		/*
1787ba76149fSAndrea Arcangeli 		 * Not yet faulted in so we will register later in the
1788ba76149fSAndrea Arcangeli 		 * page fault if needed.
1789ba76149fSAndrea Arcangeli 		 */
1790ba76149fSAndrea Arcangeli 		return 0;
179178f11a25SAndrea Arcangeli 	if (vma->vm_ops)
1792ba76149fSAndrea Arcangeli 		/* khugepaged not yet working on file or special mappings */
1793ba76149fSAndrea Arcangeli 		return 0;
17946d50e60cSDavid Rientjes 	VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
1795ba76149fSAndrea Arcangeli 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1796ba76149fSAndrea Arcangeli 	hend = vma->vm_end & HPAGE_PMD_MASK;
1797ba76149fSAndrea Arcangeli 	if (hstart < hend)
17986d50e60cSDavid Rientjes 		return khugepaged_enter(vma, vm_flags);
1799ba76149fSAndrea Arcangeli 	return 0;
1800ba76149fSAndrea Arcangeli }
1801ba76149fSAndrea Arcangeli 
1802ba76149fSAndrea Arcangeli void __khugepaged_exit(struct mm_struct *mm)
1803ba76149fSAndrea Arcangeli {
1804ba76149fSAndrea Arcangeli 	struct mm_slot *mm_slot;
1805ba76149fSAndrea Arcangeli 	int free = 0;
1806ba76149fSAndrea Arcangeli 
1807ba76149fSAndrea Arcangeli 	spin_lock(&khugepaged_mm_lock);
1808ba76149fSAndrea Arcangeli 	mm_slot = get_mm_slot(mm);
1809ba76149fSAndrea Arcangeli 	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
181043b5fbbdSSasha Levin 		hash_del(&mm_slot->hash);
1811ba76149fSAndrea Arcangeli 		list_del(&mm_slot->mm_node);
1812ba76149fSAndrea Arcangeli 		free = 1;
1813ba76149fSAndrea Arcangeli 	}
1814d788e80aSChris Wright 	spin_unlock(&khugepaged_mm_lock);
1815ba76149fSAndrea Arcangeli 
1816ba76149fSAndrea Arcangeli 	if (free) {
1817ba76149fSAndrea Arcangeli 		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1818ba76149fSAndrea Arcangeli 		free_mm_slot(mm_slot);
1819ba76149fSAndrea Arcangeli 		mmdrop(mm);
1820ba76149fSAndrea Arcangeli 	} else if (mm_slot) {
1821ba76149fSAndrea Arcangeli 		/*
1822ba76149fSAndrea Arcangeli 		 * This is required to serialize against
1823ba76149fSAndrea Arcangeli 		 * khugepaged_test_exit() (which is guaranteed to run
1824ba76149fSAndrea Arcangeli 		 * under mmap sem read mode). Stop here (after we
1825ba76149fSAndrea Arcangeli 		 * return all pagetables will be destroyed) until
1826ba76149fSAndrea Arcangeli 		 * khugepaged has finished working on the pagetables
1827ba76149fSAndrea Arcangeli 		 * under the mmap_sem.
1828ba76149fSAndrea Arcangeli 		 */
1829ba76149fSAndrea Arcangeli 		down_write(&mm->mmap_sem);
1830ba76149fSAndrea Arcangeli 		up_write(&mm->mmap_sem);
1831d788e80aSChris Wright 	}
1832ba76149fSAndrea Arcangeli }
1833ba76149fSAndrea Arcangeli 
1834ba76149fSAndrea Arcangeli static void release_pte_page(struct page *page)
1835ba76149fSAndrea Arcangeli {
1836ba76149fSAndrea Arcangeli 	/* 0 stands for page_is_file_cache(page) == false */
1837ba76149fSAndrea Arcangeli 	dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1838ba76149fSAndrea Arcangeli 	unlock_page(page);
1839ba76149fSAndrea Arcangeli 	putback_lru_page(page);
1840ba76149fSAndrea Arcangeli }
1841ba76149fSAndrea Arcangeli 
1842ba76149fSAndrea Arcangeli static void release_pte_pages(pte_t *pte, pte_t *_pte)
1843ba76149fSAndrea Arcangeli {
1844ba76149fSAndrea Arcangeli 	while (--_pte >= pte) {
1845ba76149fSAndrea Arcangeli 		pte_t pteval = *_pte;
1846ca0984caSEbru Akagunduz 		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
1847ba76149fSAndrea Arcangeli 			release_pte_page(pte_page(pteval));
1848ba76149fSAndrea Arcangeli 	}
1849ba76149fSAndrea Arcangeli }
1850ba76149fSAndrea Arcangeli 
1851ba76149fSAndrea Arcangeli static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1852ba76149fSAndrea Arcangeli 					unsigned long address,
1853ba76149fSAndrea Arcangeli 					pte_t *pte)
1854ba76149fSAndrea Arcangeli {
18557d2eba05SEbru Akagunduz 	struct page *page = NULL;
1856ba76149fSAndrea Arcangeli 	pte_t *_pte;
18577d2eba05SEbru Akagunduz 	int none_or_zero = 0, result = 0;
185810359213SEbru Akagunduz 	bool referenced = false, writable = false;
18597d2eba05SEbru Akagunduz 
1860ba76149fSAndrea Arcangeli 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1861ba76149fSAndrea Arcangeli 	     _pte++, address += PAGE_SIZE) {
1862ba76149fSAndrea Arcangeli 		pte_t pteval = *_pte;
186347aee4d8SMinchan Kim 		if (pte_none(pteval) || (pte_present(pteval) &&
186447aee4d8SMinchan Kim 				is_zero_pfn(pte_pfn(pteval)))) {
1865c1294d05SAndrea Arcangeli 			if (!userfaultfd_armed(vma) &&
18667d2eba05SEbru Akagunduz 			    ++none_or_zero <= khugepaged_max_ptes_none) {
1867ba76149fSAndrea Arcangeli 				continue;
18687d2eba05SEbru Akagunduz 			} else {
18697d2eba05SEbru Akagunduz 				result = SCAN_EXCEED_NONE_PTE;
1870ba76149fSAndrea Arcangeli 				goto out;
1871ba76149fSAndrea Arcangeli 			}
18727d2eba05SEbru Akagunduz 		}
18737d2eba05SEbru Akagunduz 		if (!pte_present(pteval)) {
18747d2eba05SEbru Akagunduz 			result = SCAN_PTE_NON_PRESENT;
1875ba76149fSAndrea Arcangeli 			goto out;
18767d2eba05SEbru Akagunduz 		}
1877ba76149fSAndrea Arcangeli 		page = vm_normal_page(vma, address, pteval);
18787d2eba05SEbru Akagunduz 		if (unlikely(!page)) {
18797d2eba05SEbru Akagunduz 			result = SCAN_PAGE_NULL;
1880ba76149fSAndrea Arcangeli 			goto out;
18817d2eba05SEbru Akagunduz 		}
1882344aa35cSBob Liu 
1883309381feSSasha Levin 		VM_BUG_ON_PAGE(PageCompound(page), page);
1884309381feSSasha Levin 		VM_BUG_ON_PAGE(!PageAnon(page), page);
1885309381feSSasha Levin 		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1886ba76149fSAndrea Arcangeli 
1887ba76149fSAndrea Arcangeli 		/*
1888ba76149fSAndrea Arcangeli 		 * We can do it before isolate_lru_page because the
1889ba76149fSAndrea Arcangeli 		 * page can't be freed from under us. NOTE: PG_lock
1890ba76149fSAndrea Arcangeli 		 * is needed to serialize against split_huge_page
1891ba76149fSAndrea Arcangeli 		 * when invoked from the VM.
1892ba76149fSAndrea Arcangeli 		 */
18937d2eba05SEbru Akagunduz 		if (!trylock_page(page)) {
18947d2eba05SEbru Akagunduz 			result = SCAN_PAGE_LOCK;
1895ba76149fSAndrea Arcangeli 			goto out;
18967d2eba05SEbru Akagunduz 		}
189710359213SEbru Akagunduz 
189810359213SEbru Akagunduz 		/*
189910359213SEbru Akagunduz 		 * cannot use mapcount: can't collapse if there's a gup pin.
190010359213SEbru Akagunduz 		 * The page must only be referenced by the scanned process
190110359213SEbru Akagunduz 		 * and page swap cache.
190210359213SEbru Akagunduz 		 */
190310359213SEbru Akagunduz 		if (page_count(page) != 1 + !!PageSwapCache(page)) {
190410359213SEbru Akagunduz 			unlock_page(page);
19057d2eba05SEbru Akagunduz 			result = SCAN_PAGE_COUNT;
190610359213SEbru Akagunduz 			goto out;
190710359213SEbru Akagunduz 		}
190810359213SEbru Akagunduz 		if (pte_write(pteval)) {
190910359213SEbru Akagunduz 			writable = true;
191010359213SEbru Akagunduz 		} else {
191110359213SEbru Akagunduz 			if (PageSwapCache(page) && !reuse_swap_page(page)) {
191210359213SEbru Akagunduz 				unlock_page(page);
19137d2eba05SEbru Akagunduz 				result = SCAN_SWAP_CACHE_PAGE;
191410359213SEbru Akagunduz 				goto out;
191510359213SEbru Akagunduz 			}
191610359213SEbru Akagunduz 			/*
191710359213SEbru Akagunduz 			 * Page is not in the swap cache. It can be collapsed
191810359213SEbru Akagunduz 			 * into a THP.
191910359213SEbru Akagunduz 			 */
192010359213SEbru Akagunduz 		}
192110359213SEbru Akagunduz 
1922ba76149fSAndrea Arcangeli 		/*
1923ba76149fSAndrea Arcangeli 		 * Isolate the page to avoid collapsing an hugepage
1924ba76149fSAndrea Arcangeli 		 * currently in use by the VM.
1925ba76149fSAndrea Arcangeli 		 */
1926ba76149fSAndrea Arcangeli 		if (isolate_lru_page(page)) {
1927ba76149fSAndrea Arcangeli 			unlock_page(page);
19287d2eba05SEbru Akagunduz 			result = SCAN_DEL_PAGE_LRU;
1929ba76149fSAndrea Arcangeli 			goto out;
1930ba76149fSAndrea Arcangeli 		}
1931ba76149fSAndrea Arcangeli 		/* 0 stands for page_is_file_cache(page) == false */
1932ba76149fSAndrea Arcangeli 		inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1933309381feSSasha Levin 		VM_BUG_ON_PAGE(!PageLocked(page), page);
1934309381feSSasha Levin 		VM_BUG_ON_PAGE(PageLRU(page), page);
1935ba76149fSAndrea Arcangeli 
1936ba76149fSAndrea Arcangeli 		/* If there is no mapped pte young don't collapse the page */
193733c3fc71SVladimir Davydov 		if (pte_young(pteval) ||
193833c3fc71SVladimir Davydov 		    page_is_young(page) || PageReferenced(page) ||
19398ee53820SAndrea Arcangeli 		    mmu_notifier_test_young(vma->vm_mm, address))
194010359213SEbru Akagunduz 			referenced = true;
1941ba76149fSAndrea Arcangeli 	}
19427d2eba05SEbru Akagunduz 	if (likely(writable)) {
19437d2eba05SEbru Akagunduz 		if (likely(referenced)) {
19447d2eba05SEbru Akagunduz 			result = SCAN_SUCCEED;
19457d2eba05SEbru Akagunduz 			trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
19467d2eba05SEbru Akagunduz 							    referenced, writable, result);
1947344aa35cSBob Liu 			return 1;
19487d2eba05SEbru Akagunduz 		}
19497d2eba05SEbru Akagunduz 	} else {
19507d2eba05SEbru Akagunduz 		result = SCAN_PAGE_RO;
19517d2eba05SEbru Akagunduz 	}
19527d2eba05SEbru Akagunduz 
1953ba76149fSAndrea Arcangeli out:
1954344aa35cSBob Liu 	release_pte_pages(pte, _pte);
19557d2eba05SEbru Akagunduz 	trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
19567d2eba05SEbru Akagunduz 					    referenced, writable, result);
1957344aa35cSBob Liu 	return 0;
1958ba76149fSAndrea Arcangeli }
1959ba76149fSAndrea Arcangeli 
1960ba76149fSAndrea Arcangeli static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1961ba76149fSAndrea Arcangeli 				      struct vm_area_struct *vma,
1962ba76149fSAndrea Arcangeli 				      unsigned long address,
1963ba76149fSAndrea Arcangeli 				      spinlock_t *ptl)
1964ba76149fSAndrea Arcangeli {
1965ba76149fSAndrea Arcangeli 	pte_t *_pte;
1966ba76149fSAndrea Arcangeli 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1967ba76149fSAndrea Arcangeli 		pte_t pteval = *_pte;
1968ba76149fSAndrea Arcangeli 		struct page *src_page;
1969ba76149fSAndrea Arcangeli 
1970ca0984caSEbru Akagunduz 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1971ba76149fSAndrea Arcangeli 			clear_user_highpage(page, address);
1972ba76149fSAndrea Arcangeli 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1973ca0984caSEbru Akagunduz 			if (is_zero_pfn(pte_pfn(pteval))) {
1974ca0984caSEbru Akagunduz 				/*
1975ca0984caSEbru Akagunduz 				 * ptl mostly unnecessary.
1976ca0984caSEbru Akagunduz 				 */
1977ca0984caSEbru Akagunduz 				spin_lock(ptl);
1978ca0984caSEbru Akagunduz 				/*
1979ca0984caSEbru Akagunduz 				 * paravirt calls inside pte_clear here are
1980ca0984caSEbru Akagunduz 				 * superfluous.
1981ca0984caSEbru Akagunduz 				 */
1982ca0984caSEbru Akagunduz 				pte_clear(vma->vm_mm, address, _pte);
1983ca0984caSEbru Akagunduz 				spin_unlock(ptl);
1984ca0984caSEbru Akagunduz 			}
1985ba76149fSAndrea Arcangeli 		} else {
1986ba76149fSAndrea Arcangeli 			src_page = pte_page(pteval);
1987ba76149fSAndrea Arcangeli 			copy_user_highpage(page, src_page, address, vma);
1988309381feSSasha Levin 			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
1989ba76149fSAndrea Arcangeli 			release_pte_page(src_page);
1990ba76149fSAndrea Arcangeli 			/*
1991ba76149fSAndrea Arcangeli 			 * ptl mostly unnecessary, but preempt has to
1992ba76149fSAndrea Arcangeli 			 * be disabled to update the per-cpu stats
1993ba76149fSAndrea Arcangeli 			 * inside page_remove_rmap().
1994ba76149fSAndrea Arcangeli 			 */
1995ba76149fSAndrea Arcangeli 			spin_lock(ptl);
1996ba76149fSAndrea Arcangeli 			/*
1997ba76149fSAndrea Arcangeli 			 * paravirt calls inside pte_clear here are
1998ba76149fSAndrea Arcangeli 			 * superfluous.
1999ba76149fSAndrea Arcangeli 			 */
2000ba76149fSAndrea Arcangeli 			pte_clear(vma->vm_mm, address, _pte);
2001d281ee61SKirill A. Shutemov 			page_remove_rmap(src_page, false);
2002ba76149fSAndrea Arcangeli 			spin_unlock(ptl);
2003ba76149fSAndrea Arcangeli 			free_page_and_swap_cache(src_page);
2004ba76149fSAndrea Arcangeli 		}
2005ba76149fSAndrea Arcangeli 
2006ba76149fSAndrea Arcangeli 		address += PAGE_SIZE;
2007ba76149fSAndrea Arcangeli 		page++;
2008ba76149fSAndrea Arcangeli 	}
2009ba76149fSAndrea Arcangeli }
2010ba76149fSAndrea Arcangeli 
201126234f36SXiao Guangrong static void khugepaged_alloc_sleep(void)
201226234f36SXiao Guangrong {
2013bde43c6cSPetr Mladek 	DEFINE_WAIT(wait);
2014bde43c6cSPetr Mladek 
2015bde43c6cSPetr Mladek 	add_wait_queue(&khugepaged_wait, &wait);
2016bde43c6cSPetr Mladek 	freezable_schedule_timeout_interruptible(
201726234f36SXiao Guangrong 		msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2018bde43c6cSPetr Mladek 	remove_wait_queue(&khugepaged_wait, &wait);
201926234f36SXiao Guangrong }
202026234f36SXiao Guangrong 
20219f1b868aSBob Liu static int khugepaged_node_load[MAX_NUMNODES];
20229f1b868aSBob Liu 
202314a4e214SDavid Rientjes static bool khugepaged_scan_abort(int nid)
202414a4e214SDavid Rientjes {
202514a4e214SDavid Rientjes 	int i;
202614a4e214SDavid Rientjes 
202714a4e214SDavid Rientjes 	/*
202814a4e214SDavid Rientjes 	 * If zone_reclaim_mode is disabled, then no extra effort is made to
202914a4e214SDavid Rientjes 	 * allocate memory locally.
203014a4e214SDavid Rientjes 	 */
203114a4e214SDavid Rientjes 	if (!zone_reclaim_mode)
203214a4e214SDavid Rientjes 		return false;
203314a4e214SDavid Rientjes 
203414a4e214SDavid Rientjes 	/* If there is a count for this node already, it must be acceptable */
203514a4e214SDavid Rientjes 	if (khugepaged_node_load[nid])
203614a4e214SDavid Rientjes 		return false;
203714a4e214SDavid Rientjes 
203814a4e214SDavid Rientjes 	for (i = 0; i < MAX_NUMNODES; i++) {
203914a4e214SDavid Rientjes 		if (!khugepaged_node_load[i])
204014a4e214SDavid Rientjes 			continue;
204114a4e214SDavid Rientjes 		if (node_distance(nid, i) > RECLAIM_DISTANCE)
204214a4e214SDavid Rientjes 			return true;
204314a4e214SDavid Rientjes 	}
204414a4e214SDavid Rientjes 	return false;
204514a4e214SDavid Rientjes }
204614a4e214SDavid Rientjes 
204726234f36SXiao Guangrong #ifdef CONFIG_NUMA
20489f1b868aSBob Liu static int khugepaged_find_target_node(void)
20499f1b868aSBob Liu {
20509f1b868aSBob Liu 	static int last_khugepaged_target_node = NUMA_NO_NODE;
20519f1b868aSBob Liu 	int nid, target_node = 0, max_value = 0;
20529f1b868aSBob Liu 
20539f1b868aSBob Liu 	/* find first node with max normal pages hit */
20549f1b868aSBob Liu 	for (nid = 0; nid < MAX_NUMNODES; nid++)
20559f1b868aSBob Liu 		if (khugepaged_node_load[nid] > max_value) {
20569f1b868aSBob Liu 			max_value = khugepaged_node_load[nid];
20579f1b868aSBob Liu 			target_node = nid;
20589f1b868aSBob Liu 		}
20599f1b868aSBob Liu 
20609f1b868aSBob Liu 	/* do some balance if several nodes have the same hit record */
20619f1b868aSBob Liu 	if (target_node <= last_khugepaged_target_node)
20629f1b868aSBob Liu 		for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
20639f1b868aSBob Liu 				nid++)
20649f1b868aSBob Liu 			if (max_value == khugepaged_node_load[nid]) {
20659f1b868aSBob Liu 				target_node = nid;
20669f1b868aSBob Liu 				break;
20679f1b868aSBob Liu 			}
20689f1b868aSBob Liu 
20699f1b868aSBob Liu 	last_khugepaged_target_node = target_node;
20709f1b868aSBob Liu 	return target_node;
20719f1b868aSBob Liu }
20729f1b868aSBob Liu 
207326234f36SXiao Guangrong static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
207426234f36SXiao Guangrong {
207526234f36SXiao Guangrong 	if (IS_ERR(*hpage)) {
207626234f36SXiao Guangrong 		if (!*wait)
207726234f36SXiao Guangrong 			return false;
207826234f36SXiao Guangrong 
207926234f36SXiao Guangrong 		*wait = false;
2080e3b4126cSXiao Guangrong 		*hpage = NULL;
208126234f36SXiao Guangrong 		khugepaged_alloc_sleep();
208226234f36SXiao Guangrong 	} else if (*hpage) {
208326234f36SXiao Guangrong 		put_page(*hpage);
208426234f36SXiao Guangrong 		*hpage = NULL;
208526234f36SXiao Guangrong 	}
208626234f36SXiao Guangrong 
208726234f36SXiao Guangrong 	return true;
208826234f36SXiao Guangrong }
208926234f36SXiao Guangrong 
20903b363692SMichal Hocko static struct page *
20913b363692SMichal Hocko khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2092d6669d68SAaron Tomlin 		       unsigned long address, int node)
209326234f36SXiao Guangrong {
2094309381feSSasha Levin 	VM_BUG_ON_PAGE(*hpage, *hpage);
20958b164568SVlastimil Babka 
209626234f36SXiao Guangrong 	/*
20978b164568SVlastimil Babka 	 * Before allocating the hugepage, release the mmap_sem read lock.
20988b164568SVlastimil Babka 	 * The allocation can take potentially a long time if it involves
20998b164568SVlastimil Babka 	 * sync compaction, and we do not need to hold the mmap_sem during
21008b164568SVlastimil Babka 	 * that. We will recheck the vma after taking it again in write mode.
210126234f36SXiao Guangrong 	 */
210226234f36SXiao Guangrong 	up_read(&mm->mmap_sem);
21038b164568SVlastimil Babka 
210496db800fSVlastimil Babka 	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
210526234f36SXiao Guangrong 	if (unlikely(!*hpage)) {
210626234f36SXiao Guangrong 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
210726234f36SXiao Guangrong 		*hpage = ERR_PTR(-ENOMEM);
210826234f36SXiao Guangrong 		return NULL;
210926234f36SXiao Guangrong 	}
211026234f36SXiao Guangrong 
211126234f36SXiao Guangrong 	count_vm_event(THP_COLLAPSE_ALLOC);
211226234f36SXiao Guangrong 	return *hpage;
211326234f36SXiao Guangrong }
211426234f36SXiao Guangrong #else
21159f1b868aSBob Liu static int khugepaged_find_target_node(void)
21169f1b868aSBob Liu {
21179f1b868aSBob Liu 	return 0;
21189f1b868aSBob Liu }
21199f1b868aSBob Liu 
212010dc4155SBob Liu static inline struct page *alloc_hugepage(int defrag)
212110dc4155SBob Liu {
212210dc4155SBob Liu 	return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
212310dc4155SBob Liu 			   HPAGE_PMD_ORDER);
212410dc4155SBob Liu }
212510dc4155SBob Liu 
212626234f36SXiao Guangrong static struct page *khugepaged_alloc_hugepage(bool *wait)
212726234f36SXiao Guangrong {
212826234f36SXiao Guangrong 	struct page *hpage;
212926234f36SXiao Guangrong 
213026234f36SXiao Guangrong 	do {
213126234f36SXiao Guangrong 		hpage = alloc_hugepage(khugepaged_defrag());
213226234f36SXiao Guangrong 		if (!hpage) {
213326234f36SXiao Guangrong 			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
213426234f36SXiao Guangrong 			if (!*wait)
213526234f36SXiao Guangrong 				return NULL;
213626234f36SXiao Guangrong 
213726234f36SXiao Guangrong 			*wait = false;
213826234f36SXiao Guangrong 			khugepaged_alloc_sleep();
213926234f36SXiao Guangrong 		} else
214026234f36SXiao Guangrong 			count_vm_event(THP_COLLAPSE_ALLOC);
214126234f36SXiao Guangrong 	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
214226234f36SXiao Guangrong 
214326234f36SXiao Guangrong 	return hpage;
214426234f36SXiao Guangrong }
214526234f36SXiao Guangrong 
214626234f36SXiao Guangrong static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
214726234f36SXiao Guangrong {
214826234f36SXiao Guangrong 	if (!*hpage)
214926234f36SXiao Guangrong 		*hpage = khugepaged_alloc_hugepage(wait);
215026234f36SXiao Guangrong 
215126234f36SXiao Guangrong 	if (unlikely(!*hpage))
215226234f36SXiao Guangrong 		return false;
215326234f36SXiao Guangrong 
215426234f36SXiao Guangrong 	return true;
215526234f36SXiao Guangrong }
215626234f36SXiao Guangrong 
21573b363692SMichal Hocko static struct page *
21583b363692SMichal Hocko khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2159d6669d68SAaron Tomlin 		       unsigned long address, int node)
216026234f36SXiao Guangrong {
216126234f36SXiao Guangrong 	up_read(&mm->mmap_sem);
216226234f36SXiao Guangrong 	VM_BUG_ON(!*hpage);
21633b363692SMichal Hocko 
216426234f36SXiao Guangrong 	return  *hpage;
216526234f36SXiao Guangrong }
216626234f36SXiao Guangrong #endif
216726234f36SXiao Guangrong 
2168fa475e51SBob Liu static bool hugepage_vma_check(struct vm_area_struct *vma)
2169fa475e51SBob Liu {
2170fa475e51SBob Liu 	if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2171fa475e51SBob Liu 	    (vma->vm_flags & VM_NOHUGEPAGE))
2172fa475e51SBob Liu 		return false;
21737479df6dSKirill A. Shutemov 	if (vma->vm_flags & VM_LOCKED)
21747479df6dSKirill A. Shutemov 		return false;
2175fa475e51SBob Liu 	if (!vma->anon_vma || vma->vm_ops)
2176fa475e51SBob Liu 		return false;
2177fa475e51SBob Liu 	if (is_vma_temporary_stack(vma))
2178fa475e51SBob Liu 		return false;
217981d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
2180fa475e51SBob Liu 	return true;
2181fa475e51SBob Liu }
2182fa475e51SBob Liu 
2183ba76149fSAndrea Arcangeli static void collapse_huge_page(struct mm_struct *mm,
2184ba76149fSAndrea Arcangeli 				   unsigned long address,
2185ce83d217SAndrea Arcangeli 				   struct page **hpage,
21865c4b4be3SAndi Kleen 				   struct vm_area_struct *vma,
21875c4b4be3SAndi Kleen 				   int node)
2188ba76149fSAndrea Arcangeli {
2189ba76149fSAndrea Arcangeli 	pmd_t *pmd, _pmd;
2190ba76149fSAndrea Arcangeli 	pte_t *pte;
2191ba76149fSAndrea Arcangeli 	pgtable_t pgtable;
2192ba76149fSAndrea Arcangeli 	struct page *new_page;
2193c4088ebdSKirill A. Shutemov 	spinlock_t *pmd_ptl, *pte_ptl;
21947d2eba05SEbru Akagunduz 	int isolated, result = 0;
2195ba76149fSAndrea Arcangeli 	unsigned long hstart, hend;
219600501b53SJohannes Weiner 	struct mem_cgroup *memcg;
21972ec74c3eSSagi Grimberg 	unsigned long mmun_start;	/* For mmu_notifiers */
21982ec74c3eSSagi Grimberg 	unsigned long mmun_end;		/* For mmu_notifiers */
21993b363692SMichal Hocko 	gfp_t gfp;
2200ba76149fSAndrea Arcangeli 
2201ba76149fSAndrea Arcangeli 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2202692e0b35SAndrea Arcangeli 
22033b363692SMichal Hocko 	/* Only allocate from the target node */
22043b363692SMichal Hocko 	gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
22053b363692SMichal Hocko 		__GFP_THISNODE;
22063b363692SMichal Hocko 
220726234f36SXiao Guangrong 	/* release the mmap_sem read lock. */
2208d6669d68SAaron Tomlin 	new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
22097d2eba05SEbru Akagunduz 	if (!new_page) {
22107d2eba05SEbru Akagunduz 		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
22117d2eba05SEbru Akagunduz 		goto out_nolock;
22127d2eba05SEbru Akagunduz 	}
2213ce83d217SAndrea Arcangeli 
2214f627c2f5SKirill A. Shutemov 	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
22157d2eba05SEbru Akagunduz 		result = SCAN_CGROUP_CHARGE_FAIL;
22167d2eba05SEbru Akagunduz 		goto out_nolock;
22177d2eba05SEbru Akagunduz 	}
2218ba76149fSAndrea Arcangeli 
2219ba76149fSAndrea Arcangeli 	/*
2220ba76149fSAndrea Arcangeli 	 * Prevent all access to pagetables with the exception of
2221ba76149fSAndrea Arcangeli 	 * gup_fast later hanlded by the ptep_clear_flush and the VM
2222ba76149fSAndrea Arcangeli 	 * handled by the anon_vma lock + PG_lock.
2223ba76149fSAndrea Arcangeli 	 */
2224ba76149fSAndrea Arcangeli 	down_write(&mm->mmap_sem);
22257d2eba05SEbru Akagunduz 	if (unlikely(khugepaged_test_exit(mm))) {
22267d2eba05SEbru Akagunduz 		result = SCAN_ANY_PROCESS;
2227ba76149fSAndrea Arcangeli 		goto out;
22287d2eba05SEbru Akagunduz 	}
2229ba76149fSAndrea Arcangeli 
2230ba76149fSAndrea Arcangeli 	vma = find_vma(mm, address);
22317d2eba05SEbru Akagunduz 	if (!vma) {
22327d2eba05SEbru Akagunduz 		result = SCAN_VMA_NULL;
2233a8f531ebSLibin 		goto out;
22347d2eba05SEbru Akagunduz 	}
2235ba76149fSAndrea Arcangeli 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2236ba76149fSAndrea Arcangeli 	hend = vma->vm_end & HPAGE_PMD_MASK;
22377d2eba05SEbru Akagunduz 	if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
22387d2eba05SEbru Akagunduz 		result = SCAN_ADDRESS_RANGE;
2239ba76149fSAndrea Arcangeli 		goto out;
22407d2eba05SEbru Akagunduz 	}
22417d2eba05SEbru Akagunduz 	if (!hugepage_vma_check(vma)) {
22427d2eba05SEbru Akagunduz 		result = SCAN_VMA_CHECK;
2243ba76149fSAndrea Arcangeli 		goto out;
22447d2eba05SEbru Akagunduz 	}
22456219049aSBob Liu 	pmd = mm_find_pmd(mm, address);
22467d2eba05SEbru Akagunduz 	if (!pmd) {
22477d2eba05SEbru Akagunduz 		result = SCAN_PMD_NULL;
2248ba76149fSAndrea Arcangeli 		goto out;
22497d2eba05SEbru Akagunduz 	}
2250ba76149fSAndrea Arcangeli 
22514fc3f1d6SIngo Molnar 	anon_vma_lock_write(vma->anon_vma);
2252ba76149fSAndrea Arcangeli 
2253ba76149fSAndrea Arcangeli 	pte = pte_offset_map(pmd, address);
2254c4088ebdSKirill A. Shutemov 	pte_ptl = pte_lockptr(mm, pmd);
2255ba76149fSAndrea Arcangeli 
22562ec74c3eSSagi Grimberg 	mmun_start = address;
22572ec74c3eSSagi Grimberg 	mmun_end   = address + HPAGE_PMD_SIZE;
22582ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2259c4088ebdSKirill A. Shutemov 	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
2260ba76149fSAndrea Arcangeli 	/*
2261ba76149fSAndrea Arcangeli 	 * After this gup_fast can't run anymore. This also removes
2262ba76149fSAndrea Arcangeli 	 * any huge TLB entry from the CPU so we won't allow
2263ba76149fSAndrea Arcangeli 	 * huge and small TLB entries for the same virtual address
2264ba76149fSAndrea Arcangeli 	 * to avoid the risk of CPU bugs in that area.
2265ba76149fSAndrea Arcangeli 	 */
226615a25b2eSAneesh Kumar K.V 	_pmd = pmdp_collapse_flush(vma, address, pmd);
2267c4088ebdSKirill A. Shutemov 	spin_unlock(pmd_ptl);
22682ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2269ba76149fSAndrea Arcangeli 
2270c4088ebdSKirill A. Shutemov 	spin_lock(pte_ptl);
2271ba76149fSAndrea Arcangeli 	isolated = __collapse_huge_page_isolate(vma, address, pte);
2272c4088ebdSKirill A. Shutemov 	spin_unlock(pte_ptl);
2273ba76149fSAndrea Arcangeli 
2274ba76149fSAndrea Arcangeli 	if (unlikely(!isolated)) {
2275453c7192SJohannes Weiner 		pte_unmap(pte);
2276c4088ebdSKirill A. Shutemov 		spin_lock(pmd_ptl);
2277ba76149fSAndrea Arcangeli 		BUG_ON(!pmd_none(*pmd));
22787c342512SAneesh Kumar K.V 		/*
22797c342512SAneesh Kumar K.V 		 * We can only use set_pmd_at when establishing
22807c342512SAneesh Kumar K.V 		 * hugepmds and never for establishing regular pmds that
22817c342512SAneesh Kumar K.V 		 * points to regular pagetables. Use pmd_populate for that
22827c342512SAneesh Kumar K.V 		 */
22837c342512SAneesh Kumar K.V 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
2284c4088ebdSKirill A. Shutemov 		spin_unlock(pmd_ptl);
228508b52706SKonstantin Khlebnikov 		anon_vma_unlock_write(vma->anon_vma);
22867d2eba05SEbru Akagunduz 		result = SCAN_FAIL;
2287ce83d217SAndrea Arcangeli 		goto out;
2288ba76149fSAndrea Arcangeli 	}
2289ba76149fSAndrea Arcangeli 
2290ba76149fSAndrea Arcangeli 	/*
2291ba76149fSAndrea Arcangeli 	 * All pages are isolated and locked so anon_vma rmap
2292ba76149fSAndrea Arcangeli 	 * can't run anymore.
2293ba76149fSAndrea Arcangeli 	 */
229408b52706SKonstantin Khlebnikov 	anon_vma_unlock_write(vma->anon_vma);
2295ba76149fSAndrea Arcangeli 
2296c4088ebdSKirill A. Shutemov 	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
2297453c7192SJohannes Weiner 	pte_unmap(pte);
2298ba76149fSAndrea Arcangeli 	__SetPageUptodate(new_page);
2299ba76149fSAndrea Arcangeli 	pgtable = pmd_pgtable(_pmd);
2300ba76149fSAndrea Arcangeli 
23013122359aSKirill A. Shutemov 	_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
23023122359aSKirill A. Shutemov 	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2303ba76149fSAndrea Arcangeli 
2304ba76149fSAndrea Arcangeli 	/*
2305ba76149fSAndrea Arcangeli 	 * spin_lock() below is not the equivalent of smp_wmb(), so
2306ba76149fSAndrea Arcangeli 	 * this is needed to avoid the copy_huge_page writes to become
2307ba76149fSAndrea Arcangeli 	 * visible after the set_pmd_at() write.
2308ba76149fSAndrea Arcangeli 	 */
2309ba76149fSAndrea Arcangeli 	smp_wmb();
2310ba76149fSAndrea Arcangeli 
2311c4088ebdSKirill A. Shutemov 	spin_lock(pmd_ptl);
2312ba76149fSAndrea Arcangeli 	BUG_ON(!pmd_none(*pmd));
2313d281ee61SKirill A. Shutemov 	page_add_new_anon_rmap(new_page, vma, address, true);
2314f627c2f5SKirill A. Shutemov 	mem_cgroup_commit_charge(new_page, memcg, false, true);
231500501b53SJohannes Weiner 	lru_cache_add_active_or_unevictable(new_page, vma);
2316fce144b4SAneesh Kumar K.V 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
2317ba76149fSAndrea Arcangeli 	set_pmd_at(mm, address, pmd, _pmd);
2318b113da65SDavid Miller 	update_mmu_cache_pmd(vma, address, pmd);
2319c4088ebdSKirill A. Shutemov 	spin_unlock(pmd_ptl);
2320ba76149fSAndrea Arcangeli 
2321ba76149fSAndrea Arcangeli 	*hpage = NULL;
2322420256efSXiao Guangrong 
2323ba76149fSAndrea Arcangeli 	khugepaged_pages_collapsed++;
23247d2eba05SEbru Akagunduz 	result = SCAN_SUCCEED;
2325ce83d217SAndrea Arcangeli out_up_write:
2326ba76149fSAndrea Arcangeli 	up_write(&mm->mmap_sem);
23277d2eba05SEbru Akagunduz 	trace_mm_collapse_huge_page(mm, isolated, result);
23280bbbc0b3SAndrea Arcangeli 	return;
23290bbbc0b3SAndrea Arcangeli 
23307d2eba05SEbru Akagunduz out_nolock:
23317d2eba05SEbru Akagunduz 	trace_mm_collapse_huge_page(mm, isolated, result);
23327d2eba05SEbru Akagunduz 	return;
2333ce83d217SAndrea Arcangeli out:
2334f627c2f5SKirill A. Shutemov 	mem_cgroup_cancel_charge(new_page, memcg, true);
2335ce83d217SAndrea Arcangeli 	goto out_up_write;
2336ba76149fSAndrea Arcangeli }
2337ba76149fSAndrea Arcangeli 
2338ba76149fSAndrea Arcangeli static int khugepaged_scan_pmd(struct mm_struct *mm,
2339ba76149fSAndrea Arcangeli 			       struct vm_area_struct *vma,
2340ba76149fSAndrea Arcangeli 			       unsigned long address,
2341ba76149fSAndrea Arcangeli 			       struct page **hpage)
2342ba76149fSAndrea Arcangeli {
2343ba76149fSAndrea Arcangeli 	pmd_t *pmd;
2344ba76149fSAndrea Arcangeli 	pte_t *pte, *_pte;
23457d2eba05SEbru Akagunduz 	int ret = 0, none_or_zero = 0, result = 0;
23467d2eba05SEbru Akagunduz 	struct page *page = NULL;
2347ba76149fSAndrea Arcangeli 	unsigned long _address;
2348ba76149fSAndrea Arcangeli 	spinlock_t *ptl;
234900ef2d2fSDavid Rientjes 	int node = NUMA_NO_NODE;
235010359213SEbru Akagunduz 	bool writable = false, referenced = false;
2351ba76149fSAndrea Arcangeli 
2352ba76149fSAndrea Arcangeli 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2353ba76149fSAndrea Arcangeli 
23546219049aSBob Liu 	pmd = mm_find_pmd(mm, address);
23557d2eba05SEbru Akagunduz 	if (!pmd) {
23567d2eba05SEbru Akagunduz 		result = SCAN_PMD_NULL;
2357ba76149fSAndrea Arcangeli 		goto out;
23587d2eba05SEbru Akagunduz 	}
2359ba76149fSAndrea Arcangeli 
23609f1b868aSBob Liu 	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2361ba76149fSAndrea Arcangeli 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2362ba76149fSAndrea Arcangeli 	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2363ba76149fSAndrea Arcangeli 	     _pte++, _address += PAGE_SIZE) {
2364ba76149fSAndrea Arcangeli 		pte_t pteval = *_pte;
2365ca0984caSEbru Akagunduz 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2366c1294d05SAndrea Arcangeli 			if (!userfaultfd_armed(vma) &&
23677d2eba05SEbru Akagunduz 			    ++none_or_zero <= khugepaged_max_ptes_none) {
2368ba76149fSAndrea Arcangeli 				continue;
23697d2eba05SEbru Akagunduz 			} else {
23707d2eba05SEbru Akagunduz 				result = SCAN_EXCEED_NONE_PTE;
2371ba76149fSAndrea Arcangeli 				goto out_unmap;
2372ba76149fSAndrea Arcangeli 			}
23737d2eba05SEbru Akagunduz 		}
23747d2eba05SEbru Akagunduz 		if (!pte_present(pteval)) {
23757d2eba05SEbru Akagunduz 			result = SCAN_PTE_NON_PRESENT;
2376ba76149fSAndrea Arcangeli 			goto out_unmap;
23777d2eba05SEbru Akagunduz 		}
237810359213SEbru Akagunduz 		if (pte_write(pteval))
237910359213SEbru Akagunduz 			writable = true;
238010359213SEbru Akagunduz 
2381ba76149fSAndrea Arcangeli 		page = vm_normal_page(vma, _address, pteval);
23827d2eba05SEbru Akagunduz 		if (unlikely(!page)) {
23837d2eba05SEbru Akagunduz 			result = SCAN_PAGE_NULL;
2384ba76149fSAndrea Arcangeli 			goto out_unmap;
23857d2eba05SEbru Akagunduz 		}
2386b1caa957SKirill A. Shutemov 
2387b1caa957SKirill A. Shutemov 		/* TODO: teach khugepaged to collapse THP mapped with pte */
2388b1caa957SKirill A. Shutemov 		if (PageCompound(page)) {
2389b1caa957SKirill A. Shutemov 			result = SCAN_PAGE_COMPOUND;
2390b1caa957SKirill A. Shutemov 			goto out_unmap;
2391b1caa957SKirill A. Shutemov 		}
2392b1caa957SKirill A. Shutemov 
23935c4b4be3SAndi Kleen 		/*
23949f1b868aSBob Liu 		 * Record which node the original page is from and save this
23959f1b868aSBob Liu 		 * information to khugepaged_node_load[].
23969f1b868aSBob Liu 		 * Khupaged will allocate hugepage from the node has the max
23979f1b868aSBob Liu 		 * hit record.
23985c4b4be3SAndi Kleen 		 */
23995c4b4be3SAndi Kleen 		node = page_to_nid(page);
24007d2eba05SEbru Akagunduz 		if (khugepaged_scan_abort(node)) {
24017d2eba05SEbru Akagunduz 			result = SCAN_SCAN_ABORT;
240214a4e214SDavid Rientjes 			goto out_unmap;
24037d2eba05SEbru Akagunduz 		}
24049f1b868aSBob Liu 		khugepaged_node_load[node]++;
24057d2eba05SEbru Akagunduz 		if (!PageLRU(page)) {
24067d2eba05SEbru Akagunduz 			result = SCAN_SCAN_ABORT;
2407ba76149fSAndrea Arcangeli 			goto out_unmap;
24087d2eba05SEbru Akagunduz 		}
24097d2eba05SEbru Akagunduz 		if (PageLocked(page)) {
24107d2eba05SEbru Akagunduz 			result = SCAN_PAGE_LOCK;
24117d2eba05SEbru Akagunduz 			goto out_unmap;
24127d2eba05SEbru Akagunduz 		}
24137d2eba05SEbru Akagunduz 		if (!PageAnon(page)) {
24147d2eba05SEbru Akagunduz 			result = SCAN_PAGE_ANON;
24157d2eba05SEbru Akagunduz 			goto out_unmap;
24167d2eba05SEbru Akagunduz 		}
24177d2eba05SEbru Akagunduz 
241810359213SEbru Akagunduz 		/*
241910359213SEbru Akagunduz 		 * cannot use mapcount: can't collapse if there's a gup pin.
242010359213SEbru Akagunduz 		 * The page must only be referenced by the scanned process
242110359213SEbru Akagunduz 		 * and page swap cache.
242210359213SEbru Akagunduz 		 */
24237d2eba05SEbru Akagunduz 		if (page_count(page) != 1 + !!PageSwapCache(page)) {
24247d2eba05SEbru Akagunduz 			result = SCAN_PAGE_COUNT;
2425ba76149fSAndrea Arcangeli 			goto out_unmap;
24267d2eba05SEbru Akagunduz 		}
242733c3fc71SVladimir Davydov 		if (pte_young(pteval) ||
242833c3fc71SVladimir Davydov 		    page_is_young(page) || PageReferenced(page) ||
24298ee53820SAndrea Arcangeli 		    mmu_notifier_test_young(vma->vm_mm, address))
243010359213SEbru Akagunduz 			referenced = true;
2431ba76149fSAndrea Arcangeli 	}
24327d2eba05SEbru Akagunduz 	if (writable) {
24337d2eba05SEbru Akagunduz 		if (referenced) {
24347d2eba05SEbru Akagunduz 			result = SCAN_SUCCEED;
2435ba76149fSAndrea Arcangeli 			ret = 1;
24367d2eba05SEbru Akagunduz 		} else {
24377d2eba05SEbru Akagunduz 			result = SCAN_NO_REFERENCED_PAGE;
24387d2eba05SEbru Akagunduz 		}
24397d2eba05SEbru Akagunduz 	} else {
24407d2eba05SEbru Akagunduz 		result = SCAN_PAGE_RO;
24417d2eba05SEbru Akagunduz 	}
2442ba76149fSAndrea Arcangeli out_unmap:
2443ba76149fSAndrea Arcangeli 	pte_unmap_unlock(pte, ptl);
24449f1b868aSBob Liu 	if (ret) {
24459f1b868aSBob Liu 		node = khugepaged_find_target_node();
2446ce83d217SAndrea Arcangeli 		/* collapse_huge_page will return with the mmap_sem released */
24475c4b4be3SAndi Kleen 		collapse_huge_page(mm, address, hpage, vma, node);
24489f1b868aSBob Liu 	}
2449ba76149fSAndrea Arcangeli out:
24507d2eba05SEbru Akagunduz 	trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
24517d2eba05SEbru Akagunduz 				     none_or_zero, result);
2452ba76149fSAndrea Arcangeli 	return ret;
2453ba76149fSAndrea Arcangeli }
2454ba76149fSAndrea Arcangeli 
2455ba76149fSAndrea Arcangeli static void collect_mm_slot(struct mm_slot *mm_slot)
2456ba76149fSAndrea Arcangeli {
2457ba76149fSAndrea Arcangeli 	struct mm_struct *mm = mm_slot->mm;
2458ba76149fSAndrea Arcangeli 
2459b9980cdcSHugh Dickins 	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2460ba76149fSAndrea Arcangeli 
2461ba76149fSAndrea Arcangeli 	if (khugepaged_test_exit(mm)) {
2462ba76149fSAndrea Arcangeli 		/* free mm_slot */
246343b5fbbdSSasha Levin 		hash_del(&mm_slot->hash);
2464ba76149fSAndrea Arcangeli 		list_del(&mm_slot->mm_node);
2465ba76149fSAndrea Arcangeli 
2466ba76149fSAndrea Arcangeli 		/*
2467ba76149fSAndrea Arcangeli 		 * Not strictly needed because the mm exited already.
2468ba76149fSAndrea Arcangeli 		 *
2469ba76149fSAndrea Arcangeli 		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2470ba76149fSAndrea Arcangeli 		 */
2471ba76149fSAndrea Arcangeli 
2472ba76149fSAndrea Arcangeli 		/* khugepaged_mm_lock actually not necessary for the below */
2473ba76149fSAndrea Arcangeli 		free_mm_slot(mm_slot);
2474ba76149fSAndrea Arcangeli 		mmdrop(mm);
2475ba76149fSAndrea Arcangeli 	}
2476ba76149fSAndrea Arcangeli }
2477ba76149fSAndrea Arcangeli 
2478ba76149fSAndrea Arcangeli static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2479ba76149fSAndrea Arcangeli 					    struct page **hpage)
24802f1da642SH Hartley Sweeten 	__releases(&khugepaged_mm_lock)
24812f1da642SH Hartley Sweeten 	__acquires(&khugepaged_mm_lock)
2482ba76149fSAndrea Arcangeli {
2483ba76149fSAndrea Arcangeli 	struct mm_slot *mm_slot;
2484ba76149fSAndrea Arcangeli 	struct mm_struct *mm;
2485ba76149fSAndrea Arcangeli 	struct vm_area_struct *vma;
2486ba76149fSAndrea Arcangeli 	int progress = 0;
2487ba76149fSAndrea Arcangeli 
2488ba76149fSAndrea Arcangeli 	VM_BUG_ON(!pages);
2489b9980cdcSHugh Dickins 	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2490ba76149fSAndrea Arcangeli 
2491ba76149fSAndrea Arcangeli 	if (khugepaged_scan.mm_slot)
2492ba76149fSAndrea Arcangeli 		mm_slot = khugepaged_scan.mm_slot;
2493ba76149fSAndrea Arcangeli 	else {
2494ba76149fSAndrea Arcangeli 		mm_slot = list_entry(khugepaged_scan.mm_head.next,
2495ba76149fSAndrea Arcangeli 				     struct mm_slot, mm_node);
2496ba76149fSAndrea Arcangeli 		khugepaged_scan.address = 0;
2497ba76149fSAndrea Arcangeli 		khugepaged_scan.mm_slot = mm_slot;
2498ba76149fSAndrea Arcangeli 	}
2499ba76149fSAndrea Arcangeli 	spin_unlock(&khugepaged_mm_lock);
2500ba76149fSAndrea Arcangeli 
2501ba76149fSAndrea Arcangeli 	mm = mm_slot->mm;
2502ba76149fSAndrea Arcangeli 	down_read(&mm->mmap_sem);
2503ba76149fSAndrea Arcangeli 	if (unlikely(khugepaged_test_exit(mm)))
2504ba76149fSAndrea Arcangeli 		vma = NULL;
2505ba76149fSAndrea Arcangeli 	else
2506ba76149fSAndrea Arcangeli 		vma = find_vma(mm, khugepaged_scan.address);
2507ba76149fSAndrea Arcangeli 
2508ba76149fSAndrea Arcangeli 	progress++;
2509ba76149fSAndrea Arcangeli 	for (; vma; vma = vma->vm_next) {
2510ba76149fSAndrea Arcangeli 		unsigned long hstart, hend;
2511ba76149fSAndrea Arcangeli 
2512ba76149fSAndrea Arcangeli 		cond_resched();
2513ba76149fSAndrea Arcangeli 		if (unlikely(khugepaged_test_exit(mm))) {
2514ba76149fSAndrea Arcangeli 			progress++;
2515ba76149fSAndrea Arcangeli 			break;
2516ba76149fSAndrea Arcangeli 		}
2517fa475e51SBob Liu 		if (!hugepage_vma_check(vma)) {
2518a7d6e4ecSAndrea Arcangeli skip:
2519ba76149fSAndrea Arcangeli 			progress++;
2520ba76149fSAndrea Arcangeli 			continue;
2521ba76149fSAndrea Arcangeli 		}
2522ba76149fSAndrea Arcangeli 		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2523ba76149fSAndrea Arcangeli 		hend = vma->vm_end & HPAGE_PMD_MASK;
2524a7d6e4ecSAndrea Arcangeli 		if (hstart >= hend)
2525a7d6e4ecSAndrea Arcangeli 			goto skip;
2526a7d6e4ecSAndrea Arcangeli 		if (khugepaged_scan.address > hend)
2527a7d6e4ecSAndrea Arcangeli 			goto skip;
2528ba76149fSAndrea Arcangeli 		if (khugepaged_scan.address < hstart)
2529ba76149fSAndrea Arcangeli 			khugepaged_scan.address = hstart;
2530a7d6e4ecSAndrea Arcangeli 		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2531ba76149fSAndrea Arcangeli 
2532ba76149fSAndrea Arcangeli 		while (khugepaged_scan.address < hend) {
2533ba76149fSAndrea Arcangeli 			int ret;
2534ba76149fSAndrea Arcangeli 			cond_resched();
2535ba76149fSAndrea Arcangeli 			if (unlikely(khugepaged_test_exit(mm)))
2536ba76149fSAndrea Arcangeli 				goto breakouterloop;
2537ba76149fSAndrea Arcangeli 
2538ba76149fSAndrea Arcangeli 			VM_BUG_ON(khugepaged_scan.address < hstart ||
2539ba76149fSAndrea Arcangeli 				  khugepaged_scan.address + HPAGE_PMD_SIZE >
2540ba76149fSAndrea Arcangeli 				  hend);
2541ba76149fSAndrea Arcangeli 			ret = khugepaged_scan_pmd(mm, vma,
2542ba76149fSAndrea Arcangeli 						  khugepaged_scan.address,
2543ba76149fSAndrea Arcangeli 						  hpage);
2544ba76149fSAndrea Arcangeli 			/* move to next address */
2545ba76149fSAndrea Arcangeli 			khugepaged_scan.address += HPAGE_PMD_SIZE;
2546ba76149fSAndrea Arcangeli 			progress += HPAGE_PMD_NR;
2547ba76149fSAndrea Arcangeli 			if (ret)
2548ba76149fSAndrea Arcangeli 				/* we released mmap_sem so break loop */
2549ba76149fSAndrea Arcangeli 				goto breakouterloop_mmap_sem;
2550ba76149fSAndrea Arcangeli 			if (progress >= pages)
2551ba76149fSAndrea Arcangeli 				goto breakouterloop;
2552ba76149fSAndrea Arcangeli 		}
2553ba76149fSAndrea Arcangeli 	}
2554ba76149fSAndrea Arcangeli breakouterloop:
2555ba76149fSAndrea Arcangeli 	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2556ba76149fSAndrea Arcangeli breakouterloop_mmap_sem:
2557ba76149fSAndrea Arcangeli 
2558ba76149fSAndrea Arcangeli 	spin_lock(&khugepaged_mm_lock);
2559a7d6e4ecSAndrea Arcangeli 	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2560ba76149fSAndrea Arcangeli 	/*
2561ba76149fSAndrea Arcangeli 	 * Release the current mm_slot if this mm is about to die, or
2562ba76149fSAndrea Arcangeli 	 * if we scanned all vmas of this mm.
2563ba76149fSAndrea Arcangeli 	 */
2564ba76149fSAndrea Arcangeli 	if (khugepaged_test_exit(mm) || !vma) {
2565ba76149fSAndrea Arcangeli 		/*
2566ba76149fSAndrea Arcangeli 		 * Make sure that if mm_users is reaching zero while
2567ba76149fSAndrea Arcangeli 		 * khugepaged runs here, khugepaged_exit will find
2568ba76149fSAndrea Arcangeli 		 * mm_slot not pointing to the exiting mm.
2569ba76149fSAndrea Arcangeli 		 */
2570ba76149fSAndrea Arcangeli 		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2571ba76149fSAndrea Arcangeli 			khugepaged_scan.mm_slot = list_entry(
2572ba76149fSAndrea Arcangeli 				mm_slot->mm_node.next,
2573ba76149fSAndrea Arcangeli 				struct mm_slot, mm_node);
2574ba76149fSAndrea Arcangeli 			khugepaged_scan.address = 0;
2575ba76149fSAndrea Arcangeli 		} else {
2576ba76149fSAndrea Arcangeli 			khugepaged_scan.mm_slot = NULL;
2577ba76149fSAndrea Arcangeli 			khugepaged_full_scans++;
2578ba76149fSAndrea Arcangeli 		}
2579ba76149fSAndrea Arcangeli 
2580ba76149fSAndrea Arcangeli 		collect_mm_slot(mm_slot);
2581ba76149fSAndrea Arcangeli 	}
2582ba76149fSAndrea Arcangeli 
2583ba76149fSAndrea Arcangeli 	return progress;
2584ba76149fSAndrea Arcangeli }
2585ba76149fSAndrea Arcangeli 
2586ba76149fSAndrea Arcangeli static int khugepaged_has_work(void)
2587ba76149fSAndrea Arcangeli {
2588ba76149fSAndrea Arcangeli 	return !list_empty(&khugepaged_scan.mm_head) &&
2589ba76149fSAndrea Arcangeli 		khugepaged_enabled();
2590ba76149fSAndrea Arcangeli }
2591ba76149fSAndrea Arcangeli 
2592ba76149fSAndrea Arcangeli static int khugepaged_wait_event(void)
2593ba76149fSAndrea Arcangeli {
2594ba76149fSAndrea Arcangeli 	return !list_empty(&khugepaged_scan.mm_head) ||
25952017c0bfSXiao Guangrong 		kthread_should_stop();
2596ba76149fSAndrea Arcangeli }
2597ba76149fSAndrea Arcangeli 
2598d516904bSXiao Guangrong static void khugepaged_do_scan(void)
2599d516904bSXiao Guangrong {
2600d516904bSXiao Guangrong 	struct page *hpage = NULL;
2601ba76149fSAndrea Arcangeli 	unsigned int progress = 0, pass_through_head = 0;
2602ba76149fSAndrea Arcangeli 	unsigned int pages = khugepaged_pages_to_scan;
2603d516904bSXiao Guangrong 	bool wait = true;
2604ba76149fSAndrea Arcangeli 
2605ba76149fSAndrea Arcangeli 	barrier(); /* write khugepaged_pages_to_scan to local stack */
2606ba76149fSAndrea Arcangeli 
2607ba76149fSAndrea Arcangeli 	while (progress < pages) {
260826234f36SXiao Guangrong 		if (!khugepaged_prealloc_page(&hpage, &wait))
260926234f36SXiao Guangrong 			break;
2610d516904bSXiao Guangrong 
2611420256efSXiao Guangrong 		cond_resched();
2612ba76149fSAndrea Arcangeli 
2613cd092411SJiri Kosina 		if (unlikely(kthread_should_stop() || try_to_freeze()))
2614878aee7dSAndrea Arcangeli 			break;
2615878aee7dSAndrea Arcangeli 
2616ba76149fSAndrea Arcangeli 		spin_lock(&khugepaged_mm_lock);
2617ba76149fSAndrea Arcangeli 		if (!khugepaged_scan.mm_slot)
2618ba76149fSAndrea Arcangeli 			pass_through_head++;
2619ba76149fSAndrea Arcangeli 		if (khugepaged_has_work() &&
2620ba76149fSAndrea Arcangeli 		    pass_through_head < 2)
2621ba76149fSAndrea Arcangeli 			progress += khugepaged_scan_mm_slot(pages - progress,
2622d516904bSXiao Guangrong 							    &hpage);
2623ba76149fSAndrea Arcangeli 		else
2624ba76149fSAndrea Arcangeli 			progress = pages;
2625ba76149fSAndrea Arcangeli 		spin_unlock(&khugepaged_mm_lock);
2626ba76149fSAndrea Arcangeli 	}
2627ba76149fSAndrea Arcangeli 
2628d516904bSXiao Guangrong 	if (!IS_ERR_OR_NULL(hpage))
2629d516904bSXiao Guangrong 		put_page(hpage);
2630ba76149fSAndrea Arcangeli }
26310bbbc0b3SAndrea Arcangeli 
26322017c0bfSXiao Guangrong static void khugepaged_wait_work(void)
26332017c0bfSXiao Guangrong {
26342017c0bfSXiao Guangrong 	if (khugepaged_has_work()) {
26352017c0bfSXiao Guangrong 		if (!khugepaged_scan_sleep_millisecs)
26362017c0bfSXiao Guangrong 			return;
26372017c0bfSXiao Guangrong 
26382017c0bfSXiao Guangrong 		wait_event_freezable_timeout(khugepaged_wait,
26392017c0bfSXiao Guangrong 					     kthread_should_stop(),
26402017c0bfSXiao Guangrong 			msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
26412017c0bfSXiao Guangrong 		return;
26422017c0bfSXiao Guangrong 	}
26432017c0bfSXiao Guangrong 
26442017c0bfSXiao Guangrong 	if (khugepaged_enabled())
26452017c0bfSXiao Guangrong 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
26462017c0bfSXiao Guangrong }
26472017c0bfSXiao Guangrong 
2648ba76149fSAndrea Arcangeli static int khugepaged(void *none)
2649ba76149fSAndrea Arcangeli {
2650ba76149fSAndrea Arcangeli 	struct mm_slot *mm_slot;
2651ba76149fSAndrea Arcangeli 
2652878aee7dSAndrea Arcangeli 	set_freezable();
26538698a745SDongsheng Yang 	set_user_nice(current, MAX_NICE);
2654ba76149fSAndrea Arcangeli 
2655b7231789SXiao Guangrong 	while (!kthread_should_stop()) {
2656b7231789SXiao Guangrong 		khugepaged_do_scan();
2657b7231789SXiao Guangrong 		khugepaged_wait_work();
2658b7231789SXiao Guangrong 	}
2659ba76149fSAndrea Arcangeli 
2660ba76149fSAndrea Arcangeli 	spin_lock(&khugepaged_mm_lock);
2661ba76149fSAndrea Arcangeli 	mm_slot = khugepaged_scan.mm_slot;
2662ba76149fSAndrea Arcangeli 	khugepaged_scan.mm_slot = NULL;
2663ba76149fSAndrea Arcangeli 	if (mm_slot)
2664ba76149fSAndrea Arcangeli 		collect_mm_slot(mm_slot);
2665ba76149fSAndrea Arcangeli 	spin_unlock(&khugepaged_mm_lock);
2666ba76149fSAndrea Arcangeli 	return 0;
2667ba76149fSAndrea Arcangeli }
2668ba76149fSAndrea Arcangeli 
266978ddc534SKirill A. Shutemov static void split_huge_pmd_address(struct vm_area_struct *vma,
267094fcc585SAndrea Arcangeli 				    unsigned long address)
267194fcc585SAndrea Arcangeli {
2672f72e7dcdSHugh Dickins 	pgd_t *pgd;
2673f72e7dcdSHugh Dickins 	pud_t *pud;
267494fcc585SAndrea Arcangeli 	pmd_t *pmd;
267594fcc585SAndrea Arcangeli 
267694fcc585SAndrea Arcangeli 	VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
267794fcc585SAndrea Arcangeli 
267878ddc534SKirill A. Shutemov 	pgd = pgd_offset(vma->vm_mm, address);
2679f72e7dcdSHugh Dickins 	if (!pgd_present(*pgd))
2680f72e7dcdSHugh Dickins 		return;
2681f72e7dcdSHugh Dickins 
2682f72e7dcdSHugh Dickins 	pud = pud_offset(pgd, address);
2683f72e7dcdSHugh Dickins 	if (!pud_present(*pud))
2684f72e7dcdSHugh Dickins 		return;
2685f72e7dcdSHugh Dickins 
2686f72e7dcdSHugh Dickins 	pmd = pmd_offset(pud, address);
268778ddc534SKirill A. Shutemov 	if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
268894fcc585SAndrea Arcangeli 		return;
268994fcc585SAndrea Arcangeli 	/*
269094fcc585SAndrea Arcangeli 	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
269194fcc585SAndrea Arcangeli 	 * materialize from under us.
269294fcc585SAndrea Arcangeli 	 */
2693ad0bed24SKirill A. Shutemov 	split_huge_pmd(vma, pmd, address);
269494fcc585SAndrea Arcangeli }
269594fcc585SAndrea Arcangeli 
2696e1b9996bSKirill A. Shutemov void vma_adjust_trans_huge(struct vm_area_struct *vma,
269794fcc585SAndrea Arcangeli 			     unsigned long start,
269894fcc585SAndrea Arcangeli 			     unsigned long end,
269994fcc585SAndrea Arcangeli 			     long adjust_next)
270094fcc585SAndrea Arcangeli {
270194fcc585SAndrea Arcangeli 	/*
270294fcc585SAndrea Arcangeli 	 * If the new start address isn't hpage aligned and it could
270394fcc585SAndrea Arcangeli 	 * previously contain an hugepage: check if we need to split
270494fcc585SAndrea Arcangeli 	 * an huge pmd.
270594fcc585SAndrea Arcangeli 	 */
270694fcc585SAndrea Arcangeli 	if (start & ~HPAGE_PMD_MASK &&
270794fcc585SAndrea Arcangeli 	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
270894fcc585SAndrea Arcangeli 	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
270978ddc534SKirill A. Shutemov 		split_huge_pmd_address(vma, start);
271094fcc585SAndrea Arcangeli 
271194fcc585SAndrea Arcangeli 	/*
271294fcc585SAndrea Arcangeli 	 * If the new end address isn't hpage aligned and it could
271394fcc585SAndrea Arcangeli 	 * previously contain an hugepage: check if we need to split
271494fcc585SAndrea Arcangeli 	 * an huge pmd.
271594fcc585SAndrea Arcangeli 	 */
271694fcc585SAndrea Arcangeli 	if (end & ~HPAGE_PMD_MASK &&
271794fcc585SAndrea Arcangeli 	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
271894fcc585SAndrea Arcangeli 	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
271978ddc534SKirill A. Shutemov 		split_huge_pmd_address(vma, end);
272094fcc585SAndrea Arcangeli 
272194fcc585SAndrea Arcangeli 	/*
272294fcc585SAndrea Arcangeli 	 * If we're also updating the vma->vm_next->vm_start, if the new
272394fcc585SAndrea Arcangeli 	 * vm_next->vm_start isn't page aligned and it could previously
272494fcc585SAndrea Arcangeli 	 * contain an hugepage: check if we need to split an huge pmd.
272594fcc585SAndrea Arcangeli 	 */
272694fcc585SAndrea Arcangeli 	if (adjust_next > 0) {
272794fcc585SAndrea Arcangeli 		struct vm_area_struct *next = vma->vm_next;
272894fcc585SAndrea Arcangeli 		unsigned long nstart = next->vm_start;
272994fcc585SAndrea Arcangeli 		nstart += adjust_next << PAGE_SHIFT;
273094fcc585SAndrea Arcangeli 		if (nstart & ~HPAGE_PMD_MASK &&
273194fcc585SAndrea Arcangeli 		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
273294fcc585SAndrea Arcangeli 		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
273378ddc534SKirill A. Shutemov 			split_huge_pmd_address(next, nstart);
273494fcc585SAndrea Arcangeli 	}
273594fcc585SAndrea Arcangeli }
2736