xref: /linux/mm/khugepaged.c (revision 06a5e1268a5fb9c2b346a3da6b97e85f2eba0f07)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2b46e756fSKirill A. Shutemov #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3b46e756fSKirill A. Shutemov 
4b46e756fSKirill A. Shutemov #include <linux/mm.h>
5b46e756fSKirill A. Shutemov #include <linux/sched.h>
66e84f315SIngo Molnar #include <linux/sched/mm.h>
7f7ccbae4SIngo Molnar #include <linux/sched/coredump.h>
8b46e756fSKirill A. Shutemov #include <linux/mmu_notifier.h>
9b46e756fSKirill A. Shutemov #include <linux/rmap.h>
10b46e756fSKirill A. Shutemov #include <linux/swap.h>
11b46e756fSKirill A. Shutemov #include <linux/mm_inline.h>
12b46e756fSKirill A. Shutemov #include <linux/kthread.h>
13b46e756fSKirill A. Shutemov #include <linux/khugepaged.h>
14b46e756fSKirill A. Shutemov #include <linux/freezer.h>
15b46e756fSKirill A. Shutemov #include <linux/mman.h>
16b46e756fSKirill A. Shutemov #include <linux/hashtable.h>
17b46e756fSKirill A. Shutemov #include <linux/userfaultfd_k.h>
18b46e756fSKirill A. Shutemov #include <linux/page_idle.h>
19b46e756fSKirill A. Shutemov #include <linux/swapops.h>
20f3f0e1d2SKirill A. Shutemov #include <linux/shmem_fs.h>
21b46e756fSKirill A. Shutemov 
22b46e756fSKirill A. Shutemov #include <asm/tlb.h>
23b46e756fSKirill A. Shutemov #include <asm/pgalloc.h>
24b46e756fSKirill A. Shutemov #include "internal.h"
25b46e756fSKirill A. Shutemov 
26b46e756fSKirill A. Shutemov enum scan_result {
27b46e756fSKirill A. Shutemov 	SCAN_FAIL,
28b46e756fSKirill A. Shutemov 	SCAN_SUCCEED,
29b46e756fSKirill A. Shutemov 	SCAN_PMD_NULL,
30b46e756fSKirill A. Shutemov 	SCAN_EXCEED_NONE_PTE,
31b46e756fSKirill A. Shutemov 	SCAN_PTE_NON_PRESENT,
32b46e756fSKirill A. Shutemov 	SCAN_PAGE_RO,
330db501f7SEbru Akagunduz 	SCAN_LACK_REFERENCED_PAGE,
34b46e756fSKirill A. Shutemov 	SCAN_PAGE_NULL,
35b46e756fSKirill A. Shutemov 	SCAN_SCAN_ABORT,
36b46e756fSKirill A. Shutemov 	SCAN_PAGE_COUNT,
37b46e756fSKirill A. Shutemov 	SCAN_PAGE_LRU,
38b46e756fSKirill A. Shutemov 	SCAN_PAGE_LOCK,
39b46e756fSKirill A. Shutemov 	SCAN_PAGE_ANON,
40b46e756fSKirill A. Shutemov 	SCAN_PAGE_COMPOUND,
41b46e756fSKirill A. Shutemov 	SCAN_ANY_PROCESS,
42b46e756fSKirill A. Shutemov 	SCAN_VMA_NULL,
43b46e756fSKirill A. Shutemov 	SCAN_VMA_CHECK,
44b46e756fSKirill A. Shutemov 	SCAN_ADDRESS_RANGE,
45b46e756fSKirill A. Shutemov 	SCAN_SWAP_CACHE_PAGE,
46b46e756fSKirill A. Shutemov 	SCAN_DEL_PAGE_LRU,
47b46e756fSKirill A. Shutemov 	SCAN_ALLOC_HUGE_PAGE_FAIL,
48b46e756fSKirill A. Shutemov 	SCAN_CGROUP_CHARGE_FAIL,
49f3f0e1d2SKirill A. Shutemov 	SCAN_EXCEED_SWAP_PTE,
50f3f0e1d2SKirill A. Shutemov 	SCAN_TRUNCATED,
51b46e756fSKirill A. Shutemov };
52b46e756fSKirill A. Shutemov 
53b46e756fSKirill A. Shutemov #define CREATE_TRACE_POINTS
54b46e756fSKirill A. Shutemov #include <trace/events/huge_memory.h>
55b46e756fSKirill A. Shutemov 
56b46e756fSKirill A. Shutemov /* default scan 8*512 pte (or vmas) every 30 second */
57b46e756fSKirill A. Shutemov static unsigned int khugepaged_pages_to_scan __read_mostly;
58b46e756fSKirill A. Shutemov static unsigned int khugepaged_pages_collapsed;
59b46e756fSKirill A. Shutemov static unsigned int khugepaged_full_scans;
60b46e756fSKirill A. Shutemov static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
61b46e756fSKirill A. Shutemov /* during fragmentation poll the hugepage allocator once every minute */
62b46e756fSKirill A. Shutemov static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
63b46e756fSKirill A. Shutemov static unsigned long khugepaged_sleep_expire;
64b46e756fSKirill A. Shutemov static DEFINE_SPINLOCK(khugepaged_mm_lock);
65b46e756fSKirill A. Shutemov static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
66b46e756fSKirill A. Shutemov /*
67b46e756fSKirill A. Shutemov  * default collapse hugepages if there is at least one pte mapped like
68b46e756fSKirill A. Shutemov  * it would have happened if the vma was large enough during page
69b46e756fSKirill A. Shutemov  * fault.
70b46e756fSKirill A. Shutemov  */
71b46e756fSKirill A. Shutemov static unsigned int khugepaged_max_ptes_none __read_mostly;
72b46e756fSKirill A. Shutemov static unsigned int khugepaged_max_ptes_swap __read_mostly;
73b46e756fSKirill A. Shutemov 
74b46e756fSKirill A. Shutemov #define MM_SLOTS_HASH_BITS 10
75b46e756fSKirill A. Shutemov static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
76b46e756fSKirill A. Shutemov 
77b46e756fSKirill A. Shutemov static struct kmem_cache *mm_slot_cache __read_mostly;
78b46e756fSKirill A. Shutemov 
79b46e756fSKirill A. Shutemov /**
80b46e756fSKirill A. Shutemov  * struct mm_slot - hash lookup from mm to mm_slot
81b46e756fSKirill A. Shutemov  * @hash: hash collision list
82b46e756fSKirill A. Shutemov  * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
83b46e756fSKirill A. Shutemov  * @mm: the mm that this information is valid for
84b46e756fSKirill A. Shutemov  */
85b46e756fSKirill A. Shutemov struct mm_slot {
86b46e756fSKirill A. Shutemov 	struct hlist_node hash;
87b46e756fSKirill A. Shutemov 	struct list_head mm_node;
88b46e756fSKirill A. Shutemov 	struct mm_struct *mm;
89b46e756fSKirill A. Shutemov };
90b46e756fSKirill A. Shutemov 
91b46e756fSKirill A. Shutemov /**
92b46e756fSKirill A. Shutemov  * struct khugepaged_scan - cursor for scanning
93b46e756fSKirill A. Shutemov  * @mm_head: the head of the mm list to scan
94b46e756fSKirill A. Shutemov  * @mm_slot: the current mm_slot we are scanning
95b46e756fSKirill A. Shutemov  * @address: the next address inside that to be scanned
96b46e756fSKirill A. Shutemov  *
97b46e756fSKirill A. Shutemov  * There is only the one khugepaged_scan instance of this cursor structure.
98b46e756fSKirill A. Shutemov  */
99b46e756fSKirill A. Shutemov struct khugepaged_scan {
100b46e756fSKirill A. Shutemov 	struct list_head mm_head;
101b46e756fSKirill A. Shutemov 	struct mm_slot *mm_slot;
102b46e756fSKirill A. Shutemov 	unsigned long address;
103b46e756fSKirill A. Shutemov };
104b46e756fSKirill A. Shutemov 
105b46e756fSKirill A. Shutemov static struct khugepaged_scan khugepaged_scan = {
106b46e756fSKirill A. Shutemov 	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
107b46e756fSKirill A. Shutemov };
108b46e756fSKirill A. Shutemov 
109e1465d12SJérémy Lefaure #ifdef CONFIG_SYSFS
110b46e756fSKirill A. Shutemov static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
111b46e756fSKirill A. Shutemov 					 struct kobj_attribute *attr,
112b46e756fSKirill A. Shutemov 					 char *buf)
113b46e756fSKirill A. Shutemov {
114b46e756fSKirill A. Shutemov 	return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
115b46e756fSKirill A. Shutemov }
116b46e756fSKirill A. Shutemov 
117b46e756fSKirill A. Shutemov static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
118b46e756fSKirill A. Shutemov 					  struct kobj_attribute *attr,
119b46e756fSKirill A. Shutemov 					  const char *buf, size_t count)
120b46e756fSKirill A. Shutemov {
121b46e756fSKirill A. Shutemov 	unsigned long msecs;
122b46e756fSKirill A. Shutemov 	int err;
123b46e756fSKirill A. Shutemov 
124b46e756fSKirill A. Shutemov 	err = kstrtoul(buf, 10, &msecs);
125b46e756fSKirill A. Shutemov 	if (err || msecs > UINT_MAX)
126b46e756fSKirill A. Shutemov 		return -EINVAL;
127b46e756fSKirill A. Shutemov 
128b46e756fSKirill A. Shutemov 	khugepaged_scan_sleep_millisecs = msecs;
129b46e756fSKirill A. Shutemov 	khugepaged_sleep_expire = 0;
130b46e756fSKirill A. Shutemov 	wake_up_interruptible(&khugepaged_wait);
131b46e756fSKirill A. Shutemov 
132b46e756fSKirill A. Shutemov 	return count;
133b46e756fSKirill A. Shutemov }
134b46e756fSKirill A. Shutemov static struct kobj_attribute scan_sleep_millisecs_attr =
135b46e756fSKirill A. Shutemov 	__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
136b46e756fSKirill A. Shutemov 	       scan_sleep_millisecs_store);
137b46e756fSKirill A. Shutemov 
138b46e756fSKirill A. Shutemov static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
139b46e756fSKirill A. Shutemov 					  struct kobj_attribute *attr,
140b46e756fSKirill A. Shutemov 					  char *buf)
141b46e756fSKirill A. Shutemov {
142b46e756fSKirill A. Shutemov 	return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
143b46e756fSKirill A. Shutemov }
144b46e756fSKirill A. Shutemov 
145b46e756fSKirill A. Shutemov static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
146b46e756fSKirill A. Shutemov 					   struct kobj_attribute *attr,
147b46e756fSKirill A. Shutemov 					   const char *buf, size_t count)
148b46e756fSKirill A. Shutemov {
149b46e756fSKirill A. Shutemov 	unsigned long msecs;
150b46e756fSKirill A. Shutemov 	int err;
151b46e756fSKirill A. Shutemov 
152b46e756fSKirill A. Shutemov 	err = kstrtoul(buf, 10, &msecs);
153b46e756fSKirill A. Shutemov 	if (err || msecs > UINT_MAX)
154b46e756fSKirill A. Shutemov 		return -EINVAL;
155b46e756fSKirill A. Shutemov 
156b46e756fSKirill A. Shutemov 	khugepaged_alloc_sleep_millisecs = msecs;
157b46e756fSKirill A. Shutemov 	khugepaged_sleep_expire = 0;
158b46e756fSKirill A. Shutemov 	wake_up_interruptible(&khugepaged_wait);
159b46e756fSKirill A. Shutemov 
160b46e756fSKirill A. Shutemov 	return count;
161b46e756fSKirill A. Shutemov }
162b46e756fSKirill A. Shutemov static struct kobj_attribute alloc_sleep_millisecs_attr =
163b46e756fSKirill A. Shutemov 	__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
164b46e756fSKirill A. Shutemov 	       alloc_sleep_millisecs_store);
165b46e756fSKirill A. Shutemov 
166b46e756fSKirill A. Shutemov static ssize_t pages_to_scan_show(struct kobject *kobj,
167b46e756fSKirill A. Shutemov 				  struct kobj_attribute *attr,
168b46e756fSKirill A. Shutemov 				  char *buf)
169b46e756fSKirill A. Shutemov {
170b46e756fSKirill A. Shutemov 	return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
171b46e756fSKirill A. Shutemov }
172b46e756fSKirill A. Shutemov static ssize_t pages_to_scan_store(struct kobject *kobj,
173b46e756fSKirill A. Shutemov 				   struct kobj_attribute *attr,
174b46e756fSKirill A. Shutemov 				   const char *buf, size_t count)
175b46e756fSKirill A. Shutemov {
176b46e756fSKirill A. Shutemov 	int err;
177b46e756fSKirill A. Shutemov 	unsigned long pages;
178b46e756fSKirill A. Shutemov 
179b46e756fSKirill A. Shutemov 	err = kstrtoul(buf, 10, &pages);
180b46e756fSKirill A. Shutemov 	if (err || !pages || pages > UINT_MAX)
181b46e756fSKirill A. Shutemov 		return -EINVAL;
182b46e756fSKirill A. Shutemov 
183b46e756fSKirill A. Shutemov 	khugepaged_pages_to_scan = pages;
184b46e756fSKirill A. Shutemov 
185b46e756fSKirill A. Shutemov 	return count;
186b46e756fSKirill A. Shutemov }
187b46e756fSKirill A. Shutemov static struct kobj_attribute pages_to_scan_attr =
188b46e756fSKirill A. Shutemov 	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
189b46e756fSKirill A. Shutemov 	       pages_to_scan_store);
190b46e756fSKirill A. Shutemov 
191b46e756fSKirill A. Shutemov static ssize_t pages_collapsed_show(struct kobject *kobj,
192b46e756fSKirill A. Shutemov 				    struct kobj_attribute *attr,
193b46e756fSKirill A. Shutemov 				    char *buf)
194b46e756fSKirill A. Shutemov {
195b46e756fSKirill A. Shutemov 	return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
196b46e756fSKirill A. Shutemov }
197b46e756fSKirill A. Shutemov static struct kobj_attribute pages_collapsed_attr =
198b46e756fSKirill A. Shutemov 	__ATTR_RO(pages_collapsed);
199b46e756fSKirill A. Shutemov 
200b46e756fSKirill A. Shutemov static ssize_t full_scans_show(struct kobject *kobj,
201b46e756fSKirill A. Shutemov 			       struct kobj_attribute *attr,
202b46e756fSKirill A. Shutemov 			       char *buf)
203b46e756fSKirill A. Shutemov {
204b46e756fSKirill A. Shutemov 	return sprintf(buf, "%u\n", khugepaged_full_scans);
205b46e756fSKirill A. Shutemov }
206b46e756fSKirill A. Shutemov static struct kobj_attribute full_scans_attr =
207b46e756fSKirill A. Shutemov 	__ATTR_RO(full_scans);
208b46e756fSKirill A. Shutemov 
209b46e756fSKirill A. Shutemov static ssize_t khugepaged_defrag_show(struct kobject *kobj,
210b46e756fSKirill A. Shutemov 				      struct kobj_attribute *attr, char *buf)
211b46e756fSKirill A. Shutemov {
212b46e756fSKirill A. Shutemov 	return single_hugepage_flag_show(kobj, attr, buf,
213b46e756fSKirill A. Shutemov 				TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
214b46e756fSKirill A. Shutemov }
215b46e756fSKirill A. Shutemov static ssize_t khugepaged_defrag_store(struct kobject *kobj,
216b46e756fSKirill A. Shutemov 				       struct kobj_attribute *attr,
217b46e756fSKirill A. Shutemov 				       const char *buf, size_t count)
218b46e756fSKirill A. Shutemov {
219b46e756fSKirill A. Shutemov 	return single_hugepage_flag_store(kobj, attr, buf, count,
220b46e756fSKirill A. Shutemov 				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
221b46e756fSKirill A. Shutemov }
222b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_defrag_attr =
223b46e756fSKirill A. Shutemov 	__ATTR(defrag, 0644, khugepaged_defrag_show,
224b46e756fSKirill A. Shutemov 	       khugepaged_defrag_store);
225b46e756fSKirill A. Shutemov 
226b46e756fSKirill A. Shutemov /*
227b46e756fSKirill A. Shutemov  * max_ptes_none controls if khugepaged should collapse hugepages over
228b46e756fSKirill A. Shutemov  * any unmapped ptes in turn potentially increasing the memory
229b46e756fSKirill A. Shutemov  * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
230b46e756fSKirill A. Shutemov  * reduce the available free memory in the system as it
231b46e756fSKirill A. Shutemov  * runs. Increasing max_ptes_none will instead potentially reduce the
232b46e756fSKirill A. Shutemov  * free memory in the system during the khugepaged scan.
233b46e756fSKirill A. Shutemov  */
234b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
235b46e756fSKirill A. Shutemov 					     struct kobj_attribute *attr,
236b46e756fSKirill A. Shutemov 					     char *buf)
237b46e756fSKirill A. Shutemov {
238b46e756fSKirill A. Shutemov 	return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
239b46e756fSKirill A. Shutemov }
240b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
241b46e756fSKirill A. Shutemov 					      struct kobj_attribute *attr,
242b46e756fSKirill A. Shutemov 					      const char *buf, size_t count)
243b46e756fSKirill A. Shutemov {
244b46e756fSKirill A. Shutemov 	int err;
245b46e756fSKirill A. Shutemov 	unsigned long max_ptes_none;
246b46e756fSKirill A. Shutemov 
247b46e756fSKirill A. Shutemov 	err = kstrtoul(buf, 10, &max_ptes_none);
248b46e756fSKirill A. Shutemov 	if (err || max_ptes_none > HPAGE_PMD_NR-1)
249b46e756fSKirill A. Shutemov 		return -EINVAL;
250b46e756fSKirill A. Shutemov 
251b46e756fSKirill A. Shutemov 	khugepaged_max_ptes_none = max_ptes_none;
252b46e756fSKirill A. Shutemov 
253b46e756fSKirill A. Shutemov 	return count;
254b46e756fSKirill A. Shutemov }
255b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_max_ptes_none_attr =
256b46e756fSKirill A. Shutemov 	__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
257b46e756fSKirill A. Shutemov 	       khugepaged_max_ptes_none_store);
258b46e756fSKirill A. Shutemov 
259b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
260b46e756fSKirill A. Shutemov 					     struct kobj_attribute *attr,
261b46e756fSKirill A. Shutemov 					     char *buf)
262b46e756fSKirill A. Shutemov {
263b46e756fSKirill A. Shutemov 	return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
264b46e756fSKirill A. Shutemov }
265b46e756fSKirill A. Shutemov 
266b46e756fSKirill A. Shutemov static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
267b46e756fSKirill A. Shutemov 					      struct kobj_attribute *attr,
268b46e756fSKirill A. Shutemov 					      const char *buf, size_t count)
269b46e756fSKirill A. Shutemov {
270b46e756fSKirill A. Shutemov 	int err;
271b46e756fSKirill A. Shutemov 	unsigned long max_ptes_swap;
272b46e756fSKirill A. Shutemov 
273b46e756fSKirill A. Shutemov 	err  = kstrtoul(buf, 10, &max_ptes_swap);
274b46e756fSKirill A. Shutemov 	if (err || max_ptes_swap > HPAGE_PMD_NR-1)
275b46e756fSKirill A. Shutemov 		return -EINVAL;
276b46e756fSKirill A. Shutemov 
277b46e756fSKirill A. Shutemov 	khugepaged_max_ptes_swap = max_ptes_swap;
278b46e756fSKirill A. Shutemov 
279b46e756fSKirill A. Shutemov 	return count;
280b46e756fSKirill A. Shutemov }
281b46e756fSKirill A. Shutemov 
282b46e756fSKirill A. Shutemov static struct kobj_attribute khugepaged_max_ptes_swap_attr =
283b46e756fSKirill A. Shutemov 	__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
284b46e756fSKirill A. Shutemov 	       khugepaged_max_ptes_swap_store);
285b46e756fSKirill A. Shutemov 
286b46e756fSKirill A. Shutemov static struct attribute *khugepaged_attr[] = {
287b46e756fSKirill A. Shutemov 	&khugepaged_defrag_attr.attr,
288b46e756fSKirill A. Shutemov 	&khugepaged_max_ptes_none_attr.attr,
289b46e756fSKirill A. Shutemov 	&pages_to_scan_attr.attr,
290b46e756fSKirill A. Shutemov 	&pages_collapsed_attr.attr,
291b46e756fSKirill A. Shutemov 	&full_scans_attr.attr,
292b46e756fSKirill A. Shutemov 	&scan_sleep_millisecs_attr.attr,
293b46e756fSKirill A. Shutemov 	&alloc_sleep_millisecs_attr.attr,
294b46e756fSKirill A. Shutemov 	&khugepaged_max_ptes_swap_attr.attr,
295b46e756fSKirill A. Shutemov 	NULL,
296b46e756fSKirill A. Shutemov };
297b46e756fSKirill A. Shutemov 
298b46e756fSKirill A. Shutemov struct attribute_group khugepaged_attr_group = {
299b46e756fSKirill A. Shutemov 	.attrs = khugepaged_attr,
300b46e756fSKirill A. Shutemov 	.name = "khugepaged",
301b46e756fSKirill A. Shutemov };
302e1465d12SJérémy Lefaure #endif /* CONFIG_SYSFS */
303b46e756fSKirill A. Shutemov 
304f3f0e1d2SKirill A. Shutemov #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
305b46e756fSKirill A. Shutemov 
306b46e756fSKirill A. Shutemov int hugepage_madvise(struct vm_area_struct *vma,
307b46e756fSKirill A. Shutemov 		     unsigned long *vm_flags, int advice)
308b46e756fSKirill A. Shutemov {
309b46e756fSKirill A. Shutemov 	switch (advice) {
310b46e756fSKirill A. Shutemov 	case MADV_HUGEPAGE:
311b46e756fSKirill A. Shutemov #ifdef CONFIG_S390
312b46e756fSKirill A. Shutemov 		/*
313b46e756fSKirill A. Shutemov 		 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
314b46e756fSKirill A. Shutemov 		 * can't handle this properly after s390_enable_sie, so we simply
315b46e756fSKirill A. Shutemov 		 * ignore the madvise to prevent qemu from causing a SIGSEGV.
316b46e756fSKirill A. Shutemov 		 */
317b46e756fSKirill A. Shutemov 		if (mm_has_pgste(vma->vm_mm))
318b46e756fSKirill A. Shutemov 			return 0;
319b46e756fSKirill A. Shutemov #endif
320b46e756fSKirill A. Shutemov 		*vm_flags &= ~VM_NOHUGEPAGE;
321b46e756fSKirill A. Shutemov 		*vm_flags |= VM_HUGEPAGE;
322b46e756fSKirill A. Shutemov 		/*
323b46e756fSKirill A. Shutemov 		 * If the vma become good for khugepaged to scan,
324b46e756fSKirill A. Shutemov 		 * register it here without waiting a page fault that
325b46e756fSKirill A. Shutemov 		 * may not happen any time soon.
326b46e756fSKirill A. Shutemov 		 */
327b46e756fSKirill A. Shutemov 		if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
328b46e756fSKirill A. Shutemov 				khugepaged_enter_vma_merge(vma, *vm_flags))
329b46e756fSKirill A. Shutemov 			return -ENOMEM;
330b46e756fSKirill A. Shutemov 		break;
331b46e756fSKirill A. Shutemov 	case MADV_NOHUGEPAGE:
332b46e756fSKirill A. Shutemov 		*vm_flags &= ~VM_HUGEPAGE;
333b46e756fSKirill A. Shutemov 		*vm_flags |= VM_NOHUGEPAGE;
334b46e756fSKirill A. Shutemov 		/*
335b46e756fSKirill A. Shutemov 		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
336b46e756fSKirill A. Shutemov 		 * this vma even if we leave the mm registered in khugepaged if
337b46e756fSKirill A. Shutemov 		 * it got registered before VM_NOHUGEPAGE was set.
338b46e756fSKirill A. Shutemov 		 */
339b46e756fSKirill A. Shutemov 		break;
340b46e756fSKirill A. Shutemov 	}
341b46e756fSKirill A. Shutemov 
342b46e756fSKirill A. Shutemov 	return 0;
343b46e756fSKirill A. Shutemov }
344b46e756fSKirill A. Shutemov 
345b46e756fSKirill A. Shutemov int __init khugepaged_init(void)
346b46e756fSKirill A. Shutemov {
347b46e756fSKirill A. Shutemov 	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
348b46e756fSKirill A. Shutemov 					  sizeof(struct mm_slot),
349b46e756fSKirill A. Shutemov 					  __alignof__(struct mm_slot), 0, NULL);
350b46e756fSKirill A. Shutemov 	if (!mm_slot_cache)
351b46e756fSKirill A. Shutemov 		return -ENOMEM;
352b46e756fSKirill A. Shutemov 
353b46e756fSKirill A. Shutemov 	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
354b46e756fSKirill A. Shutemov 	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
355b46e756fSKirill A. Shutemov 	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
356b46e756fSKirill A. Shutemov 
357b46e756fSKirill A. Shutemov 	return 0;
358b46e756fSKirill A. Shutemov }
359b46e756fSKirill A. Shutemov 
360b46e756fSKirill A. Shutemov void __init khugepaged_destroy(void)
361b46e756fSKirill A. Shutemov {
362b46e756fSKirill A. Shutemov 	kmem_cache_destroy(mm_slot_cache);
363b46e756fSKirill A. Shutemov }
364b46e756fSKirill A. Shutemov 
365b46e756fSKirill A. Shutemov static inline struct mm_slot *alloc_mm_slot(void)
366b46e756fSKirill A. Shutemov {
367b46e756fSKirill A. Shutemov 	if (!mm_slot_cache)	/* initialization failed */
368b46e756fSKirill A. Shutemov 		return NULL;
369b46e756fSKirill A. Shutemov 	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
370b46e756fSKirill A. Shutemov }
371b46e756fSKirill A. Shutemov 
372b46e756fSKirill A. Shutemov static inline void free_mm_slot(struct mm_slot *mm_slot)
373b46e756fSKirill A. Shutemov {
374b46e756fSKirill A. Shutemov 	kmem_cache_free(mm_slot_cache, mm_slot);
375b46e756fSKirill A. Shutemov }
376b46e756fSKirill A. Shutemov 
377b46e756fSKirill A. Shutemov static struct mm_slot *get_mm_slot(struct mm_struct *mm)
378b46e756fSKirill A. Shutemov {
379b46e756fSKirill A. Shutemov 	struct mm_slot *mm_slot;
380b46e756fSKirill A. Shutemov 
381b46e756fSKirill A. Shutemov 	hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
382b46e756fSKirill A. Shutemov 		if (mm == mm_slot->mm)
383b46e756fSKirill A. Shutemov 			return mm_slot;
384b46e756fSKirill A. Shutemov 
385b46e756fSKirill A. Shutemov 	return NULL;
386b46e756fSKirill A. Shutemov }
387b46e756fSKirill A. Shutemov 
388b46e756fSKirill A. Shutemov static void insert_to_mm_slots_hash(struct mm_struct *mm,
389b46e756fSKirill A. Shutemov 				    struct mm_slot *mm_slot)
390b46e756fSKirill A. Shutemov {
391b46e756fSKirill A. Shutemov 	mm_slot->mm = mm;
392b46e756fSKirill A. Shutemov 	hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
393b46e756fSKirill A. Shutemov }
394b46e756fSKirill A. Shutemov 
395b46e756fSKirill A. Shutemov static inline int khugepaged_test_exit(struct mm_struct *mm)
396b46e756fSKirill A. Shutemov {
397b46e756fSKirill A. Shutemov 	return atomic_read(&mm->mm_users) == 0;
398b46e756fSKirill A. Shutemov }
399b46e756fSKirill A. Shutemov 
40050f8b92fSSong Liu static bool hugepage_vma_check(struct vm_area_struct *vma,
40150f8b92fSSong Liu 			       unsigned long vm_flags)
402c2231020SYang Shi {
40350f8b92fSSong Liu 	if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
40450f8b92fSSong Liu 	    (vm_flags & VM_NOHUGEPAGE) ||
405c2231020SYang Shi 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
406c2231020SYang Shi 		return false;
407c2231020SYang Shi 	if (shmem_file(vma->vm_file)) {
408c2231020SYang Shi 		if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
409c2231020SYang Shi 			return false;
410c2231020SYang Shi 		return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
411c2231020SYang Shi 				HPAGE_PMD_NR);
412c2231020SYang Shi 	}
413c2231020SYang Shi 	if (!vma->anon_vma || vma->vm_ops)
414c2231020SYang Shi 		return false;
415c2231020SYang Shi 	if (is_vma_temporary_stack(vma))
416c2231020SYang Shi 		return false;
41750f8b92fSSong Liu 	return !(vm_flags & VM_NO_KHUGEPAGED);
418c2231020SYang Shi }
419c2231020SYang Shi 
420b46e756fSKirill A. Shutemov int __khugepaged_enter(struct mm_struct *mm)
421b46e756fSKirill A. Shutemov {
422b46e756fSKirill A. Shutemov 	struct mm_slot *mm_slot;
423b46e756fSKirill A. Shutemov 	int wakeup;
424b46e756fSKirill A. Shutemov 
425b46e756fSKirill A. Shutemov 	mm_slot = alloc_mm_slot();
426b46e756fSKirill A. Shutemov 	if (!mm_slot)
427b46e756fSKirill A. Shutemov 		return -ENOMEM;
428b46e756fSKirill A. Shutemov 
429b46e756fSKirill A. Shutemov 	/* __khugepaged_exit() must not run from under us */
430b46e756fSKirill A. Shutemov 	VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
431b46e756fSKirill A. Shutemov 	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
432b46e756fSKirill A. Shutemov 		free_mm_slot(mm_slot);
433b46e756fSKirill A. Shutemov 		return 0;
434b46e756fSKirill A. Shutemov 	}
435b46e756fSKirill A. Shutemov 
436b46e756fSKirill A. Shutemov 	spin_lock(&khugepaged_mm_lock);
437b46e756fSKirill A. Shutemov 	insert_to_mm_slots_hash(mm, mm_slot);
438b46e756fSKirill A. Shutemov 	/*
439b46e756fSKirill A. Shutemov 	 * Insert just behind the scanning cursor, to let the area settle
440b46e756fSKirill A. Shutemov 	 * down a little.
441b46e756fSKirill A. Shutemov 	 */
442b46e756fSKirill A. Shutemov 	wakeup = list_empty(&khugepaged_scan.mm_head);
443b46e756fSKirill A. Shutemov 	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
444b46e756fSKirill A. Shutemov 	spin_unlock(&khugepaged_mm_lock);
445b46e756fSKirill A. Shutemov 
446f1f10076SVegard Nossum 	mmgrab(mm);
447b46e756fSKirill A. Shutemov 	if (wakeup)
448b46e756fSKirill A. Shutemov 		wake_up_interruptible(&khugepaged_wait);
449b46e756fSKirill A. Shutemov 
450b46e756fSKirill A. Shutemov 	return 0;
451b46e756fSKirill A. Shutemov }
452b46e756fSKirill A. Shutemov 
453b46e756fSKirill A. Shutemov int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
454b46e756fSKirill A. Shutemov 			       unsigned long vm_flags)
455b46e756fSKirill A. Shutemov {
456b46e756fSKirill A. Shutemov 	unsigned long hstart, hend;
457c2231020SYang Shi 
458b46e756fSKirill A. Shutemov 	/*
459c2231020SYang Shi 	 * khugepaged does not yet work on non-shmem files or special
460c2231020SYang Shi 	 * mappings. And file-private shmem THP is not supported.
461b46e756fSKirill A. Shutemov 	 */
46250f8b92fSSong Liu 	if (!hugepage_vma_check(vma, vm_flags))
463b46e756fSKirill A. Shutemov 		return 0;
464c2231020SYang Shi 
465b46e756fSKirill A. Shutemov 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
466b46e756fSKirill A. Shutemov 	hend = vma->vm_end & HPAGE_PMD_MASK;
467b46e756fSKirill A. Shutemov 	if (hstart < hend)
468b46e756fSKirill A. Shutemov 		return khugepaged_enter(vma, vm_flags);
469b46e756fSKirill A. Shutemov 	return 0;
470b46e756fSKirill A. Shutemov }
471b46e756fSKirill A. Shutemov 
472b46e756fSKirill A. Shutemov void __khugepaged_exit(struct mm_struct *mm)
473b46e756fSKirill A. Shutemov {
474b46e756fSKirill A. Shutemov 	struct mm_slot *mm_slot;
475b46e756fSKirill A. Shutemov 	int free = 0;
476b46e756fSKirill A. Shutemov 
477b46e756fSKirill A. Shutemov 	spin_lock(&khugepaged_mm_lock);
478b46e756fSKirill A. Shutemov 	mm_slot = get_mm_slot(mm);
479b46e756fSKirill A. Shutemov 	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
480b46e756fSKirill A. Shutemov 		hash_del(&mm_slot->hash);
481b46e756fSKirill A. Shutemov 		list_del(&mm_slot->mm_node);
482b46e756fSKirill A. Shutemov 		free = 1;
483b46e756fSKirill A. Shutemov 	}
484b46e756fSKirill A. Shutemov 	spin_unlock(&khugepaged_mm_lock);
485b46e756fSKirill A. Shutemov 
486b46e756fSKirill A. Shutemov 	if (free) {
487b46e756fSKirill A. Shutemov 		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
488b46e756fSKirill A. Shutemov 		free_mm_slot(mm_slot);
489b46e756fSKirill A. Shutemov 		mmdrop(mm);
490b46e756fSKirill A. Shutemov 	} else if (mm_slot) {
491b46e756fSKirill A. Shutemov 		/*
492b46e756fSKirill A. Shutemov 		 * This is required to serialize against
493b46e756fSKirill A. Shutemov 		 * khugepaged_test_exit() (which is guaranteed to run
494b46e756fSKirill A. Shutemov 		 * under mmap sem read mode). Stop here (after we
495b46e756fSKirill A. Shutemov 		 * return all pagetables will be destroyed) until
496b46e756fSKirill A. Shutemov 		 * khugepaged has finished working on the pagetables
497b46e756fSKirill A. Shutemov 		 * under the mmap_sem.
498b46e756fSKirill A. Shutemov 		 */
499b46e756fSKirill A. Shutemov 		down_write(&mm->mmap_sem);
500b46e756fSKirill A. Shutemov 		up_write(&mm->mmap_sem);
501b46e756fSKirill A. Shutemov 	}
502b46e756fSKirill A. Shutemov }
503b46e756fSKirill A. Shutemov 
504b46e756fSKirill A. Shutemov static void release_pte_page(struct page *page)
505b46e756fSKirill A. Shutemov {
506d44d363fSShaohua Li 	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
507b46e756fSKirill A. Shutemov 	unlock_page(page);
508b46e756fSKirill A. Shutemov 	putback_lru_page(page);
509b46e756fSKirill A. Shutemov }
510b46e756fSKirill A. Shutemov 
511b46e756fSKirill A. Shutemov static void release_pte_pages(pte_t *pte, pte_t *_pte)
512b46e756fSKirill A. Shutemov {
513b46e756fSKirill A. Shutemov 	while (--_pte >= pte) {
514b46e756fSKirill A. Shutemov 		pte_t pteval = *_pte;
515b46e756fSKirill A. Shutemov 		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
516b46e756fSKirill A. Shutemov 			release_pte_page(pte_page(pteval));
517b46e756fSKirill A. Shutemov 	}
518b46e756fSKirill A. Shutemov }
519b46e756fSKirill A. Shutemov 
520b46e756fSKirill A. Shutemov static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
521b46e756fSKirill A. Shutemov 					unsigned long address,
522b46e756fSKirill A. Shutemov 					pte_t *pte)
523b46e756fSKirill A. Shutemov {
524b46e756fSKirill A. Shutemov 	struct page *page = NULL;
525b46e756fSKirill A. Shutemov 	pte_t *_pte;
5260db501f7SEbru Akagunduz 	int none_or_zero = 0, result = 0, referenced = 0;
5270db501f7SEbru Akagunduz 	bool writable = false;
528b46e756fSKirill A. Shutemov 
529b46e756fSKirill A. Shutemov 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
530b46e756fSKirill A. Shutemov 	     _pte++, address += PAGE_SIZE) {
531b46e756fSKirill A. Shutemov 		pte_t pteval = *_pte;
532b46e756fSKirill A. Shutemov 		if (pte_none(pteval) || (pte_present(pteval) &&
533b46e756fSKirill A. Shutemov 				is_zero_pfn(pte_pfn(pteval)))) {
534b46e756fSKirill A. Shutemov 			if (!userfaultfd_armed(vma) &&
535b46e756fSKirill A. Shutemov 			    ++none_or_zero <= khugepaged_max_ptes_none) {
536b46e756fSKirill A. Shutemov 				continue;
537b46e756fSKirill A. Shutemov 			} else {
538b46e756fSKirill A. Shutemov 				result = SCAN_EXCEED_NONE_PTE;
539b46e756fSKirill A. Shutemov 				goto out;
540b46e756fSKirill A. Shutemov 			}
541b46e756fSKirill A. Shutemov 		}
542b46e756fSKirill A. Shutemov 		if (!pte_present(pteval)) {
543b46e756fSKirill A. Shutemov 			result = SCAN_PTE_NON_PRESENT;
544b46e756fSKirill A. Shutemov 			goto out;
545b46e756fSKirill A. Shutemov 		}
546b46e756fSKirill A. Shutemov 		page = vm_normal_page(vma, address, pteval);
547b46e756fSKirill A. Shutemov 		if (unlikely(!page)) {
548b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_NULL;
549b46e756fSKirill A. Shutemov 			goto out;
550b46e756fSKirill A. Shutemov 		}
551b46e756fSKirill A. Shutemov 
552fece2029SKirill A. Shutemov 		/* TODO: teach khugepaged to collapse THP mapped with pte */
553fece2029SKirill A. Shutemov 		if (PageCompound(page)) {
554fece2029SKirill A. Shutemov 			result = SCAN_PAGE_COMPOUND;
555fece2029SKirill A. Shutemov 			goto out;
556fece2029SKirill A. Shutemov 		}
557fece2029SKirill A. Shutemov 
558b46e756fSKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageAnon(page), page);
559b46e756fSKirill A. Shutemov 
560b46e756fSKirill A. Shutemov 		/*
561b46e756fSKirill A. Shutemov 		 * We can do it before isolate_lru_page because the
562b46e756fSKirill A. Shutemov 		 * page can't be freed from under us. NOTE: PG_lock
563b46e756fSKirill A. Shutemov 		 * is needed to serialize against split_huge_page
564b46e756fSKirill A. Shutemov 		 * when invoked from the VM.
565b46e756fSKirill A. Shutemov 		 */
566b46e756fSKirill A. Shutemov 		if (!trylock_page(page)) {
567b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_LOCK;
568b46e756fSKirill A. Shutemov 			goto out;
569b46e756fSKirill A. Shutemov 		}
570b46e756fSKirill A. Shutemov 
571b46e756fSKirill A. Shutemov 		/*
572b46e756fSKirill A. Shutemov 		 * cannot use mapcount: can't collapse if there's a gup pin.
573b46e756fSKirill A. Shutemov 		 * The page must only be referenced by the scanned process
574b46e756fSKirill A. Shutemov 		 * and page swap cache.
575b46e756fSKirill A. Shutemov 		 */
5762948be5aSMinchan Kim 		if (page_count(page) != 1 + PageSwapCache(page)) {
577b46e756fSKirill A. Shutemov 			unlock_page(page);
578b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_COUNT;
579b46e756fSKirill A. Shutemov 			goto out;
580b46e756fSKirill A. Shutemov 		}
581b46e756fSKirill A. Shutemov 		if (pte_write(pteval)) {
582b46e756fSKirill A. Shutemov 			writable = true;
583b46e756fSKirill A. Shutemov 		} else {
584b46e756fSKirill A. Shutemov 			if (PageSwapCache(page) &&
585b46e756fSKirill A. Shutemov 			    !reuse_swap_page(page, NULL)) {
586b46e756fSKirill A. Shutemov 				unlock_page(page);
587b46e756fSKirill A. Shutemov 				result = SCAN_SWAP_CACHE_PAGE;
588b46e756fSKirill A. Shutemov 				goto out;
589b46e756fSKirill A. Shutemov 			}
590b46e756fSKirill A. Shutemov 			/*
591b46e756fSKirill A. Shutemov 			 * Page is not in the swap cache. It can be collapsed
592b46e756fSKirill A. Shutemov 			 * into a THP.
593b46e756fSKirill A. Shutemov 			 */
594b46e756fSKirill A. Shutemov 		}
595b46e756fSKirill A. Shutemov 
596b46e756fSKirill A. Shutemov 		/*
597b46e756fSKirill A. Shutemov 		 * Isolate the page to avoid collapsing an hugepage
598b46e756fSKirill A. Shutemov 		 * currently in use by the VM.
599b46e756fSKirill A. Shutemov 		 */
600b46e756fSKirill A. Shutemov 		if (isolate_lru_page(page)) {
601b46e756fSKirill A. Shutemov 			unlock_page(page);
602b46e756fSKirill A. Shutemov 			result = SCAN_DEL_PAGE_LRU;
603b46e756fSKirill A. Shutemov 			goto out;
604b46e756fSKirill A. Shutemov 		}
605d44d363fSShaohua Li 		inc_node_page_state(page,
606d44d363fSShaohua Li 				NR_ISOLATED_ANON + page_is_file_cache(page));
607b46e756fSKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageLocked(page), page);
608b46e756fSKirill A. Shutemov 		VM_BUG_ON_PAGE(PageLRU(page), page);
609b46e756fSKirill A. Shutemov 
6100db501f7SEbru Akagunduz 		/* There should be enough young pte to collapse the page */
611b46e756fSKirill A. Shutemov 		if (pte_young(pteval) ||
612b46e756fSKirill A. Shutemov 		    page_is_young(page) || PageReferenced(page) ||
613b46e756fSKirill A. Shutemov 		    mmu_notifier_test_young(vma->vm_mm, address))
6140db501f7SEbru Akagunduz 			referenced++;
615b46e756fSKirill A. Shutemov 	}
616b46e756fSKirill A. Shutemov 	if (likely(writable)) {
617b46e756fSKirill A. Shutemov 		if (likely(referenced)) {
618b46e756fSKirill A. Shutemov 			result = SCAN_SUCCEED;
619b46e756fSKirill A. Shutemov 			trace_mm_collapse_huge_page_isolate(page, none_or_zero,
620b46e756fSKirill A. Shutemov 							    referenced, writable, result);
621b46e756fSKirill A. Shutemov 			return 1;
622b46e756fSKirill A. Shutemov 		}
623b46e756fSKirill A. Shutemov 	} else {
624b46e756fSKirill A. Shutemov 		result = SCAN_PAGE_RO;
625b46e756fSKirill A. Shutemov 	}
626b46e756fSKirill A. Shutemov 
627b46e756fSKirill A. Shutemov out:
628b46e756fSKirill A. Shutemov 	release_pte_pages(pte, _pte);
629b46e756fSKirill A. Shutemov 	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
630b46e756fSKirill A. Shutemov 					    referenced, writable, result);
631b46e756fSKirill A. Shutemov 	return 0;
632b46e756fSKirill A. Shutemov }
633b46e756fSKirill A. Shutemov 
634b46e756fSKirill A. Shutemov static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
635b46e756fSKirill A. Shutemov 				      struct vm_area_struct *vma,
636b46e756fSKirill A. Shutemov 				      unsigned long address,
637b46e756fSKirill A. Shutemov 				      spinlock_t *ptl)
638b46e756fSKirill A. Shutemov {
639b46e756fSKirill A. Shutemov 	pte_t *_pte;
640338a16baSDavid Rientjes 	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
641338a16baSDavid Rientjes 				_pte++, page++, address += PAGE_SIZE) {
642b46e756fSKirill A. Shutemov 		pte_t pteval = *_pte;
643b46e756fSKirill A. Shutemov 		struct page *src_page;
644b46e756fSKirill A. Shutemov 
645b46e756fSKirill A. Shutemov 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
646b46e756fSKirill A. Shutemov 			clear_user_highpage(page, address);
647b46e756fSKirill A. Shutemov 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
648b46e756fSKirill A. Shutemov 			if (is_zero_pfn(pte_pfn(pteval))) {
649b46e756fSKirill A. Shutemov 				/*
650b46e756fSKirill A. Shutemov 				 * ptl mostly unnecessary.
651b46e756fSKirill A. Shutemov 				 */
652b46e756fSKirill A. Shutemov 				spin_lock(ptl);
653b46e756fSKirill A. Shutemov 				/*
654b46e756fSKirill A. Shutemov 				 * paravirt calls inside pte_clear here are
655b46e756fSKirill A. Shutemov 				 * superfluous.
656b46e756fSKirill A. Shutemov 				 */
657b46e756fSKirill A. Shutemov 				pte_clear(vma->vm_mm, address, _pte);
658b46e756fSKirill A. Shutemov 				spin_unlock(ptl);
659b46e756fSKirill A. Shutemov 			}
660b46e756fSKirill A. Shutemov 		} else {
661b46e756fSKirill A. Shutemov 			src_page = pte_page(pteval);
662b46e756fSKirill A. Shutemov 			copy_user_highpage(page, src_page, address, vma);
663b46e756fSKirill A. Shutemov 			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
664b46e756fSKirill A. Shutemov 			release_pte_page(src_page);
665b46e756fSKirill A. Shutemov 			/*
666b46e756fSKirill A. Shutemov 			 * ptl mostly unnecessary, but preempt has to
667b46e756fSKirill A. Shutemov 			 * be disabled to update the per-cpu stats
668b46e756fSKirill A. Shutemov 			 * inside page_remove_rmap().
669b46e756fSKirill A. Shutemov 			 */
670b46e756fSKirill A. Shutemov 			spin_lock(ptl);
671b46e756fSKirill A. Shutemov 			/*
672b46e756fSKirill A. Shutemov 			 * paravirt calls inside pte_clear here are
673b46e756fSKirill A. Shutemov 			 * superfluous.
674b46e756fSKirill A. Shutemov 			 */
675b46e756fSKirill A. Shutemov 			pte_clear(vma->vm_mm, address, _pte);
676b46e756fSKirill A. Shutemov 			page_remove_rmap(src_page, false);
677b46e756fSKirill A. Shutemov 			spin_unlock(ptl);
678b46e756fSKirill A. Shutemov 			free_page_and_swap_cache(src_page);
679b46e756fSKirill A. Shutemov 		}
680b46e756fSKirill A. Shutemov 	}
681b46e756fSKirill A. Shutemov }
682b46e756fSKirill A. Shutemov 
683b46e756fSKirill A. Shutemov static void khugepaged_alloc_sleep(void)
684b46e756fSKirill A. Shutemov {
685b46e756fSKirill A. Shutemov 	DEFINE_WAIT(wait);
686b46e756fSKirill A. Shutemov 
687b46e756fSKirill A. Shutemov 	add_wait_queue(&khugepaged_wait, &wait);
688b46e756fSKirill A. Shutemov 	freezable_schedule_timeout_interruptible(
689b46e756fSKirill A. Shutemov 		msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
690b46e756fSKirill A. Shutemov 	remove_wait_queue(&khugepaged_wait, &wait);
691b46e756fSKirill A. Shutemov }
692b46e756fSKirill A. Shutemov 
693b46e756fSKirill A. Shutemov static int khugepaged_node_load[MAX_NUMNODES];
694b46e756fSKirill A. Shutemov 
695b46e756fSKirill A. Shutemov static bool khugepaged_scan_abort(int nid)
696b46e756fSKirill A. Shutemov {
697b46e756fSKirill A. Shutemov 	int i;
698b46e756fSKirill A. Shutemov 
699b46e756fSKirill A. Shutemov 	/*
700a5f5f91dSMel Gorman 	 * If node_reclaim_mode is disabled, then no extra effort is made to
701b46e756fSKirill A. Shutemov 	 * allocate memory locally.
702b46e756fSKirill A. Shutemov 	 */
703a5f5f91dSMel Gorman 	if (!node_reclaim_mode)
704b46e756fSKirill A. Shutemov 		return false;
705b46e756fSKirill A. Shutemov 
706b46e756fSKirill A. Shutemov 	/* If there is a count for this node already, it must be acceptable */
707b46e756fSKirill A. Shutemov 	if (khugepaged_node_load[nid])
708b46e756fSKirill A. Shutemov 		return false;
709b46e756fSKirill A. Shutemov 
710b46e756fSKirill A. Shutemov 	for (i = 0; i < MAX_NUMNODES; i++) {
711b46e756fSKirill A. Shutemov 		if (!khugepaged_node_load[i])
712b46e756fSKirill A. Shutemov 			continue;
713b46e756fSKirill A. Shutemov 		if (node_distance(nid, i) > RECLAIM_DISTANCE)
714b46e756fSKirill A. Shutemov 			return true;
715b46e756fSKirill A. Shutemov 	}
716b46e756fSKirill A. Shutemov 	return false;
717b46e756fSKirill A. Shutemov }
718b46e756fSKirill A. Shutemov 
719b46e756fSKirill A. Shutemov /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
720b46e756fSKirill A. Shutemov static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
721b46e756fSKirill A. Shutemov {
72225160354SVlastimil Babka 	return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
723b46e756fSKirill A. Shutemov }
724b46e756fSKirill A. Shutemov 
725b46e756fSKirill A. Shutemov #ifdef CONFIG_NUMA
726b46e756fSKirill A. Shutemov static int khugepaged_find_target_node(void)
727b46e756fSKirill A. Shutemov {
728b46e756fSKirill A. Shutemov 	static int last_khugepaged_target_node = NUMA_NO_NODE;
729b46e756fSKirill A. Shutemov 	int nid, target_node = 0, max_value = 0;
730b46e756fSKirill A. Shutemov 
731b46e756fSKirill A. Shutemov 	/* find first node with max normal pages hit */
732b46e756fSKirill A. Shutemov 	for (nid = 0; nid < MAX_NUMNODES; nid++)
733b46e756fSKirill A. Shutemov 		if (khugepaged_node_load[nid] > max_value) {
734b46e756fSKirill A. Shutemov 			max_value = khugepaged_node_load[nid];
735b46e756fSKirill A. Shutemov 			target_node = nid;
736b46e756fSKirill A. Shutemov 		}
737b46e756fSKirill A. Shutemov 
738b46e756fSKirill A. Shutemov 	/* do some balance if several nodes have the same hit record */
739b46e756fSKirill A. Shutemov 	if (target_node <= last_khugepaged_target_node)
740b46e756fSKirill A. Shutemov 		for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
741b46e756fSKirill A. Shutemov 				nid++)
742b46e756fSKirill A. Shutemov 			if (max_value == khugepaged_node_load[nid]) {
743b46e756fSKirill A. Shutemov 				target_node = nid;
744b46e756fSKirill A. Shutemov 				break;
745b46e756fSKirill A. Shutemov 			}
746b46e756fSKirill A. Shutemov 
747b46e756fSKirill A. Shutemov 	last_khugepaged_target_node = target_node;
748b46e756fSKirill A. Shutemov 	return target_node;
749b46e756fSKirill A. Shutemov }
750b46e756fSKirill A. Shutemov 
751b46e756fSKirill A. Shutemov static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
752b46e756fSKirill A. Shutemov {
753b46e756fSKirill A. Shutemov 	if (IS_ERR(*hpage)) {
754b46e756fSKirill A. Shutemov 		if (!*wait)
755b46e756fSKirill A. Shutemov 			return false;
756b46e756fSKirill A. Shutemov 
757b46e756fSKirill A. Shutemov 		*wait = false;
758b46e756fSKirill A. Shutemov 		*hpage = NULL;
759b46e756fSKirill A. Shutemov 		khugepaged_alloc_sleep();
760b46e756fSKirill A. Shutemov 	} else if (*hpage) {
761b46e756fSKirill A. Shutemov 		put_page(*hpage);
762b46e756fSKirill A. Shutemov 		*hpage = NULL;
763b46e756fSKirill A. Shutemov 	}
764b46e756fSKirill A. Shutemov 
765b46e756fSKirill A. Shutemov 	return true;
766b46e756fSKirill A. Shutemov }
767b46e756fSKirill A. Shutemov 
768b46e756fSKirill A. Shutemov static struct page *
769988ddb71SKirill A. Shutemov khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
770b46e756fSKirill A. Shutemov {
771b46e756fSKirill A. Shutemov 	VM_BUG_ON_PAGE(*hpage, *hpage);
772b46e756fSKirill A. Shutemov 
773b46e756fSKirill A. Shutemov 	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
774b46e756fSKirill A. Shutemov 	if (unlikely(!*hpage)) {
775b46e756fSKirill A. Shutemov 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
776b46e756fSKirill A. Shutemov 		*hpage = ERR_PTR(-ENOMEM);
777b46e756fSKirill A. Shutemov 		return NULL;
778b46e756fSKirill A. Shutemov 	}
779b46e756fSKirill A. Shutemov 
780b46e756fSKirill A. Shutemov 	prep_transhuge_page(*hpage);
781b46e756fSKirill A. Shutemov 	count_vm_event(THP_COLLAPSE_ALLOC);
782b46e756fSKirill A. Shutemov 	return *hpage;
783b46e756fSKirill A. Shutemov }
784b46e756fSKirill A. Shutemov #else
785b46e756fSKirill A. Shutemov static int khugepaged_find_target_node(void)
786b46e756fSKirill A. Shutemov {
787b46e756fSKirill A. Shutemov 	return 0;
788b46e756fSKirill A. Shutemov }
789b46e756fSKirill A. Shutemov 
790b46e756fSKirill A. Shutemov static inline struct page *alloc_khugepaged_hugepage(void)
791b46e756fSKirill A. Shutemov {
792b46e756fSKirill A. Shutemov 	struct page *page;
793b46e756fSKirill A. Shutemov 
794b46e756fSKirill A. Shutemov 	page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
795b46e756fSKirill A. Shutemov 			   HPAGE_PMD_ORDER);
796b46e756fSKirill A. Shutemov 	if (page)
797b46e756fSKirill A. Shutemov 		prep_transhuge_page(page);
798b46e756fSKirill A. Shutemov 	return page;
799b46e756fSKirill A. Shutemov }
800b46e756fSKirill A. Shutemov 
801b46e756fSKirill A. Shutemov static struct page *khugepaged_alloc_hugepage(bool *wait)
802b46e756fSKirill A. Shutemov {
803b46e756fSKirill A. Shutemov 	struct page *hpage;
804b46e756fSKirill A. Shutemov 
805b46e756fSKirill A. Shutemov 	do {
806b46e756fSKirill A. Shutemov 		hpage = alloc_khugepaged_hugepage();
807b46e756fSKirill A. Shutemov 		if (!hpage) {
808b46e756fSKirill A. Shutemov 			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
809b46e756fSKirill A. Shutemov 			if (!*wait)
810b46e756fSKirill A. Shutemov 				return NULL;
811b46e756fSKirill A. Shutemov 
812b46e756fSKirill A. Shutemov 			*wait = false;
813b46e756fSKirill A. Shutemov 			khugepaged_alloc_sleep();
814b46e756fSKirill A. Shutemov 		} else
815b46e756fSKirill A. Shutemov 			count_vm_event(THP_COLLAPSE_ALLOC);
816b46e756fSKirill A. Shutemov 	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
817b46e756fSKirill A. Shutemov 
818b46e756fSKirill A. Shutemov 	return hpage;
819b46e756fSKirill A. Shutemov }
820b46e756fSKirill A. Shutemov 
821b46e756fSKirill A. Shutemov static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
822b46e756fSKirill A. Shutemov {
823b46e756fSKirill A. Shutemov 	if (!*hpage)
824b46e756fSKirill A. Shutemov 		*hpage = khugepaged_alloc_hugepage(wait);
825b46e756fSKirill A. Shutemov 
826b46e756fSKirill A. Shutemov 	if (unlikely(!*hpage))
827b46e756fSKirill A. Shutemov 		return false;
828b46e756fSKirill A. Shutemov 
829b46e756fSKirill A. Shutemov 	return true;
830b46e756fSKirill A. Shutemov }
831b46e756fSKirill A. Shutemov 
832b46e756fSKirill A. Shutemov static struct page *
833988ddb71SKirill A. Shutemov khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
834b46e756fSKirill A. Shutemov {
835b46e756fSKirill A. Shutemov 	VM_BUG_ON(!*hpage);
836b46e756fSKirill A. Shutemov 
837b46e756fSKirill A. Shutemov 	return  *hpage;
838b46e756fSKirill A. Shutemov }
839b46e756fSKirill A. Shutemov #endif
840b46e756fSKirill A. Shutemov 
841b46e756fSKirill A. Shutemov /*
842b46e756fSKirill A. Shutemov  * If mmap_sem temporarily dropped, revalidate vma
843b46e756fSKirill A. Shutemov  * before taking mmap_sem.
844b46e756fSKirill A. Shutemov  * Return 0 if succeeds, otherwise return none-zero
845b46e756fSKirill A. Shutemov  * value (scan code).
846b46e756fSKirill A. Shutemov  */
847b46e756fSKirill A. Shutemov 
848c131f751SKirill A. Shutemov static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
849c131f751SKirill A. Shutemov 		struct vm_area_struct **vmap)
850b46e756fSKirill A. Shutemov {
851b46e756fSKirill A. Shutemov 	struct vm_area_struct *vma;
852b46e756fSKirill A. Shutemov 	unsigned long hstart, hend;
853b46e756fSKirill A. Shutemov 
854b46e756fSKirill A. Shutemov 	if (unlikely(khugepaged_test_exit(mm)))
855b46e756fSKirill A. Shutemov 		return SCAN_ANY_PROCESS;
856b46e756fSKirill A. Shutemov 
857c131f751SKirill A. Shutemov 	*vmap = vma = find_vma(mm, address);
858b46e756fSKirill A. Shutemov 	if (!vma)
859b46e756fSKirill A. Shutemov 		return SCAN_VMA_NULL;
860b46e756fSKirill A. Shutemov 
861b46e756fSKirill A. Shutemov 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
862b46e756fSKirill A. Shutemov 	hend = vma->vm_end & HPAGE_PMD_MASK;
863b46e756fSKirill A. Shutemov 	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
864b46e756fSKirill A. Shutemov 		return SCAN_ADDRESS_RANGE;
86550f8b92fSSong Liu 	if (!hugepage_vma_check(vma, vma->vm_flags))
866b46e756fSKirill A. Shutemov 		return SCAN_VMA_CHECK;
867b46e756fSKirill A. Shutemov 	return 0;
868b46e756fSKirill A. Shutemov }
869b46e756fSKirill A. Shutemov 
870b46e756fSKirill A. Shutemov /*
871b46e756fSKirill A. Shutemov  * Bring missing pages in from swap, to complete THP collapse.
872b46e756fSKirill A. Shutemov  * Only done if khugepaged_scan_pmd believes it is worthwhile.
873b46e756fSKirill A. Shutemov  *
874b46e756fSKirill A. Shutemov  * Called and returns without pte mapped or spinlocks held,
875b46e756fSKirill A. Shutemov  * but with mmap_sem held to protect against vma changes.
876b46e756fSKirill A. Shutemov  */
877b46e756fSKirill A. Shutemov 
878b46e756fSKirill A. Shutemov static bool __collapse_huge_page_swapin(struct mm_struct *mm,
879b46e756fSKirill A. Shutemov 					struct vm_area_struct *vma,
8800db501f7SEbru Akagunduz 					unsigned long address, pmd_t *pmd,
8810db501f7SEbru Akagunduz 					int referenced)
882b46e756fSKirill A. Shutemov {
8832b740303SSouptick Joarder 	int swapped_in = 0;
8842b740303SSouptick Joarder 	vm_fault_t ret = 0;
88582b0f8c3SJan Kara 	struct vm_fault vmf = {
886b46e756fSKirill A. Shutemov 		.vma = vma,
887b46e756fSKirill A. Shutemov 		.address = address,
888b46e756fSKirill A. Shutemov 		.flags = FAULT_FLAG_ALLOW_RETRY,
889b46e756fSKirill A. Shutemov 		.pmd = pmd,
8900721ec8bSJan Kara 		.pgoff = linear_page_index(vma, address),
891b46e756fSKirill A. Shutemov 	};
892b46e756fSKirill A. Shutemov 
893982785c6SEbru Akagunduz 	/* we only decide to swapin, if there is enough young ptes */
894982785c6SEbru Akagunduz 	if (referenced < HPAGE_PMD_NR/2) {
895982785c6SEbru Akagunduz 		trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
896982785c6SEbru Akagunduz 		return false;
897982785c6SEbru Akagunduz 	}
89882b0f8c3SJan Kara 	vmf.pte = pte_offset_map(pmd, address);
89982b0f8c3SJan Kara 	for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
90082b0f8c3SJan Kara 			vmf.pte++, vmf.address += PAGE_SIZE) {
9012994302bSJan Kara 		vmf.orig_pte = *vmf.pte;
9022994302bSJan Kara 		if (!is_swap_pte(vmf.orig_pte))
903b46e756fSKirill A. Shutemov 			continue;
904b46e756fSKirill A. Shutemov 		swapped_in++;
9052994302bSJan Kara 		ret = do_swap_page(&vmf);
9060db501f7SEbru Akagunduz 
907b46e756fSKirill A. Shutemov 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
908b46e756fSKirill A. Shutemov 		if (ret & VM_FAULT_RETRY) {
909b46e756fSKirill A. Shutemov 			down_read(&mm->mmap_sem);
91082b0f8c3SJan Kara 			if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
911b46e756fSKirill A. Shutemov 				/* vma is no longer available, don't continue to swapin */
9120db501f7SEbru Akagunduz 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
913b46e756fSKirill A. Shutemov 				return false;
91447f863eaSEbru Akagunduz 			}
915b46e756fSKirill A. Shutemov 			/* check if the pmd is still valid */
916835152a2SSeongJae Park 			if (mm_find_pmd(mm, address) != pmd) {
917835152a2SSeongJae Park 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
918b46e756fSKirill A. Shutemov 				return false;
919b46e756fSKirill A. Shutemov 			}
920835152a2SSeongJae Park 		}
921b46e756fSKirill A. Shutemov 		if (ret & VM_FAULT_ERROR) {
9220db501f7SEbru Akagunduz 			trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
923b46e756fSKirill A. Shutemov 			return false;
924b46e756fSKirill A. Shutemov 		}
925b46e756fSKirill A. Shutemov 		/* pte is unmapped now, we need to map it */
92682b0f8c3SJan Kara 		vmf.pte = pte_offset_map(pmd, vmf.address);
927b46e756fSKirill A. Shutemov 	}
92882b0f8c3SJan Kara 	vmf.pte--;
92982b0f8c3SJan Kara 	pte_unmap(vmf.pte);
9300db501f7SEbru Akagunduz 	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
931b46e756fSKirill A. Shutemov 	return true;
932b46e756fSKirill A. Shutemov }
933b46e756fSKirill A. Shutemov 
934b46e756fSKirill A. Shutemov static void collapse_huge_page(struct mm_struct *mm,
935b46e756fSKirill A. Shutemov 				   unsigned long address,
936b46e756fSKirill A. Shutemov 				   struct page **hpage,
9370db501f7SEbru Akagunduz 				   int node, int referenced)
938b46e756fSKirill A. Shutemov {
939b46e756fSKirill A. Shutemov 	pmd_t *pmd, _pmd;
940b46e756fSKirill A. Shutemov 	pte_t *pte;
941b46e756fSKirill A. Shutemov 	pgtable_t pgtable;
942b46e756fSKirill A. Shutemov 	struct page *new_page;
943b46e756fSKirill A. Shutemov 	spinlock_t *pmd_ptl, *pte_ptl;
944b46e756fSKirill A. Shutemov 	int isolated = 0, result = 0;
945b46e756fSKirill A. Shutemov 	struct mem_cgroup *memcg;
946c131f751SKirill A. Shutemov 	struct vm_area_struct *vma;
947b46e756fSKirill A. Shutemov 	unsigned long mmun_start;	/* For mmu_notifiers */
948b46e756fSKirill A. Shutemov 	unsigned long mmun_end;		/* For mmu_notifiers */
949b46e756fSKirill A. Shutemov 	gfp_t gfp;
950b46e756fSKirill A. Shutemov 
951b46e756fSKirill A. Shutemov 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
952b46e756fSKirill A. Shutemov 
953b46e756fSKirill A. Shutemov 	/* Only allocate from the target node */
95441b6167eSMichal Hocko 	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
955b46e756fSKirill A. Shutemov 
956988ddb71SKirill A. Shutemov 	/*
957988ddb71SKirill A. Shutemov 	 * Before allocating the hugepage, release the mmap_sem read lock.
958988ddb71SKirill A. Shutemov 	 * The allocation can take potentially a long time if it involves
959988ddb71SKirill A. Shutemov 	 * sync compaction, and we do not need to hold the mmap_sem during
960988ddb71SKirill A. Shutemov 	 * that. We will recheck the vma after taking it again in write mode.
961988ddb71SKirill A. Shutemov 	 */
962988ddb71SKirill A. Shutemov 	up_read(&mm->mmap_sem);
963988ddb71SKirill A. Shutemov 	new_page = khugepaged_alloc_page(hpage, gfp, node);
964b46e756fSKirill A. Shutemov 	if (!new_page) {
965b46e756fSKirill A. Shutemov 		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
966b46e756fSKirill A. Shutemov 		goto out_nolock;
967b46e756fSKirill A. Shutemov 	}
968b46e756fSKirill A. Shutemov 
9692a70f6a7SMichal Hocko 	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
970b46e756fSKirill A. Shutemov 		result = SCAN_CGROUP_CHARGE_FAIL;
971b46e756fSKirill A. Shutemov 		goto out_nolock;
972b46e756fSKirill A. Shutemov 	}
973b46e756fSKirill A. Shutemov 
974b46e756fSKirill A. Shutemov 	down_read(&mm->mmap_sem);
975c131f751SKirill A. Shutemov 	result = hugepage_vma_revalidate(mm, address, &vma);
976b46e756fSKirill A. Shutemov 	if (result) {
977b46e756fSKirill A. Shutemov 		mem_cgroup_cancel_charge(new_page, memcg, true);
978b46e756fSKirill A. Shutemov 		up_read(&mm->mmap_sem);
979b46e756fSKirill A. Shutemov 		goto out_nolock;
980b46e756fSKirill A. Shutemov 	}
981b46e756fSKirill A. Shutemov 
982b46e756fSKirill A. Shutemov 	pmd = mm_find_pmd(mm, address);
983b46e756fSKirill A. Shutemov 	if (!pmd) {
984b46e756fSKirill A. Shutemov 		result = SCAN_PMD_NULL;
985b46e756fSKirill A. Shutemov 		mem_cgroup_cancel_charge(new_page, memcg, true);
986b46e756fSKirill A. Shutemov 		up_read(&mm->mmap_sem);
987b46e756fSKirill A. Shutemov 		goto out_nolock;
988b46e756fSKirill A. Shutemov 	}
989b46e756fSKirill A. Shutemov 
990b46e756fSKirill A. Shutemov 	/*
991b46e756fSKirill A. Shutemov 	 * __collapse_huge_page_swapin always returns with mmap_sem locked.
99247f863eaSEbru Akagunduz 	 * If it fails, we release mmap_sem and jump out_nolock.
993b46e756fSKirill A. Shutemov 	 * Continuing to collapse causes inconsistency.
994b46e756fSKirill A. Shutemov 	 */
9950db501f7SEbru Akagunduz 	if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
996b46e756fSKirill A. Shutemov 		mem_cgroup_cancel_charge(new_page, memcg, true);
997b46e756fSKirill A. Shutemov 		up_read(&mm->mmap_sem);
998b46e756fSKirill A. Shutemov 		goto out_nolock;
999b46e756fSKirill A. Shutemov 	}
1000b46e756fSKirill A. Shutemov 
1001b46e756fSKirill A. Shutemov 	up_read(&mm->mmap_sem);
1002b46e756fSKirill A. Shutemov 	/*
1003b46e756fSKirill A. Shutemov 	 * Prevent all access to pagetables with the exception of
1004b46e756fSKirill A. Shutemov 	 * gup_fast later handled by the ptep_clear_flush and the VM
1005b46e756fSKirill A. Shutemov 	 * handled by the anon_vma lock + PG_lock.
1006b46e756fSKirill A. Shutemov 	 */
1007b46e756fSKirill A. Shutemov 	down_write(&mm->mmap_sem);
1008c131f751SKirill A. Shutemov 	result = hugepage_vma_revalidate(mm, address, &vma);
1009b46e756fSKirill A. Shutemov 	if (result)
1010b46e756fSKirill A. Shutemov 		goto out;
1011b46e756fSKirill A. Shutemov 	/* check if the pmd is still valid */
1012b46e756fSKirill A. Shutemov 	if (mm_find_pmd(mm, address) != pmd)
1013b46e756fSKirill A. Shutemov 		goto out;
1014b46e756fSKirill A. Shutemov 
1015b46e756fSKirill A. Shutemov 	anon_vma_lock_write(vma->anon_vma);
1016b46e756fSKirill A. Shutemov 
1017b46e756fSKirill A. Shutemov 	pte = pte_offset_map(pmd, address);
1018b46e756fSKirill A. Shutemov 	pte_ptl = pte_lockptr(mm, pmd);
1019b46e756fSKirill A. Shutemov 
1020b46e756fSKirill A. Shutemov 	mmun_start = address;
1021b46e756fSKirill A. Shutemov 	mmun_end   = address + HPAGE_PMD_SIZE;
1022b46e756fSKirill A. Shutemov 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1023b46e756fSKirill A. Shutemov 	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1024b46e756fSKirill A. Shutemov 	/*
1025b46e756fSKirill A. Shutemov 	 * After this gup_fast can't run anymore. This also removes
1026b46e756fSKirill A. Shutemov 	 * any huge TLB entry from the CPU so we won't allow
1027b46e756fSKirill A. Shutemov 	 * huge and small TLB entries for the same virtual address
1028b46e756fSKirill A. Shutemov 	 * to avoid the risk of CPU bugs in that area.
1029b46e756fSKirill A. Shutemov 	 */
1030b46e756fSKirill A. Shutemov 	_pmd = pmdp_collapse_flush(vma, address, pmd);
1031b46e756fSKirill A. Shutemov 	spin_unlock(pmd_ptl);
1032b46e756fSKirill A. Shutemov 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1033b46e756fSKirill A. Shutemov 
1034b46e756fSKirill A. Shutemov 	spin_lock(pte_ptl);
1035b46e756fSKirill A. Shutemov 	isolated = __collapse_huge_page_isolate(vma, address, pte);
1036b46e756fSKirill A. Shutemov 	spin_unlock(pte_ptl);
1037b46e756fSKirill A. Shutemov 
1038b46e756fSKirill A. Shutemov 	if (unlikely(!isolated)) {
1039b46e756fSKirill A. Shutemov 		pte_unmap(pte);
1040b46e756fSKirill A. Shutemov 		spin_lock(pmd_ptl);
1041b46e756fSKirill A. Shutemov 		BUG_ON(!pmd_none(*pmd));
1042b46e756fSKirill A. Shutemov 		/*
1043b46e756fSKirill A. Shutemov 		 * We can only use set_pmd_at when establishing
1044b46e756fSKirill A. Shutemov 		 * hugepmds and never for establishing regular pmds that
1045b46e756fSKirill A. Shutemov 		 * points to regular pagetables. Use pmd_populate for that
1046b46e756fSKirill A. Shutemov 		 */
1047b46e756fSKirill A. Shutemov 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1048b46e756fSKirill A. Shutemov 		spin_unlock(pmd_ptl);
1049b46e756fSKirill A. Shutemov 		anon_vma_unlock_write(vma->anon_vma);
1050b46e756fSKirill A. Shutemov 		result = SCAN_FAIL;
1051b46e756fSKirill A. Shutemov 		goto out;
1052b46e756fSKirill A. Shutemov 	}
1053b46e756fSKirill A. Shutemov 
1054b46e756fSKirill A. Shutemov 	/*
1055b46e756fSKirill A. Shutemov 	 * All pages are isolated and locked so anon_vma rmap
1056b46e756fSKirill A. Shutemov 	 * can't run anymore.
1057b46e756fSKirill A. Shutemov 	 */
1058b46e756fSKirill A. Shutemov 	anon_vma_unlock_write(vma->anon_vma);
1059b46e756fSKirill A. Shutemov 
1060b46e756fSKirill A. Shutemov 	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
1061b46e756fSKirill A. Shutemov 	pte_unmap(pte);
1062b46e756fSKirill A. Shutemov 	__SetPageUptodate(new_page);
1063b46e756fSKirill A. Shutemov 	pgtable = pmd_pgtable(_pmd);
1064b46e756fSKirill A. Shutemov 
1065b46e756fSKirill A. Shutemov 	_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
1066f55e1014SLinus Torvalds 	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1067b46e756fSKirill A. Shutemov 
1068b46e756fSKirill A. Shutemov 	/*
1069b46e756fSKirill A. Shutemov 	 * spin_lock() below is not the equivalent of smp_wmb(), so
1070b46e756fSKirill A. Shutemov 	 * this is needed to avoid the copy_huge_page writes to become
1071b46e756fSKirill A. Shutemov 	 * visible after the set_pmd_at() write.
1072b46e756fSKirill A. Shutemov 	 */
1073b46e756fSKirill A. Shutemov 	smp_wmb();
1074b46e756fSKirill A. Shutemov 
1075b46e756fSKirill A. Shutemov 	spin_lock(pmd_ptl);
1076b46e756fSKirill A. Shutemov 	BUG_ON(!pmd_none(*pmd));
1077b46e756fSKirill A. Shutemov 	page_add_new_anon_rmap(new_page, vma, address, true);
1078b46e756fSKirill A. Shutemov 	mem_cgroup_commit_charge(new_page, memcg, false, true);
1079b46e756fSKirill A. Shutemov 	lru_cache_add_active_or_unevictable(new_page, vma);
1080b46e756fSKirill A. Shutemov 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
1081b46e756fSKirill A. Shutemov 	set_pmd_at(mm, address, pmd, _pmd);
1082b46e756fSKirill A. Shutemov 	update_mmu_cache_pmd(vma, address, pmd);
1083b46e756fSKirill A. Shutemov 	spin_unlock(pmd_ptl);
1084b46e756fSKirill A. Shutemov 
1085b46e756fSKirill A. Shutemov 	*hpage = NULL;
1086b46e756fSKirill A. Shutemov 
1087b46e756fSKirill A. Shutemov 	khugepaged_pages_collapsed++;
1088b46e756fSKirill A. Shutemov 	result = SCAN_SUCCEED;
1089b46e756fSKirill A. Shutemov out_up_write:
1090b46e756fSKirill A. Shutemov 	up_write(&mm->mmap_sem);
1091b46e756fSKirill A. Shutemov out_nolock:
1092b46e756fSKirill A. Shutemov 	trace_mm_collapse_huge_page(mm, isolated, result);
1093b46e756fSKirill A. Shutemov 	return;
1094b46e756fSKirill A. Shutemov out:
1095b46e756fSKirill A. Shutemov 	mem_cgroup_cancel_charge(new_page, memcg, true);
1096b46e756fSKirill A. Shutemov 	goto out_up_write;
1097b46e756fSKirill A. Shutemov }
1098b46e756fSKirill A. Shutemov 
1099b46e756fSKirill A. Shutemov static int khugepaged_scan_pmd(struct mm_struct *mm,
1100b46e756fSKirill A. Shutemov 			       struct vm_area_struct *vma,
1101b46e756fSKirill A. Shutemov 			       unsigned long address,
1102b46e756fSKirill A. Shutemov 			       struct page **hpage)
1103b46e756fSKirill A. Shutemov {
1104b46e756fSKirill A. Shutemov 	pmd_t *pmd;
1105b46e756fSKirill A. Shutemov 	pte_t *pte, *_pte;
11060db501f7SEbru Akagunduz 	int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
1107b46e756fSKirill A. Shutemov 	struct page *page = NULL;
1108b46e756fSKirill A. Shutemov 	unsigned long _address;
1109b46e756fSKirill A. Shutemov 	spinlock_t *ptl;
1110b46e756fSKirill A. Shutemov 	int node = NUMA_NO_NODE, unmapped = 0;
11110db501f7SEbru Akagunduz 	bool writable = false;
1112b46e756fSKirill A. Shutemov 
1113b46e756fSKirill A. Shutemov 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1114b46e756fSKirill A. Shutemov 
1115b46e756fSKirill A. Shutemov 	pmd = mm_find_pmd(mm, address);
1116b46e756fSKirill A. Shutemov 	if (!pmd) {
1117b46e756fSKirill A. Shutemov 		result = SCAN_PMD_NULL;
1118b46e756fSKirill A. Shutemov 		goto out;
1119b46e756fSKirill A. Shutemov 	}
1120b46e756fSKirill A. Shutemov 
1121b46e756fSKirill A. Shutemov 	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1122b46e756fSKirill A. Shutemov 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1123b46e756fSKirill A. Shutemov 	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1124b46e756fSKirill A. Shutemov 	     _pte++, _address += PAGE_SIZE) {
1125b46e756fSKirill A. Shutemov 		pte_t pteval = *_pte;
1126b46e756fSKirill A. Shutemov 		if (is_swap_pte(pteval)) {
1127b46e756fSKirill A. Shutemov 			if (++unmapped <= khugepaged_max_ptes_swap) {
1128b46e756fSKirill A. Shutemov 				continue;
1129b46e756fSKirill A. Shutemov 			} else {
1130b46e756fSKirill A. Shutemov 				result = SCAN_EXCEED_SWAP_PTE;
1131b46e756fSKirill A. Shutemov 				goto out_unmap;
1132b46e756fSKirill A. Shutemov 			}
1133b46e756fSKirill A. Shutemov 		}
1134b46e756fSKirill A. Shutemov 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1135b46e756fSKirill A. Shutemov 			if (!userfaultfd_armed(vma) &&
1136b46e756fSKirill A. Shutemov 			    ++none_or_zero <= khugepaged_max_ptes_none) {
1137b46e756fSKirill A. Shutemov 				continue;
1138b46e756fSKirill A. Shutemov 			} else {
1139b46e756fSKirill A. Shutemov 				result = SCAN_EXCEED_NONE_PTE;
1140b46e756fSKirill A. Shutemov 				goto out_unmap;
1141b46e756fSKirill A. Shutemov 			}
1142b46e756fSKirill A. Shutemov 		}
1143b46e756fSKirill A. Shutemov 		if (!pte_present(pteval)) {
1144b46e756fSKirill A. Shutemov 			result = SCAN_PTE_NON_PRESENT;
1145b46e756fSKirill A. Shutemov 			goto out_unmap;
1146b46e756fSKirill A. Shutemov 		}
1147b46e756fSKirill A. Shutemov 		if (pte_write(pteval))
1148b46e756fSKirill A. Shutemov 			writable = true;
1149b46e756fSKirill A. Shutemov 
1150b46e756fSKirill A. Shutemov 		page = vm_normal_page(vma, _address, pteval);
1151b46e756fSKirill A. Shutemov 		if (unlikely(!page)) {
1152b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_NULL;
1153b46e756fSKirill A. Shutemov 			goto out_unmap;
1154b46e756fSKirill A. Shutemov 		}
1155b46e756fSKirill A. Shutemov 
1156b46e756fSKirill A. Shutemov 		/* TODO: teach khugepaged to collapse THP mapped with pte */
1157b46e756fSKirill A. Shutemov 		if (PageCompound(page)) {
1158b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_COMPOUND;
1159b46e756fSKirill A. Shutemov 			goto out_unmap;
1160b46e756fSKirill A. Shutemov 		}
1161b46e756fSKirill A. Shutemov 
1162b46e756fSKirill A. Shutemov 		/*
1163b46e756fSKirill A. Shutemov 		 * Record which node the original page is from and save this
1164b46e756fSKirill A. Shutemov 		 * information to khugepaged_node_load[].
1165b46e756fSKirill A. Shutemov 		 * Khupaged will allocate hugepage from the node has the max
1166b46e756fSKirill A. Shutemov 		 * hit record.
1167b46e756fSKirill A. Shutemov 		 */
1168b46e756fSKirill A. Shutemov 		node = page_to_nid(page);
1169b46e756fSKirill A. Shutemov 		if (khugepaged_scan_abort(node)) {
1170b46e756fSKirill A. Shutemov 			result = SCAN_SCAN_ABORT;
1171b46e756fSKirill A. Shutemov 			goto out_unmap;
1172b46e756fSKirill A. Shutemov 		}
1173b46e756fSKirill A. Shutemov 		khugepaged_node_load[node]++;
1174b46e756fSKirill A. Shutemov 		if (!PageLRU(page)) {
1175b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_LRU;
1176b46e756fSKirill A. Shutemov 			goto out_unmap;
1177b46e756fSKirill A. Shutemov 		}
1178b46e756fSKirill A. Shutemov 		if (PageLocked(page)) {
1179b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_LOCK;
1180b46e756fSKirill A. Shutemov 			goto out_unmap;
1181b46e756fSKirill A. Shutemov 		}
1182b46e756fSKirill A. Shutemov 		if (!PageAnon(page)) {
1183b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_ANON;
1184b46e756fSKirill A. Shutemov 			goto out_unmap;
1185b46e756fSKirill A. Shutemov 		}
1186b46e756fSKirill A. Shutemov 
1187b46e756fSKirill A. Shutemov 		/*
1188b46e756fSKirill A. Shutemov 		 * cannot use mapcount: can't collapse if there's a gup pin.
1189b46e756fSKirill A. Shutemov 		 * The page must only be referenced by the scanned process
1190b46e756fSKirill A. Shutemov 		 * and page swap cache.
1191b46e756fSKirill A. Shutemov 		 */
11922948be5aSMinchan Kim 		if (page_count(page) != 1 + PageSwapCache(page)) {
1193b46e756fSKirill A. Shutemov 			result = SCAN_PAGE_COUNT;
1194b46e756fSKirill A. Shutemov 			goto out_unmap;
1195b46e756fSKirill A. Shutemov 		}
1196b46e756fSKirill A. Shutemov 		if (pte_young(pteval) ||
1197b46e756fSKirill A. Shutemov 		    page_is_young(page) || PageReferenced(page) ||
1198b46e756fSKirill A. Shutemov 		    mmu_notifier_test_young(vma->vm_mm, address))
11990db501f7SEbru Akagunduz 			referenced++;
1200b46e756fSKirill A. Shutemov 	}
1201b46e756fSKirill A. Shutemov 	if (writable) {
1202b46e756fSKirill A. Shutemov 		if (referenced) {
1203b46e756fSKirill A. Shutemov 			result = SCAN_SUCCEED;
1204b46e756fSKirill A. Shutemov 			ret = 1;
1205b46e756fSKirill A. Shutemov 		} else {
12060db501f7SEbru Akagunduz 			result = SCAN_LACK_REFERENCED_PAGE;
1207b46e756fSKirill A. Shutemov 		}
1208b46e756fSKirill A. Shutemov 	} else {
1209b46e756fSKirill A. Shutemov 		result = SCAN_PAGE_RO;
1210b46e756fSKirill A. Shutemov 	}
1211b46e756fSKirill A. Shutemov out_unmap:
1212b46e756fSKirill A. Shutemov 	pte_unmap_unlock(pte, ptl);
1213b46e756fSKirill A. Shutemov 	if (ret) {
1214b46e756fSKirill A. Shutemov 		node = khugepaged_find_target_node();
1215b46e756fSKirill A. Shutemov 		/* collapse_huge_page will return with the mmap_sem released */
1216c131f751SKirill A. Shutemov 		collapse_huge_page(mm, address, hpage, node, referenced);
1217b46e756fSKirill A. Shutemov 	}
1218b46e756fSKirill A. Shutemov out:
1219b46e756fSKirill A. Shutemov 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
1220b46e756fSKirill A. Shutemov 				     none_or_zero, result, unmapped);
1221b46e756fSKirill A. Shutemov 	return ret;
1222b46e756fSKirill A. Shutemov }
1223b46e756fSKirill A. Shutemov 
1224b46e756fSKirill A. Shutemov static void collect_mm_slot(struct mm_slot *mm_slot)
1225b46e756fSKirill A. Shutemov {
1226b46e756fSKirill A. Shutemov 	struct mm_struct *mm = mm_slot->mm;
1227b46e756fSKirill A. Shutemov 
1228b46e756fSKirill A. Shutemov 	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
1229b46e756fSKirill A. Shutemov 
1230b46e756fSKirill A. Shutemov 	if (khugepaged_test_exit(mm)) {
1231b46e756fSKirill A. Shutemov 		/* free mm_slot */
1232b46e756fSKirill A. Shutemov 		hash_del(&mm_slot->hash);
1233b46e756fSKirill A. Shutemov 		list_del(&mm_slot->mm_node);
1234b46e756fSKirill A. Shutemov 
1235b46e756fSKirill A. Shutemov 		/*
1236b46e756fSKirill A. Shutemov 		 * Not strictly needed because the mm exited already.
1237b46e756fSKirill A. Shutemov 		 *
1238b46e756fSKirill A. Shutemov 		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1239b46e756fSKirill A. Shutemov 		 */
1240b46e756fSKirill A. Shutemov 
1241b46e756fSKirill A. Shutemov 		/* khugepaged_mm_lock actually not necessary for the below */
1242b46e756fSKirill A. Shutemov 		free_mm_slot(mm_slot);
1243b46e756fSKirill A. Shutemov 		mmdrop(mm);
1244b46e756fSKirill A. Shutemov 	}
1245b46e756fSKirill A. Shutemov }
1246b46e756fSKirill A. Shutemov 
1247e496cf3dSKirill A. Shutemov #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
1248f3f0e1d2SKirill A. Shutemov static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1249f3f0e1d2SKirill A. Shutemov {
1250f3f0e1d2SKirill A. Shutemov 	struct vm_area_struct *vma;
1251f3f0e1d2SKirill A. Shutemov 	unsigned long addr;
1252f3f0e1d2SKirill A. Shutemov 	pmd_t *pmd, _pmd;
1253f3f0e1d2SKirill A. Shutemov 
1254f3f0e1d2SKirill A. Shutemov 	i_mmap_lock_write(mapping);
1255f3f0e1d2SKirill A. Shutemov 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1256f3f0e1d2SKirill A. Shutemov 		/* probably overkill */
1257f3f0e1d2SKirill A. Shutemov 		if (vma->anon_vma)
1258f3f0e1d2SKirill A. Shutemov 			continue;
1259f3f0e1d2SKirill A. Shutemov 		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1260f3f0e1d2SKirill A. Shutemov 		if (addr & ~HPAGE_PMD_MASK)
1261f3f0e1d2SKirill A. Shutemov 			continue;
1262f3f0e1d2SKirill A. Shutemov 		if (vma->vm_end < addr + HPAGE_PMD_SIZE)
1263f3f0e1d2SKirill A. Shutemov 			continue;
1264f3f0e1d2SKirill A. Shutemov 		pmd = mm_find_pmd(vma->vm_mm, addr);
1265f3f0e1d2SKirill A. Shutemov 		if (!pmd)
1266f3f0e1d2SKirill A. Shutemov 			continue;
1267f3f0e1d2SKirill A. Shutemov 		/*
1268f3f0e1d2SKirill A. Shutemov 		 * We need exclusive mmap_sem to retract page table.
1269f3f0e1d2SKirill A. Shutemov 		 * If trylock fails we would end up with pte-mapped THP after
1270f3f0e1d2SKirill A. Shutemov 		 * re-fault. Not ideal, but it's more important to not disturb
1271f3f0e1d2SKirill A. Shutemov 		 * the system too much.
1272f3f0e1d2SKirill A. Shutemov 		 */
1273f3f0e1d2SKirill A. Shutemov 		if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
1274f3f0e1d2SKirill A. Shutemov 			spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
1275f3f0e1d2SKirill A. Shutemov 			/* assume page table is clear */
1276f3f0e1d2SKirill A. Shutemov 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
1277f3f0e1d2SKirill A. Shutemov 			spin_unlock(ptl);
1278f3f0e1d2SKirill A. Shutemov 			up_write(&vma->vm_mm->mmap_sem);
1279c4812909SKirill A. Shutemov 			mm_dec_nr_ptes(vma->vm_mm);
1280f3f0e1d2SKirill A. Shutemov 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1281f3f0e1d2SKirill A. Shutemov 		}
1282f3f0e1d2SKirill A. Shutemov 	}
1283f3f0e1d2SKirill A. Shutemov 	i_mmap_unlock_write(mapping);
1284f3f0e1d2SKirill A. Shutemov }
1285f3f0e1d2SKirill A. Shutemov 
1286f3f0e1d2SKirill A. Shutemov /**
1287f3f0e1d2SKirill A. Shutemov  * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
1288f3f0e1d2SKirill A. Shutemov  *
1289f3f0e1d2SKirill A. Shutemov  * Basic scheme is simple, details are more complex:
129087c460a0SHugh Dickins  *  - allocate and lock a new huge page;
129177da9389SMatthew Wilcox  *  - scan page cache replacing old pages with the new one
1292f3f0e1d2SKirill A. Shutemov  *    + swap in pages if necessary;
1293f3f0e1d2SKirill A. Shutemov  *    + fill in gaps;
129477da9389SMatthew Wilcox  *    + keep old pages around in case rollback is required;
129577da9389SMatthew Wilcox  *  - if replacing succeeds:
1296f3f0e1d2SKirill A. Shutemov  *    + copy data over;
1297f3f0e1d2SKirill A. Shutemov  *    + free old pages;
129887c460a0SHugh Dickins  *    + unlock huge page;
1299f3f0e1d2SKirill A. Shutemov  *  - if replacing failed;
1300f3f0e1d2SKirill A. Shutemov  *    + put all pages back and unfreeze them;
130177da9389SMatthew Wilcox  *    + restore gaps in the page cache;
130287c460a0SHugh Dickins  *    + unlock and free huge page;
1303f3f0e1d2SKirill A. Shutemov  */
1304f3f0e1d2SKirill A. Shutemov static void collapse_shmem(struct mm_struct *mm,
1305f3f0e1d2SKirill A. Shutemov 		struct address_space *mapping, pgoff_t start,
1306f3f0e1d2SKirill A. Shutemov 		struct page **hpage, int node)
1307f3f0e1d2SKirill A. Shutemov {
1308f3f0e1d2SKirill A. Shutemov 	gfp_t gfp;
130977da9389SMatthew Wilcox 	struct page *new_page;
1310f3f0e1d2SKirill A. Shutemov 	struct mem_cgroup *memcg;
1311f3f0e1d2SKirill A. Shutemov 	pgoff_t index, end = start + HPAGE_PMD_NR;
1312f3f0e1d2SKirill A. Shutemov 	LIST_HEAD(pagelist);
131377da9389SMatthew Wilcox 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1314f3f0e1d2SKirill A. Shutemov 	int nr_none = 0, result = SCAN_SUCCEED;
1315f3f0e1d2SKirill A. Shutemov 
1316f3f0e1d2SKirill A. Shutemov 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1317f3f0e1d2SKirill A. Shutemov 
1318f3f0e1d2SKirill A. Shutemov 	/* Only allocate from the target node */
131941b6167eSMichal Hocko 	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
1320f3f0e1d2SKirill A. Shutemov 
1321f3f0e1d2SKirill A. Shutemov 	new_page = khugepaged_alloc_page(hpage, gfp, node);
1322f3f0e1d2SKirill A. Shutemov 	if (!new_page) {
1323f3f0e1d2SKirill A. Shutemov 		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
1324f3f0e1d2SKirill A. Shutemov 		goto out;
1325f3f0e1d2SKirill A. Shutemov 	}
1326f3f0e1d2SKirill A. Shutemov 
13272a70f6a7SMichal Hocko 	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
1328f3f0e1d2SKirill A. Shutemov 		result = SCAN_CGROUP_CHARGE_FAIL;
1329f3f0e1d2SKirill A. Shutemov 		goto out;
1330f3f0e1d2SKirill A. Shutemov 	}
1331f3f0e1d2SKirill A. Shutemov 
1332042a3082SHugh Dickins 	__SetPageLocked(new_page);
1333042a3082SHugh Dickins 	__SetPageSwapBacked(new_page);
1334f3f0e1d2SKirill A. Shutemov 	new_page->index = start;
1335f3f0e1d2SKirill A. Shutemov 	new_page->mapping = mapping;
1336f3f0e1d2SKirill A. Shutemov 
1337f3f0e1d2SKirill A. Shutemov 	/*
133887c460a0SHugh Dickins 	 * At this point the new_page is locked and not up-to-date.
133987c460a0SHugh Dickins 	 * It's safe to insert it into the page cache, because nobody would
134087c460a0SHugh Dickins 	 * be able to map it or use it in another way until we unlock it.
1341f3f0e1d2SKirill A. Shutemov 	 */
1342f3f0e1d2SKirill A. Shutemov 
134377da9389SMatthew Wilcox 	/* This will be less messy when we use multi-index entries */
134477da9389SMatthew Wilcox 	do {
134577da9389SMatthew Wilcox 		xas_lock_irq(&xas);
134677da9389SMatthew Wilcox 		xas_create_range(&xas);
134777da9389SMatthew Wilcox 		if (!xas_error(&xas))
134877da9389SMatthew Wilcox 			break;
134977da9389SMatthew Wilcox 		xas_unlock_irq(&xas);
135077da9389SMatthew Wilcox 		if (!xas_nomem(&xas, GFP_KERNEL))
135177da9389SMatthew Wilcox 			goto out;
135277da9389SMatthew Wilcox 	} while (1);
1353f3f0e1d2SKirill A. Shutemov 
135477da9389SMatthew Wilcox 	xas_set(&xas, start);
135577da9389SMatthew Wilcox 	for (index = start; index < end; index++) {
135677da9389SMatthew Wilcox 		struct page *page = xas_next(&xas);
135777da9389SMatthew Wilcox 
135877da9389SMatthew Wilcox 		VM_BUG_ON(index != xas.xa_index);
135977da9389SMatthew Wilcox 		if (!page) {
1360701270faSHugh Dickins 			/*
1361701270faSHugh Dickins 			 * Stop if extent has been truncated or hole-punched,
1362701270faSHugh Dickins 			 * and is now completely empty.
1363701270faSHugh Dickins 			 */
1364701270faSHugh Dickins 			if (index == start) {
1365701270faSHugh Dickins 				if (!xas_next_entry(&xas, end - 1)) {
1366701270faSHugh Dickins 					result = SCAN_TRUNCATED;
1367042a3082SHugh Dickins 					goto xa_locked;
1368701270faSHugh Dickins 				}
1369701270faSHugh Dickins 				xas_set(&xas, index);
1370701270faSHugh Dickins 			}
137177da9389SMatthew Wilcox 			if (!shmem_charge(mapping->host, 1)) {
1372f3f0e1d2SKirill A. Shutemov 				result = SCAN_FAIL;
1373042a3082SHugh Dickins 				goto xa_locked;
1374f3f0e1d2SKirill A. Shutemov 			}
137577da9389SMatthew Wilcox 			xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
137677da9389SMatthew Wilcox 			nr_none++;
137777da9389SMatthew Wilcox 			continue;
1378f3f0e1d2SKirill A. Shutemov 		}
1379f3f0e1d2SKirill A. Shutemov 
13803159f943SMatthew Wilcox 		if (xa_is_value(page) || !PageUptodate(page)) {
138177da9389SMatthew Wilcox 			xas_unlock_irq(&xas);
1382f3f0e1d2SKirill A. Shutemov 			/* swap in or instantiate fallocated page */
1383f3f0e1d2SKirill A. Shutemov 			if (shmem_getpage(mapping->host, index, &page,
1384f3f0e1d2SKirill A. Shutemov 						SGP_NOHUGE)) {
1385f3f0e1d2SKirill A. Shutemov 				result = SCAN_FAIL;
138677da9389SMatthew Wilcox 				goto xa_unlocked;
1387f3f0e1d2SKirill A. Shutemov 			}
1388f3f0e1d2SKirill A. Shutemov 		} else if (trylock_page(page)) {
1389f3f0e1d2SKirill A. Shutemov 			get_page(page);
1390042a3082SHugh Dickins 			xas_unlock_irq(&xas);
1391f3f0e1d2SKirill A. Shutemov 		} else {
1392f3f0e1d2SKirill A. Shutemov 			result = SCAN_PAGE_LOCK;
1393042a3082SHugh Dickins 			goto xa_locked;
1394f3f0e1d2SKirill A. Shutemov 		}
1395f3f0e1d2SKirill A. Shutemov 
1396f3f0e1d2SKirill A. Shutemov 		/*
1397b93b0163SMatthew Wilcox 		 * The page must be locked, so we can drop the i_pages lock
1398f3f0e1d2SKirill A. Shutemov 		 * without racing with truncate.
1399f3f0e1d2SKirill A. Shutemov 		 */
1400f3f0e1d2SKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageLocked(page), page);
1401f3f0e1d2SKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageUptodate(page), page);
1402*06a5e126SHugh Dickins 
1403*06a5e126SHugh Dickins 		/*
1404*06a5e126SHugh Dickins 		 * If file was truncated then extended, or hole-punched, before
1405*06a5e126SHugh Dickins 		 * we locked the first page, then a THP might be there already.
1406*06a5e126SHugh Dickins 		 */
1407*06a5e126SHugh Dickins 		if (PageTransCompound(page)) {
1408*06a5e126SHugh Dickins 			result = SCAN_PAGE_COMPOUND;
1409*06a5e126SHugh Dickins 			goto out_unlock;
1410*06a5e126SHugh Dickins 		}
1411f3f0e1d2SKirill A. Shutemov 
1412f3f0e1d2SKirill A. Shutemov 		if (page_mapping(page) != mapping) {
1413f3f0e1d2SKirill A. Shutemov 			result = SCAN_TRUNCATED;
1414f3f0e1d2SKirill A. Shutemov 			goto out_unlock;
1415f3f0e1d2SKirill A. Shutemov 		}
1416f3f0e1d2SKirill A. Shutemov 
1417f3f0e1d2SKirill A. Shutemov 		if (isolate_lru_page(page)) {
1418f3f0e1d2SKirill A. Shutemov 			result = SCAN_DEL_PAGE_LRU;
1419042a3082SHugh Dickins 			goto out_unlock;
1420f3f0e1d2SKirill A. Shutemov 		}
1421f3f0e1d2SKirill A. Shutemov 
1422f3f0e1d2SKirill A. Shutemov 		if (page_mapped(page))
1423977fbdcdSMatthew Wilcox 			unmap_mapping_pages(mapping, index, 1, false);
1424f3f0e1d2SKirill A. Shutemov 
142577da9389SMatthew Wilcox 		xas_lock_irq(&xas);
142677da9389SMatthew Wilcox 		xas_set(&xas, index);
1427f3f0e1d2SKirill A. Shutemov 
142877da9389SMatthew Wilcox 		VM_BUG_ON_PAGE(page != xas_load(&xas), page);
1429f3f0e1d2SKirill A. Shutemov 		VM_BUG_ON_PAGE(page_mapped(page), page);
1430f3f0e1d2SKirill A. Shutemov 
1431f3f0e1d2SKirill A. Shutemov 		/*
1432f3f0e1d2SKirill A. Shutemov 		 * The page is expected to have page_count() == 3:
1433f3f0e1d2SKirill A. Shutemov 		 *  - we hold a pin on it;
143477da9389SMatthew Wilcox 		 *  - one reference from page cache;
1435f3f0e1d2SKirill A. Shutemov 		 *  - one from isolate_lru_page;
1436f3f0e1d2SKirill A. Shutemov 		 */
1437f3f0e1d2SKirill A. Shutemov 		if (!page_ref_freeze(page, 3)) {
1438f3f0e1d2SKirill A. Shutemov 			result = SCAN_PAGE_COUNT;
1439042a3082SHugh Dickins 			xas_unlock_irq(&xas);
1440042a3082SHugh Dickins 			putback_lru_page(page);
1441042a3082SHugh Dickins 			goto out_unlock;
1442f3f0e1d2SKirill A. Shutemov 		}
1443f3f0e1d2SKirill A. Shutemov 
1444f3f0e1d2SKirill A. Shutemov 		/*
1445f3f0e1d2SKirill A. Shutemov 		 * Add the page to the list to be able to undo the collapse if
1446f3f0e1d2SKirill A. Shutemov 		 * something go wrong.
1447f3f0e1d2SKirill A. Shutemov 		 */
1448f3f0e1d2SKirill A. Shutemov 		list_add_tail(&page->lru, &pagelist);
1449f3f0e1d2SKirill A. Shutemov 
1450f3f0e1d2SKirill A. Shutemov 		/* Finally, replace with the new page. */
145177da9389SMatthew Wilcox 		xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
1452f3f0e1d2SKirill A. Shutemov 		continue;
1453f3f0e1d2SKirill A. Shutemov out_unlock:
1454f3f0e1d2SKirill A. Shutemov 		unlock_page(page);
1455f3f0e1d2SKirill A. Shutemov 		put_page(page);
1456042a3082SHugh Dickins 		goto xa_unlocked;
1457f3f0e1d2SKirill A. Shutemov 	}
1458f3f0e1d2SKirill A. Shutemov 
1459042a3082SHugh Dickins 	__inc_node_page_state(new_page, NR_SHMEM_THPS);
1460042a3082SHugh Dickins 	if (nr_none) {
1461042a3082SHugh Dickins 		struct zone *zone = page_zone(new_page);
1462042a3082SHugh Dickins 
1463042a3082SHugh Dickins 		__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
1464042a3082SHugh Dickins 		__mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
1465042a3082SHugh Dickins 	}
1466042a3082SHugh Dickins 
1467042a3082SHugh Dickins xa_locked:
1468042a3082SHugh Dickins 	xas_unlock_irq(&xas);
146977da9389SMatthew Wilcox xa_unlocked:
1470042a3082SHugh Dickins 
1471f3f0e1d2SKirill A. Shutemov 	if (result == SCAN_SUCCEED) {
147277da9389SMatthew Wilcox 		struct page *page, *tmp;
1473f3f0e1d2SKirill A. Shutemov 
1474f3f0e1d2SKirill A. Shutemov 		/*
147577da9389SMatthew Wilcox 		 * Replacing old pages with new one has succeeded, now we
147677da9389SMatthew Wilcox 		 * need to copy the content and free the old pages.
1477f3f0e1d2SKirill A. Shutemov 		 */
14782af8ff29SHugh Dickins 		index = start;
1479f3f0e1d2SKirill A. Shutemov 		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
14802af8ff29SHugh Dickins 			while (index < page->index) {
14812af8ff29SHugh Dickins 				clear_highpage(new_page + (index % HPAGE_PMD_NR));
14822af8ff29SHugh Dickins 				index++;
14832af8ff29SHugh Dickins 			}
1484f3f0e1d2SKirill A. Shutemov 			copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
1485f3f0e1d2SKirill A. Shutemov 					page);
1486f3f0e1d2SKirill A. Shutemov 			list_del(&page->lru);
1487f3f0e1d2SKirill A. Shutemov 			page->mapping = NULL;
1488042a3082SHugh Dickins 			page_ref_unfreeze(page, 1);
1489f3f0e1d2SKirill A. Shutemov 			ClearPageActive(page);
1490f3f0e1d2SKirill A. Shutemov 			ClearPageUnevictable(page);
1491042a3082SHugh Dickins 			unlock_page(page);
1492f3f0e1d2SKirill A. Shutemov 			put_page(page);
14932af8ff29SHugh Dickins 			index++;
14942af8ff29SHugh Dickins 		}
14952af8ff29SHugh Dickins 		while (index < end) {
14962af8ff29SHugh Dickins 			clear_highpage(new_page + (index % HPAGE_PMD_NR));
14972af8ff29SHugh Dickins 			index++;
1498f3f0e1d2SKirill A. Shutemov 		}
1499f3f0e1d2SKirill A. Shutemov 
1500f3f0e1d2SKirill A. Shutemov 		SetPageUptodate(new_page);
150187c460a0SHugh Dickins 		page_ref_add(new_page, HPAGE_PMD_NR - 1);
1502042a3082SHugh Dickins 		set_page_dirty(new_page);
1503f3f0e1d2SKirill A. Shutemov 		mem_cgroup_commit_charge(new_page, memcg, false, true);
1504f3f0e1d2SKirill A. Shutemov 		lru_cache_add_anon(new_page);
1505f3f0e1d2SKirill A. Shutemov 
1506042a3082SHugh Dickins 		/*
1507042a3082SHugh Dickins 		 * Remove pte page tables, so we can re-fault the page as huge.
1508042a3082SHugh Dickins 		 */
1509042a3082SHugh Dickins 		retract_page_tables(mapping, start);
1510f3f0e1d2SKirill A. Shutemov 		*hpage = NULL;
151187aa7529SYang Shi 
151287aa7529SYang Shi 		khugepaged_pages_collapsed++;
1513f3f0e1d2SKirill A. Shutemov 	} else {
151477da9389SMatthew Wilcox 		struct page *page;
1515aaa52e34SHugh Dickins 
151677da9389SMatthew Wilcox 		/* Something went wrong: roll back page cache changes */
151777da9389SMatthew Wilcox 		xas_lock_irq(&xas);
1518aaa52e34SHugh Dickins 		mapping->nrpages -= nr_none;
1519aaa52e34SHugh Dickins 		shmem_uncharge(mapping->host, nr_none);
1520aaa52e34SHugh Dickins 
152177da9389SMatthew Wilcox 		xas_set(&xas, start);
152277da9389SMatthew Wilcox 		xas_for_each(&xas, page, end - 1) {
1523f3f0e1d2SKirill A. Shutemov 			page = list_first_entry_or_null(&pagelist,
1524f3f0e1d2SKirill A. Shutemov 					struct page, lru);
152577da9389SMatthew Wilcox 			if (!page || xas.xa_index < page->index) {
1526f3f0e1d2SKirill A. Shutemov 				if (!nr_none)
1527f3f0e1d2SKirill A. Shutemov 					break;
1528f3f0e1d2SKirill A. Shutemov 				nr_none--;
152959749e6cSJohannes Weiner 				/* Put holes back where they were */
153077da9389SMatthew Wilcox 				xas_store(&xas, NULL);
1531f3f0e1d2SKirill A. Shutemov 				continue;
1532f3f0e1d2SKirill A. Shutemov 			}
1533f3f0e1d2SKirill A. Shutemov 
153477da9389SMatthew Wilcox 			VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
1535f3f0e1d2SKirill A. Shutemov 
1536f3f0e1d2SKirill A. Shutemov 			/* Unfreeze the page. */
1537f3f0e1d2SKirill A. Shutemov 			list_del(&page->lru);
1538f3f0e1d2SKirill A. Shutemov 			page_ref_unfreeze(page, 2);
153977da9389SMatthew Wilcox 			xas_store(&xas, page);
154077da9389SMatthew Wilcox 			xas_pause(&xas);
154177da9389SMatthew Wilcox 			xas_unlock_irq(&xas);
1542f3f0e1d2SKirill A. Shutemov 			unlock_page(page);
1543042a3082SHugh Dickins 			putback_lru_page(page);
154477da9389SMatthew Wilcox 			xas_lock_irq(&xas);
1545f3f0e1d2SKirill A. Shutemov 		}
1546f3f0e1d2SKirill A. Shutemov 		VM_BUG_ON(nr_none);
154777da9389SMatthew Wilcox 		xas_unlock_irq(&xas);
1548f3f0e1d2SKirill A. Shutemov 
1549f3f0e1d2SKirill A. Shutemov 		mem_cgroup_cancel_charge(new_page, memcg, true);
1550f3f0e1d2SKirill A. Shutemov 		new_page->mapping = NULL;
1551f3f0e1d2SKirill A. Shutemov 	}
1552042a3082SHugh Dickins 
1553042a3082SHugh Dickins 	unlock_page(new_page);
1554f3f0e1d2SKirill A. Shutemov out:
1555f3f0e1d2SKirill A. Shutemov 	VM_BUG_ON(!list_empty(&pagelist));
1556f3f0e1d2SKirill A. Shutemov 	/* TODO: tracepoints */
1557f3f0e1d2SKirill A. Shutemov }
1558f3f0e1d2SKirill A. Shutemov 
1559f3f0e1d2SKirill A. Shutemov static void khugepaged_scan_shmem(struct mm_struct *mm,
1560f3f0e1d2SKirill A. Shutemov 		struct address_space *mapping,
1561f3f0e1d2SKirill A. Shutemov 		pgoff_t start, struct page **hpage)
1562f3f0e1d2SKirill A. Shutemov {
1563f3f0e1d2SKirill A. Shutemov 	struct page *page = NULL;
156485b392dbSMatthew Wilcox 	XA_STATE(xas, &mapping->i_pages, start);
1565f3f0e1d2SKirill A. Shutemov 	int present, swap;
1566f3f0e1d2SKirill A. Shutemov 	int node = NUMA_NO_NODE;
1567f3f0e1d2SKirill A. Shutemov 	int result = SCAN_SUCCEED;
1568f3f0e1d2SKirill A. Shutemov 
1569f3f0e1d2SKirill A. Shutemov 	present = 0;
1570f3f0e1d2SKirill A. Shutemov 	swap = 0;
1571f3f0e1d2SKirill A. Shutemov 	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1572f3f0e1d2SKirill A. Shutemov 	rcu_read_lock();
157385b392dbSMatthew Wilcox 	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
157485b392dbSMatthew Wilcox 		if (xas_retry(&xas, page))
1575f3f0e1d2SKirill A. Shutemov 			continue;
1576f3f0e1d2SKirill A. Shutemov 
157785b392dbSMatthew Wilcox 		if (xa_is_value(page)) {
1578f3f0e1d2SKirill A. Shutemov 			if (++swap > khugepaged_max_ptes_swap) {
1579f3f0e1d2SKirill A. Shutemov 				result = SCAN_EXCEED_SWAP_PTE;
1580f3f0e1d2SKirill A. Shutemov 				break;
1581f3f0e1d2SKirill A. Shutemov 			}
1582f3f0e1d2SKirill A. Shutemov 			continue;
1583f3f0e1d2SKirill A. Shutemov 		}
1584f3f0e1d2SKirill A. Shutemov 
1585f3f0e1d2SKirill A. Shutemov 		if (PageTransCompound(page)) {
1586f3f0e1d2SKirill A. Shutemov 			result = SCAN_PAGE_COMPOUND;
1587f3f0e1d2SKirill A. Shutemov 			break;
1588f3f0e1d2SKirill A. Shutemov 		}
1589f3f0e1d2SKirill A. Shutemov 
1590f3f0e1d2SKirill A. Shutemov 		node = page_to_nid(page);
1591f3f0e1d2SKirill A. Shutemov 		if (khugepaged_scan_abort(node)) {
1592f3f0e1d2SKirill A. Shutemov 			result = SCAN_SCAN_ABORT;
1593f3f0e1d2SKirill A. Shutemov 			break;
1594f3f0e1d2SKirill A. Shutemov 		}
1595f3f0e1d2SKirill A. Shutemov 		khugepaged_node_load[node]++;
1596f3f0e1d2SKirill A. Shutemov 
1597f3f0e1d2SKirill A. Shutemov 		if (!PageLRU(page)) {
1598f3f0e1d2SKirill A. Shutemov 			result = SCAN_PAGE_LRU;
1599f3f0e1d2SKirill A. Shutemov 			break;
1600f3f0e1d2SKirill A. Shutemov 		}
1601f3f0e1d2SKirill A. Shutemov 
1602f3f0e1d2SKirill A. Shutemov 		if (page_count(page) != 1 + page_mapcount(page)) {
1603f3f0e1d2SKirill A. Shutemov 			result = SCAN_PAGE_COUNT;
1604f3f0e1d2SKirill A. Shutemov 			break;
1605f3f0e1d2SKirill A. Shutemov 		}
1606f3f0e1d2SKirill A. Shutemov 
1607f3f0e1d2SKirill A. Shutemov 		/*
1608f3f0e1d2SKirill A. Shutemov 		 * We probably should check if the page is referenced here, but
1609f3f0e1d2SKirill A. Shutemov 		 * nobody would transfer pte_young() to PageReferenced() for us.
1610f3f0e1d2SKirill A. Shutemov 		 * And rmap walk here is just too costly...
1611f3f0e1d2SKirill A. Shutemov 		 */
1612f3f0e1d2SKirill A. Shutemov 
1613f3f0e1d2SKirill A. Shutemov 		present++;
1614f3f0e1d2SKirill A. Shutemov 
1615f3f0e1d2SKirill A. Shutemov 		if (need_resched()) {
161685b392dbSMatthew Wilcox 			xas_pause(&xas);
1617f3f0e1d2SKirill A. Shutemov 			cond_resched_rcu();
1618f3f0e1d2SKirill A. Shutemov 		}
1619f3f0e1d2SKirill A. Shutemov 	}
1620f3f0e1d2SKirill A. Shutemov 	rcu_read_unlock();
1621f3f0e1d2SKirill A. Shutemov 
1622f3f0e1d2SKirill A. Shutemov 	if (result == SCAN_SUCCEED) {
1623f3f0e1d2SKirill A. Shutemov 		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
1624f3f0e1d2SKirill A. Shutemov 			result = SCAN_EXCEED_NONE_PTE;
1625f3f0e1d2SKirill A. Shutemov 		} else {
1626f3f0e1d2SKirill A. Shutemov 			node = khugepaged_find_target_node();
1627f3f0e1d2SKirill A. Shutemov 			collapse_shmem(mm, mapping, start, hpage, node);
1628f3f0e1d2SKirill A. Shutemov 		}
1629f3f0e1d2SKirill A. Shutemov 	}
1630f3f0e1d2SKirill A. Shutemov 
1631f3f0e1d2SKirill A. Shutemov 	/* TODO: tracepoints */
1632f3f0e1d2SKirill A. Shutemov }
1633f3f0e1d2SKirill A. Shutemov #else
1634f3f0e1d2SKirill A. Shutemov static void khugepaged_scan_shmem(struct mm_struct *mm,
1635f3f0e1d2SKirill A. Shutemov 		struct address_space *mapping,
1636f3f0e1d2SKirill A. Shutemov 		pgoff_t start, struct page **hpage)
1637f3f0e1d2SKirill A. Shutemov {
1638f3f0e1d2SKirill A. Shutemov 	BUILD_BUG();
1639f3f0e1d2SKirill A. Shutemov }
1640f3f0e1d2SKirill A. Shutemov #endif
1641f3f0e1d2SKirill A. Shutemov 
1642b46e756fSKirill A. Shutemov static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1643b46e756fSKirill A. Shutemov 					    struct page **hpage)
1644b46e756fSKirill A. Shutemov 	__releases(&khugepaged_mm_lock)
1645b46e756fSKirill A. Shutemov 	__acquires(&khugepaged_mm_lock)
1646b46e756fSKirill A. Shutemov {
1647b46e756fSKirill A. Shutemov 	struct mm_slot *mm_slot;
1648b46e756fSKirill A. Shutemov 	struct mm_struct *mm;
1649b46e756fSKirill A. Shutemov 	struct vm_area_struct *vma;
1650b46e756fSKirill A. Shutemov 	int progress = 0;
1651b46e756fSKirill A. Shutemov 
1652b46e756fSKirill A. Shutemov 	VM_BUG_ON(!pages);
1653b46e756fSKirill A. Shutemov 	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
1654b46e756fSKirill A. Shutemov 
1655b46e756fSKirill A. Shutemov 	if (khugepaged_scan.mm_slot)
1656b46e756fSKirill A. Shutemov 		mm_slot = khugepaged_scan.mm_slot;
1657b46e756fSKirill A. Shutemov 	else {
1658b46e756fSKirill A. Shutemov 		mm_slot = list_entry(khugepaged_scan.mm_head.next,
1659b46e756fSKirill A. Shutemov 				     struct mm_slot, mm_node);
1660b46e756fSKirill A. Shutemov 		khugepaged_scan.address = 0;
1661b46e756fSKirill A. Shutemov 		khugepaged_scan.mm_slot = mm_slot;
1662b46e756fSKirill A. Shutemov 	}
1663b46e756fSKirill A. Shutemov 	spin_unlock(&khugepaged_mm_lock);
1664b46e756fSKirill A. Shutemov 
1665b46e756fSKirill A. Shutemov 	mm = mm_slot->mm;
16663b454ad3SYang Shi 	/*
16673b454ad3SYang Shi 	 * Don't wait for semaphore (to avoid long wait times).  Just move to
16683b454ad3SYang Shi 	 * the next mm on the list.
16693b454ad3SYang Shi 	 */
1670b46e756fSKirill A. Shutemov 	vma = NULL;
16713b454ad3SYang Shi 	if (unlikely(!down_read_trylock(&mm->mmap_sem)))
16723b454ad3SYang Shi 		goto breakouterloop_mmap_sem;
16733b454ad3SYang Shi 	if (likely(!khugepaged_test_exit(mm)))
1674b46e756fSKirill A. Shutemov 		vma = find_vma(mm, khugepaged_scan.address);
1675b46e756fSKirill A. Shutemov 
1676b46e756fSKirill A. Shutemov 	progress++;
1677b46e756fSKirill A. Shutemov 	for (; vma; vma = vma->vm_next) {
1678b46e756fSKirill A. Shutemov 		unsigned long hstart, hend;
1679b46e756fSKirill A. Shutemov 
1680b46e756fSKirill A. Shutemov 		cond_resched();
1681b46e756fSKirill A. Shutemov 		if (unlikely(khugepaged_test_exit(mm))) {
1682b46e756fSKirill A. Shutemov 			progress++;
1683b46e756fSKirill A. Shutemov 			break;
1684b46e756fSKirill A. Shutemov 		}
168550f8b92fSSong Liu 		if (!hugepage_vma_check(vma, vma->vm_flags)) {
1686b46e756fSKirill A. Shutemov skip:
1687b46e756fSKirill A. Shutemov 			progress++;
1688b46e756fSKirill A. Shutemov 			continue;
1689b46e756fSKirill A. Shutemov 		}
1690b46e756fSKirill A. Shutemov 		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1691b46e756fSKirill A. Shutemov 		hend = vma->vm_end & HPAGE_PMD_MASK;
1692b46e756fSKirill A. Shutemov 		if (hstart >= hend)
1693b46e756fSKirill A. Shutemov 			goto skip;
1694b46e756fSKirill A. Shutemov 		if (khugepaged_scan.address > hend)
1695b46e756fSKirill A. Shutemov 			goto skip;
1696b46e756fSKirill A. Shutemov 		if (khugepaged_scan.address < hstart)
1697b46e756fSKirill A. Shutemov 			khugepaged_scan.address = hstart;
1698b46e756fSKirill A. Shutemov 		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
1699b46e756fSKirill A. Shutemov 
1700b46e756fSKirill A. Shutemov 		while (khugepaged_scan.address < hend) {
1701b46e756fSKirill A. Shutemov 			int ret;
1702b46e756fSKirill A. Shutemov 			cond_resched();
1703b46e756fSKirill A. Shutemov 			if (unlikely(khugepaged_test_exit(mm)))
1704b46e756fSKirill A. Shutemov 				goto breakouterloop;
1705b46e756fSKirill A. Shutemov 
1706b46e756fSKirill A. Shutemov 			VM_BUG_ON(khugepaged_scan.address < hstart ||
1707b46e756fSKirill A. Shutemov 				  khugepaged_scan.address + HPAGE_PMD_SIZE >
1708b46e756fSKirill A. Shutemov 				  hend);
1709f3f0e1d2SKirill A. Shutemov 			if (shmem_file(vma->vm_file)) {
1710e496cf3dSKirill A. Shutemov 				struct file *file;
1711f3f0e1d2SKirill A. Shutemov 				pgoff_t pgoff = linear_page_index(vma,
1712f3f0e1d2SKirill A. Shutemov 						khugepaged_scan.address);
1713e496cf3dSKirill A. Shutemov 				if (!shmem_huge_enabled(vma))
1714e496cf3dSKirill A. Shutemov 					goto skip;
1715e496cf3dSKirill A. Shutemov 				file = get_file(vma->vm_file);
1716f3f0e1d2SKirill A. Shutemov 				up_read(&mm->mmap_sem);
1717f3f0e1d2SKirill A. Shutemov 				ret = 1;
1718f3f0e1d2SKirill A. Shutemov 				khugepaged_scan_shmem(mm, file->f_mapping,
1719f3f0e1d2SKirill A. Shutemov 						pgoff, hpage);
1720f3f0e1d2SKirill A. Shutemov 				fput(file);
1721f3f0e1d2SKirill A. Shutemov 			} else {
1722b46e756fSKirill A. Shutemov 				ret = khugepaged_scan_pmd(mm, vma,
1723b46e756fSKirill A. Shutemov 						khugepaged_scan.address,
1724b46e756fSKirill A. Shutemov 						hpage);
1725f3f0e1d2SKirill A. Shutemov 			}
1726b46e756fSKirill A. Shutemov 			/* move to next address */
1727b46e756fSKirill A. Shutemov 			khugepaged_scan.address += HPAGE_PMD_SIZE;
1728b46e756fSKirill A. Shutemov 			progress += HPAGE_PMD_NR;
1729b46e756fSKirill A. Shutemov 			if (ret)
1730b46e756fSKirill A. Shutemov 				/* we released mmap_sem so break loop */
1731b46e756fSKirill A. Shutemov 				goto breakouterloop_mmap_sem;
1732b46e756fSKirill A. Shutemov 			if (progress >= pages)
1733b46e756fSKirill A. Shutemov 				goto breakouterloop;
1734b46e756fSKirill A. Shutemov 		}
1735b46e756fSKirill A. Shutemov 	}
1736b46e756fSKirill A. Shutemov breakouterloop:
1737b46e756fSKirill A. Shutemov 	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
1738b46e756fSKirill A. Shutemov breakouterloop_mmap_sem:
1739b46e756fSKirill A. Shutemov 
1740b46e756fSKirill A. Shutemov 	spin_lock(&khugepaged_mm_lock);
1741b46e756fSKirill A. Shutemov 	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
1742b46e756fSKirill A. Shutemov 	/*
1743b46e756fSKirill A. Shutemov 	 * Release the current mm_slot if this mm is about to die, or
1744b46e756fSKirill A. Shutemov 	 * if we scanned all vmas of this mm.
1745b46e756fSKirill A. Shutemov 	 */
1746b46e756fSKirill A. Shutemov 	if (khugepaged_test_exit(mm) || !vma) {
1747b46e756fSKirill A. Shutemov 		/*
1748b46e756fSKirill A. Shutemov 		 * Make sure that if mm_users is reaching zero while
1749b46e756fSKirill A. Shutemov 		 * khugepaged runs here, khugepaged_exit will find
1750b46e756fSKirill A. Shutemov 		 * mm_slot not pointing to the exiting mm.
1751b46e756fSKirill A. Shutemov 		 */
1752b46e756fSKirill A. Shutemov 		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
1753b46e756fSKirill A. Shutemov 			khugepaged_scan.mm_slot = list_entry(
1754b46e756fSKirill A. Shutemov 				mm_slot->mm_node.next,
1755b46e756fSKirill A. Shutemov 				struct mm_slot, mm_node);
1756b46e756fSKirill A. Shutemov 			khugepaged_scan.address = 0;
1757b46e756fSKirill A. Shutemov 		} else {
1758b46e756fSKirill A. Shutemov 			khugepaged_scan.mm_slot = NULL;
1759b46e756fSKirill A. Shutemov 			khugepaged_full_scans++;
1760b46e756fSKirill A. Shutemov 		}
1761b46e756fSKirill A. Shutemov 
1762b46e756fSKirill A. Shutemov 		collect_mm_slot(mm_slot);
1763b46e756fSKirill A. Shutemov 	}
1764b46e756fSKirill A. Shutemov 
1765b46e756fSKirill A. Shutemov 	return progress;
1766b46e756fSKirill A. Shutemov }
1767b46e756fSKirill A. Shutemov 
1768b46e756fSKirill A. Shutemov static int khugepaged_has_work(void)
1769b46e756fSKirill A. Shutemov {
1770b46e756fSKirill A. Shutemov 	return !list_empty(&khugepaged_scan.mm_head) &&
1771b46e756fSKirill A. Shutemov 		khugepaged_enabled();
1772b46e756fSKirill A. Shutemov }
1773b46e756fSKirill A. Shutemov 
1774b46e756fSKirill A. Shutemov static int khugepaged_wait_event(void)
1775b46e756fSKirill A. Shutemov {
1776b46e756fSKirill A. Shutemov 	return !list_empty(&khugepaged_scan.mm_head) ||
1777b46e756fSKirill A. Shutemov 		kthread_should_stop();
1778b46e756fSKirill A. Shutemov }
1779b46e756fSKirill A. Shutemov 
1780b46e756fSKirill A. Shutemov static void khugepaged_do_scan(void)
1781b46e756fSKirill A. Shutemov {
1782b46e756fSKirill A. Shutemov 	struct page *hpage = NULL;
1783b46e756fSKirill A. Shutemov 	unsigned int progress = 0, pass_through_head = 0;
1784b46e756fSKirill A. Shutemov 	unsigned int pages = khugepaged_pages_to_scan;
1785b46e756fSKirill A. Shutemov 	bool wait = true;
1786b46e756fSKirill A. Shutemov 
1787b46e756fSKirill A. Shutemov 	barrier(); /* write khugepaged_pages_to_scan to local stack */
1788b46e756fSKirill A. Shutemov 
1789b46e756fSKirill A. Shutemov 	while (progress < pages) {
1790b46e756fSKirill A. Shutemov 		if (!khugepaged_prealloc_page(&hpage, &wait))
1791b46e756fSKirill A. Shutemov 			break;
1792b46e756fSKirill A. Shutemov 
1793b46e756fSKirill A. Shutemov 		cond_resched();
1794b46e756fSKirill A. Shutemov 
1795b46e756fSKirill A. Shutemov 		if (unlikely(kthread_should_stop() || try_to_freeze()))
1796b46e756fSKirill A. Shutemov 			break;
1797b46e756fSKirill A. Shutemov 
1798b46e756fSKirill A. Shutemov 		spin_lock(&khugepaged_mm_lock);
1799b46e756fSKirill A. Shutemov 		if (!khugepaged_scan.mm_slot)
1800b46e756fSKirill A. Shutemov 			pass_through_head++;
1801b46e756fSKirill A. Shutemov 		if (khugepaged_has_work() &&
1802b46e756fSKirill A. Shutemov 		    pass_through_head < 2)
1803b46e756fSKirill A. Shutemov 			progress += khugepaged_scan_mm_slot(pages - progress,
1804b46e756fSKirill A. Shutemov 							    &hpage);
1805b46e756fSKirill A. Shutemov 		else
1806b46e756fSKirill A. Shutemov 			progress = pages;
1807b46e756fSKirill A. Shutemov 		spin_unlock(&khugepaged_mm_lock);
1808b46e756fSKirill A. Shutemov 	}
1809b46e756fSKirill A. Shutemov 
1810b46e756fSKirill A. Shutemov 	if (!IS_ERR_OR_NULL(hpage))
1811b46e756fSKirill A. Shutemov 		put_page(hpage);
1812b46e756fSKirill A. Shutemov }
1813b46e756fSKirill A. Shutemov 
1814b46e756fSKirill A. Shutemov static bool khugepaged_should_wakeup(void)
1815b46e756fSKirill A. Shutemov {
1816b46e756fSKirill A. Shutemov 	return kthread_should_stop() ||
1817b46e756fSKirill A. Shutemov 	       time_after_eq(jiffies, khugepaged_sleep_expire);
1818b46e756fSKirill A. Shutemov }
1819b46e756fSKirill A. Shutemov 
1820b46e756fSKirill A. Shutemov static void khugepaged_wait_work(void)
1821b46e756fSKirill A. Shutemov {
1822b46e756fSKirill A. Shutemov 	if (khugepaged_has_work()) {
1823b46e756fSKirill A. Shutemov 		const unsigned long scan_sleep_jiffies =
1824b46e756fSKirill A. Shutemov 			msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
1825b46e756fSKirill A. Shutemov 
1826b46e756fSKirill A. Shutemov 		if (!scan_sleep_jiffies)
1827b46e756fSKirill A. Shutemov 			return;
1828b46e756fSKirill A. Shutemov 
1829b46e756fSKirill A. Shutemov 		khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
1830b46e756fSKirill A. Shutemov 		wait_event_freezable_timeout(khugepaged_wait,
1831b46e756fSKirill A. Shutemov 					     khugepaged_should_wakeup(),
1832b46e756fSKirill A. Shutemov 					     scan_sleep_jiffies);
1833b46e756fSKirill A. Shutemov 		return;
1834b46e756fSKirill A. Shutemov 	}
1835b46e756fSKirill A. Shutemov 
1836b46e756fSKirill A. Shutemov 	if (khugepaged_enabled())
1837b46e756fSKirill A. Shutemov 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
1838b46e756fSKirill A. Shutemov }
1839b46e756fSKirill A. Shutemov 
1840b46e756fSKirill A. Shutemov static int khugepaged(void *none)
1841b46e756fSKirill A. Shutemov {
1842b46e756fSKirill A. Shutemov 	struct mm_slot *mm_slot;
1843b46e756fSKirill A. Shutemov 
1844b46e756fSKirill A. Shutemov 	set_freezable();
1845b46e756fSKirill A. Shutemov 	set_user_nice(current, MAX_NICE);
1846b46e756fSKirill A. Shutemov 
1847b46e756fSKirill A. Shutemov 	while (!kthread_should_stop()) {
1848b46e756fSKirill A. Shutemov 		khugepaged_do_scan();
1849b46e756fSKirill A. Shutemov 		khugepaged_wait_work();
1850b46e756fSKirill A. Shutemov 	}
1851b46e756fSKirill A. Shutemov 
1852b46e756fSKirill A. Shutemov 	spin_lock(&khugepaged_mm_lock);
1853b46e756fSKirill A. Shutemov 	mm_slot = khugepaged_scan.mm_slot;
1854b46e756fSKirill A. Shutemov 	khugepaged_scan.mm_slot = NULL;
1855b46e756fSKirill A. Shutemov 	if (mm_slot)
1856b46e756fSKirill A. Shutemov 		collect_mm_slot(mm_slot);
1857b46e756fSKirill A. Shutemov 	spin_unlock(&khugepaged_mm_lock);
1858b46e756fSKirill A. Shutemov 	return 0;
1859b46e756fSKirill A. Shutemov }
1860b46e756fSKirill A. Shutemov 
1861b46e756fSKirill A. Shutemov static void set_recommended_min_free_kbytes(void)
1862b46e756fSKirill A. Shutemov {
1863b46e756fSKirill A. Shutemov 	struct zone *zone;
1864b46e756fSKirill A. Shutemov 	int nr_zones = 0;
1865b46e756fSKirill A. Shutemov 	unsigned long recommended_min;
1866b46e756fSKirill A. Shutemov 
1867b7d349c7SJoonsoo Kim 	for_each_populated_zone(zone) {
1868b7d349c7SJoonsoo Kim 		/*
1869b7d349c7SJoonsoo Kim 		 * We don't need to worry about fragmentation of
1870b7d349c7SJoonsoo Kim 		 * ZONE_MOVABLE since it only has movable pages.
1871b7d349c7SJoonsoo Kim 		 */
1872b7d349c7SJoonsoo Kim 		if (zone_idx(zone) > gfp_zone(GFP_USER))
1873b7d349c7SJoonsoo Kim 			continue;
1874b7d349c7SJoonsoo Kim 
1875b46e756fSKirill A. Shutemov 		nr_zones++;
1876b7d349c7SJoonsoo Kim 	}
1877b46e756fSKirill A. Shutemov 
1878b46e756fSKirill A. Shutemov 	/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
1879b46e756fSKirill A. Shutemov 	recommended_min = pageblock_nr_pages * nr_zones * 2;
1880b46e756fSKirill A. Shutemov 
1881b46e756fSKirill A. Shutemov 	/*
1882b46e756fSKirill A. Shutemov 	 * Make sure that on average at least two pageblocks are almost free
1883b46e756fSKirill A. Shutemov 	 * of another type, one for a migratetype to fall back to and a
1884b46e756fSKirill A. Shutemov 	 * second to avoid subsequent fallbacks of other types There are 3
1885b46e756fSKirill A. Shutemov 	 * MIGRATE_TYPES we care about.
1886b46e756fSKirill A. Shutemov 	 */
1887b46e756fSKirill A. Shutemov 	recommended_min += pageblock_nr_pages * nr_zones *
1888b46e756fSKirill A. Shutemov 			   MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
1889b46e756fSKirill A. Shutemov 
1890b46e756fSKirill A. Shutemov 	/* don't ever allow to reserve more than 5% of the lowmem */
1891b46e756fSKirill A. Shutemov 	recommended_min = min(recommended_min,
1892b46e756fSKirill A. Shutemov 			      (unsigned long) nr_free_buffer_pages() / 20);
1893b46e756fSKirill A. Shutemov 	recommended_min <<= (PAGE_SHIFT-10);
1894b46e756fSKirill A. Shutemov 
1895b46e756fSKirill A. Shutemov 	if (recommended_min > min_free_kbytes) {
1896b46e756fSKirill A. Shutemov 		if (user_min_free_kbytes >= 0)
1897b46e756fSKirill A. Shutemov 			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
1898b46e756fSKirill A. Shutemov 				min_free_kbytes, recommended_min);
1899b46e756fSKirill A. Shutemov 
1900b46e756fSKirill A. Shutemov 		min_free_kbytes = recommended_min;
1901b46e756fSKirill A. Shutemov 	}
1902b46e756fSKirill A. Shutemov 	setup_per_zone_wmarks();
1903b46e756fSKirill A. Shutemov }
1904b46e756fSKirill A. Shutemov 
1905b46e756fSKirill A. Shutemov int start_stop_khugepaged(void)
1906b46e756fSKirill A. Shutemov {
1907b46e756fSKirill A. Shutemov 	static struct task_struct *khugepaged_thread __read_mostly;
1908b46e756fSKirill A. Shutemov 	static DEFINE_MUTEX(khugepaged_mutex);
1909b46e756fSKirill A. Shutemov 	int err = 0;
1910b46e756fSKirill A. Shutemov 
1911b46e756fSKirill A. Shutemov 	mutex_lock(&khugepaged_mutex);
1912b46e756fSKirill A. Shutemov 	if (khugepaged_enabled()) {
1913b46e756fSKirill A. Shutemov 		if (!khugepaged_thread)
1914b46e756fSKirill A. Shutemov 			khugepaged_thread = kthread_run(khugepaged, NULL,
1915b46e756fSKirill A. Shutemov 							"khugepaged");
1916b46e756fSKirill A. Shutemov 		if (IS_ERR(khugepaged_thread)) {
1917b46e756fSKirill A. Shutemov 			pr_err("khugepaged: kthread_run(khugepaged) failed\n");
1918b46e756fSKirill A. Shutemov 			err = PTR_ERR(khugepaged_thread);
1919b46e756fSKirill A. Shutemov 			khugepaged_thread = NULL;
1920b46e756fSKirill A. Shutemov 			goto fail;
1921b46e756fSKirill A. Shutemov 		}
1922b46e756fSKirill A. Shutemov 
1923b46e756fSKirill A. Shutemov 		if (!list_empty(&khugepaged_scan.mm_head))
1924b46e756fSKirill A. Shutemov 			wake_up_interruptible(&khugepaged_wait);
1925b46e756fSKirill A. Shutemov 
1926b46e756fSKirill A. Shutemov 		set_recommended_min_free_kbytes();
1927b46e756fSKirill A. Shutemov 	} else if (khugepaged_thread) {
1928b46e756fSKirill A. Shutemov 		kthread_stop(khugepaged_thread);
1929b46e756fSKirill A. Shutemov 		khugepaged_thread = NULL;
1930b46e756fSKirill A. Shutemov 	}
1931b46e756fSKirill A. Shutemov fail:
1932b46e756fSKirill A. Shutemov 	mutex_unlock(&khugepaged_mutex);
1933b46e756fSKirill A. Shutemov 	return err;
1934b46e756fSKirill A. Shutemov }
1935