1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4 #include <linux/mm.h>
5 #include <linux/sched.h>
6 #include <linux/sched/mm.h>
7 #include <linux/mmu_notifier.h>
8 #include <linux/rmap.h>
9 #include <linux/swap.h>
10 #include <linux/mm_inline.h>
11 #include <linux/kthread.h>
12 #include <linux/khugepaged.h>
13 #include <linux/freezer.h>
14 #include <linux/mman.h>
15 #include <linux/hashtable.h>
16 #include <linux/userfaultfd_k.h>
17 #include <linux/page_idle.h>
18 #include <linux/page_table_check.h>
19 #include <linux/rcupdate_wait.h>
20 #include <linux/swapops.h>
21 #include <linux/shmem_fs.h>
22 #include <linux/dax.h>
23 #include <linux/ksm.h>
24
25 #include <asm/tlb.h>
26 #include <asm/pgalloc.h>
27 #include "internal.h"
28 #include "mm_slot.h"
29
30 enum scan_result {
31 SCAN_FAIL,
32 SCAN_SUCCEED,
33 SCAN_PMD_NULL,
34 SCAN_PMD_NONE,
35 SCAN_PMD_MAPPED,
36 SCAN_EXCEED_NONE_PTE,
37 SCAN_EXCEED_SWAP_PTE,
38 SCAN_EXCEED_SHARED_PTE,
39 SCAN_PTE_NON_PRESENT,
40 SCAN_PTE_UFFD_WP,
41 SCAN_PTE_MAPPED_HUGEPAGE,
42 SCAN_LACK_REFERENCED_PAGE,
43 SCAN_PAGE_NULL,
44 SCAN_SCAN_ABORT,
45 SCAN_PAGE_COUNT,
46 SCAN_PAGE_LRU,
47 SCAN_PAGE_LOCK,
48 SCAN_PAGE_ANON,
49 SCAN_PAGE_COMPOUND,
50 SCAN_ANY_PROCESS,
51 SCAN_VMA_NULL,
52 SCAN_VMA_CHECK,
53 SCAN_ADDRESS_RANGE,
54 SCAN_DEL_PAGE_LRU,
55 SCAN_ALLOC_HUGE_PAGE_FAIL,
56 SCAN_CGROUP_CHARGE_FAIL,
57 SCAN_TRUNCATED,
58 SCAN_PAGE_HAS_PRIVATE,
59 SCAN_STORE_FAILED,
60 SCAN_COPY_MC,
61 SCAN_PAGE_FILLED,
62 };
63
64 #define CREATE_TRACE_POINTS
65 #include <trace/events/huge_memory.h>
66
67 static struct task_struct *khugepaged_thread __read_mostly;
68 static DEFINE_MUTEX(khugepaged_mutex);
69
70 /* default scan 8*512 pte (or vmas) every 30 second */
71 static unsigned int khugepaged_pages_to_scan __read_mostly;
72 static unsigned int khugepaged_pages_collapsed;
73 static unsigned int khugepaged_full_scans;
74 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
75 /* during fragmentation poll the hugepage allocator once every minute */
76 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
77 static unsigned long khugepaged_sleep_expire;
78 static DEFINE_SPINLOCK(khugepaged_mm_lock);
79 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
80 /*
81 * default collapse hugepages if there is at least one pte mapped like
82 * it would have happened if the vma was large enough during page
83 * fault.
84 *
85 * Note that these are only respected if collapse was initiated by khugepaged.
86 */
87 unsigned int khugepaged_max_ptes_none __read_mostly;
88 static unsigned int khugepaged_max_ptes_swap __read_mostly;
89 static unsigned int khugepaged_max_ptes_shared __read_mostly;
90
91 #define MM_SLOTS_HASH_BITS 10
92 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
93
94 static struct kmem_cache *mm_slot_cache __ro_after_init;
95
96 struct collapse_control {
97 bool is_khugepaged;
98
99 /* Num pages scanned per node */
100 u32 node_load[MAX_NUMNODES];
101
102 /* nodemask for allocation fallback */
103 nodemask_t alloc_nmask;
104 };
105
106 /**
107 * struct khugepaged_scan - cursor for scanning
108 * @mm_head: the head of the mm list to scan
109 * @mm_slot: the current mm_slot we are scanning
110 * @address: the next address inside that to be scanned
111 *
112 * There is only the one khugepaged_scan instance of this cursor structure.
113 */
114 struct khugepaged_scan {
115 struct list_head mm_head;
116 struct mm_slot *mm_slot;
117 unsigned long address;
118 };
119
120 static struct khugepaged_scan khugepaged_scan = {
121 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
122 };
123
124 #ifdef CONFIG_SYSFS
scan_sleep_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)125 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
126 struct kobj_attribute *attr,
127 char *buf)
128 {
129 return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
130 }
131
scan_sleep_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)132 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
133 struct kobj_attribute *attr,
134 const char *buf, size_t count)
135 {
136 unsigned int msecs;
137 int err;
138
139 err = kstrtouint(buf, 10, &msecs);
140 if (err)
141 return -EINVAL;
142
143 khugepaged_scan_sleep_millisecs = msecs;
144 khugepaged_sleep_expire = 0;
145 wake_up_interruptible(&khugepaged_wait);
146
147 return count;
148 }
149 static struct kobj_attribute scan_sleep_millisecs_attr =
150 __ATTR_RW(scan_sleep_millisecs);
151
alloc_sleep_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)152 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
153 struct kobj_attribute *attr,
154 char *buf)
155 {
156 return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
157 }
158
alloc_sleep_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)159 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
160 struct kobj_attribute *attr,
161 const char *buf, size_t count)
162 {
163 unsigned int msecs;
164 int err;
165
166 err = kstrtouint(buf, 10, &msecs);
167 if (err)
168 return -EINVAL;
169
170 khugepaged_alloc_sleep_millisecs = msecs;
171 khugepaged_sleep_expire = 0;
172 wake_up_interruptible(&khugepaged_wait);
173
174 return count;
175 }
176 static struct kobj_attribute alloc_sleep_millisecs_attr =
177 __ATTR_RW(alloc_sleep_millisecs);
178
pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)179 static ssize_t pages_to_scan_show(struct kobject *kobj,
180 struct kobj_attribute *attr,
181 char *buf)
182 {
183 return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
184 }
pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)185 static ssize_t pages_to_scan_store(struct kobject *kobj,
186 struct kobj_attribute *attr,
187 const char *buf, size_t count)
188 {
189 unsigned int pages;
190 int err;
191
192 err = kstrtouint(buf, 10, &pages);
193 if (err || !pages)
194 return -EINVAL;
195
196 khugepaged_pages_to_scan = pages;
197
198 return count;
199 }
200 static struct kobj_attribute pages_to_scan_attr =
201 __ATTR_RW(pages_to_scan);
202
pages_collapsed_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)203 static ssize_t pages_collapsed_show(struct kobject *kobj,
204 struct kobj_attribute *attr,
205 char *buf)
206 {
207 return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
208 }
209 static struct kobj_attribute pages_collapsed_attr =
210 __ATTR_RO(pages_collapsed);
211
full_scans_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)212 static ssize_t full_scans_show(struct kobject *kobj,
213 struct kobj_attribute *attr,
214 char *buf)
215 {
216 return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
217 }
218 static struct kobj_attribute full_scans_attr =
219 __ATTR_RO(full_scans);
220
defrag_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)221 static ssize_t defrag_show(struct kobject *kobj,
222 struct kobj_attribute *attr, char *buf)
223 {
224 return single_hugepage_flag_show(kobj, attr, buf,
225 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
226 }
defrag_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)227 static ssize_t defrag_store(struct kobject *kobj,
228 struct kobj_attribute *attr,
229 const char *buf, size_t count)
230 {
231 return single_hugepage_flag_store(kobj, attr, buf, count,
232 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
233 }
234 static struct kobj_attribute khugepaged_defrag_attr =
235 __ATTR_RW(defrag);
236
237 /*
238 * max_ptes_none controls if khugepaged should collapse hugepages over
239 * any unmapped ptes in turn potentially increasing the memory
240 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
241 * reduce the available free memory in the system as it
242 * runs. Increasing max_ptes_none will instead potentially reduce the
243 * free memory in the system during the khugepaged scan.
244 */
max_ptes_none_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)245 static ssize_t max_ptes_none_show(struct kobject *kobj,
246 struct kobj_attribute *attr,
247 char *buf)
248 {
249 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
250 }
max_ptes_none_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)251 static ssize_t max_ptes_none_store(struct kobject *kobj,
252 struct kobj_attribute *attr,
253 const char *buf, size_t count)
254 {
255 int err;
256 unsigned long max_ptes_none;
257
258 err = kstrtoul(buf, 10, &max_ptes_none);
259 if (err || max_ptes_none > HPAGE_PMD_NR - 1)
260 return -EINVAL;
261
262 khugepaged_max_ptes_none = max_ptes_none;
263
264 return count;
265 }
266 static struct kobj_attribute khugepaged_max_ptes_none_attr =
267 __ATTR_RW(max_ptes_none);
268
max_ptes_swap_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)269 static ssize_t max_ptes_swap_show(struct kobject *kobj,
270 struct kobj_attribute *attr,
271 char *buf)
272 {
273 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
274 }
275
max_ptes_swap_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)276 static ssize_t max_ptes_swap_store(struct kobject *kobj,
277 struct kobj_attribute *attr,
278 const char *buf, size_t count)
279 {
280 int err;
281 unsigned long max_ptes_swap;
282
283 err = kstrtoul(buf, 10, &max_ptes_swap);
284 if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
285 return -EINVAL;
286
287 khugepaged_max_ptes_swap = max_ptes_swap;
288
289 return count;
290 }
291
292 static struct kobj_attribute khugepaged_max_ptes_swap_attr =
293 __ATTR_RW(max_ptes_swap);
294
max_ptes_shared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)295 static ssize_t max_ptes_shared_show(struct kobject *kobj,
296 struct kobj_attribute *attr,
297 char *buf)
298 {
299 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
300 }
301
max_ptes_shared_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)302 static ssize_t max_ptes_shared_store(struct kobject *kobj,
303 struct kobj_attribute *attr,
304 const char *buf, size_t count)
305 {
306 int err;
307 unsigned long max_ptes_shared;
308
309 err = kstrtoul(buf, 10, &max_ptes_shared);
310 if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
311 return -EINVAL;
312
313 khugepaged_max_ptes_shared = max_ptes_shared;
314
315 return count;
316 }
317
318 static struct kobj_attribute khugepaged_max_ptes_shared_attr =
319 __ATTR_RW(max_ptes_shared);
320
321 static struct attribute *khugepaged_attr[] = {
322 &khugepaged_defrag_attr.attr,
323 &khugepaged_max_ptes_none_attr.attr,
324 &khugepaged_max_ptes_swap_attr.attr,
325 &khugepaged_max_ptes_shared_attr.attr,
326 &pages_to_scan_attr.attr,
327 &pages_collapsed_attr.attr,
328 &full_scans_attr.attr,
329 &scan_sleep_millisecs_attr.attr,
330 &alloc_sleep_millisecs_attr.attr,
331 NULL,
332 };
333
334 struct attribute_group khugepaged_attr_group = {
335 .attrs = khugepaged_attr,
336 .name = "khugepaged",
337 };
338 #endif /* CONFIG_SYSFS */
339
hugepage_madvise(struct vm_area_struct * vma,vm_flags_t * vm_flags,int advice)340 int hugepage_madvise(struct vm_area_struct *vma,
341 vm_flags_t *vm_flags, int advice)
342 {
343 switch (advice) {
344 case MADV_HUGEPAGE:
345 #ifdef CONFIG_S390
346 /*
347 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
348 * can't handle this properly after s390_enable_sie, so we simply
349 * ignore the madvise to prevent qemu from causing a SIGSEGV.
350 */
351 if (mm_has_pgste(vma->vm_mm))
352 return 0;
353 #endif
354 *vm_flags &= ~VM_NOHUGEPAGE;
355 *vm_flags |= VM_HUGEPAGE;
356 /*
357 * If the vma become good for khugepaged to scan,
358 * register it here without waiting a page fault that
359 * may not happen any time soon.
360 */
361 khugepaged_enter_vma(vma, *vm_flags);
362 break;
363 case MADV_NOHUGEPAGE:
364 *vm_flags &= ~VM_HUGEPAGE;
365 *vm_flags |= VM_NOHUGEPAGE;
366 /*
367 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
368 * this vma even if we leave the mm registered in khugepaged if
369 * it got registered before VM_NOHUGEPAGE was set.
370 */
371 break;
372 }
373
374 return 0;
375 }
376
khugepaged_init(void)377 int __init khugepaged_init(void)
378 {
379 mm_slot_cache = KMEM_CACHE(mm_slot, 0);
380 if (!mm_slot_cache)
381 return -ENOMEM;
382
383 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
384 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
385 khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
386 khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
387
388 return 0;
389 }
390
khugepaged_destroy(void)391 void __init khugepaged_destroy(void)
392 {
393 kmem_cache_destroy(mm_slot_cache);
394 }
395
hpage_collapse_test_exit(struct mm_struct * mm)396 static inline int hpage_collapse_test_exit(struct mm_struct *mm)
397 {
398 return atomic_read(&mm->mm_users) == 0;
399 }
400
hpage_collapse_test_exit_or_disable(struct mm_struct * mm)401 static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
402 {
403 return hpage_collapse_test_exit(mm) ||
404 mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
405 }
406
hugepage_pmd_enabled(void)407 static bool hugepage_pmd_enabled(void)
408 {
409 /*
410 * We cover the anon, shmem and the file-backed case here; file-backed
411 * hugepages, when configured in, are determined by the global control.
412 * Anon pmd-sized hugepages are determined by the pmd-size control.
413 * Shmem pmd-sized hugepages are also determined by its pmd-size control,
414 * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
415 */
416 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
417 hugepage_global_enabled())
418 return true;
419 if (test_bit(PMD_ORDER, &huge_anon_orders_always))
420 return true;
421 if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
422 return true;
423 if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
424 hugepage_global_enabled())
425 return true;
426 if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
427 return true;
428 return false;
429 }
430
__khugepaged_enter(struct mm_struct * mm)431 void __khugepaged_enter(struct mm_struct *mm)
432 {
433 struct mm_slot *slot;
434 int wakeup;
435
436 /* __khugepaged_exit() must not run from under us */
437 VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
438 if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
439 return;
440
441 slot = mm_slot_alloc(mm_slot_cache);
442 if (!slot)
443 return;
444
445 spin_lock(&khugepaged_mm_lock);
446 mm_slot_insert(mm_slots_hash, mm, slot);
447 /*
448 * Insert just behind the scanning cursor, to let the area settle
449 * down a little.
450 */
451 wakeup = list_empty(&khugepaged_scan.mm_head);
452 list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
453 spin_unlock(&khugepaged_mm_lock);
454
455 mmgrab(mm);
456 if (wakeup)
457 wake_up_interruptible(&khugepaged_wait);
458 }
459
khugepaged_enter_vma(struct vm_area_struct * vma,vm_flags_t vm_flags)460 void khugepaged_enter_vma(struct vm_area_struct *vma,
461 vm_flags_t vm_flags)
462 {
463 if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
464 hugepage_pmd_enabled()) {
465 if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
466 __khugepaged_enter(vma->vm_mm);
467 }
468 }
469
__khugepaged_exit(struct mm_struct * mm)470 void __khugepaged_exit(struct mm_struct *mm)
471 {
472 struct mm_slot *slot;
473 int free = 0;
474
475 spin_lock(&khugepaged_mm_lock);
476 slot = mm_slot_lookup(mm_slots_hash, mm);
477 if (slot && khugepaged_scan.mm_slot != slot) {
478 hash_del(&slot->hash);
479 list_del(&slot->mm_node);
480 free = 1;
481 }
482 spin_unlock(&khugepaged_mm_lock);
483
484 if (free) {
485 mm_flags_clear(MMF_VM_HUGEPAGE, mm);
486 mm_slot_free(mm_slot_cache, slot);
487 mmdrop(mm);
488 } else if (slot) {
489 /*
490 * This is required to serialize against
491 * hpage_collapse_test_exit() (which is guaranteed to run
492 * under mmap sem read mode). Stop here (after we return all
493 * pagetables will be destroyed) until khugepaged has finished
494 * working on the pagetables under the mmap_lock.
495 */
496 mmap_write_lock(mm);
497 mmap_write_unlock(mm);
498 }
499 }
500
release_pte_folio(struct folio * folio)501 static void release_pte_folio(struct folio *folio)
502 {
503 node_stat_mod_folio(folio,
504 NR_ISOLATED_ANON + folio_is_file_lru(folio),
505 -folio_nr_pages(folio));
506 folio_unlock(folio);
507 folio_putback_lru(folio);
508 }
509
release_pte_pages(pte_t * pte,pte_t * _pte,struct list_head * compound_pagelist)510 static void release_pte_pages(pte_t *pte, pte_t *_pte,
511 struct list_head *compound_pagelist)
512 {
513 struct folio *folio, *tmp;
514
515 while (--_pte >= pte) {
516 pte_t pteval = ptep_get(_pte);
517 unsigned long pfn;
518
519 if (pte_none(pteval))
520 continue;
521 pfn = pte_pfn(pteval);
522 if (is_zero_pfn(pfn))
523 continue;
524 folio = pfn_folio(pfn);
525 if (folio_test_large(folio))
526 continue;
527 release_pte_folio(folio);
528 }
529
530 list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
531 list_del(&folio->lru);
532 release_pte_folio(folio);
533 }
534 }
535
__collapse_huge_page_isolate(struct vm_area_struct * vma,unsigned long start_addr,pte_t * pte,struct collapse_control * cc,struct list_head * compound_pagelist)536 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
537 unsigned long start_addr,
538 pte_t *pte,
539 struct collapse_control *cc,
540 struct list_head *compound_pagelist)
541 {
542 struct page *page = NULL;
543 struct folio *folio = NULL;
544 unsigned long addr = start_addr;
545 pte_t *_pte;
546 int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
547
548 for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
549 _pte++, addr += PAGE_SIZE) {
550 pte_t pteval = ptep_get(_pte);
551 if (pte_none(pteval) || (pte_present(pteval) &&
552 is_zero_pfn(pte_pfn(pteval)))) {
553 ++none_or_zero;
554 if (!userfaultfd_armed(vma) &&
555 (!cc->is_khugepaged ||
556 none_or_zero <= khugepaged_max_ptes_none)) {
557 continue;
558 } else {
559 result = SCAN_EXCEED_NONE_PTE;
560 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
561 goto out;
562 }
563 }
564 if (!pte_present(pteval)) {
565 result = SCAN_PTE_NON_PRESENT;
566 goto out;
567 }
568 if (pte_uffd_wp(pteval)) {
569 result = SCAN_PTE_UFFD_WP;
570 goto out;
571 }
572 page = vm_normal_page(vma, addr, pteval);
573 if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
574 result = SCAN_PAGE_NULL;
575 goto out;
576 }
577
578 folio = page_folio(page);
579 VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
580
581 /* See hpage_collapse_scan_pmd(). */
582 if (folio_maybe_mapped_shared(folio)) {
583 ++shared;
584 if (cc->is_khugepaged &&
585 shared > khugepaged_max_ptes_shared) {
586 result = SCAN_EXCEED_SHARED_PTE;
587 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
588 goto out;
589 }
590 }
591
592 if (folio_test_large(folio)) {
593 struct folio *f;
594
595 /*
596 * Check if we have dealt with the compound page
597 * already
598 */
599 list_for_each_entry(f, compound_pagelist, lru) {
600 if (folio == f)
601 goto next;
602 }
603 }
604
605 /*
606 * We can do it before folio_isolate_lru because the
607 * folio can't be freed from under us. NOTE: PG_lock
608 * is needed to serialize against split_huge_page
609 * when invoked from the VM.
610 */
611 if (!folio_trylock(folio)) {
612 result = SCAN_PAGE_LOCK;
613 goto out;
614 }
615
616 /*
617 * Check if the page has any GUP (or other external) pins.
618 *
619 * The page table that maps the page has been already unlinked
620 * from the page table tree and this process cannot get
621 * an additional pin on the page.
622 *
623 * New pins can come later if the page is shared across fork,
624 * but not from this process. The other process cannot write to
625 * the page, only trigger CoW.
626 */
627 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
628 folio_unlock(folio);
629 result = SCAN_PAGE_COUNT;
630 goto out;
631 }
632
633 /*
634 * Isolate the page to avoid collapsing an hugepage
635 * currently in use by the VM.
636 */
637 if (!folio_isolate_lru(folio)) {
638 folio_unlock(folio);
639 result = SCAN_DEL_PAGE_LRU;
640 goto out;
641 }
642 node_stat_mod_folio(folio,
643 NR_ISOLATED_ANON + folio_is_file_lru(folio),
644 folio_nr_pages(folio));
645 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
646 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
647
648 if (folio_test_large(folio))
649 list_add_tail(&folio->lru, compound_pagelist);
650 next:
651 /*
652 * If collapse was initiated by khugepaged, check that there is
653 * enough young pte to justify collapsing the page
654 */
655 if (cc->is_khugepaged &&
656 (pte_young(pteval) || folio_test_young(folio) ||
657 folio_test_referenced(folio) ||
658 mmu_notifier_test_young(vma->vm_mm, addr)))
659 referenced++;
660 }
661
662 if (unlikely(cc->is_khugepaged && !referenced)) {
663 result = SCAN_LACK_REFERENCED_PAGE;
664 } else {
665 result = SCAN_SUCCEED;
666 trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
667 referenced, result);
668 return result;
669 }
670 out:
671 release_pte_pages(pte, _pte, compound_pagelist);
672 trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
673 referenced, result);
674 return result;
675 }
676
__collapse_huge_page_copy_succeeded(pte_t * pte,struct vm_area_struct * vma,unsigned long address,spinlock_t * ptl,struct list_head * compound_pagelist)677 static void __collapse_huge_page_copy_succeeded(pte_t *pte,
678 struct vm_area_struct *vma,
679 unsigned long address,
680 spinlock_t *ptl,
681 struct list_head *compound_pagelist)
682 {
683 unsigned long end = address + HPAGE_PMD_SIZE;
684 struct folio *src, *tmp;
685 pte_t pteval;
686 pte_t *_pte;
687 unsigned int nr_ptes;
688
689 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
690 address += nr_ptes * PAGE_SIZE) {
691 nr_ptes = 1;
692 pteval = ptep_get(_pte);
693 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
694 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
695 if (is_zero_pfn(pte_pfn(pteval))) {
696 /*
697 * ptl mostly unnecessary.
698 */
699 spin_lock(ptl);
700 ptep_clear(vma->vm_mm, address, _pte);
701 spin_unlock(ptl);
702 ksm_might_unmap_zero_page(vma->vm_mm, pteval);
703 }
704 } else {
705 struct page *src_page = pte_page(pteval);
706
707 src = page_folio(src_page);
708
709 if (folio_test_large(src)) {
710 unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
711
712 nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
713 } else {
714 release_pte_folio(src);
715 }
716
717 /*
718 * ptl mostly unnecessary, but preempt has to
719 * be disabled to update the per-cpu stats
720 * inside folio_remove_rmap_pte().
721 */
722 spin_lock(ptl);
723 clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
724 folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
725 spin_unlock(ptl);
726 free_swap_cache(src);
727 folio_put_refs(src, nr_ptes);
728 }
729 }
730
731 list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
732 list_del(&src->lru);
733 node_stat_sub_folio(src, NR_ISOLATED_ANON +
734 folio_is_file_lru(src));
735 folio_unlock(src);
736 free_swap_cache(src);
737 folio_putback_lru(src);
738 }
739 }
740
__collapse_huge_page_copy_failed(pte_t * pte,pmd_t * pmd,pmd_t orig_pmd,struct vm_area_struct * vma,struct list_head * compound_pagelist)741 static void __collapse_huge_page_copy_failed(pte_t *pte,
742 pmd_t *pmd,
743 pmd_t orig_pmd,
744 struct vm_area_struct *vma,
745 struct list_head *compound_pagelist)
746 {
747 spinlock_t *pmd_ptl;
748
749 /*
750 * Re-establish the PMD to point to the original page table
751 * entry. Restoring PMD needs to be done prior to releasing
752 * pages. Since pages are still isolated and locked here,
753 * acquiring anon_vma_lock_write is unnecessary.
754 */
755 pmd_ptl = pmd_lock(vma->vm_mm, pmd);
756 pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
757 spin_unlock(pmd_ptl);
758 /*
759 * Release both raw and compound pages isolated
760 * in __collapse_huge_page_isolate.
761 */
762 release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
763 }
764
765 /*
766 * __collapse_huge_page_copy - attempts to copy memory contents from raw
767 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
768 * otherwise restores the original page table and releases isolated raw pages.
769 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
770 *
771 * @pte: starting of the PTEs to copy from
772 * @folio: the new hugepage to copy contents to
773 * @pmd: pointer to the new hugepage's PMD
774 * @orig_pmd: the original raw pages' PMD
775 * @vma: the original raw pages' virtual memory area
776 * @address: starting address to copy
777 * @ptl: lock on raw pages' PTEs
778 * @compound_pagelist: list that stores compound pages
779 */
__collapse_huge_page_copy(pte_t * pte,struct folio * folio,pmd_t * pmd,pmd_t orig_pmd,struct vm_area_struct * vma,unsigned long address,spinlock_t * ptl,struct list_head * compound_pagelist)780 static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
781 pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
782 unsigned long address, spinlock_t *ptl,
783 struct list_head *compound_pagelist)
784 {
785 unsigned int i;
786 int result = SCAN_SUCCEED;
787
788 /*
789 * Copying pages' contents is subject to memory poison at any iteration.
790 */
791 for (i = 0; i < HPAGE_PMD_NR; i++) {
792 pte_t pteval = ptep_get(pte + i);
793 struct page *page = folio_page(folio, i);
794 unsigned long src_addr = address + i * PAGE_SIZE;
795 struct page *src_page;
796
797 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
798 clear_user_highpage(page, src_addr);
799 continue;
800 }
801 src_page = pte_page(pteval);
802 if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
803 result = SCAN_COPY_MC;
804 break;
805 }
806 }
807
808 if (likely(result == SCAN_SUCCEED))
809 __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
810 compound_pagelist);
811 else
812 __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
813 compound_pagelist);
814
815 return result;
816 }
817
khugepaged_alloc_sleep(void)818 static void khugepaged_alloc_sleep(void)
819 {
820 DEFINE_WAIT(wait);
821
822 add_wait_queue(&khugepaged_wait, &wait);
823 __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
824 schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
825 remove_wait_queue(&khugepaged_wait, &wait);
826 }
827
828 struct collapse_control khugepaged_collapse_control = {
829 .is_khugepaged = true,
830 };
831
hpage_collapse_scan_abort(int nid,struct collapse_control * cc)832 static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
833 {
834 int i;
835
836 /*
837 * If node_reclaim_mode is disabled, then no extra effort is made to
838 * allocate memory locally.
839 */
840 if (!node_reclaim_enabled())
841 return false;
842
843 /* If there is a count for this node already, it must be acceptable */
844 if (cc->node_load[nid])
845 return false;
846
847 for (i = 0; i < MAX_NUMNODES; i++) {
848 if (!cc->node_load[i])
849 continue;
850 if (node_distance(nid, i) > node_reclaim_distance)
851 return true;
852 }
853 return false;
854 }
855
856 #define khugepaged_defrag() \
857 (transparent_hugepage_flags & \
858 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
859
860 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
alloc_hugepage_khugepaged_gfpmask(void)861 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
862 {
863 return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
864 }
865
866 #ifdef CONFIG_NUMA
hpage_collapse_find_target_node(struct collapse_control * cc)867 static int hpage_collapse_find_target_node(struct collapse_control *cc)
868 {
869 int nid, target_node = 0, max_value = 0;
870
871 /* find first node with max normal pages hit */
872 for (nid = 0; nid < MAX_NUMNODES; nid++)
873 if (cc->node_load[nid] > max_value) {
874 max_value = cc->node_load[nid];
875 target_node = nid;
876 }
877
878 for_each_online_node(nid) {
879 if (max_value == cc->node_load[nid])
880 node_set(nid, cc->alloc_nmask);
881 }
882
883 return target_node;
884 }
885 #else
hpage_collapse_find_target_node(struct collapse_control * cc)886 static int hpage_collapse_find_target_node(struct collapse_control *cc)
887 {
888 return 0;
889 }
890 #endif
891
892 /*
893 * If mmap_lock temporarily dropped, revalidate vma
894 * before taking mmap_lock.
895 * Returns enum scan_result value.
896 */
897
hugepage_vma_revalidate(struct mm_struct * mm,unsigned long address,bool expect_anon,struct vm_area_struct ** vmap,struct collapse_control * cc)898 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
899 bool expect_anon,
900 struct vm_area_struct **vmap,
901 struct collapse_control *cc)
902 {
903 struct vm_area_struct *vma;
904 enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
905 TVA_FORCED_COLLAPSE;
906
907 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
908 return SCAN_ANY_PROCESS;
909
910 *vmap = vma = find_vma(mm, address);
911 if (!vma)
912 return SCAN_VMA_NULL;
913
914 if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
915 return SCAN_ADDRESS_RANGE;
916 if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
917 return SCAN_VMA_CHECK;
918 /*
919 * Anon VMA expected, the address may be unmapped then
920 * remapped to file after khugepaged reaquired the mmap_lock.
921 *
922 * thp_vma_allowable_order may return true for qualified file
923 * vmas.
924 */
925 if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
926 return SCAN_PAGE_ANON;
927 return SCAN_SUCCEED;
928 }
929
check_pmd_state(pmd_t * pmd)930 static inline int check_pmd_state(pmd_t *pmd)
931 {
932 pmd_t pmde = pmdp_get_lockless(pmd);
933
934 if (pmd_none(pmde))
935 return SCAN_PMD_NONE;
936
937 /*
938 * The folio may be under migration when khugepaged is trying to
939 * collapse it. Migration success or failure will eventually end
940 * up with a present PMD mapping a folio again.
941 */
942 if (is_pmd_migration_entry(pmde))
943 return SCAN_PMD_MAPPED;
944 if (!pmd_present(pmde))
945 return SCAN_PMD_NULL;
946 if (pmd_trans_huge(pmde))
947 return SCAN_PMD_MAPPED;
948 if (pmd_bad(pmde))
949 return SCAN_PMD_NULL;
950 return SCAN_SUCCEED;
951 }
952
find_pmd_or_thp_or_none(struct mm_struct * mm,unsigned long address,pmd_t ** pmd)953 static int find_pmd_or_thp_or_none(struct mm_struct *mm,
954 unsigned long address,
955 pmd_t **pmd)
956 {
957 *pmd = mm_find_pmd(mm, address);
958 if (!*pmd)
959 return SCAN_PMD_NULL;
960
961 return check_pmd_state(*pmd);
962 }
963
check_pmd_still_valid(struct mm_struct * mm,unsigned long address,pmd_t * pmd)964 static int check_pmd_still_valid(struct mm_struct *mm,
965 unsigned long address,
966 pmd_t *pmd)
967 {
968 pmd_t *new_pmd;
969 int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
970
971 if (result != SCAN_SUCCEED)
972 return result;
973 if (new_pmd != pmd)
974 return SCAN_FAIL;
975 return SCAN_SUCCEED;
976 }
977
978 /*
979 * Bring missing pages in from swap, to complete THP collapse.
980 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
981 *
982 * Called and returns without pte mapped or spinlocks held.
983 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
984 */
__collapse_huge_page_swapin(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long start_addr,pmd_t * pmd,int referenced)985 static int __collapse_huge_page_swapin(struct mm_struct *mm,
986 struct vm_area_struct *vma,
987 unsigned long start_addr, pmd_t *pmd,
988 int referenced)
989 {
990 int swapped_in = 0;
991 vm_fault_t ret = 0;
992 unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
993 int result;
994 pte_t *pte = NULL;
995 spinlock_t *ptl;
996
997 for (addr = start_addr; addr < end; addr += PAGE_SIZE) {
998 struct vm_fault vmf = {
999 .vma = vma,
1000 .address = addr,
1001 .pgoff = linear_page_index(vma, addr),
1002 .flags = FAULT_FLAG_ALLOW_RETRY,
1003 .pmd = pmd,
1004 };
1005
1006 if (!pte++) {
1007 /*
1008 * Here the ptl is only used to check pte_same() in
1009 * do_swap_page(), so readonly version is enough.
1010 */
1011 pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
1012 if (!pte) {
1013 mmap_read_unlock(mm);
1014 result = SCAN_PMD_NULL;
1015 goto out;
1016 }
1017 }
1018
1019 vmf.orig_pte = ptep_get_lockless(pte);
1020 if (!is_swap_pte(vmf.orig_pte))
1021 continue;
1022
1023 vmf.pte = pte;
1024 vmf.ptl = ptl;
1025 ret = do_swap_page(&vmf);
1026 /* Which unmaps pte (after perhaps re-checking the entry) */
1027 pte = NULL;
1028
1029 /*
1030 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
1031 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
1032 * we do not retry here and swap entry will remain in pagetable
1033 * resulting in later failure.
1034 */
1035 if (ret & VM_FAULT_RETRY) {
1036 /* Likely, but not guaranteed, that page lock failed */
1037 result = SCAN_PAGE_LOCK;
1038 goto out;
1039 }
1040 if (ret & VM_FAULT_ERROR) {
1041 mmap_read_unlock(mm);
1042 result = SCAN_FAIL;
1043 goto out;
1044 }
1045 swapped_in++;
1046 }
1047
1048 if (pte)
1049 pte_unmap(pte);
1050
1051 /* Drain LRU cache to remove extra pin on the swapped in pages */
1052 if (swapped_in)
1053 lru_add_drain();
1054
1055 result = SCAN_SUCCEED;
1056 out:
1057 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
1058 return result;
1059 }
1060
alloc_charge_folio(struct folio ** foliop,struct mm_struct * mm,struct collapse_control * cc)1061 static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
1062 struct collapse_control *cc)
1063 {
1064 gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
1065 GFP_TRANSHUGE);
1066 int node = hpage_collapse_find_target_node(cc);
1067 struct folio *folio;
1068
1069 folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
1070 if (!folio) {
1071 *foliop = NULL;
1072 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1073 return SCAN_ALLOC_HUGE_PAGE_FAIL;
1074 }
1075
1076 count_vm_event(THP_COLLAPSE_ALLOC);
1077 if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
1078 folio_put(folio);
1079 *foliop = NULL;
1080 return SCAN_CGROUP_CHARGE_FAIL;
1081 }
1082
1083 count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
1084
1085 *foliop = folio;
1086 return SCAN_SUCCEED;
1087 }
1088
collapse_huge_page(struct mm_struct * mm,unsigned long address,int referenced,int unmapped,struct collapse_control * cc)1089 static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
1090 int referenced, int unmapped,
1091 struct collapse_control *cc)
1092 {
1093 LIST_HEAD(compound_pagelist);
1094 pmd_t *pmd, _pmd;
1095 pte_t *pte;
1096 pgtable_t pgtable;
1097 struct folio *folio;
1098 spinlock_t *pmd_ptl, *pte_ptl;
1099 int result = SCAN_FAIL;
1100 struct vm_area_struct *vma;
1101 struct mmu_notifier_range range;
1102
1103 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1104
1105 /*
1106 * Before allocating the hugepage, release the mmap_lock read lock.
1107 * The allocation can take potentially a long time if it involves
1108 * sync compaction, and we do not need to hold the mmap_lock during
1109 * that. We will recheck the vma after taking it again in write mode.
1110 */
1111 mmap_read_unlock(mm);
1112
1113 result = alloc_charge_folio(&folio, mm, cc);
1114 if (result != SCAN_SUCCEED)
1115 goto out_nolock;
1116
1117 mmap_read_lock(mm);
1118 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1119 if (result != SCAN_SUCCEED) {
1120 mmap_read_unlock(mm);
1121 goto out_nolock;
1122 }
1123
1124 result = find_pmd_or_thp_or_none(mm, address, &pmd);
1125 if (result != SCAN_SUCCEED) {
1126 mmap_read_unlock(mm);
1127 goto out_nolock;
1128 }
1129
1130 if (unmapped) {
1131 /*
1132 * __collapse_huge_page_swapin will return with mmap_lock
1133 * released when it fails. So we jump out_nolock directly in
1134 * that case. Continuing to collapse causes inconsistency.
1135 */
1136 result = __collapse_huge_page_swapin(mm, vma, address, pmd,
1137 referenced);
1138 if (result != SCAN_SUCCEED)
1139 goto out_nolock;
1140 }
1141
1142 mmap_read_unlock(mm);
1143 /*
1144 * Prevent all access to pagetables with the exception of
1145 * gup_fast later handled by the ptep_clear_flush and the VM
1146 * handled by the anon_vma lock + PG_lock.
1147 *
1148 * UFFDIO_MOVE is prevented to race as well thanks to the
1149 * mmap_lock.
1150 */
1151 mmap_write_lock(mm);
1152 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1153 if (result != SCAN_SUCCEED)
1154 goto out_up_write;
1155 /* check if the pmd is still valid */
1156 vma_start_write(vma);
1157 result = check_pmd_still_valid(mm, address, pmd);
1158 if (result != SCAN_SUCCEED)
1159 goto out_up_write;
1160
1161 anon_vma_lock_write(vma->anon_vma);
1162
1163 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
1164 address + HPAGE_PMD_SIZE);
1165 mmu_notifier_invalidate_range_start(&range);
1166
1167 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1168 /*
1169 * This removes any huge TLB entry from the CPU so we won't allow
1170 * huge and small TLB entries for the same virtual address to
1171 * avoid the risk of CPU bugs in that area.
1172 *
1173 * Parallel GUP-fast is fine since GUP-fast will back off when
1174 * it detects PMD is changed.
1175 */
1176 _pmd = pmdp_collapse_flush(vma, address, pmd);
1177 spin_unlock(pmd_ptl);
1178 mmu_notifier_invalidate_range_end(&range);
1179 tlb_remove_table_sync_one();
1180
1181 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
1182 if (pte) {
1183 result = __collapse_huge_page_isolate(vma, address, pte, cc,
1184 &compound_pagelist);
1185 spin_unlock(pte_ptl);
1186 } else {
1187 result = SCAN_PMD_NULL;
1188 }
1189
1190 if (unlikely(result != SCAN_SUCCEED)) {
1191 if (pte)
1192 pte_unmap(pte);
1193 spin_lock(pmd_ptl);
1194 BUG_ON(!pmd_none(*pmd));
1195 /*
1196 * We can only use set_pmd_at when establishing
1197 * hugepmds and never for establishing regular pmds that
1198 * points to regular pagetables. Use pmd_populate for that
1199 */
1200 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1201 spin_unlock(pmd_ptl);
1202 anon_vma_unlock_write(vma->anon_vma);
1203 goto out_up_write;
1204 }
1205
1206 /*
1207 * All pages are isolated and locked so anon_vma rmap
1208 * can't run anymore.
1209 */
1210 anon_vma_unlock_write(vma->anon_vma);
1211
1212 result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
1213 vma, address, pte_ptl,
1214 &compound_pagelist);
1215 pte_unmap(pte);
1216 if (unlikely(result != SCAN_SUCCEED))
1217 goto out_up_write;
1218
1219 /*
1220 * The smp_wmb() inside __folio_mark_uptodate() ensures the
1221 * copy_huge_page writes become visible before the set_pmd_at()
1222 * write.
1223 */
1224 __folio_mark_uptodate(folio);
1225 pgtable = pmd_pgtable(_pmd);
1226
1227 _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
1228 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1229
1230 spin_lock(pmd_ptl);
1231 BUG_ON(!pmd_none(*pmd));
1232 folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
1233 folio_add_lru_vma(folio, vma);
1234 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1235 set_pmd_at(mm, address, pmd, _pmd);
1236 update_mmu_cache_pmd(vma, address, pmd);
1237 deferred_split_folio(folio, false);
1238 spin_unlock(pmd_ptl);
1239
1240 folio = NULL;
1241
1242 result = SCAN_SUCCEED;
1243 out_up_write:
1244 mmap_write_unlock(mm);
1245 out_nolock:
1246 if (folio)
1247 folio_put(folio);
1248 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
1249 return result;
1250 }
1251
hpage_collapse_scan_pmd(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long start_addr,bool * mmap_locked,struct collapse_control * cc)1252 static int hpage_collapse_scan_pmd(struct mm_struct *mm,
1253 struct vm_area_struct *vma,
1254 unsigned long start_addr, bool *mmap_locked,
1255 struct collapse_control *cc)
1256 {
1257 pmd_t *pmd;
1258 pte_t *pte, *_pte;
1259 int result = SCAN_FAIL, referenced = 0;
1260 int none_or_zero = 0, shared = 0;
1261 struct page *page = NULL;
1262 struct folio *folio = NULL;
1263 unsigned long addr;
1264 spinlock_t *ptl;
1265 int node = NUMA_NO_NODE, unmapped = 0;
1266
1267 VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
1268
1269 result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
1270 if (result != SCAN_SUCCEED)
1271 goto out;
1272
1273 memset(cc->node_load, 0, sizeof(cc->node_load));
1274 nodes_clear(cc->alloc_nmask);
1275 pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
1276 if (!pte) {
1277 result = SCAN_PMD_NULL;
1278 goto out;
1279 }
1280
1281 for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
1282 _pte++, addr += PAGE_SIZE) {
1283 pte_t pteval = ptep_get(_pte);
1284 if (is_swap_pte(pteval)) {
1285 ++unmapped;
1286 if (!cc->is_khugepaged ||
1287 unmapped <= khugepaged_max_ptes_swap) {
1288 /*
1289 * Always be strict with uffd-wp
1290 * enabled swap entries. Please see
1291 * comment below for pte_uffd_wp().
1292 */
1293 if (pte_swp_uffd_wp_any(pteval)) {
1294 result = SCAN_PTE_UFFD_WP;
1295 goto out_unmap;
1296 }
1297 continue;
1298 } else {
1299 result = SCAN_EXCEED_SWAP_PTE;
1300 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
1301 goto out_unmap;
1302 }
1303 }
1304 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1305 ++none_or_zero;
1306 if (!userfaultfd_armed(vma) &&
1307 (!cc->is_khugepaged ||
1308 none_or_zero <= khugepaged_max_ptes_none)) {
1309 continue;
1310 } else {
1311 result = SCAN_EXCEED_NONE_PTE;
1312 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
1313 goto out_unmap;
1314 }
1315 }
1316 if (pte_uffd_wp(pteval)) {
1317 /*
1318 * Don't collapse the page if any of the small
1319 * PTEs are armed with uffd write protection.
1320 * Here we can also mark the new huge pmd as
1321 * write protected if any of the small ones is
1322 * marked but that could bring unknown
1323 * userfault messages that falls outside of
1324 * the registered range. So, just be simple.
1325 */
1326 result = SCAN_PTE_UFFD_WP;
1327 goto out_unmap;
1328 }
1329
1330 page = vm_normal_page(vma, addr, pteval);
1331 if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
1332 result = SCAN_PAGE_NULL;
1333 goto out_unmap;
1334 }
1335 folio = page_folio(page);
1336
1337 if (!folio_test_anon(folio)) {
1338 result = SCAN_PAGE_ANON;
1339 goto out_unmap;
1340 }
1341
1342 /*
1343 * We treat a single page as shared if any part of the THP
1344 * is shared.
1345 */
1346 if (folio_maybe_mapped_shared(folio)) {
1347 ++shared;
1348 if (cc->is_khugepaged &&
1349 shared > khugepaged_max_ptes_shared) {
1350 result = SCAN_EXCEED_SHARED_PTE;
1351 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
1352 goto out_unmap;
1353 }
1354 }
1355
1356 /*
1357 * Record which node the original page is from and save this
1358 * information to cc->node_load[].
1359 * Khugepaged will allocate hugepage from the node has the max
1360 * hit record.
1361 */
1362 node = folio_nid(folio);
1363 if (hpage_collapse_scan_abort(node, cc)) {
1364 result = SCAN_SCAN_ABORT;
1365 goto out_unmap;
1366 }
1367 cc->node_load[node]++;
1368 if (!folio_test_lru(folio)) {
1369 result = SCAN_PAGE_LRU;
1370 goto out_unmap;
1371 }
1372 if (folio_test_locked(folio)) {
1373 result = SCAN_PAGE_LOCK;
1374 goto out_unmap;
1375 }
1376
1377 /*
1378 * Check if the page has any GUP (or other external) pins.
1379 *
1380 * Here the check may be racy:
1381 * it may see folio_mapcount() > folio_ref_count().
1382 * But such case is ephemeral we could always retry collapse
1383 * later. However it may report false positive if the page
1384 * has excessive GUP pins (i.e. 512). Anyway the same check
1385 * will be done again later the risk seems low.
1386 */
1387 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
1388 result = SCAN_PAGE_COUNT;
1389 goto out_unmap;
1390 }
1391
1392 /*
1393 * If collapse was initiated by khugepaged, check that there is
1394 * enough young pte to justify collapsing the page
1395 */
1396 if (cc->is_khugepaged &&
1397 (pte_young(pteval) || folio_test_young(folio) ||
1398 folio_test_referenced(folio) ||
1399 mmu_notifier_test_young(vma->vm_mm, addr)))
1400 referenced++;
1401 }
1402 if (cc->is_khugepaged &&
1403 (!referenced ||
1404 (unmapped && referenced < HPAGE_PMD_NR / 2))) {
1405 result = SCAN_LACK_REFERENCED_PAGE;
1406 } else {
1407 result = SCAN_SUCCEED;
1408 }
1409 out_unmap:
1410 pte_unmap_unlock(pte, ptl);
1411 if (result == SCAN_SUCCEED) {
1412 result = collapse_huge_page(mm, start_addr, referenced,
1413 unmapped, cc);
1414 /* collapse_huge_page will return with the mmap_lock released */
1415 *mmap_locked = false;
1416 }
1417 out:
1418 trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
1419 none_or_zero, result, unmapped);
1420 return result;
1421 }
1422
collect_mm_slot(struct mm_slot * slot)1423 static void collect_mm_slot(struct mm_slot *slot)
1424 {
1425 struct mm_struct *mm = slot->mm;
1426
1427 lockdep_assert_held(&khugepaged_mm_lock);
1428
1429 if (hpage_collapse_test_exit(mm)) {
1430 /* free mm_slot */
1431 hash_del(&slot->hash);
1432 list_del(&slot->mm_node);
1433
1434 /*
1435 * Not strictly needed because the mm exited already.
1436 *
1437 * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
1438 */
1439
1440 /* khugepaged_mm_lock actually not necessary for the below */
1441 mm_slot_free(mm_slot_cache, slot);
1442 mmdrop(mm);
1443 }
1444 }
1445
1446 /* folio must be locked, and mmap_lock must be held */
set_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp,struct folio * folio,struct page * page)1447 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
1448 pmd_t *pmdp, struct folio *folio, struct page *page)
1449 {
1450 struct mm_struct *mm = vma->vm_mm;
1451 struct vm_fault vmf = {
1452 .vma = vma,
1453 .address = addr,
1454 .flags = 0,
1455 };
1456 pgd_t *pgdp;
1457 p4d_t *p4dp;
1458 pud_t *pudp;
1459
1460 mmap_assert_locked(vma->vm_mm);
1461
1462 if (!pmdp) {
1463 pgdp = pgd_offset(mm, addr);
1464 p4dp = p4d_alloc(mm, pgdp, addr);
1465 if (!p4dp)
1466 return SCAN_FAIL;
1467 pudp = pud_alloc(mm, p4dp, addr);
1468 if (!pudp)
1469 return SCAN_FAIL;
1470 pmdp = pmd_alloc(mm, pudp, addr);
1471 if (!pmdp)
1472 return SCAN_FAIL;
1473 }
1474
1475 vmf.pmd = pmdp;
1476 if (do_set_pmd(&vmf, folio, page))
1477 return SCAN_FAIL;
1478
1479 folio_get(folio);
1480 return SCAN_SUCCEED;
1481 }
1482
1483 /**
1484 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1485 * address haddr.
1486 *
1487 * @mm: process address space where collapse happens
1488 * @addr: THP collapse address
1489 * @install_pmd: If a huge PMD should be installed
1490 *
1491 * This function checks whether all the PTEs in the PMD are pointing to the
1492 * right THP. If so, retract the page table so the THP can refault in with
1493 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
1494 */
collapse_pte_mapped_thp(struct mm_struct * mm,unsigned long addr,bool install_pmd)1495 int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
1496 bool install_pmd)
1497 {
1498 int nr_mapped_ptes = 0, result = SCAN_FAIL;
1499 unsigned int nr_batch_ptes;
1500 struct mmu_notifier_range range;
1501 bool notified = false;
1502 unsigned long haddr = addr & HPAGE_PMD_MASK;
1503 unsigned long end = haddr + HPAGE_PMD_SIZE;
1504 struct vm_area_struct *vma = vma_lookup(mm, haddr);
1505 struct folio *folio;
1506 pte_t *start_pte, *pte;
1507 pmd_t *pmd, pgt_pmd;
1508 spinlock_t *pml = NULL, *ptl;
1509 int i;
1510
1511 mmap_assert_locked(mm);
1512
1513 /* First check VMA found, in case page tables are being torn down */
1514 if (!vma || !vma->vm_file ||
1515 !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
1516 return SCAN_VMA_CHECK;
1517
1518 /* Fast check before locking page if already PMD-mapped */
1519 result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
1520 if (result == SCAN_PMD_MAPPED)
1521 return result;
1522
1523 /*
1524 * If we are here, we've succeeded in replacing all the native pages
1525 * in the page cache with a single hugepage. If a mm were to fault-in
1526 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
1527 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
1528 * analogously elide sysfs THP settings here and force collapse.
1529 */
1530 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
1531 return SCAN_VMA_CHECK;
1532
1533 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
1534 if (userfaultfd_wp(vma))
1535 return SCAN_PTE_UFFD_WP;
1536
1537 folio = filemap_lock_folio(vma->vm_file->f_mapping,
1538 linear_page_index(vma, haddr));
1539 if (IS_ERR(folio))
1540 return SCAN_PAGE_NULL;
1541
1542 if (folio_order(folio) != HPAGE_PMD_ORDER) {
1543 result = SCAN_PAGE_COMPOUND;
1544 goto drop_folio;
1545 }
1546
1547 result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
1548 switch (result) {
1549 case SCAN_SUCCEED:
1550 break;
1551 case SCAN_PMD_NULL:
1552 case SCAN_PMD_NONE:
1553 /*
1554 * All pte entries have been removed and pmd cleared.
1555 * Skip all the pte checks and just update the pmd mapping.
1556 */
1557 goto maybe_install_pmd;
1558 default:
1559 goto drop_folio;
1560 }
1561
1562 result = SCAN_FAIL;
1563 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1564 if (!start_pte) /* mmap_lock + page lock should prevent this */
1565 goto drop_folio;
1566
1567 /* step 1: check all mapped PTEs are to the right huge page */
1568 for (i = 0, addr = haddr, pte = start_pte;
1569 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1570 struct page *page;
1571 pte_t ptent = ptep_get(pte);
1572
1573 /* empty pte, skip */
1574 if (pte_none(ptent))
1575 continue;
1576
1577 /* page swapped out, abort */
1578 if (!pte_present(ptent)) {
1579 result = SCAN_PTE_NON_PRESENT;
1580 goto abort;
1581 }
1582
1583 page = vm_normal_page(vma, addr, ptent);
1584 if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1585 page = NULL;
1586 /*
1587 * Note that uprobe, debugger, or MAP_PRIVATE may change the
1588 * page table, but the new page will not be a subpage of hpage.
1589 */
1590 if (folio_page(folio, i) != page)
1591 goto abort;
1592 }
1593
1594 pte_unmap_unlock(start_pte, ptl);
1595 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1596 haddr, haddr + HPAGE_PMD_SIZE);
1597 mmu_notifier_invalidate_range_start(&range);
1598 notified = true;
1599
1600 /*
1601 * pmd_lock covers a wider range than ptl, and (if split from mm's
1602 * page_table_lock) ptl nests inside pml. The less time we hold pml,
1603 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
1604 * inserts a valid as-if-COWed PTE without even looking up page cache.
1605 * So page lock of folio does not protect from it, so we must not drop
1606 * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
1607 */
1608 if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
1609 pml = pmd_lock(mm, pmd);
1610
1611 start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
1612 if (!start_pte) /* mmap_lock + page lock should prevent this */
1613 goto abort;
1614 if (!pml)
1615 spin_lock(ptl);
1616 else if (ptl != pml)
1617 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1618
1619 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
1620 goto abort;
1621
1622 /* step 2: clear page table and adjust rmap */
1623 for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
1624 i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
1625 pte += nr_batch_ptes) {
1626 unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
1627 struct page *page;
1628 pte_t ptent = ptep_get(pte);
1629
1630 nr_batch_ptes = 1;
1631
1632 if (pte_none(ptent))
1633 continue;
1634 /*
1635 * We dropped ptl after the first scan, to do the mmu_notifier:
1636 * page lock stops more PTEs of the folio being faulted in, but
1637 * does not stop write faults COWing anon copies from existing
1638 * PTEs; and does not stop those being swapped out or migrated.
1639 */
1640 if (!pte_present(ptent)) {
1641 result = SCAN_PTE_NON_PRESENT;
1642 goto abort;
1643 }
1644 page = vm_normal_page(vma, addr, ptent);
1645
1646 if (folio_page(folio, i) != page)
1647 goto abort;
1648
1649 nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
1650
1651 /*
1652 * Must clear entry, or a racing truncate may re-remove it.
1653 * TLB flush can be left until pmdp_collapse_flush() does it.
1654 * PTE dirty? Shmem page is already dirty; file is read-only.
1655 */
1656 clear_ptes(mm, addr, pte, nr_batch_ptes);
1657 folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
1658 nr_mapped_ptes += nr_batch_ptes;
1659 }
1660
1661 if (!pml)
1662 spin_unlock(ptl);
1663
1664 /* step 3: set proper refcount and mm_counters. */
1665 if (nr_mapped_ptes) {
1666 folio_ref_sub(folio, nr_mapped_ptes);
1667 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
1668 }
1669
1670 /* step 4: remove empty page table */
1671 if (!pml) {
1672 pml = pmd_lock(mm, pmd);
1673 if (ptl != pml) {
1674 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1675 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
1676 flush_tlb_mm(mm);
1677 goto unlock;
1678 }
1679 }
1680 }
1681 pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
1682 pmdp_get_lockless_sync();
1683 pte_unmap_unlock(start_pte, ptl);
1684 if (ptl != pml)
1685 spin_unlock(pml);
1686
1687 mmu_notifier_invalidate_range_end(&range);
1688
1689 mm_dec_nr_ptes(mm);
1690 page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
1691 pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1692
1693 maybe_install_pmd:
1694 /* step 5: install pmd entry */
1695 result = install_pmd
1696 ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
1697 : SCAN_SUCCEED;
1698 goto drop_folio;
1699 abort:
1700 if (nr_mapped_ptes) {
1701 flush_tlb_mm(mm);
1702 folio_ref_sub(folio, nr_mapped_ptes);
1703 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
1704 }
1705 unlock:
1706 if (start_pte)
1707 pte_unmap_unlock(start_pte, ptl);
1708 if (pml && pml != ptl)
1709 spin_unlock(pml);
1710 if (notified)
1711 mmu_notifier_invalidate_range_end(&range);
1712 drop_folio:
1713 folio_unlock(folio);
1714 folio_put(folio);
1715 return result;
1716 }
1717
retract_page_tables(struct address_space * mapping,pgoff_t pgoff)1718 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1719 {
1720 struct vm_area_struct *vma;
1721
1722 i_mmap_lock_read(mapping);
1723 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1724 struct mmu_notifier_range range;
1725 struct mm_struct *mm;
1726 unsigned long addr;
1727 pmd_t *pmd, pgt_pmd;
1728 spinlock_t *pml;
1729 spinlock_t *ptl;
1730 bool success = false;
1731
1732 /*
1733 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1734 * got written to. These VMAs are likely not worth removing
1735 * page tables from, as PMD-mapping is likely to be split later.
1736 */
1737 if (READ_ONCE(vma->anon_vma))
1738 continue;
1739
1740 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1741 if (addr & ~HPAGE_PMD_MASK ||
1742 vma->vm_end < addr + HPAGE_PMD_SIZE)
1743 continue;
1744
1745 mm = vma->vm_mm;
1746 if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
1747 continue;
1748
1749 if (hpage_collapse_test_exit(mm))
1750 continue;
1751 /*
1752 * When a vma is registered with uffd-wp, we cannot recycle
1753 * the page table because there may be pte markers installed.
1754 * Other vmas can still have the same file mapped hugely, but
1755 * skip this one: it will always be mapped in small page size
1756 * for uffd-wp registered ranges.
1757 */
1758 if (userfaultfd_wp(vma))
1759 continue;
1760
1761 /* PTEs were notified when unmapped; but now for the PMD? */
1762 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1763 addr, addr + HPAGE_PMD_SIZE);
1764 mmu_notifier_invalidate_range_start(&range);
1765
1766 pml = pmd_lock(mm, pmd);
1767 /*
1768 * The lock of new_folio is still held, we will be blocked in
1769 * the page fault path, which prevents the pte entries from
1770 * being set again. So even though the old empty PTE page may be
1771 * concurrently freed and a new PTE page is filled into the pmd
1772 * entry, it is still empty and can be removed.
1773 *
1774 * So here we only need to recheck if the state of pmd entry
1775 * still meets our requirements, rather than checking pmd_same()
1776 * like elsewhere.
1777 */
1778 if (check_pmd_state(pmd) != SCAN_SUCCEED)
1779 goto drop_pml;
1780 ptl = pte_lockptr(mm, pmd);
1781 if (ptl != pml)
1782 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1783
1784 /*
1785 * Huge page lock is still held, so normally the page table
1786 * must remain empty; and we have already skipped anon_vma
1787 * and userfaultfd_wp() vmas. But since the mmap_lock is not
1788 * held, it is still possible for a racing userfaultfd_ioctl()
1789 * to have inserted ptes or markers. Now that we hold ptlock,
1790 * repeating the anon_vma check protects from one category,
1791 * and repeating the userfaultfd_wp() check from another.
1792 */
1793 if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
1794 pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
1795 pmdp_get_lockless_sync();
1796 success = true;
1797 }
1798
1799 if (ptl != pml)
1800 spin_unlock(ptl);
1801 drop_pml:
1802 spin_unlock(pml);
1803
1804 mmu_notifier_invalidate_range_end(&range);
1805
1806 if (success) {
1807 mm_dec_nr_ptes(mm);
1808 page_table_check_pte_clear_range(mm, addr, pgt_pmd);
1809 pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1810 }
1811 }
1812 i_mmap_unlock_read(mapping);
1813 }
1814
1815 /**
1816 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1817 *
1818 * @mm: process address space where collapse happens
1819 * @addr: virtual collapse start address
1820 * @file: file that collapse on
1821 * @start: collapse start address
1822 * @cc: collapse context and scratchpad
1823 *
1824 * Basic scheme is simple, details are more complex:
1825 * - allocate and lock a new huge page;
1826 * - scan page cache, locking old pages
1827 * + swap/gup in pages if necessary;
1828 * - copy data to new page
1829 * - handle shmem holes
1830 * + re-validate that holes weren't filled by someone else
1831 * + check for userfaultfd
1832 * - finalize updates to the page cache;
1833 * - if replacing succeeds:
1834 * + unlock huge page;
1835 * + free old pages;
1836 * - if replacing failed;
1837 * + unlock old pages
1838 * + unlock and free huge page;
1839 */
collapse_file(struct mm_struct * mm,unsigned long addr,struct file * file,pgoff_t start,struct collapse_control * cc)1840 static int collapse_file(struct mm_struct *mm, unsigned long addr,
1841 struct file *file, pgoff_t start,
1842 struct collapse_control *cc)
1843 {
1844 struct address_space *mapping = file->f_mapping;
1845 struct page *dst;
1846 struct folio *folio, *tmp, *new_folio;
1847 pgoff_t index = 0, end = start + HPAGE_PMD_NR;
1848 LIST_HEAD(pagelist);
1849 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1850 int nr_none = 0, result = SCAN_SUCCEED;
1851 bool is_shmem = shmem_file(file);
1852
1853 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1854 VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1855
1856 result = alloc_charge_folio(&new_folio, mm, cc);
1857 if (result != SCAN_SUCCEED)
1858 goto out;
1859
1860 mapping_set_update(&xas, mapping);
1861
1862 __folio_set_locked(new_folio);
1863 if (is_shmem)
1864 __folio_set_swapbacked(new_folio);
1865 new_folio->index = start;
1866 new_folio->mapping = mapping;
1867
1868 /*
1869 * Ensure we have slots for all the pages in the range. This is
1870 * almost certainly a no-op because most of the pages must be present
1871 */
1872 do {
1873 xas_lock_irq(&xas);
1874 xas_create_range(&xas);
1875 if (!xas_error(&xas))
1876 break;
1877 xas_unlock_irq(&xas);
1878 if (!xas_nomem(&xas, GFP_KERNEL)) {
1879 result = SCAN_FAIL;
1880 goto rollback;
1881 }
1882 } while (1);
1883
1884 for (index = start; index < end;) {
1885 xas_set(&xas, index);
1886 folio = xas_load(&xas);
1887
1888 VM_BUG_ON(index != xas.xa_index);
1889 if (is_shmem) {
1890 if (!folio) {
1891 /*
1892 * Stop if extent has been truncated or
1893 * hole-punched, and is now completely
1894 * empty.
1895 */
1896 if (index == start) {
1897 if (!xas_next_entry(&xas, end - 1)) {
1898 result = SCAN_TRUNCATED;
1899 goto xa_locked;
1900 }
1901 }
1902 nr_none++;
1903 index++;
1904 continue;
1905 }
1906
1907 if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
1908 xas_unlock_irq(&xas);
1909 /* swap in or instantiate fallocated page */
1910 if (shmem_get_folio(mapping->host, index, 0,
1911 &folio, SGP_NOALLOC)) {
1912 result = SCAN_FAIL;
1913 goto xa_unlocked;
1914 }
1915 /* drain lru cache to help folio_isolate_lru() */
1916 lru_add_drain();
1917 } else if (folio_trylock(folio)) {
1918 folio_get(folio);
1919 xas_unlock_irq(&xas);
1920 } else {
1921 result = SCAN_PAGE_LOCK;
1922 goto xa_locked;
1923 }
1924 } else { /* !is_shmem */
1925 if (!folio || xa_is_value(folio)) {
1926 xas_unlock_irq(&xas);
1927 page_cache_sync_readahead(mapping, &file->f_ra,
1928 file, index,
1929 end - index);
1930 /* drain lru cache to help folio_isolate_lru() */
1931 lru_add_drain();
1932 folio = filemap_lock_folio(mapping, index);
1933 if (IS_ERR(folio)) {
1934 result = SCAN_FAIL;
1935 goto xa_unlocked;
1936 }
1937 } else if (folio_test_dirty(folio)) {
1938 /*
1939 * khugepaged only works on read-only fd,
1940 * so this page is dirty because it hasn't
1941 * been flushed since first write. There
1942 * won't be new dirty pages.
1943 *
1944 * Trigger async flush here and hope the
1945 * writeback is done when khugepaged
1946 * revisits this page.
1947 *
1948 * This is a one-off situation. We are not
1949 * forcing writeback in loop.
1950 */
1951 xas_unlock_irq(&xas);
1952 filemap_flush(mapping);
1953 result = SCAN_FAIL;
1954 goto xa_unlocked;
1955 } else if (folio_test_writeback(folio)) {
1956 xas_unlock_irq(&xas);
1957 result = SCAN_FAIL;
1958 goto xa_unlocked;
1959 } else if (folio_trylock(folio)) {
1960 folio_get(folio);
1961 xas_unlock_irq(&xas);
1962 } else {
1963 result = SCAN_PAGE_LOCK;
1964 goto xa_locked;
1965 }
1966 }
1967
1968 /*
1969 * The folio must be locked, so we can drop the i_pages lock
1970 * without racing with truncate.
1971 */
1972 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1973
1974 /* make sure the folio is up to date */
1975 if (unlikely(!folio_test_uptodate(folio))) {
1976 result = SCAN_FAIL;
1977 goto out_unlock;
1978 }
1979
1980 /*
1981 * If file was truncated then extended, or hole-punched, before
1982 * we locked the first folio, then a THP might be there already.
1983 * This will be discovered on the first iteration.
1984 */
1985 if (folio_order(folio) == HPAGE_PMD_ORDER &&
1986 folio->index == start) {
1987 /* Maybe PMD-mapped */
1988 result = SCAN_PTE_MAPPED_HUGEPAGE;
1989 goto out_unlock;
1990 }
1991
1992 if (folio_mapping(folio) != mapping) {
1993 result = SCAN_TRUNCATED;
1994 goto out_unlock;
1995 }
1996
1997 if (!is_shmem && (folio_test_dirty(folio) ||
1998 folio_test_writeback(folio))) {
1999 /*
2000 * khugepaged only works on read-only fd, so this
2001 * folio is dirty because it hasn't been flushed
2002 * since first write.
2003 */
2004 result = SCAN_FAIL;
2005 goto out_unlock;
2006 }
2007
2008 if (!folio_isolate_lru(folio)) {
2009 result = SCAN_DEL_PAGE_LRU;
2010 goto out_unlock;
2011 }
2012
2013 if (!filemap_release_folio(folio, GFP_KERNEL)) {
2014 result = SCAN_PAGE_HAS_PRIVATE;
2015 folio_putback_lru(folio);
2016 goto out_unlock;
2017 }
2018
2019 if (folio_mapped(folio))
2020 try_to_unmap(folio,
2021 TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
2022
2023 xas_lock_irq(&xas);
2024
2025 VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);
2026
2027 /*
2028 * We control 2 + nr_pages references to the folio:
2029 * - we hold a pin on it;
2030 * - nr_pages reference from page cache;
2031 * - one from lru_isolate_folio;
2032 * If those are the only references, then any new usage
2033 * of the folio will have to fetch it from the page
2034 * cache. That requires locking the folio to handle
2035 * truncate, so any new usage will be blocked until we
2036 * unlock folio after collapse/during rollback.
2037 */
2038 if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
2039 result = SCAN_PAGE_COUNT;
2040 xas_unlock_irq(&xas);
2041 folio_putback_lru(folio);
2042 goto out_unlock;
2043 }
2044
2045 /*
2046 * Accumulate the folios that are being collapsed.
2047 */
2048 list_add_tail(&folio->lru, &pagelist);
2049 index += folio_nr_pages(folio);
2050 continue;
2051 out_unlock:
2052 folio_unlock(folio);
2053 folio_put(folio);
2054 goto xa_unlocked;
2055 }
2056
2057 if (!is_shmem) {
2058 filemap_nr_thps_inc(mapping);
2059 /*
2060 * Paired with the fence in do_dentry_open() -> get_write_access()
2061 * to ensure i_writecount is up to date and the update to nr_thps
2062 * is visible. Ensures the page cache will be truncated if the
2063 * file is opened writable.
2064 */
2065 smp_mb();
2066 if (inode_is_open_for_write(mapping->host)) {
2067 result = SCAN_FAIL;
2068 filemap_nr_thps_dec(mapping);
2069 }
2070 }
2071
2072 xa_locked:
2073 xas_unlock_irq(&xas);
2074 xa_unlocked:
2075
2076 /*
2077 * If collapse is successful, flush must be done now before copying.
2078 * If collapse is unsuccessful, does flush actually need to be done?
2079 * Do it anyway, to clear the state.
2080 */
2081 try_to_unmap_flush();
2082
2083 if (result == SCAN_SUCCEED && nr_none &&
2084 !shmem_charge(mapping->host, nr_none))
2085 result = SCAN_FAIL;
2086 if (result != SCAN_SUCCEED) {
2087 nr_none = 0;
2088 goto rollback;
2089 }
2090
2091 /*
2092 * The old folios are locked, so they won't change anymore.
2093 */
2094 index = start;
2095 dst = folio_page(new_folio, 0);
2096 list_for_each_entry(folio, &pagelist, lru) {
2097 int i, nr_pages = folio_nr_pages(folio);
2098
2099 while (index < folio->index) {
2100 clear_highpage(dst);
2101 index++;
2102 dst++;
2103 }
2104
2105 for (i = 0; i < nr_pages; i++) {
2106 if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
2107 result = SCAN_COPY_MC;
2108 goto rollback;
2109 }
2110 index++;
2111 dst++;
2112 }
2113 }
2114 while (index < end) {
2115 clear_highpage(dst);
2116 index++;
2117 dst++;
2118 }
2119
2120 if (nr_none) {
2121 struct vm_area_struct *vma;
2122 int nr_none_check = 0;
2123
2124 i_mmap_lock_read(mapping);
2125 xas_lock_irq(&xas);
2126
2127 xas_set(&xas, start);
2128 for (index = start; index < end; index++) {
2129 if (!xas_next(&xas)) {
2130 xas_store(&xas, XA_RETRY_ENTRY);
2131 if (xas_error(&xas)) {
2132 result = SCAN_STORE_FAILED;
2133 goto immap_locked;
2134 }
2135 nr_none_check++;
2136 }
2137 }
2138
2139 if (nr_none != nr_none_check) {
2140 result = SCAN_PAGE_FILLED;
2141 goto immap_locked;
2142 }
2143
2144 /*
2145 * If userspace observed a missing page in a VMA with
2146 * a MODE_MISSING userfaultfd, then it might expect a
2147 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
2148 * roll back to avoid suppressing such an event. Since
2149 * wp/minor userfaultfds don't give userspace any
2150 * guarantees that the kernel doesn't fill a missing
2151 * page with a zero page, so they don't matter here.
2152 *
2153 * Any userfaultfds registered after this point will
2154 * not be able to observe any missing pages due to the
2155 * previously inserted retry entries.
2156 */
2157 vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
2158 if (userfaultfd_missing(vma)) {
2159 result = SCAN_EXCEED_NONE_PTE;
2160 goto immap_locked;
2161 }
2162 }
2163
2164 immap_locked:
2165 i_mmap_unlock_read(mapping);
2166 if (result != SCAN_SUCCEED) {
2167 xas_set(&xas, start);
2168 for (index = start; index < end; index++) {
2169 if (xas_next(&xas) == XA_RETRY_ENTRY)
2170 xas_store(&xas, NULL);
2171 }
2172
2173 xas_unlock_irq(&xas);
2174 goto rollback;
2175 }
2176 } else {
2177 xas_lock_irq(&xas);
2178 }
2179
2180 if (is_shmem)
2181 __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
2182 else
2183 __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
2184
2185 if (nr_none) {
2186 __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
2187 /* nr_none is always 0 for non-shmem. */
2188 __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
2189 }
2190
2191 /*
2192 * Mark new_folio as uptodate before inserting it into the
2193 * page cache so that it isn't mistaken for an fallocated but
2194 * unwritten page.
2195 */
2196 folio_mark_uptodate(new_folio);
2197 folio_ref_add(new_folio, HPAGE_PMD_NR - 1);
2198
2199 if (is_shmem)
2200 folio_mark_dirty(new_folio);
2201 folio_add_lru(new_folio);
2202
2203 /* Join all the small entries into a single multi-index entry. */
2204 xas_set_order(&xas, start, HPAGE_PMD_ORDER);
2205 xas_store(&xas, new_folio);
2206 WARN_ON_ONCE(xas_error(&xas));
2207 xas_unlock_irq(&xas);
2208
2209 /*
2210 * Remove pte page tables, so we can re-fault the page as huge.
2211 * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
2212 */
2213 retract_page_tables(mapping, start);
2214 if (cc && !cc->is_khugepaged)
2215 result = SCAN_PTE_MAPPED_HUGEPAGE;
2216 folio_unlock(new_folio);
2217
2218 /*
2219 * The collapse has succeeded, so free the old folios.
2220 */
2221 list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
2222 list_del(&folio->lru);
2223 folio->mapping = NULL;
2224 folio_clear_active(folio);
2225 folio_clear_unevictable(folio);
2226 folio_unlock(folio);
2227 folio_put_refs(folio, 2 + folio_nr_pages(folio));
2228 }
2229
2230 goto out;
2231
2232 rollback:
2233 /* Something went wrong: roll back page cache changes */
2234 if (nr_none) {
2235 xas_lock_irq(&xas);
2236 mapping->nrpages -= nr_none;
2237 xas_unlock_irq(&xas);
2238 shmem_uncharge(mapping->host, nr_none);
2239 }
2240
2241 list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
2242 list_del(&folio->lru);
2243 folio_unlock(folio);
2244 folio_putback_lru(folio);
2245 folio_put(folio);
2246 }
2247 /*
2248 * Undo the updates of filemap_nr_thps_inc for non-SHMEM
2249 * file only. This undo is not needed unless failure is
2250 * due to SCAN_COPY_MC.
2251 */
2252 if (!is_shmem && result == SCAN_COPY_MC) {
2253 filemap_nr_thps_dec(mapping);
2254 /*
2255 * Paired with the fence in do_dentry_open() -> get_write_access()
2256 * to ensure the update to nr_thps is visible.
2257 */
2258 smp_mb();
2259 }
2260
2261 new_folio->mapping = NULL;
2262
2263 folio_unlock(new_folio);
2264 folio_put(new_folio);
2265 out:
2266 VM_BUG_ON(!list_empty(&pagelist));
2267 trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
2268 return result;
2269 }
2270
hpage_collapse_scan_file(struct mm_struct * mm,unsigned long addr,struct file * file,pgoff_t start,struct collapse_control * cc)2271 static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
2272 struct file *file, pgoff_t start,
2273 struct collapse_control *cc)
2274 {
2275 struct folio *folio = NULL;
2276 struct address_space *mapping = file->f_mapping;
2277 XA_STATE(xas, &mapping->i_pages, start);
2278 int present, swap;
2279 int node = NUMA_NO_NODE;
2280 int result = SCAN_SUCCEED;
2281
2282 present = 0;
2283 swap = 0;
2284 memset(cc->node_load, 0, sizeof(cc->node_load));
2285 nodes_clear(cc->alloc_nmask);
2286 rcu_read_lock();
2287 xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
2288 if (xas_retry(&xas, folio))
2289 continue;
2290
2291 if (xa_is_value(folio)) {
2292 swap += 1 << xas_get_order(&xas);
2293 if (cc->is_khugepaged &&
2294 swap > khugepaged_max_ptes_swap) {
2295 result = SCAN_EXCEED_SWAP_PTE;
2296 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
2297 break;
2298 }
2299 continue;
2300 }
2301
2302 if (!folio_try_get(folio)) {
2303 xas_reset(&xas);
2304 continue;
2305 }
2306
2307 if (unlikely(folio != xas_reload(&xas))) {
2308 folio_put(folio);
2309 xas_reset(&xas);
2310 continue;
2311 }
2312
2313 if (folio_order(folio) == HPAGE_PMD_ORDER &&
2314 folio->index == start) {
2315 /* Maybe PMD-mapped */
2316 result = SCAN_PTE_MAPPED_HUGEPAGE;
2317 /*
2318 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
2319 * by the caller won't touch the page cache, and so
2320 * it's safe to skip LRU and refcount checks before
2321 * returning.
2322 */
2323 folio_put(folio);
2324 break;
2325 }
2326
2327 node = folio_nid(folio);
2328 if (hpage_collapse_scan_abort(node, cc)) {
2329 result = SCAN_SCAN_ABORT;
2330 folio_put(folio);
2331 break;
2332 }
2333 cc->node_load[node]++;
2334
2335 if (!folio_test_lru(folio)) {
2336 result = SCAN_PAGE_LRU;
2337 folio_put(folio);
2338 break;
2339 }
2340
2341 if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
2342 result = SCAN_PAGE_COUNT;
2343 folio_put(folio);
2344 break;
2345 }
2346
2347 /*
2348 * We probably should check if the folio is referenced
2349 * here, but nobody would transfer pte_young() to
2350 * folio_test_referenced() for us. And rmap walk here
2351 * is just too costly...
2352 */
2353
2354 present += folio_nr_pages(folio);
2355 folio_put(folio);
2356
2357 if (need_resched()) {
2358 xas_pause(&xas);
2359 cond_resched_rcu();
2360 }
2361 }
2362 rcu_read_unlock();
2363
2364 if (result == SCAN_SUCCEED) {
2365 if (cc->is_khugepaged &&
2366 present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
2367 result = SCAN_EXCEED_NONE_PTE;
2368 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
2369 } else {
2370 result = collapse_file(mm, addr, file, start, cc);
2371 }
2372 }
2373
2374 trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
2375 return result;
2376 }
2377
khugepaged_scan_mm_slot(unsigned int pages,int * result,struct collapse_control * cc)2378 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2379 struct collapse_control *cc)
2380 __releases(&khugepaged_mm_lock)
2381 __acquires(&khugepaged_mm_lock)
2382 {
2383 struct vma_iterator vmi;
2384 struct mm_slot *slot;
2385 struct mm_struct *mm;
2386 struct vm_area_struct *vma;
2387 int progress = 0;
2388
2389 VM_BUG_ON(!pages);
2390 lockdep_assert_held(&khugepaged_mm_lock);
2391 *result = SCAN_FAIL;
2392
2393 if (khugepaged_scan.mm_slot) {
2394 slot = khugepaged_scan.mm_slot;
2395 } else {
2396 slot = list_first_entry(&khugepaged_scan.mm_head,
2397 struct mm_slot, mm_node);
2398 khugepaged_scan.address = 0;
2399 khugepaged_scan.mm_slot = slot;
2400 }
2401 spin_unlock(&khugepaged_mm_lock);
2402
2403 mm = slot->mm;
2404 /*
2405 * Don't wait for semaphore (to avoid long wait times). Just move to
2406 * the next mm on the list.
2407 */
2408 vma = NULL;
2409 if (unlikely(!mmap_read_trylock(mm)))
2410 goto breakouterloop_mmap_lock;
2411
2412 progress++;
2413 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2414 goto breakouterloop;
2415
2416 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2417 for_each_vma(vmi, vma) {
2418 unsigned long hstart, hend;
2419
2420 cond_resched();
2421 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2422 progress++;
2423 break;
2424 }
2425 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
2426 skip:
2427 progress++;
2428 continue;
2429 }
2430 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2431 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2432 if (khugepaged_scan.address > hend)
2433 goto skip;
2434 if (khugepaged_scan.address < hstart)
2435 khugepaged_scan.address = hstart;
2436 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2437
2438 while (khugepaged_scan.address < hend) {
2439 bool mmap_locked = true;
2440
2441 cond_resched();
2442 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2443 goto breakouterloop;
2444
2445 VM_BUG_ON(khugepaged_scan.address < hstart ||
2446 khugepaged_scan.address + HPAGE_PMD_SIZE >
2447 hend);
2448 if (!vma_is_anonymous(vma)) {
2449 struct file *file = get_file(vma->vm_file);
2450 pgoff_t pgoff = linear_page_index(vma,
2451 khugepaged_scan.address);
2452
2453 mmap_read_unlock(mm);
2454 mmap_locked = false;
2455 *result = hpage_collapse_scan_file(mm,
2456 khugepaged_scan.address, file, pgoff, cc);
2457 fput(file);
2458 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2459 mmap_read_lock(mm);
2460 if (hpage_collapse_test_exit_or_disable(mm))
2461 goto breakouterloop;
2462 *result = collapse_pte_mapped_thp(mm,
2463 khugepaged_scan.address, false);
2464 if (*result == SCAN_PMD_MAPPED)
2465 *result = SCAN_SUCCEED;
2466 mmap_read_unlock(mm);
2467 }
2468 } else {
2469 *result = hpage_collapse_scan_pmd(mm, vma,
2470 khugepaged_scan.address, &mmap_locked, cc);
2471 }
2472
2473 if (*result == SCAN_SUCCEED)
2474 ++khugepaged_pages_collapsed;
2475
2476 /* move to next address */
2477 khugepaged_scan.address += HPAGE_PMD_SIZE;
2478 progress += HPAGE_PMD_NR;
2479 if (!mmap_locked)
2480 /*
2481 * We released mmap_lock so break loop. Note
2482 * that we drop mmap_lock before all hugepage
2483 * allocations, so if allocation fails, we are
2484 * guaranteed to break here and report the
2485 * correct result back to caller.
2486 */
2487 goto breakouterloop_mmap_lock;
2488 if (progress >= pages)
2489 goto breakouterloop;
2490 }
2491 }
2492 breakouterloop:
2493 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2494 breakouterloop_mmap_lock:
2495
2496 spin_lock(&khugepaged_mm_lock);
2497 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
2498 /*
2499 * Release the current mm_slot if this mm is about to die, or
2500 * if we scanned all vmas of this mm.
2501 */
2502 if (hpage_collapse_test_exit(mm) || !vma) {
2503 /*
2504 * Make sure that if mm_users is reaching zero while
2505 * khugepaged runs here, khugepaged_exit will find
2506 * mm_slot not pointing to the exiting mm.
2507 */
2508 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
2509 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
2510 khugepaged_scan.address = 0;
2511 } else {
2512 khugepaged_scan.mm_slot = NULL;
2513 khugepaged_full_scans++;
2514 }
2515
2516 collect_mm_slot(slot);
2517 }
2518
2519 return progress;
2520 }
2521
khugepaged_has_work(void)2522 static int khugepaged_has_work(void)
2523 {
2524 return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
2525 }
2526
khugepaged_wait_event(void)2527 static int khugepaged_wait_event(void)
2528 {
2529 return !list_empty(&khugepaged_scan.mm_head) ||
2530 kthread_should_stop();
2531 }
2532
khugepaged_do_scan(struct collapse_control * cc)2533 static void khugepaged_do_scan(struct collapse_control *cc)
2534 {
2535 unsigned int progress = 0, pass_through_head = 0;
2536 unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
2537 bool wait = true;
2538 int result = SCAN_SUCCEED;
2539
2540 lru_add_drain_all();
2541
2542 while (true) {
2543 cond_resched();
2544
2545 if (unlikely(kthread_should_stop()))
2546 break;
2547
2548 spin_lock(&khugepaged_mm_lock);
2549 if (!khugepaged_scan.mm_slot)
2550 pass_through_head++;
2551 if (khugepaged_has_work() &&
2552 pass_through_head < 2)
2553 progress += khugepaged_scan_mm_slot(pages - progress,
2554 &result, cc);
2555 else
2556 progress = pages;
2557 spin_unlock(&khugepaged_mm_lock);
2558
2559 if (progress >= pages)
2560 break;
2561
2562 if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
2563 /*
2564 * If fail to allocate the first time, try to sleep for
2565 * a while. When hit again, cancel the scan.
2566 */
2567 if (!wait)
2568 break;
2569 wait = false;
2570 khugepaged_alloc_sleep();
2571 }
2572 }
2573 }
2574
khugepaged_should_wakeup(void)2575 static bool khugepaged_should_wakeup(void)
2576 {
2577 return kthread_should_stop() ||
2578 time_after_eq(jiffies, khugepaged_sleep_expire);
2579 }
2580
khugepaged_wait_work(void)2581 static void khugepaged_wait_work(void)
2582 {
2583 if (khugepaged_has_work()) {
2584 const unsigned long scan_sleep_jiffies =
2585 msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
2586
2587 if (!scan_sleep_jiffies)
2588 return;
2589
2590 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2591 wait_event_freezable_timeout(khugepaged_wait,
2592 khugepaged_should_wakeup(),
2593 scan_sleep_jiffies);
2594 return;
2595 }
2596
2597 if (hugepage_pmd_enabled())
2598 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2599 }
2600
khugepaged(void * none)2601 static int khugepaged(void *none)
2602 {
2603 struct mm_slot *slot;
2604
2605 set_freezable();
2606 set_user_nice(current, MAX_NICE);
2607
2608 while (!kthread_should_stop()) {
2609 khugepaged_do_scan(&khugepaged_collapse_control);
2610 khugepaged_wait_work();
2611 }
2612
2613 spin_lock(&khugepaged_mm_lock);
2614 slot = khugepaged_scan.mm_slot;
2615 khugepaged_scan.mm_slot = NULL;
2616 if (slot)
2617 collect_mm_slot(slot);
2618 spin_unlock(&khugepaged_mm_lock);
2619 return 0;
2620 }
2621
set_recommended_min_free_kbytes(void)2622 static void set_recommended_min_free_kbytes(void)
2623 {
2624 struct zone *zone;
2625 int nr_zones = 0;
2626 unsigned long recommended_min;
2627
2628 if (!hugepage_pmd_enabled()) {
2629 calculate_min_free_kbytes();
2630 goto update_wmarks;
2631 }
2632
2633 for_each_populated_zone(zone) {
2634 /*
2635 * We don't need to worry about fragmentation of
2636 * ZONE_MOVABLE since it only has movable pages.
2637 */
2638 if (zone_idx(zone) > gfp_zone(GFP_USER))
2639 continue;
2640
2641 nr_zones++;
2642 }
2643
2644 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
2645 recommended_min = pageblock_nr_pages * nr_zones * 2;
2646
2647 /*
2648 * Make sure that on average at least two pageblocks are almost free
2649 * of another type, one for a migratetype to fall back to and a
2650 * second to avoid subsequent fallbacks of other types There are 3
2651 * MIGRATE_TYPES we care about.
2652 */
2653 recommended_min += pageblock_nr_pages * nr_zones *
2654 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
2655
2656 /* don't ever allow to reserve more than 5% of the lowmem */
2657 recommended_min = min(recommended_min,
2658 (unsigned long) nr_free_buffer_pages() / 20);
2659 recommended_min <<= (PAGE_SHIFT-10);
2660
2661 if (recommended_min > min_free_kbytes) {
2662 if (user_min_free_kbytes >= 0)
2663 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2664 min_free_kbytes, recommended_min);
2665
2666 min_free_kbytes = recommended_min;
2667 }
2668
2669 update_wmarks:
2670 setup_per_zone_wmarks();
2671 }
2672
start_stop_khugepaged(void)2673 int start_stop_khugepaged(void)
2674 {
2675 int err = 0;
2676
2677 mutex_lock(&khugepaged_mutex);
2678 if (hugepage_pmd_enabled()) {
2679 if (!khugepaged_thread)
2680 khugepaged_thread = kthread_run(khugepaged, NULL,
2681 "khugepaged");
2682 if (IS_ERR(khugepaged_thread)) {
2683 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2684 err = PTR_ERR(khugepaged_thread);
2685 khugepaged_thread = NULL;
2686 goto fail;
2687 }
2688
2689 if (!list_empty(&khugepaged_scan.mm_head))
2690 wake_up_interruptible(&khugepaged_wait);
2691 } else if (khugepaged_thread) {
2692 kthread_stop(khugepaged_thread);
2693 khugepaged_thread = NULL;
2694 }
2695 set_recommended_min_free_kbytes();
2696 fail:
2697 mutex_unlock(&khugepaged_mutex);
2698 return err;
2699 }
2700
khugepaged_min_free_kbytes_update(void)2701 void khugepaged_min_free_kbytes_update(void)
2702 {
2703 mutex_lock(&khugepaged_mutex);
2704 if (hugepage_pmd_enabled() && khugepaged_thread)
2705 set_recommended_min_free_kbytes();
2706 mutex_unlock(&khugepaged_mutex);
2707 }
2708
current_is_khugepaged(void)2709 bool current_is_khugepaged(void)
2710 {
2711 return kthread_func(current) == khugepaged;
2712 }
2713
madvise_collapse_errno(enum scan_result r)2714 static int madvise_collapse_errno(enum scan_result r)
2715 {
2716 /*
2717 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
2718 * actionable feedback to caller, so they may take an appropriate
2719 * fallback measure depending on the nature of the failure.
2720 */
2721 switch (r) {
2722 case SCAN_ALLOC_HUGE_PAGE_FAIL:
2723 return -ENOMEM;
2724 case SCAN_CGROUP_CHARGE_FAIL:
2725 case SCAN_EXCEED_NONE_PTE:
2726 return -EBUSY;
2727 /* Resource temporary unavailable - trying again might succeed */
2728 case SCAN_PAGE_COUNT:
2729 case SCAN_PAGE_LOCK:
2730 case SCAN_PAGE_LRU:
2731 case SCAN_DEL_PAGE_LRU:
2732 case SCAN_PAGE_FILLED:
2733 return -EAGAIN;
2734 /*
2735 * Other: Trying again likely not to succeed / error intrinsic to
2736 * specified memory range. khugepaged likely won't be able to collapse
2737 * either.
2738 */
2739 default:
2740 return -EINVAL;
2741 }
2742 }
2743
madvise_collapse(struct vm_area_struct * vma,unsigned long start,unsigned long end,bool * lock_dropped)2744 int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
2745 unsigned long end, bool *lock_dropped)
2746 {
2747 struct collapse_control *cc;
2748 struct mm_struct *mm = vma->vm_mm;
2749 unsigned long hstart, hend, addr;
2750 int thps = 0, last_fail = SCAN_FAIL;
2751 bool mmap_locked = true;
2752
2753 BUG_ON(vma->vm_start > start);
2754 BUG_ON(vma->vm_end < end);
2755
2756 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
2757 return -EINVAL;
2758
2759 cc = kmalloc(sizeof(*cc), GFP_KERNEL);
2760 if (!cc)
2761 return -ENOMEM;
2762 cc->is_khugepaged = false;
2763
2764 mmgrab(mm);
2765 lru_add_drain_all();
2766
2767 hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2768 hend = end & HPAGE_PMD_MASK;
2769
2770 for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
2771 int result = SCAN_FAIL;
2772
2773 if (!mmap_locked) {
2774 cond_resched();
2775 mmap_read_lock(mm);
2776 mmap_locked = true;
2777 result = hugepage_vma_revalidate(mm, addr, false, &vma,
2778 cc);
2779 if (result != SCAN_SUCCEED) {
2780 last_fail = result;
2781 goto out_nolock;
2782 }
2783
2784 hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
2785 }
2786 mmap_assert_locked(mm);
2787 memset(cc->node_load, 0, sizeof(cc->node_load));
2788 nodes_clear(cc->alloc_nmask);
2789 if (!vma_is_anonymous(vma)) {
2790 struct file *file = get_file(vma->vm_file);
2791 pgoff_t pgoff = linear_page_index(vma, addr);
2792
2793 mmap_read_unlock(mm);
2794 mmap_locked = false;
2795 result = hpage_collapse_scan_file(mm, addr, file, pgoff,
2796 cc);
2797 fput(file);
2798 } else {
2799 result = hpage_collapse_scan_pmd(mm, vma, addr,
2800 &mmap_locked, cc);
2801 }
2802 if (!mmap_locked)
2803 *lock_dropped = true;
2804
2805 handle_result:
2806 switch (result) {
2807 case SCAN_SUCCEED:
2808 case SCAN_PMD_MAPPED:
2809 ++thps;
2810 break;
2811 case SCAN_PTE_MAPPED_HUGEPAGE:
2812 BUG_ON(mmap_locked);
2813 mmap_read_lock(mm);
2814 result = collapse_pte_mapped_thp(mm, addr, true);
2815 mmap_read_unlock(mm);
2816 goto handle_result;
2817 /* Whitelisted set of results where continuing OK */
2818 case SCAN_PMD_NULL:
2819 case SCAN_PTE_NON_PRESENT:
2820 case SCAN_PTE_UFFD_WP:
2821 case SCAN_LACK_REFERENCED_PAGE:
2822 case SCAN_PAGE_NULL:
2823 case SCAN_PAGE_COUNT:
2824 case SCAN_PAGE_LOCK:
2825 case SCAN_PAGE_COMPOUND:
2826 case SCAN_PAGE_LRU:
2827 case SCAN_DEL_PAGE_LRU:
2828 last_fail = result;
2829 break;
2830 default:
2831 last_fail = result;
2832 /* Other error, exit */
2833 goto out_maybelock;
2834 }
2835 }
2836
2837 out_maybelock:
2838 /* Caller expects us to hold mmap_lock on return */
2839 if (!mmap_locked)
2840 mmap_read_lock(mm);
2841 out_nolock:
2842 mmap_assert_locked(mm);
2843 mmdrop(mm);
2844 kfree(cc);
2845
2846 return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
2847 : madvise_collapse_errno(last_fail);
2848 }
2849