1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2009 Red Hat, Inc.
4 */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/mm.h>
9 #include <linux/sched.h>
10 #include <linux/sched/mm.h>
11 #include <linux/sched/numa_balancing.h>
12 #include <linux/highmem.h>
13 #include <linux/hugetlb.h>
14 #include <linux/mmu_notifier.h>
15 #include <linux/rmap.h>
16 #include <linux/swap.h>
17 #include <linux/shrinker.h>
18 #include <linux/mm_inline.h>
19 #include <linux/swapops.h>
20 #include <linux/backing-dev.h>
21 #include <linux/dax.h>
22 #include <linux/mm_types.h>
23 #include <linux/khugepaged.h>
24 #include <linux/freezer.h>
25 #include <linux/mman.h>
26 #include <linux/memremap.h>
27 #include <linux/pagemap.h>
28 #include <linux/debugfs.h>
29 #include <linux/migrate.h>
30 #include <linux/hashtable.h>
31 #include <linux/userfaultfd_k.h>
32 #include <linux/page_idle.h>
33 #include <linux/shmem_fs.h>
34 #include <linux/oom.h>
35 #include <linux/numa.h>
36 #include <linux/page_owner.h>
37 #include <linux/sched/sysctl.h>
38 #include <linux/memory-tiers.h>
39 #include <linux/compat.h>
40 #include <linux/pgalloc_tag.h>
41 #include <linux/pagewalk.h>
42
43 #include <asm/tlb.h>
44 #include <asm/pgalloc.h>
45 #include "internal.h"
46 #include "swap.h"
47
48 #define CREATE_TRACE_POINTS
49 #include <trace/events/thp.h>
50
51 /*
52 * By default, transparent hugepage support is disabled in order to avoid
53 * risking an increased memory footprint for applications that are not
54 * guaranteed to benefit from it. When transparent hugepage support is
55 * enabled, it is for all mappings, and khugepaged scans all mappings.
56 * Defrag is invoked by khugepaged hugepage allocations and by page faults
57 * for all hugepage allocations.
58 */
59 unsigned long transparent_hugepage_flags __read_mostly =
60 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
61 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
62 #endif
63 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
64 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
65 #endif
66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
68 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
69
70 static struct shrinker *deferred_split_shrinker;
71 static unsigned long deferred_split_count(struct shrinker *shrink,
72 struct shrink_control *sc);
73 static unsigned long deferred_split_scan(struct shrinker *shrink,
74 struct shrink_control *sc);
75 static bool split_underused_thp = true;
76
77 static atomic_t huge_zero_refcount;
78 struct folio *huge_zero_folio __read_mostly;
79 unsigned long huge_zero_pfn __read_mostly = ~0UL;
80 unsigned long huge_anon_orders_always __read_mostly;
81 unsigned long huge_anon_orders_madvise __read_mostly;
82 unsigned long huge_anon_orders_inherit __read_mostly;
83 static bool anon_orders_configured __initdata;
84
file_thp_enabled(struct vm_area_struct * vma)85 static inline bool file_thp_enabled(struct vm_area_struct *vma)
86 {
87 struct inode *inode;
88
89 if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
90 return false;
91
92 if (!vma->vm_file)
93 return false;
94
95 inode = file_inode(vma->vm_file);
96
97 return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
98 }
99
__thp_vma_allowable_orders(struct vm_area_struct * vma,vm_flags_t vm_flags,enum tva_type type,unsigned long orders)100 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
101 vm_flags_t vm_flags,
102 enum tva_type type,
103 unsigned long orders)
104 {
105 const bool smaps = type == TVA_SMAPS;
106 const bool in_pf = type == TVA_PAGEFAULT;
107 const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
108 unsigned long supported_orders;
109
110 /* Check the intersection of requested and supported orders. */
111 if (vma_is_anonymous(vma))
112 supported_orders = THP_ORDERS_ALL_ANON;
113 else if (vma_is_special_huge(vma))
114 supported_orders = THP_ORDERS_ALL_SPECIAL;
115 else
116 supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
117
118 orders &= supported_orders;
119 if (!orders)
120 return 0;
121
122 if (!vma->vm_mm) /* vdso */
123 return 0;
124
125 if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
126 return 0;
127
128 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
129 if (vma_is_dax(vma))
130 return in_pf ? orders : 0;
131
132 /*
133 * khugepaged special VMA and hugetlb VMA.
134 * Must be checked after dax since some dax mappings may have
135 * VM_MIXEDMAP set.
136 */
137 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
138 return 0;
139
140 /*
141 * Check alignment for file vma and size for both file and anon vma by
142 * filtering out the unsuitable orders.
143 *
144 * Skip the check for page fault. Huge fault does the check in fault
145 * handlers.
146 */
147 if (!in_pf) {
148 int order = highest_order(orders);
149 unsigned long addr;
150
151 while (orders) {
152 addr = vma->vm_end - (PAGE_SIZE << order);
153 if (thp_vma_suitable_order(vma, addr, order))
154 break;
155 order = next_order(&orders, order);
156 }
157
158 if (!orders)
159 return 0;
160 }
161
162 /*
163 * Enabled via shmem mount options or sysfs settings.
164 * Must be done before hugepage flags check since shmem has its
165 * own flags.
166 */
167 if (!in_pf && shmem_file(vma->vm_file))
168 return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
169 vma, vma->vm_pgoff, 0,
170 forced_collapse);
171
172 if (!vma_is_anonymous(vma)) {
173 /*
174 * Enforce THP collapse requirements as necessary. Anonymous vmas
175 * were already handled in thp_vma_allowable_orders().
176 */
177 if (!forced_collapse &&
178 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
179 !hugepage_global_always())))
180 return 0;
181
182 /*
183 * Trust that ->huge_fault() handlers know what they are doing
184 * in fault path.
185 */
186 if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
187 return orders;
188 /* Only regular file is valid in collapse path */
189 if (((!in_pf || smaps)) && file_thp_enabled(vma))
190 return orders;
191 return 0;
192 }
193
194 if (vma_is_temporary_stack(vma))
195 return 0;
196
197 /*
198 * THPeligible bit of smaps should show 1 for proper VMAs even
199 * though anon_vma is not initialized yet.
200 *
201 * Allow page fault since anon_vma may be not initialized until
202 * the first page fault.
203 */
204 if (!vma->anon_vma)
205 return (smaps || in_pf) ? orders : 0;
206
207 return orders;
208 }
209
get_huge_zero_folio(void)210 static bool get_huge_zero_folio(void)
211 {
212 struct folio *zero_folio;
213 retry:
214 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
215 return true;
216
217 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
218 HPAGE_PMD_ORDER);
219 if (!zero_folio) {
220 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
221 return false;
222 }
223 /* Ensure zero folio won't have large_rmappable flag set. */
224 folio_clear_large_rmappable(zero_folio);
225 preempt_disable();
226 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
227 preempt_enable();
228 folio_put(zero_folio);
229 goto retry;
230 }
231 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
232
233 /* We take additional reference here. It will be put back by shrinker */
234 atomic_set(&huge_zero_refcount, 2);
235 preempt_enable();
236 count_vm_event(THP_ZERO_PAGE_ALLOC);
237 return true;
238 }
239
put_huge_zero_folio(void)240 static void put_huge_zero_folio(void)
241 {
242 /*
243 * Counter should never go to zero here. Only shrinker can put
244 * last reference.
245 */
246 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
247 }
248
mm_get_huge_zero_folio(struct mm_struct * mm)249 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
250 {
251 if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
252 return huge_zero_folio;
253
254 if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
255 return READ_ONCE(huge_zero_folio);
256
257 if (!get_huge_zero_folio())
258 return NULL;
259
260 if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
261 put_huge_zero_folio();
262
263 return READ_ONCE(huge_zero_folio);
264 }
265
mm_put_huge_zero_folio(struct mm_struct * mm)266 void mm_put_huge_zero_folio(struct mm_struct *mm)
267 {
268 if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
269 return;
270
271 if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
272 put_huge_zero_folio();
273 }
274
shrink_huge_zero_folio_count(struct shrinker * shrink,struct shrink_control * sc)275 static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
276 struct shrink_control *sc)
277 {
278 /* we can free zero page only if last reference remains */
279 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
280 }
281
shrink_huge_zero_folio_scan(struct shrinker * shrink,struct shrink_control * sc)282 static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
283 struct shrink_control *sc)
284 {
285 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
286 struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
287 BUG_ON(zero_folio == NULL);
288 WRITE_ONCE(huge_zero_pfn, ~0UL);
289 folio_put(zero_folio);
290 return HPAGE_PMD_NR;
291 }
292
293 return 0;
294 }
295
296 static struct shrinker *huge_zero_folio_shrinker;
297
298 #ifdef CONFIG_SYSFS
enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)299 static ssize_t enabled_show(struct kobject *kobj,
300 struct kobj_attribute *attr, char *buf)
301 {
302 const char *output;
303
304 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
305 output = "[always] madvise never";
306 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
307 &transparent_hugepage_flags))
308 output = "always [madvise] never";
309 else
310 output = "always madvise [never]";
311
312 return sysfs_emit(buf, "%s\n", output);
313 }
314
enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)315 static ssize_t enabled_store(struct kobject *kobj,
316 struct kobj_attribute *attr,
317 const char *buf, size_t count)
318 {
319 ssize_t ret = count;
320
321 if (sysfs_streq(buf, "always")) {
322 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
323 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
324 } else if (sysfs_streq(buf, "madvise")) {
325 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
326 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
327 } else if (sysfs_streq(buf, "never")) {
328 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
329 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
330 } else
331 ret = -EINVAL;
332
333 if (ret > 0) {
334 int err = start_stop_khugepaged();
335 if (err)
336 ret = err;
337 }
338 return ret;
339 }
340
341 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
342
single_hugepage_flag_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf,enum transparent_hugepage_flag flag)343 ssize_t single_hugepage_flag_show(struct kobject *kobj,
344 struct kobj_attribute *attr, char *buf,
345 enum transparent_hugepage_flag flag)
346 {
347 return sysfs_emit(buf, "%d\n",
348 !!test_bit(flag, &transparent_hugepage_flags));
349 }
350
single_hugepage_flag_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count,enum transparent_hugepage_flag flag)351 ssize_t single_hugepage_flag_store(struct kobject *kobj,
352 struct kobj_attribute *attr,
353 const char *buf, size_t count,
354 enum transparent_hugepage_flag flag)
355 {
356 unsigned long value;
357 int ret;
358
359 ret = kstrtoul(buf, 10, &value);
360 if (ret < 0)
361 return ret;
362 if (value > 1)
363 return -EINVAL;
364
365 if (value)
366 set_bit(flag, &transparent_hugepage_flags);
367 else
368 clear_bit(flag, &transparent_hugepage_flags);
369
370 return count;
371 }
372
defrag_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)373 static ssize_t defrag_show(struct kobject *kobj,
374 struct kobj_attribute *attr, char *buf)
375 {
376 const char *output;
377
378 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
379 &transparent_hugepage_flags))
380 output = "[always] defer defer+madvise madvise never";
381 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
382 &transparent_hugepage_flags))
383 output = "always [defer] defer+madvise madvise never";
384 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
385 &transparent_hugepage_flags))
386 output = "always defer [defer+madvise] madvise never";
387 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
388 &transparent_hugepage_flags))
389 output = "always defer defer+madvise [madvise] never";
390 else
391 output = "always defer defer+madvise madvise [never]";
392
393 return sysfs_emit(buf, "%s\n", output);
394 }
395
defrag_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)396 static ssize_t defrag_store(struct kobject *kobj,
397 struct kobj_attribute *attr,
398 const char *buf, size_t count)
399 {
400 if (sysfs_streq(buf, "always")) {
401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
402 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
403 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
404 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
405 } else if (sysfs_streq(buf, "defer+madvise")) {
406 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
407 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
408 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
409 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
410 } else if (sysfs_streq(buf, "defer")) {
411 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
412 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
413 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
414 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
415 } else if (sysfs_streq(buf, "madvise")) {
416 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
417 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
418 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
419 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
420 } else if (sysfs_streq(buf, "never")) {
421 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
422 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
423 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
424 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
425 } else
426 return -EINVAL;
427
428 return count;
429 }
430 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
431
use_zero_page_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)432 static ssize_t use_zero_page_show(struct kobject *kobj,
433 struct kobj_attribute *attr, char *buf)
434 {
435 return single_hugepage_flag_show(kobj, attr, buf,
436 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
437 }
use_zero_page_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)438 static ssize_t use_zero_page_store(struct kobject *kobj,
439 struct kobj_attribute *attr, const char *buf, size_t count)
440 {
441 return single_hugepage_flag_store(kobj, attr, buf, count,
442 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
443 }
444 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
445
hpage_pmd_size_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)446 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
447 struct kobj_attribute *attr, char *buf)
448 {
449 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
450 }
451 static struct kobj_attribute hpage_pmd_size_attr =
452 __ATTR_RO(hpage_pmd_size);
453
split_underused_thp_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)454 static ssize_t split_underused_thp_show(struct kobject *kobj,
455 struct kobj_attribute *attr, char *buf)
456 {
457 return sysfs_emit(buf, "%d\n", split_underused_thp);
458 }
459
split_underused_thp_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)460 static ssize_t split_underused_thp_store(struct kobject *kobj,
461 struct kobj_attribute *attr,
462 const char *buf, size_t count)
463 {
464 int err = kstrtobool(buf, &split_underused_thp);
465
466 if (err < 0)
467 return err;
468
469 return count;
470 }
471
472 static struct kobj_attribute split_underused_thp_attr = __ATTR(
473 shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
474
475 static struct attribute *hugepage_attr[] = {
476 &enabled_attr.attr,
477 &defrag_attr.attr,
478 &use_zero_page_attr.attr,
479 &hpage_pmd_size_attr.attr,
480 #ifdef CONFIG_SHMEM
481 &shmem_enabled_attr.attr,
482 #endif
483 &split_underused_thp_attr.attr,
484 NULL,
485 };
486
487 static const struct attribute_group hugepage_attr_group = {
488 .attrs = hugepage_attr,
489 };
490
491 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
492 static void thpsize_release(struct kobject *kobj);
493 static DEFINE_SPINLOCK(huge_anon_orders_lock);
494 static LIST_HEAD(thpsize_list);
495
anon_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)496 static ssize_t anon_enabled_show(struct kobject *kobj,
497 struct kobj_attribute *attr, char *buf)
498 {
499 int order = to_thpsize(kobj)->order;
500 const char *output;
501
502 if (test_bit(order, &huge_anon_orders_always))
503 output = "[always] inherit madvise never";
504 else if (test_bit(order, &huge_anon_orders_inherit))
505 output = "always [inherit] madvise never";
506 else if (test_bit(order, &huge_anon_orders_madvise))
507 output = "always inherit [madvise] never";
508 else
509 output = "always inherit madvise [never]";
510
511 return sysfs_emit(buf, "%s\n", output);
512 }
513
anon_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)514 static ssize_t anon_enabled_store(struct kobject *kobj,
515 struct kobj_attribute *attr,
516 const char *buf, size_t count)
517 {
518 int order = to_thpsize(kobj)->order;
519 ssize_t ret = count;
520
521 if (sysfs_streq(buf, "always")) {
522 spin_lock(&huge_anon_orders_lock);
523 clear_bit(order, &huge_anon_orders_inherit);
524 clear_bit(order, &huge_anon_orders_madvise);
525 set_bit(order, &huge_anon_orders_always);
526 spin_unlock(&huge_anon_orders_lock);
527 } else if (sysfs_streq(buf, "inherit")) {
528 spin_lock(&huge_anon_orders_lock);
529 clear_bit(order, &huge_anon_orders_always);
530 clear_bit(order, &huge_anon_orders_madvise);
531 set_bit(order, &huge_anon_orders_inherit);
532 spin_unlock(&huge_anon_orders_lock);
533 } else if (sysfs_streq(buf, "madvise")) {
534 spin_lock(&huge_anon_orders_lock);
535 clear_bit(order, &huge_anon_orders_always);
536 clear_bit(order, &huge_anon_orders_inherit);
537 set_bit(order, &huge_anon_orders_madvise);
538 spin_unlock(&huge_anon_orders_lock);
539 } else if (sysfs_streq(buf, "never")) {
540 spin_lock(&huge_anon_orders_lock);
541 clear_bit(order, &huge_anon_orders_always);
542 clear_bit(order, &huge_anon_orders_inherit);
543 clear_bit(order, &huge_anon_orders_madvise);
544 spin_unlock(&huge_anon_orders_lock);
545 } else
546 ret = -EINVAL;
547
548 if (ret > 0) {
549 int err;
550
551 err = start_stop_khugepaged();
552 if (err)
553 ret = err;
554 }
555 return ret;
556 }
557
558 static struct kobj_attribute anon_enabled_attr =
559 __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
560
561 static struct attribute *anon_ctrl_attrs[] = {
562 &anon_enabled_attr.attr,
563 NULL,
564 };
565
566 static const struct attribute_group anon_ctrl_attr_grp = {
567 .attrs = anon_ctrl_attrs,
568 };
569
570 static struct attribute *file_ctrl_attrs[] = {
571 #ifdef CONFIG_SHMEM
572 &thpsize_shmem_enabled_attr.attr,
573 #endif
574 NULL,
575 };
576
577 static const struct attribute_group file_ctrl_attr_grp = {
578 .attrs = file_ctrl_attrs,
579 };
580
581 static struct attribute *any_ctrl_attrs[] = {
582 NULL,
583 };
584
585 static const struct attribute_group any_ctrl_attr_grp = {
586 .attrs = any_ctrl_attrs,
587 };
588
589 static const struct kobj_type thpsize_ktype = {
590 .release = &thpsize_release,
591 .sysfs_ops = &kobj_sysfs_ops,
592 };
593
594 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
595
sum_mthp_stat(int order,enum mthp_stat_item item)596 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
597 {
598 unsigned long sum = 0;
599 int cpu;
600
601 for_each_possible_cpu(cpu) {
602 struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
603
604 sum += this->stats[order][item];
605 }
606
607 return sum;
608 }
609
610 #define DEFINE_MTHP_STAT_ATTR(_name, _index) \
611 static ssize_t _name##_show(struct kobject *kobj, \
612 struct kobj_attribute *attr, char *buf) \
613 { \
614 int order = to_thpsize(kobj)->order; \
615 \
616 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
617 } \
618 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
619
620 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
621 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
622 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
623 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
624 DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
625 DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
626 DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
627 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
628 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
629 #ifdef CONFIG_SHMEM
630 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
631 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
632 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
633 #endif
634 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
635 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
636 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
637 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
638 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
639
640 static struct attribute *anon_stats_attrs[] = {
641 &anon_fault_alloc_attr.attr,
642 &anon_fault_fallback_attr.attr,
643 &anon_fault_fallback_charge_attr.attr,
644 #ifndef CONFIG_SHMEM
645 &zswpout_attr.attr,
646 &swpin_attr.attr,
647 &swpin_fallback_attr.attr,
648 &swpin_fallback_charge_attr.attr,
649 &swpout_attr.attr,
650 &swpout_fallback_attr.attr,
651 #endif
652 &split_deferred_attr.attr,
653 &nr_anon_attr.attr,
654 &nr_anon_partially_mapped_attr.attr,
655 NULL,
656 };
657
658 static struct attribute_group anon_stats_attr_grp = {
659 .name = "stats",
660 .attrs = anon_stats_attrs,
661 };
662
663 static struct attribute *file_stats_attrs[] = {
664 #ifdef CONFIG_SHMEM
665 &shmem_alloc_attr.attr,
666 &shmem_fallback_attr.attr,
667 &shmem_fallback_charge_attr.attr,
668 #endif
669 NULL,
670 };
671
672 static struct attribute_group file_stats_attr_grp = {
673 .name = "stats",
674 .attrs = file_stats_attrs,
675 };
676
677 static struct attribute *any_stats_attrs[] = {
678 #ifdef CONFIG_SHMEM
679 &zswpout_attr.attr,
680 &swpin_attr.attr,
681 &swpin_fallback_attr.attr,
682 &swpin_fallback_charge_attr.attr,
683 &swpout_attr.attr,
684 &swpout_fallback_attr.attr,
685 #endif
686 &split_attr.attr,
687 &split_failed_attr.attr,
688 NULL,
689 };
690
691 static struct attribute_group any_stats_attr_grp = {
692 .name = "stats",
693 .attrs = any_stats_attrs,
694 };
695
sysfs_add_group(struct kobject * kobj,const struct attribute_group * grp)696 static int sysfs_add_group(struct kobject *kobj,
697 const struct attribute_group *grp)
698 {
699 int ret = -ENOENT;
700
701 /*
702 * If the group is named, try to merge first, assuming the subdirectory
703 * was already created. This avoids the warning emitted by
704 * sysfs_create_group() if the directory already exists.
705 */
706 if (grp->name)
707 ret = sysfs_merge_group(kobj, grp);
708 if (ret)
709 ret = sysfs_create_group(kobj, grp);
710
711 return ret;
712 }
713
thpsize_create(int order,struct kobject * parent)714 static struct thpsize *thpsize_create(int order, struct kobject *parent)
715 {
716 unsigned long size = (PAGE_SIZE << order) / SZ_1K;
717 struct thpsize *thpsize;
718 int ret = -ENOMEM;
719
720 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
721 if (!thpsize)
722 goto err;
723
724 thpsize->order = order;
725
726 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
727 "hugepages-%lukB", size);
728 if (ret) {
729 kfree(thpsize);
730 goto err;
731 }
732
733
734 ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
735 if (ret)
736 goto err_put;
737
738 ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
739 if (ret)
740 goto err_put;
741
742 if (BIT(order) & THP_ORDERS_ALL_ANON) {
743 ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
744 if (ret)
745 goto err_put;
746
747 ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
748 if (ret)
749 goto err_put;
750 }
751
752 if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
753 ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
754 if (ret)
755 goto err_put;
756
757 ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
758 if (ret)
759 goto err_put;
760 }
761
762 return thpsize;
763 err_put:
764 kobject_put(&thpsize->kobj);
765 err:
766 return ERR_PTR(ret);
767 }
768
thpsize_release(struct kobject * kobj)769 static void thpsize_release(struct kobject *kobj)
770 {
771 kfree(to_thpsize(kobj));
772 }
773
hugepage_init_sysfs(struct kobject ** hugepage_kobj)774 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
775 {
776 int err;
777 struct thpsize *thpsize;
778 unsigned long orders;
779 int order;
780
781 /*
782 * Default to setting PMD-sized THP to inherit the global setting and
783 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
784 * constant so we have to do this here.
785 */
786 if (!anon_orders_configured)
787 huge_anon_orders_inherit = BIT(PMD_ORDER);
788
789 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
790 if (unlikely(!*hugepage_kobj)) {
791 pr_err("failed to create transparent hugepage kobject\n");
792 return -ENOMEM;
793 }
794
795 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
796 if (err) {
797 pr_err("failed to register transparent hugepage group\n");
798 goto delete_obj;
799 }
800
801 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
802 if (err) {
803 pr_err("failed to register transparent hugepage group\n");
804 goto remove_hp_group;
805 }
806
807 orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
808 order = highest_order(orders);
809 while (orders) {
810 thpsize = thpsize_create(order, *hugepage_kobj);
811 if (IS_ERR(thpsize)) {
812 pr_err("failed to create thpsize for order %d\n", order);
813 err = PTR_ERR(thpsize);
814 goto remove_all;
815 }
816 list_add(&thpsize->node, &thpsize_list);
817 order = next_order(&orders, order);
818 }
819
820 return 0;
821
822 remove_all:
823 hugepage_exit_sysfs(*hugepage_kobj);
824 return err;
825 remove_hp_group:
826 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
827 delete_obj:
828 kobject_put(*hugepage_kobj);
829 return err;
830 }
831
hugepage_exit_sysfs(struct kobject * hugepage_kobj)832 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
833 {
834 struct thpsize *thpsize, *tmp;
835
836 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
837 list_del(&thpsize->node);
838 kobject_put(&thpsize->kobj);
839 }
840
841 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
842 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
843 kobject_put(hugepage_kobj);
844 }
845 #else
hugepage_init_sysfs(struct kobject ** hugepage_kobj)846 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
847 {
848 return 0;
849 }
850
hugepage_exit_sysfs(struct kobject * hugepage_kobj)851 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
852 {
853 }
854 #endif /* CONFIG_SYSFS */
855
thp_shrinker_init(void)856 static int __init thp_shrinker_init(void)
857 {
858 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
859 SHRINKER_MEMCG_AWARE |
860 SHRINKER_NONSLAB,
861 "thp-deferred_split");
862 if (!deferred_split_shrinker)
863 return -ENOMEM;
864
865 deferred_split_shrinker->count_objects = deferred_split_count;
866 deferred_split_shrinker->scan_objects = deferred_split_scan;
867 shrinker_register(deferred_split_shrinker);
868
869 if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
870 /*
871 * Bump the reference of the huge_zero_folio and do not
872 * initialize the shrinker.
873 *
874 * huge_zero_folio will always be NULL on failure. We assume
875 * that get_huge_zero_folio() will most likely not fail as
876 * thp_shrinker_init() is invoked early on during boot.
877 */
878 if (!get_huge_zero_folio())
879 pr_warn("Allocating persistent huge zero folio failed\n");
880 return 0;
881 }
882
883 huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
884 if (!huge_zero_folio_shrinker) {
885 shrinker_free(deferred_split_shrinker);
886 return -ENOMEM;
887 }
888
889 huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
890 huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
891 shrinker_register(huge_zero_folio_shrinker);
892
893 return 0;
894 }
895
thp_shrinker_exit(void)896 static void __init thp_shrinker_exit(void)
897 {
898 shrinker_free(huge_zero_folio_shrinker);
899 shrinker_free(deferred_split_shrinker);
900 }
901
hugepage_init(void)902 static int __init hugepage_init(void)
903 {
904 int err;
905 struct kobject *hugepage_kobj;
906
907 if (!has_transparent_hugepage()) {
908 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
909 return -EINVAL;
910 }
911
912 /*
913 * hugepages can't be allocated by the buddy allocator
914 */
915 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
916
917 err = hugepage_init_sysfs(&hugepage_kobj);
918 if (err)
919 goto err_sysfs;
920
921 err = khugepaged_init();
922 if (err)
923 goto err_slab;
924
925 err = thp_shrinker_init();
926 if (err)
927 goto err_shrinker;
928
929 /*
930 * By default disable transparent hugepages on smaller systems,
931 * where the extra memory used could hurt more than TLB overhead
932 * is likely to save. The admin can still enable it through /sys.
933 */
934 if (totalram_pages() < MB_TO_PAGES(512)) {
935 transparent_hugepage_flags = 0;
936 return 0;
937 }
938
939 err = start_stop_khugepaged();
940 if (err)
941 goto err_khugepaged;
942
943 return 0;
944 err_khugepaged:
945 thp_shrinker_exit();
946 err_shrinker:
947 khugepaged_destroy();
948 err_slab:
949 hugepage_exit_sysfs(hugepage_kobj);
950 err_sysfs:
951 return err;
952 }
953 subsys_initcall(hugepage_init);
954
setup_transparent_hugepage(char * str)955 static int __init setup_transparent_hugepage(char *str)
956 {
957 int ret = 0;
958 if (!str)
959 goto out;
960 if (!strcmp(str, "always")) {
961 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
962 &transparent_hugepage_flags);
963 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
964 &transparent_hugepage_flags);
965 ret = 1;
966 } else if (!strcmp(str, "madvise")) {
967 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
968 &transparent_hugepage_flags);
969 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
970 &transparent_hugepage_flags);
971 ret = 1;
972 } else if (!strcmp(str, "never")) {
973 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
974 &transparent_hugepage_flags);
975 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
976 &transparent_hugepage_flags);
977 ret = 1;
978 }
979 out:
980 if (!ret)
981 pr_warn("transparent_hugepage= cannot parse, ignored\n");
982 return ret;
983 }
984 __setup("transparent_hugepage=", setup_transparent_hugepage);
985
986 static char str_dup[PAGE_SIZE] __initdata;
setup_thp_anon(char * str)987 static int __init setup_thp_anon(char *str)
988 {
989 char *token, *range, *policy, *subtoken;
990 unsigned long always, inherit, madvise;
991 char *start_size, *end_size;
992 int start, end, nr;
993 char *p;
994
995 if (!str || strlen(str) + 1 > PAGE_SIZE)
996 goto err;
997 strscpy(str_dup, str);
998
999 always = huge_anon_orders_always;
1000 madvise = huge_anon_orders_madvise;
1001 inherit = huge_anon_orders_inherit;
1002 p = str_dup;
1003 while ((token = strsep(&p, ";")) != NULL) {
1004 range = strsep(&token, ":");
1005 policy = token;
1006
1007 if (!policy)
1008 goto err;
1009
1010 while ((subtoken = strsep(&range, ",")) != NULL) {
1011 if (strchr(subtoken, '-')) {
1012 start_size = strsep(&subtoken, "-");
1013 end_size = subtoken;
1014
1015 start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
1016 end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
1017 } else {
1018 start_size = end_size = subtoken;
1019 start = end = get_order_from_str(subtoken,
1020 THP_ORDERS_ALL_ANON);
1021 }
1022
1023 if (start == -EINVAL) {
1024 pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
1025 goto err;
1026 }
1027
1028 if (end == -EINVAL) {
1029 pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
1030 goto err;
1031 }
1032
1033 if (start < 0 || end < 0 || start > end)
1034 goto err;
1035
1036 nr = end - start + 1;
1037 if (!strcmp(policy, "always")) {
1038 bitmap_set(&always, start, nr);
1039 bitmap_clear(&inherit, start, nr);
1040 bitmap_clear(&madvise, start, nr);
1041 } else if (!strcmp(policy, "madvise")) {
1042 bitmap_set(&madvise, start, nr);
1043 bitmap_clear(&inherit, start, nr);
1044 bitmap_clear(&always, start, nr);
1045 } else if (!strcmp(policy, "inherit")) {
1046 bitmap_set(&inherit, start, nr);
1047 bitmap_clear(&madvise, start, nr);
1048 bitmap_clear(&always, start, nr);
1049 } else if (!strcmp(policy, "never")) {
1050 bitmap_clear(&inherit, start, nr);
1051 bitmap_clear(&madvise, start, nr);
1052 bitmap_clear(&always, start, nr);
1053 } else {
1054 pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
1055 goto err;
1056 }
1057 }
1058 }
1059
1060 huge_anon_orders_always = always;
1061 huge_anon_orders_madvise = madvise;
1062 huge_anon_orders_inherit = inherit;
1063 anon_orders_configured = true;
1064 return 1;
1065
1066 err:
1067 pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
1068 return 0;
1069 }
1070 __setup("thp_anon=", setup_thp_anon);
1071
maybe_pmd_mkwrite(pmd_t pmd,struct vm_area_struct * vma)1072 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
1073 {
1074 if (likely(vma->vm_flags & VM_WRITE))
1075 pmd = pmd_mkwrite(pmd, vma);
1076 return pmd;
1077 }
1078
1079 #ifdef CONFIG_MEMCG
1080 static inline
get_deferred_split_queue(struct folio * folio)1081 struct deferred_split *get_deferred_split_queue(struct folio *folio)
1082 {
1083 struct mem_cgroup *memcg = folio_memcg(folio);
1084 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
1085
1086 if (memcg)
1087 return &memcg->deferred_split_queue;
1088 else
1089 return &pgdat->deferred_split_queue;
1090 }
1091 #else
1092 static inline
get_deferred_split_queue(struct folio * folio)1093 struct deferred_split *get_deferred_split_queue(struct folio *folio)
1094 {
1095 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
1096
1097 return &pgdat->deferred_split_queue;
1098 }
1099 #endif
1100
is_transparent_hugepage(const struct folio * folio)1101 static inline bool is_transparent_hugepage(const struct folio *folio)
1102 {
1103 if (!folio_test_large(folio))
1104 return false;
1105
1106 return is_huge_zero_folio(folio) ||
1107 folio_test_large_rmappable(folio);
1108 }
1109
__thp_get_unmapped_area(struct file * filp,unsigned long addr,unsigned long len,loff_t off,unsigned long flags,unsigned long size,vm_flags_t vm_flags)1110 static unsigned long __thp_get_unmapped_area(struct file *filp,
1111 unsigned long addr, unsigned long len,
1112 loff_t off, unsigned long flags, unsigned long size,
1113 vm_flags_t vm_flags)
1114 {
1115 loff_t off_end = off + len;
1116 loff_t off_align = round_up(off, size);
1117 unsigned long len_pad, ret, off_sub;
1118
1119 if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
1120 return 0;
1121
1122 if (off_end <= off_align || (off_end - off_align) < size)
1123 return 0;
1124
1125 len_pad = len + size;
1126 if (len_pad < len || (off + len_pad) < off)
1127 return 0;
1128
1129 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
1130 off >> PAGE_SHIFT, flags, vm_flags);
1131
1132 /*
1133 * The failure might be due to length padding. The caller will retry
1134 * without the padding.
1135 */
1136 if (IS_ERR_VALUE(ret))
1137 return 0;
1138
1139 /*
1140 * Do not try to align to THP boundary if allocation at the address
1141 * hint succeeds.
1142 */
1143 if (ret == addr)
1144 return addr;
1145
1146 off_sub = (off - ret) & (size - 1);
1147
1148 if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
1149 return ret + size;
1150
1151 ret += off_sub;
1152 return ret;
1153 }
1154
thp_get_unmapped_area_vmflags(struct file * filp,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags,vm_flags_t vm_flags)1155 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
1156 unsigned long len, unsigned long pgoff, unsigned long flags,
1157 vm_flags_t vm_flags)
1158 {
1159 unsigned long ret;
1160 loff_t off = (loff_t)pgoff << PAGE_SHIFT;
1161
1162 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
1163 if (ret)
1164 return ret;
1165
1166 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
1167 vm_flags);
1168 }
1169
thp_get_unmapped_area(struct file * filp,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)1170 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
1171 unsigned long len, unsigned long pgoff, unsigned long flags)
1172 {
1173 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
1174 }
1175 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
1176
vma_alloc_anon_folio_pmd(struct vm_area_struct * vma,unsigned long addr)1177 static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
1178 unsigned long addr)
1179 {
1180 gfp_t gfp = vma_thp_gfp_mask(vma);
1181 const int order = HPAGE_PMD_ORDER;
1182 struct folio *folio;
1183
1184 folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
1185
1186 if (unlikely(!folio)) {
1187 count_vm_event(THP_FAULT_FALLBACK);
1188 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
1189 return NULL;
1190 }
1191
1192 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
1193 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
1194 folio_put(folio);
1195 count_vm_event(THP_FAULT_FALLBACK);
1196 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
1197 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
1198 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
1199 return NULL;
1200 }
1201 folio_throttle_swaprate(folio, gfp);
1202
1203 /*
1204 * When a folio is not zeroed during allocation (__GFP_ZERO not used)
1205 * or user folios require special handling, folio_zero_user() is used to
1206 * make sure that the page corresponding to the faulting address will be
1207 * hot in the cache after zeroing.
1208 */
1209 if (user_alloc_needs_zeroing())
1210 folio_zero_user(folio, addr);
1211 /*
1212 * The memory barrier inside __folio_mark_uptodate makes sure that
1213 * folio_zero_user writes become visible before the set_pmd_at()
1214 * write.
1215 */
1216 __folio_mark_uptodate(folio);
1217 return folio;
1218 }
1219
map_anon_folio_pmd(struct folio * folio,pmd_t * pmd,struct vm_area_struct * vma,unsigned long haddr)1220 static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
1221 struct vm_area_struct *vma, unsigned long haddr)
1222 {
1223 pmd_t entry;
1224
1225 entry = folio_mk_pmd(folio, vma->vm_page_prot);
1226 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1227 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
1228 folio_add_lru_vma(folio, vma);
1229 set_pmd_at(vma->vm_mm, haddr, pmd, entry);
1230 update_mmu_cache_pmd(vma, haddr, pmd);
1231 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1232 count_vm_event(THP_FAULT_ALLOC);
1233 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
1234 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
1235 }
1236
__do_huge_pmd_anonymous_page(struct vm_fault * vmf)1237 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1238 {
1239 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1240 struct vm_area_struct *vma = vmf->vma;
1241 struct folio *folio;
1242 pgtable_t pgtable;
1243 vm_fault_t ret = 0;
1244
1245 folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
1246 if (unlikely(!folio))
1247 return VM_FAULT_FALLBACK;
1248
1249 pgtable = pte_alloc_one(vma->vm_mm);
1250 if (unlikely(!pgtable)) {
1251 ret = VM_FAULT_OOM;
1252 goto release;
1253 }
1254
1255 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1256 if (unlikely(!pmd_none(*vmf->pmd))) {
1257 goto unlock_release;
1258 } else {
1259 ret = check_stable_address_space(vma->vm_mm);
1260 if (ret)
1261 goto unlock_release;
1262
1263 /* Deliver the page fault to userland */
1264 if (userfaultfd_missing(vma)) {
1265 spin_unlock(vmf->ptl);
1266 folio_put(folio);
1267 pte_free(vma->vm_mm, pgtable);
1268 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1269 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1270 return ret;
1271 }
1272 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1273 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
1274 mm_inc_nr_ptes(vma->vm_mm);
1275 deferred_split_folio(folio, false);
1276 spin_unlock(vmf->ptl);
1277 }
1278
1279 return 0;
1280 unlock_release:
1281 spin_unlock(vmf->ptl);
1282 release:
1283 if (pgtable)
1284 pte_free(vma->vm_mm, pgtable);
1285 folio_put(folio);
1286 return ret;
1287
1288 }
1289
1290 /*
1291 * always: directly stall for all thp allocations
1292 * defer: wake kswapd and fail if not immediately available
1293 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1294 * fail if not immediately available
1295 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1296 * available
1297 * never: never stall for any thp allocation
1298 */
vma_thp_gfp_mask(struct vm_area_struct * vma)1299 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
1300 {
1301 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
1302
1303 /* Always do synchronous compaction */
1304 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1305 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
1306
1307 /* Kick kcompactd and fail quickly */
1308 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1309 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
1310
1311 /* Synchronous compaction if madvised, otherwise kick kcompactd */
1312 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1313 return GFP_TRANSHUGE_LIGHT |
1314 (vma_madvised ? __GFP_DIRECT_RECLAIM :
1315 __GFP_KSWAPD_RECLAIM);
1316
1317 /* Only do synchronous compaction if madvised */
1318 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1319 return GFP_TRANSHUGE_LIGHT |
1320 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
1321
1322 return GFP_TRANSHUGE_LIGHT;
1323 }
1324
1325 /* Caller must hold page table lock. */
set_huge_zero_folio(pgtable_t pgtable,struct mm_struct * mm,struct vm_area_struct * vma,unsigned long haddr,pmd_t * pmd,struct folio * zero_folio)1326 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
1327 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1328 struct folio *zero_folio)
1329 {
1330 pmd_t entry;
1331 entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
1332 entry = pmd_mkspecial(entry);
1333 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1334 set_pmd_at(mm, haddr, pmd, entry);
1335 mm_inc_nr_ptes(mm);
1336 }
1337
do_huge_pmd_anonymous_page(struct vm_fault * vmf)1338 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1339 {
1340 struct vm_area_struct *vma = vmf->vma;
1341 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1342 vm_fault_t ret;
1343
1344 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
1345 return VM_FAULT_FALLBACK;
1346 ret = vmf_anon_prepare(vmf);
1347 if (ret)
1348 return ret;
1349 khugepaged_enter_vma(vma, vma->vm_flags);
1350
1351 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1352 !mm_forbids_zeropage(vma->vm_mm) &&
1353 transparent_hugepage_use_zero_page()) {
1354 pgtable_t pgtable;
1355 struct folio *zero_folio;
1356 vm_fault_t ret;
1357
1358 pgtable = pte_alloc_one(vma->vm_mm);
1359 if (unlikely(!pgtable))
1360 return VM_FAULT_OOM;
1361 zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
1362 if (unlikely(!zero_folio)) {
1363 pte_free(vma->vm_mm, pgtable);
1364 count_vm_event(THP_FAULT_FALLBACK);
1365 return VM_FAULT_FALLBACK;
1366 }
1367 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1368 ret = 0;
1369 if (pmd_none(*vmf->pmd)) {
1370 ret = check_stable_address_space(vma->vm_mm);
1371 if (ret) {
1372 spin_unlock(vmf->ptl);
1373 pte_free(vma->vm_mm, pgtable);
1374 } else if (userfaultfd_missing(vma)) {
1375 spin_unlock(vmf->ptl);
1376 pte_free(vma->vm_mm, pgtable);
1377 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1378 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1379 } else {
1380 set_huge_zero_folio(pgtable, vma->vm_mm, vma,
1381 haddr, vmf->pmd, zero_folio);
1382 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1383 spin_unlock(vmf->ptl);
1384 }
1385 } else {
1386 spin_unlock(vmf->ptl);
1387 pte_free(vma->vm_mm, pgtable);
1388 }
1389 return ret;
1390 }
1391
1392 return __do_huge_pmd_anonymous_page(vmf);
1393 }
1394
1395 struct folio_or_pfn {
1396 union {
1397 struct folio *folio;
1398 unsigned long pfn;
1399 };
1400 bool is_folio;
1401 };
1402
insert_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,struct folio_or_pfn fop,pgprot_t prot,bool write)1403 static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
1404 pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
1405 bool write)
1406 {
1407 struct mm_struct *mm = vma->vm_mm;
1408 pgtable_t pgtable = NULL;
1409 spinlock_t *ptl;
1410 pmd_t entry;
1411
1412 if (addr < vma->vm_start || addr >= vma->vm_end)
1413 return VM_FAULT_SIGBUS;
1414
1415 if (arch_needs_pgtable_deposit()) {
1416 pgtable = pte_alloc_one(vma->vm_mm);
1417 if (!pgtable)
1418 return VM_FAULT_OOM;
1419 }
1420
1421 ptl = pmd_lock(mm, pmd);
1422 if (!pmd_none(*pmd)) {
1423 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
1424 fop.pfn;
1425
1426 if (write) {
1427 if (pmd_pfn(*pmd) != pfn) {
1428 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1429 goto out_unlock;
1430 }
1431 entry = pmd_mkyoung(*pmd);
1432 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1433 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1434 update_mmu_cache_pmd(vma, addr, pmd);
1435 }
1436 goto out_unlock;
1437 }
1438
1439 if (fop.is_folio) {
1440 entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
1441
1442 if (is_huge_zero_folio(fop.folio)) {
1443 entry = pmd_mkspecial(entry);
1444 } else {
1445 folio_get(fop.folio);
1446 folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
1447 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
1448 }
1449 } else {
1450 entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
1451 entry = pmd_mkspecial(entry);
1452 }
1453 if (write) {
1454 entry = pmd_mkyoung(pmd_mkdirty(entry));
1455 entry = maybe_pmd_mkwrite(entry, vma);
1456 }
1457
1458 if (pgtable) {
1459 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1460 mm_inc_nr_ptes(mm);
1461 pgtable = NULL;
1462 }
1463
1464 set_pmd_at(mm, addr, pmd, entry);
1465 update_mmu_cache_pmd(vma, addr, pmd);
1466
1467 out_unlock:
1468 spin_unlock(ptl);
1469 if (pgtable)
1470 pte_free(mm, pgtable);
1471 return VM_FAULT_NOPAGE;
1472 }
1473
1474 /**
1475 * vmf_insert_pfn_pmd - insert a pmd size pfn
1476 * @vmf: Structure describing the fault
1477 * @pfn: pfn to insert
1478 * @write: whether it's a write fault
1479 *
1480 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
1481 *
1482 * Return: vm_fault_t value.
1483 */
vmf_insert_pfn_pmd(struct vm_fault * vmf,unsigned long pfn,bool write)1484 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
1485 bool write)
1486 {
1487 unsigned long addr = vmf->address & PMD_MASK;
1488 struct vm_area_struct *vma = vmf->vma;
1489 pgprot_t pgprot = vma->vm_page_prot;
1490 struct folio_or_pfn fop = {
1491 .pfn = pfn,
1492 };
1493
1494 /*
1495 * If we had pmd_special, we could avoid all these restrictions,
1496 * but we need to be consistent with PTEs and architectures that
1497 * can't support a 'special' bit.
1498 */
1499 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1500 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1501 (VM_PFNMAP|VM_MIXEDMAP));
1502 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1503
1504 pfnmap_setup_cachemode_pfn(pfn, &pgprot);
1505
1506 return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
1507 }
1508 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1509
vmf_insert_folio_pmd(struct vm_fault * vmf,struct folio * folio,bool write)1510 vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
1511 bool write)
1512 {
1513 struct vm_area_struct *vma = vmf->vma;
1514 unsigned long addr = vmf->address & PMD_MASK;
1515 struct folio_or_pfn fop = {
1516 .folio = folio,
1517 .is_folio = true,
1518 };
1519
1520 if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
1521 return VM_FAULT_SIGBUS;
1522
1523 return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
1524 }
1525 EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
1526
1527 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
maybe_pud_mkwrite(pud_t pud,struct vm_area_struct * vma)1528 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1529 {
1530 if (likely(vma->vm_flags & VM_WRITE))
1531 pud = pud_mkwrite(pud);
1532 return pud;
1533 }
1534
insert_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pud,struct folio_or_pfn fop,pgprot_t prot,bool write)1535 static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
1536 pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
1537 {
1538 struct mm_struct *mm = vma->vm_mm;
1539 spinlock_t *ptl;
1540 pud_t entry;
1541
1542 if (addr < vma->vm_start || addr >= vma->vm_end)
1543 return VM_FAULT_SIGBUS;
1544
1545 ptl = pud_lock(mm, pud);
1546 if (!pud_none(*pud)) {
1547 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
1548 fop.pfn;
1549
1550 if (write) {
1551 if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
1552 goto out_unlock;
1553 entry = pud_mkyoung(*pud);
1554 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1555 if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1556 update_mmu_cache_pud(vma, addr, pud);
1557 }
1558 goto out_unlock;
1559 }
1560
1561 if (fop.is_folio) {
1562 entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
1563
1564 folio_get(fop.folio);
1565 folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
1566 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
1567 } else {
1568 entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
1569 entry = pud_mkspecial(entry);
1570 }
1571 if (write) {
1572 entry = pud_mkyoung(pud_mkdirty(entry));
1573 entry = maybe_pud_mkwrite(entry, vma);
1574 }
1575 set_pud_at(mm, addr, pud, entry);
1576 update_mmu_cache_pud(vma, addr, pud);
1577 out_unlock:
1578 spin_unlock(ptl);
1579 return VM_FAULT_NOPAGE;
1580 }
1581
1582 /**
1583 * vmf_insert_pfn_pud - insert a pud size pfn
1584 * @vmf: Structure describing the fault
1585 * @pfn: pfn to insert
1586 * @write: whether it's a write fault
1587 *
1588 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1589 *
1590 * Return: vm_fault_t value.
1591 */
vmf_insert_pfn_pud(struct vm_fault * vmf,unsigned long pfn,bool write)1592 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
1593 bool write)
1594 {
1595 unsigned long addr = vmf->address & PUD_MASK;
1596 struct vm_area_struct *vma = vmf->vma;
1597 pgprot_t pgprot = vma->vm_page_prot;
1598 struct folio_or_pfn fop = {
1599 .pfn = pfn,
1600 };
1601
1602 /*
1603 * If we had pud_special, we could avoid all these restrictions,
1604 * but we need to be consistent with PTEs and architectures that
1605 * can't support a 'special' bit.
1606 */
1607 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1608 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1609 (VM_PFNMAP|VM_MIXEDMAP));
1610 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1611
1612 pfnmap_setup_cachemode_pfn(pfn, &pgprot);
1613
1614 return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
1615 }
1616 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1617
1618 /**
1619 * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
1620 * @vmf: Structure describing the fault
1621 * @folio: folio to insert
1622 * @write: whether it's a write fault
1623 *
1624 * Return: vm_fault_t value.
1625 */
vmf_insert_folio_pud(struct vm_fault * vmf,struct folio * folio,bool write)1626 vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
1627 bool write)
1628 {
1629 struct vm_area_struct *vma = vmf->vma;
1630 unsigned long addr = vmf->address & PUD_MASK;
1631 struct folio_or_pfn fop = {
1632 .folio = folio,
1633 .is_folio = true,
1634 };
1635
1636 if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
1637 return VM_FAULT_SIGBUS;
1638
1639 return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
1640 }
1641 EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
1642 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1643
touch_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,bool write)1644 void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1645 pmd_t *pmd, bool write)
1646 {
1647 pmd_t _pmd;
1648
1649 _pmd = pmd_mkyoung(*pmd);
1650 if (write)
1651 _pmd = pmd_mkdirty(_pmd);
1652 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1653 pmd, _pmd, write))
1654 update_mmu_cache_pmd(vma, addr, pmd);
1655 }
1656
copy_huge_pmd(struct mm_struct * dst_mm,struct mm_struct * src_mm,pmd_t * dst_pmd,pmd_t * src_pmd,unsigned long addr,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)1657 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1658 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1659 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1660 {
1661 spinlock_t *dst_ptl, *src_ptl;
1662 struct page *src_page;
1663 struct folio *src_folio;
1664 pmd_t pmd;
1665 pgtable_t pgtable = NULL;
1666 int ret = -ENOMEM;
1667
1668 pmd = pmdp_get_lockless(src_pmd);
1669 if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
1670 !is_huge_zero_pmd(pmd))) {
1671 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1672 src_ptl = pmd_lockptr(src_mm, src_pmd);
1673 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1674 /*
1675 * No need to recheck the pmd, it can't change with write
1676 * mmap lock held here.
1677 *
1678 * Meanwhile, making sure it's not a CoW VMA with writable
1679 * mapping, otherwise it means either the anon page wrongly
1680 * applied special bit, or we made the PRIVATE mapping be
1681 * able to wrongly write to the backend MMIO.
1682 */
1683 VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
1684 goto set_pmd;
1685 }
1686
1687 /* Skip if can be re-fill on fault */
1688 if (!vma_is_anonymous(dst_vma))
1689 return 0;
1690
1691 pgtable = pte_alloc_one(dst_mm);
1692 if (unlikely(!pgtable))
1693 goto out;
1694
1695 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1696 src_ptl = pmd_lockptr(src_mm, src_pmd);
1697 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1698
1699 ret = -EAGAIN;
1700 pmd = *src_pmd;
1701
1702 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1703 if (unlikely(is_swap_pmd(pmd))) {
1704 swp_entry_t entry = pmd_to_swp_entry(pmd);
1705
1706 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1707 if (!is_readable_migration_entry(entry)) {
1708 entry = make_readable_migration_entry(
1709 swp_offset(entry));
1710 pmd = swp_entry_to_pmd(entry);
1711 if (pmd_swp_soft_dirty(*src_pmd))
1712 pmd = pmd_swp_mksoft_dirty(pmd);
1713 if (pmd_swp_uffd_wp(*src_pmd))
1714 pmd = pmd_swp_mkuffd_wp(pmd);
1715 set_pmd_at(src_mm, addr, src_pmd, pmd);
1716 }
1717 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1718 mm_inc_nr_ptes(dst_mm);
1719 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1720 if (!userfaultfd_wp(dst_vma))
1721 pmd = pmd_swp_clear_uffd_wp(pmd);
1722 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1723 ret = 0;
1724 goto out_unlock;
1725 }
1726 #endif
1727
1728 if (unlikely(!pmd_trans_huge(pmd))) {
1729 pte_free(dst_mm, pgtable);
1730 goto out_unlock;
1731 }
1732 /*
1733 * When page table lock is held, the huge zero pmd should not be
1734 * under splitting since we don't split the page itself, only pmd to
1735 * a page table.
1736 */
1737 if (is_huge_zero_pmd(pmd)) {
1738 /*
1739 * mm_get_huge_zero_folio() will never allocate a new
1740 * folio here, since we already have a zero page to
1741 * copy. It just takes a reference.
1742 */
1743 mm_get_huge_zero_folio(dst_mm);
1744 goto out_zero_page;
1745 }
1746
1747 src_page = pmd_page(pmd);
1748 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1749 src_folio = page_folio(src_page);
1750
1751 folio_get(src_folio);
1752 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
1753 /* Page maybe pinned: split and retry the fault on PTEs. */
1754 folio_put(src_folio);
1755 pte_free(dst_mm, pgtable);
1756 spin_unlock(src_ptl);
1757 spin_unlock(dst_ptl);
1758 __split_huge_pmd(src_vma, src_pmd, addr, false);
1759 return -EAGAIN;
1760 }
1761 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1762 out_zero_page:
1763 mm_inc_nr_ptes(dst_mm);
1764 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1765 pmdp_set_wrprotect(src_mm, addr, src_pmd);
1766 if (!userfaultfd_wp(dst_vma))
1767 pmd = pmd_clear_uffd_wp(pmd);
1768 pmd = pmd_wrprotect(pmd);
1769 set_pmd:
1770 pmd = pmd_mkold(pmd);
1771 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1772
1773 ret = 0;
1774 out_unlock:
1775 spin_unlock(src_ptl);
1776 spin_unlock(dst_ptl);
1777 out:
1778 return ret;
1779 }
1780
1781 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
touch_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pud,bool write)1782 void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1783 pud_t *pud, bool write)
1784 {
1785 pud_t _pud;
1786
1787 _pud = pud_mkyoung(*pud);
1788 if (write)
1789 _pud = pud_mkdirty(_pud);
1790 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1791 pud, _pud, write))
1792 update_mmu_cache_pud(vma, addr, pud);
1793 }
1794
copy_huge_pud(struct mm_struct * dst_mm,struct mm_struct * src_mm,pud_t * dst_pud,pud_t * src_pud,unsigned long addr,struct vm_area_struct * vma)1795 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1796 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1797 struct vm_area_struct *vma)
1798 {
1799 spinlock_t *dst_ptl, *src_ptl;
1800 pud_t pud;
1801 int ret;
1802
1803 dst_ptl = pud_lock(dst_mm, dst_pud);
1804 src_ptl = pud_lockptr(src_mm, src_pud);
1805 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1806
1807 ret = -EAGAIN;
1808 pud = *src_pud;
1809 if (unlikely(!pud_trans_huge(pud)))
1810 goto out_unlock;
1811
1812 /*
1813 * TODO: once we support anonymous pages, use
1814 * folio_try_dup_anon_rmap_*() and split if duplicating fails.
1815 */
1816 if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
1817 pudp_set_wrprotect(src_mm, addr, src_pud);
1818 pud = pud_wrprotect(pud);
1819 }
1820 pud = pud_mkold(pud);
1821 set_pud_at(dst_mm, addr, dst_pud, pud);
1822
1823 ret = 0;
1824 out_unlock:
1825 spin_unlock(src_ptl);
1826 spin_unlock(dst_ptl);
1827 return ret;
1828 }
1829
huge_pud_set_accessed(struct vm_fault * vmf,pud_t orig_pud)1830 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1831 {
1832 bool write = vmf->flags & FAULT_FLAG_WRITE;
1833
1834 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1835 if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1836 goto unlock;
1837
1838 touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1839 unlock:
1840 spin_unlock(vmf->ptl);
1841 }
1842 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1843
huge_pmd_set_accessed(struct vm_fault * vmf)1844 void huge_pmd_set_accessed(struct vm_fault *vmf)
1845 {
1846 bool write = vmf->flags & FAULT_FLAG_WRITE;
1847
1848 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1849 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1850 goto unlock;
1851
1852 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1853
1854 unlock:
1855 spin_unlock(vmf->ptl);
1856 }
1857
do_huge_zero_wp_pmd(struct vm_fault * vmf)1858 static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
1859 {
1860 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1861 struct vm_area_struct *vma = vmf->vma;
1862 struct mmu_notifier_range range;
1863 struct folio *folio;
1864 vm_fault_t ret = 0;
1865
1866 folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
1867 if (unlikely(!folio))
1868 return VM_FAULT_FALLBACK;
1869
1870 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
1871 haddr + HPAGE_PMD_SIZE);
1872 mmu_notifier_invalidate_range_start(&range);
1873 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1874 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
1875 goto release;
1876 ret = check_stable_address_space(vma->vm_mm);
1877 if (ret)
1878 goto release;
1879 (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
1880 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
1881 goto unlock;
1882 release:
1883 folio_put(folio);
1884 unlock:
1885 spin_unlock(vmf->ptl);
1886 mmu_notifier_invalidate_range_end(&range);
1887 return ret;
1888 }
1889
do_huge_pmd_wp_page(struct vm_fault * vmf)1890 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1891 {
1892 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1893 struct vm_area_struct *vma = vmf->vma;
1894 struct folio *folio;
1895 struct page *page;
1896 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1897 pmd_t orig_pmd = vmf->orig_pmd;
1898
1899 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1900 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1901
1902 if (is_huge_zero_pmd(orig_pmd)) {
1903 vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
1904
1905 if (!(ret & VM_FAULT_FALLBACK))
1906 return ret;
1907
1908 /* Fallback to splitting PMD if THP cannot be allocated */
1909 goto fallback;
1910 }
1911
1912 spin_lock(vmf->ptl);
1913
1914 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1915 spin_unlock(vmf->ptl);
1916 return 0;
1917 }
1918
1919 page = pmd_page(orig_pmd);
1920 folio = page_folio(page);
1921 VM_BUG_ON_PAGE(!PageHead(page), page);
1922
1923 /* Early check when only holding the PT lock. */
1924 if (PageAnonExclusive(page))
1925 goto reuse;
1926
1927 if (!folio_trylock(folio)) {
1928 folio_get(folio);
1929 spin_unlock(vmf->ptl);
1930 folio_lock(folio);
1931 spin_lock(vmf->ptl);
1932 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1933 spin_unlock(vmf->ptl);
1934 folio_unlock(folio);
1935 folio_put(folio);
1936 return 0;
1937 }
1938 folio_put(folio);
1939 }
1940
1941 /* Recheck after temporarily dropping the PT lock. */
1942 if (PageAnonExclusive(page)) {
1943 folio_unlock(folio);
1944 goto reuse;
1945 }
1946
1947 /*
1948 * See do_wp_page(): we can only reuse the folio exclusively if
1949 * there are no additional references. Note that we always drain
1950 * the LRU cache immediately after adding a THP.
1951 */
1952 if (folio_ref_count(folio) >
1953 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
1954 goto unlock_fallback;
1955 if (folio_test_swapcache(folio))
1956 folio_free_swap(folio);
1957 if (folio_ref_count(folio) == 1) {
1958 pmd_t entry;
1959
1960 folio_move_anon_rmap(folio, vma);
1961 SetPageAnonExclusive(page);
1962 folio_unlock(folio);
1963 reuse:
1964 if (unlikely(unshare)) {
1965 spin_unlock(vmf->ptl);
1966 return 0;
1967 }
1968 entry = pmd_mkyoung(orig_pmd);
1969 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1970 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1971 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1972 spin_unlock(vmf->ptl);
1973 return 0;
1974 }
1975
1976 unlock_fallback:
1977 folio_unlock(folio);
1978 spin_unlock(vmf->ptl);
1979 fallback:
1980 __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
1981 return VM_FAULT_FALLBACK;
1982 }
1983
can_change_pmd_writable(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd)1984 static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1985 unsigned long addr, pmd_t pmd)
1986 {
1987 struct page *page;
1988
1989 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1990 return false;
1991
1992 /* Don't touch entries that are not even readable (NUMA hinting). */
1993 if (pmd_protnone(pmd))
1994 return false;
1995
1996 /* Do we need write faults for softdirty tracking? */
1997 if (pmd_needs_soft_dirty_wp(vma, pmd))
1998 return false;
1999
2000 /* Do we need write faults for uffd-wp tracking? */
2001 if (userfaultfd_huge_pmd_wp(vma, pmd))
2002 return false;
2003
2004 if (!(vma->vm_flags & VM_SHARED)) {
2005 /* See can_change_pte_writable(). */
2006 page = vm_normal_page_pmd(vma, addr, pmd);
2007 return page && PageAnon(page) && PageAnonExclusive(page);
2008 }
2009
2010 /* See can_change_pte_writable(). */
2011 return pmd_dirty(pmd);
2012 }
2013
2014 /* NUMA hinting page fault entry point for trans huge pmds */
do_huge_pmd_numa_page(struct vm_fault * vmf)2015 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
2016 {
2017 struct vm_area_struct *vma = vmf->vma;
2018 struct folio *folio;
2019 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2020 int nid = NUMA_NO_NODE;
2021 int target_nid, last_cpupid;
2022 pmd_t pmd, old_pmd;
2023 bool writable = false;
2024 int flags = 0;
2025
2026 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2027 old_pmd = pmdp_get(vmf->pmd);
2028
2029 if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
2030 spin_unlock(vmf->ptl);
2031 return 0;
2032 }
2033
2034 pmd = pmd_modify(old_pmd, vma->vm_page_prot);
2035
2036 /*
2037 * Detect now whether the PMD could be writable; this information
2038 * is only valid while holding the PT lock.
2039 */
2040 writable = pmd_write(pmd);
2041 if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
2042 can_change_pmd_writable(vma, vmf->address, pmd))
2043 writable = true;
2044
2045 folio = vm_normal_folio_pmd(vma, haddr, pmd);
2046 if (!folio)
2047 goto out_map;
2048
2049 nid = folio_nid(folio);
2050
2051 target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
2052 &last_cpupid);
2053 if (target_nid == NUMA_NO_NODE)
2054 goto out_map;
2055 if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
2056 flags |= TNF_MIGRATE_FAIL;
2057 goto out_map;
2058 }
2059 /* The folio is isolated and isolation code holds a folio reference. */
2060 spin_unlock(vmf->ptl);
2061 writable = false;
2062
2063 if (!migrate_misplaced_folio(folio, target_nid)) {
2064 flags |= TNF_MIGRATED;
2065 nid = target_nid;
2066 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
2067 return 0;
2068 }
2069
2070 flags |= TNF_MIGRATE_FAIL;
2071 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2072 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
2073 spin_unlock(vmf->ptl);
2074 return 0;
2075 }
2076 out_map:
2077 /* Restore the PMD */
2078 pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
2079 pmd = pmd_mkyoung(pmd);
2080 if (writable)
2081 pmd = pmd_mkwrite(pmd, vma);
2082 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
2083 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
2084 spin_unlock(vmf->ptl);
2085
2086 if (nid != NUMA_NO_NODE)
2087 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
2088 return 0;
2089 }
2090
2091 /*
2092 * Return true if we do MADV_FREE successfully on entire pmd page.
2093 * Otherwise, return false.
2094 */
madvise_free_huge_pmd(struct mmu_gather * tlb,struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,unsigned long next)2095 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2096 pmd_t *pmd, unsigned long addr, unsigned long next)
2097 {
2098 spinlock_t *ptl;
2099 pmd_t orig_pmd;
2100 struct folio *folio;
2101 struct mm_struct *mm = tlb->mm;
2102 bool ret = false;
2103
2104 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2105
2106 ptl = pmd_trans_huge_lock(pmd, vma);
2107 if (!ptl)
2108 goto out_unlocked;
2109
2110 orig_pmd = *pmd;
2111 if (is_huge_zero_pmd(orig_pmd))
2112 goto out;
2113
2114 if (unlikely(!pmd_present(orig_pmd))) {
2115 VM_BUG_ON(thp_migration_supported() &&
2116 !is_pmd_migration_entry(orig_pmd));
2117 goto out;
2118 }
2119
2120 folio = pmd_folio(orig_pmd);
2121 /*
2122 * If other processes are mapping this folio, we couldn't discard
2123 * the folio unless they all do MADV_FREE so let's skip the folio.
2124 */
2125 if (folio_maybe_mapped_shared(folio))
2126 goto out;
2127
2128 if (!folio_trylock(folio))
2129 goto out;
2130
2131 /*
2132 * If user want to discard part-pages of THP, split it so MADV_FREE
2133 * will deactivate only them.
2134 */
2135 if (next - addr != HPAGE_PMD_SIZE) {
2136 folio_get(folio);
2137 spin_unlock(ptl);
2138 split_folio(folio);
2139 folio_unlock(folio);
2140 folio_put(folio);
2141 goto out_unlocked;
2142 }
2143
2144 if (folio_test_dirty(folio))
2145 folio_clear_dirty(folio);
2146 folio_unlock(folio);
2147
2148 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
2149 pmdp_invalidate(vma, addr, pmd);
2150 orig_pmd = pmd_mkold(orig_pmd);
2151 orig_pmd = pmd_mkclean(orig_pmd);
2152
2153 set_pmd_at(mm, addr, pmd, orig_pmd);
2154 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2155 }
2156
2157 folio_mark_lazyfree(folio);
2158 ret = true;
2159 out:
2160 spin_unlock(ptl);
2161 out_unlocked:
2162 return ret;
2163 }
2164
zap_deposited_table(struct mm_struct * mm,pmd_t * pmd)2165 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
2166 {
2167 pgtable_t pgtable;
2168
2169 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2170 pte_free(mm, pgtable);
2171 mm_dec_nr_ptes(mm);
2172 }
2173
zap_huge_pmd(struct mmu_gather * tlb,struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr)2174 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2175 pmd_t *pmd, unsigned long addr)
2176 {
2177 pmd_t orig_pmd;
2178 spinlock_t *ptl;
2179
2180 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2181
2182 ptl = __pmd_trans_huge_lock(pmd, vma);
2183 if (!ptl)
2184 return 0;
2185 /*
2186 * For architectures like ppc64 we look at deposited pgtable
2187 * when calling pmdp_huge_get_and_clear. So do the
2188 * pgtable_trans_huge_withdraw after finishing pmdp related
2189 * operations.
2190 */
2191 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
2192 tlb->fullmm);
2193 arch_check_zapped_pmd(vma, orig_pmd);
2194 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2195 if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2196 if (arch_needs_pgtable_deposit())
2197 zap_deposited_table(tlb->mm, pmd);
2198 spin_unlock(ptl);
2199 } else if (is_huge_zero_pmd(orig_pmd)) {
2200 if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
2201 zap_deposited_table(tlb->mm, pmd);
2202 spin_unlock(ptl);
2203 } else {
2204 struct folio *folio = NULL;
2205 int flush_needed = 1;
2206
2207 if (pmd_present(orig_pmd)) {
2208 struct page *page = pmd_page(orig_pmd);
2209
2210 folio = page_folio(page);
2211 folio_remove_rmap_pmd(folio, page, vma);
2212 WARN_ON_ONCE(folio_mapcount(folio) < 0);
2213 VM_BUG_ON_PAGE(!PageHead(page), page);
2214 } else if (thp_migration_supported()) {
2215 swp_entry_t entry;
2216
2217 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
2218 entry = pmd_to_swp_entry(orig_pmd);
2219 folio = pfn_swap_entry_folio(entry);
2220 flush_needed = 0;
2221 } else
2222 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
2223
2224 if (folio_test_anon(folio)) {
2225 zap_deposited_table(tlb->mm, pmd);
2226 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
2227 } else {
2228 if (arch_needs_pgtable_deposit())
2229 zap_deposited_table(tlb->mm, pmd);
2230 add_mm_counter(tlb->mm, mm_counter_file(folio),
2231 -HPAGE_PMD_NR);
2232
2233 /*
2234 * Use flush_needed to indicate whether the PMD entry
2235 * is present, instead of checking pmd_present() again.
2236 */
2237 if (flush_needed && pmd_young(orig_pmd) &&
2238 likely(vma_has_recency(vma)))
2239 folio_mark_accessed(folio);
2240 }
2241
2242 spin_unlock(ptl);
2243 if (flush_needed)
2244 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
2245 }
2246 return 1;
2247 }
2248
2249 #ifndef pmd_move_must_withdraw
pmd_move_must_withdraw(spinlock_t * new_pmd_ptl,spinlock_t * old_pmd_ptl,struct vm_area_struct * vma)2250 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
2251 spinlock_t *old_pmd_ptl,
2252 struct vm_area_struct *vma)
2253 {
2254 /*
2255 * With split pmd lock we also need to move preallocated
2256 * PTE page table if new_pmd is on different PMD page table.
2257 *
2258 * We also don't deposit and withdraw tables for file pages.
2259 */
2260 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
2261 }
2262 #endif
2263
move_soft_dirty_pmd(pmd_t pmd)2264 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
2265 {
2266 #ifdef CONFIG_MEM_SOFT_DIRTY
2267 if (unlikely(is_pmd_migration_entry(pmd)))
2268 pmd = pmd_swp_mksoft_dirty(pmd);
2269 else if (pmd_present(pmd))
2270 pmd = pmd_mksoft_dirty(pmd);
2271 #endif
2272 return pmd;
2273 }
2274
clear_uffd_wp_pmd(pmd_t pmd)2275 static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
2276 {
2277 if (pmd_present(pmd))
2278 pmd = pmd_clear_uffd_wp(pmd);
2279 else if (is_swap_pmd(pmd))
2280 pmd = pmd_swp_clear_uffd_wp(pmd);
2281
2282 return pmd;
2283 }
2284
move_huge_pmd(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pmd_t * old_pmd,pmd_t * new_pmd)2285 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
2286 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
2287 {
2288 spinlock_t *old_ptl, *new_ptl;
2289 pmd_t pmd;
2290 struct mm_struct *mm = vma->vm_mm;
2291 bool force_flush = false;
2292
2293 /*
2294 * The destination pmd shouldn't be established, free_pgtables()
2295 * should have released it; but move_page_tables() might have already
2296 * inserted a page table, if racing against shmem/file collapse.
2297 */
2298 if (!pmd_none(*new_pmd)) {
2299 VM_BUG_ON(pmd_trans_huge(*new_pmd));
2300 return false;
2301 }
2302
2303 /*
2304 * We don't have to worry about the ordering of src and dst
2305 * ptlocks because exclusive mmap_lock prevents deadlock.
2306 */
2307 old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
2308 if (old_ptl) {
2309 new_ptl = pmd_lockptr(mm, new_pmd);
2310 if (new_ptl != old_ptl)
2311 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
2312 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
2313 if (pmd_present(pmd))
2314 force_flush = true;
2315 VM_BUG_ON(!pmd_none(*new_pmd));
2316
2317 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
2318 pgtable_t pgtable;
2319 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
2320 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
2321 }
2322 pmd = move_soft_dirty_pmd(pmd);
2323 if (vma_has_uffd_without_event_remap(vma))
2324 pmd = clear_uffd_wp_pmd(pmd);
2325 set_pmd_at(mm, new_addr, new_pmd, pmd);
2326 if (force_flush)
2327 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
2328 if (new_ptl != old_ptl)
2329 spin_unlock(new_ptl);
2330 spin_unlock(old_ptl);
2331 return true;
2332 }
2333 return false;
2334 }
2335
2336 /*
2337 * Returns
2338 * - 0 if PMD could not be locked
2339 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
2340 * or if prot_numa but THP migration is not supported
2341 * - HPAGE_PMD_NR if protections changed and TLB flush necessary
2342 */
change_huge_pmd(struct mmu_gather * tlb,struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,pgprot_t newprot,unsigned long cp_flags)2343 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2344 pmd_t *pmd, unsigned long addr, pgprot_t newprot,
2345 unsigned long cp_flags)
2346 {
2347 struct mm_struct *mm = vma->vm_mm;
2348 spinlock_t *ptl;
2349 pmd_t oldpmd, entry;
2350 bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
2351 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2352 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
2353 int ret = 1;
2354
2355 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2356
2357 if (prot_numa && !thp_migration_supported())
2358 return 1;
2359
2360 ptl = __pmd_trans_huge_lock(pmd, vma);
2361 if (!ptl)
2362 return 0;
2363
2364 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2365 if (is_swap_pmd(*pmd)) {
2366 swp_entry_t entry = pmd_to_swp_entry(*pmd);
2367 struct folio *folio = pfn_swap_entry_folio(entry);
2368 pmd_t newpmd;
2369
2370 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
2371 if (is_writable_migration_entry(entry)) {
2372 /*
2373 * A protection check is difficult so
2374 * just be safe and disable write
2375 */
2376 if (folio_test_anon(folio))
2377 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
2378 else
2379 entry = make_readable_migration_entry(swp_offset(entry));
2380 newpmd = swp_entry_to_pmd(entry);
2381 if (pmd_swp_soft_dirty(*pmd))
2382 newpmd = pmd_swp_mksoft_dirty(newpmd);
2383 } else {
2384 newpmd = *pmd;
2385 }
2386
2387 if (uffd_wp)
2388 newpmd = pmd_swp_mkuffd_wp(newpmd);
2389 else if (uffd_wp_resolve)
2390 newpmd = pmd_swp_clear_uffd_wp(newpmd);
2391 if (!pmd_same(*pmd, newpmd))
2392 set_pmd_at(mm, addr, pmd, newpmd);
2393 goto unlock;
2394 }
2395 #endif
2396
2397 if (prot_numa) {
2398 struct folio *folio;
2399 bool toptier;
2400 /*
2401 * Avoid trapping faults against the zero page. The read-only
2402 * data is likely to be read-cached on the local CPU and
2403 * local/remote hits to the zero page are not interesting.
2404 */
2405 if (is_huge_zero_pmd(*pmd))
2406 goto unlock;
2407
2408 if (pmd_protnone(*pmd))
2409 goto unlock;
2410
2411 folio = pmd_folio(*pmd);
2412 toptier = node_is_toptier(folio_nid(folio));
2413 /*
2414 * Skip scanning top tier node if normal numa
2415 * balancing is disabled
2416 */
2417 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
2418 toptier)
2419 goto unlock;
2420
2421 if (folio_use_access_time(folio))
2422 folio_xchg_access_time(folio,
2423 jiffies_to_msecs(jiffies));
2424 }
2425 /*
2426 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
2427 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
2428 * which is also under mmap_read_lock(mm):
2429 *
2430 * CPU0: CPU1:
2431 * change_huge_pmd(prot_numa=1)
2432 * pmdp_huge_get_and_clear_notify()
2433 * madvise_dontneed()
2434 * zap_pmd_range()
2435 * pmd_trans_huge(*pmd) == 0 (without ptl)
2436 * // skip the pmd
2437 * set_pmd_at();
2438 * // pmd is re-established
2439 *
2440 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2441 * which may break userspace.
2442 *
2443 * pmdp_invalidate_ad() is required to make sure we don't miss
2444 * dirty/young flags set by hardware.
2445 */
2446 oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2447
2448 entry = pmd_modify(oldpmd, newprot);
2449 if (uffd_wp)
2450 entry = pmd_mkuffd_wp(entry);
2451 else if (uffd_wp_resolve)
2452 /*
2453 * Leave the write bit to be handled by PF interrupt
2454 * handler, then things like COW could be properly
2455 * handled.
2456 */
2457 entry = pmd_clear_uffd_wp(entry);
2458
2459 /* See change_pte_range(). */
2460 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2461 can_change_pmd_writable(vma, addr, entry))
2462 entry = pmd_mkwrite(entry, vma);
2463
2464 ret = HPAGE_PMD_NR;
2465 set_pmd_at(mm, addr, pmd, entry);
2466
2467 if (huge_pmd_needs_flush(oldpmd, entry))
2468 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
2469 unlock:
2470 spin_unlock(ptl);
2471 return ret;
2472 }
2473
2474 /*
2475 * Returns:
2476 *
2477 * - 0: if pud leaf changed from under us
2478 * - 1: if pud can be skipped
2479 * - HPAGE_PUD_NR: if pud was successfully processed
2480 */
2481 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
change_huge_pud(struct mmu_gather * tlb,struct vm_area_struct * vma,pud_t * pudp,unsigned long addr,pgprot_t newprot,unsigned long cp_flags)2482 int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2483 pud_t *pudp, unsigned long addr, pgprot_t newprot,
2484 unsigned long cp_flags)
2485 {
2486 struct mm_struct *mm = vma->vm_mm;
2487 pud_t oldpud, entry;
2488 spinlock_t *ptl;
2489
2490 tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
2491
2492 /* NUMA balancing doesn't apply to dax */
2493 if (cp_flags & MM_CP_PROT_NUMA)
2494 return 1;
2495
2496 /*
2497 * Huge entries on userfault-wp only works with anonymous, while we
2498 * don't have anonymous PUDs yet.
2499 */
2500 if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
2501 return 1;
2502
2503 ptl = __pud_trans_huge_lock(pudp, vma);
2504 if (!ptl)
2505 return 0;
2506
2507 /*
2508 * Can't clear PUD or it can race with concurrent zapping. See
2509 * change_huge_pmd().
2510 */
2511 oldpud = pudp_invalidate(vma, addr, pudp);
2512 entry = pud_modify(oldpud, newprot);
2513 set_pud_at(mm, addr, pudp, entry);
2514 tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
2515
2516 spin_unlock(ptl);
2517 return HPAGE_PUD_NR;
2518 }
2519 #endif
2520
2521 #ifdef CONFIG_USERFAULTFD
2522 /*
2523 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2524 * the caller, but it must return after releasing the page_table_lock.
2525 * Just move the page from src_pmd to dst_pmd if possible.
2526 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2527 * repeated by the caller, or other errors in case of failure.
2528 */
move_pages_huge_pmd(struct mm_struct * mm,pmd_t * dst_pmd,pmd_t * src_pmd,pmd_t dst_pmdval,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,unsigned long dst_addr,unsigned long src_addr)2529 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2530 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2531 unsigned long dst_addr, unsigned long src_addr)
2532 {
2533 pmd_t _dst_pmd, src_pmdval;
2534 struct page *src_page;
2535 struct folio *src_folio;
2536 struct anon_vma *src_anon_vma;
2537 spinlock_t *src_ptl, *dst_ptl;
2538 pgtable_t src_pgtable;
2539 struct mmu_notifier_range range;
2540 int err = 0;
2541
2542 src_pmdval = *src_pmd;
2543 src_ptl = pmd_lockptr(mm, src_pmd);
2544
2545 lockdep_assert_held(src_ptl);
2546 vma_assert_locked(src_vma);
2547 vma_assert_locked(dst_vma);
2548
2549 /* Sanity checks before the operation */
2550 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2551 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2552 spin_unlock(src_ptl);
2553 return -EINVAL;
2554 }
2555
2556 if (!pmd_trans_huge(src_pmdval)) {
2557 spin_unlock(src_ptl);
2558 if (is_pmd_migration_entry(src_pmdval)) {
2559 pmd_migration_entry_wait(mm, &src_pmdval);
2560 return -EAGAIN;
2561 }
2562 return -ENOENT;
2563 }
2564
2565 src_page = pmd_page(src_pmdval);
2566
2567 if (!is_huge_zero_pmd(src_pmdval)) {
2568 if (unlikely(!PageAnonExclusive(src_page))) {
2569 spin_unlock(src_ptl);
2570 return -EBUSY;
2571 }
2572
2573 src_folio = page_folio(src_page);
2574 folio_get(src_folio);
2575 } else
2576 src_folio = NULL;
2577
2578 spin_unlock(src_ptl);
2579
2580 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2581 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2582 src_addr + HPAGE_PMD_SIZE);
2583 mmu_notifier_invalidate_range_start(&range);
2584
2585 if (src_folio) {
2586 folio_lock(src_folio);
2587
2588 /*
2589 * split_huge_page walks the anon_vma chain without the page
2590 * lock. Serialize against it with the anon_vma lock, the page
2591 * lock is not enough.
2592 */
2593 src_anon_vma = folio_get_anon_vma(src_folio);
2594 if (!src_anon_vma) {
2595 err = -EAGAIN;
2596 goto unlock_folio;
2597 }
2598 anon_vma_lock_write(src_anon_vma);
2599 } else
2600 src_anon_vma = NULL;
2601
2602 dst_ptl = pmd_lockptr(mm, dst_pmd);
2603 double_pt_lock(src_ptl, dst_ptl);
2604 if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2605 !pmd_same(*dst_pmd, dst_pmdval))) {
2606 err = -EAGAIN;
2607 goto unlock_ptls;
2608 }
2609 if (src_folio) {
2610 if (folio_maybe_dma_pinned(src_folio) ||
2611 !PageAnonExclusive(&src_folio->page)) {
2612 err = -EBUSY;
2613 goto unlock_ptls;
2614 }
2615
2616 if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2617 WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2618 err = -EBUSY;
2619 goto unlock_ptls;
2620 }
2621
2622 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2623 /* Folio got pinned from under us. Put it back and fail the move. */
2624 if (folio_maybe_dma_pinned(src_folio)) {
2625 set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2626 err = -EBUSY;
2627 goto unlock_ptls;
2628 }
2629
2630 folio_move_anon_rmap(src_folio, dst_vma);
2631 src_folio->index = linear_page_index(dst_vma, dst_addr);
2632
2633 _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
2634 /* Follow mremap() behavior and treat the entry dirty after the move */
2635 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2636 } else {
2637 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2638 _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
2639 }
2640 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2641
2642 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2643 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2644 unlock_ptls:
2645 double_pt_unlock(src_ptl, dst_ptl);
2646 if (src_anon_vma) {
2647 anon_vma_unlock_write(src_anon_vma);
2648 put_anon_vma(src_anon_vma);
2649 }
2650 unlock_folio:
2651 /* unblock rmap walks */
2652 if (src_folio)
2653 folio_unlock(src_folio);
2654 mmu_notifier_invalidate_range_end(&range);
2655 if (src_folio)
2656 folio_put(src_folio);
2657 return err;
2658 }
2659 #endif /* CONFIG_USERFAULTFD */
2660
2661 /*
2662 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2663 *
2664 * Note that if it returns page table lock pointer, this routine returns without
2665 * unlocking page table lock. So callers must unlock it.
2666 */
__pmd_trans_huge_lock(pmd_t * pmd,struct vm_area_struct * vma)2667 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2668 {
2669 spinlock_t *ptl;
2670 ptl = pmd_lock(vma->vm_mm, pmd);
2671 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
2672 return ptl;
2673 spin_unlock(ptl);
2674 return NULL;
2675 }
2676
2677 /*
2678 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2679 *
2680 * Note that if it returns page table lock pointer, this routine returns without
2681 * unlocking page table lock. So callers must unlock it.
2682 */
__pud_trans_huge_lock(pud_t * pud,struct vm_area_struct * vma)2683 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2684 {
2685 spinlock_t *ptl;
2686
2687 ptl = pud_lock(vma->vm_mm, pud);
2688 if (likely(pud_trans_huge(*pud)))
2689 return ptl;
2690 spin_unlock(ptl);
2691 return NULL;
2692 }
2693
2694 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
zap_huge_pud(struct mmu_gather * tlb,struct vm_area_struct * vma,pud_t * pud,unsigned long addr)2695 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2696 pud_t *pud, unsigned long addr)
2697 {
2698 spinlock_t *ptl;
2699 pud_t orig_pud;
2700
2701 ptl = __pud_trans_huge_lock(pud, vma);
2702 if (!ptl)
2703 return 0;
2704
2705 orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2706 arch_check_zapped_pud(vma, orig_pud);
2707 tlb_remove_pud_tlb_entry(tlb, pud, addr);
2708 if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2709 spin_unlock(ptl);
2710 /* No zero page support yet */
2711 } else {
2712 struct page *page = NULL;
2713 struct folio *folio;
2714
2715 /* No support for anonymous PUD pages or migration yet */
2716 VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
2717 !pud_present(orig_pud));
2718
2719 page = pud_page(orig_pud);
2720 folio = page_folio(page);
2721 folio_remove_rmap_pud(folio, page, vma);
2722 add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
2723
2724 spin_unlock(ptl);
2725 tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
2726 }
2727 return 1;
2728 }
2729
__split_huge_pud_locked(struct vm_area_struct * vma,pud_t * pud,unsigned long haddr)2730 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2731 unsigned long haddr)
2732 {
2733 struct folio *folio;
2734 struct page *page;
2735 pud_t old_pud;
2736
2737 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2738 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2739 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2740 VM_BUG_ON(!pud_trans_huge(*pud));
2741
2742 count_vm_event(THP_SPLIT_PUD);
2743
2744 old_pud = pudp_huge_clear_flush(vma, haddr, pud);
2745
2746 if (!vma_is_dax(vma))
2747 return;
2748
2749 page = pud_page(old_pud);
2750 folio = page_folio(page);
2751
2752 if (!folio_test_dirty(folio) && pud_dirty(old_pud))
2753 folio_mark_dirty(folio);
2754 if (!folio_test_referenced(folio) && pud_young(old_pud))
2755 folio_set_referenced(folio);
2756 folio_remove_rmap_pud(folio, page, vma);
2757 folio_put(folio);
2758 add_mm_counter(vma->vm_mm, mm_counter_file(folio),
2759 -HPAGE_PUD_NR);
2760 }
2761
__split_huge_pud(struct vm_area_struct * vma,pud_t * pud,unsigned long address)2762 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2763 unsigned long address)
2764 {
2765 spinlock_t *ptl;
2766 struct mmu_notifier_range range;
2767
2768 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2769 address & HPAGE_PUD_MASK,
2770 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2771 mmu_notifier_invalidate_range_start(&range);
2772 ptl = pud_lock(vma->vm_mm, pud);
2773 if (unlikely(!pud_trans_huge(*pud)))
2774 goto out;
2775 __split_huge_pud_locked(vma, pud, range.start);
2776
2777 out:
2778 spin_unlock(ptl);
2779 mmu_notifier_invalidate_range_end(&range);
2780 }
2781 #else
__split_huge_pud(struct vm_area_struct * vma,pud_t * pud,unsigned long address)2782 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2783 unsigned long address)
2784 {
2785 }
2786 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2787
__split_huge_zero_page_pmd(struct vm_area_struct * vma,unsigned long haddr,pmd_t * pmd)2788 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2789 unsigned long haddr, pmd_t *pmd)
2790 {
2791 struct mm_struct *mm = vma->vm_mm;
2792 pgtable_t pgtable;
2793 pmd_t _pmd, old_pmd;
2794 unsigned long addr;
2795 pte_t *pte;
2796 int i;
2797
2798 /*
2799 * Leave pmd empty until pte is filled note that it is fine to delay
2800 * notification until mmu_notifier_invalidate_range_end() as we are
2801 * replacing a zero pmd write protected page with a zero pte write
2802 * protected page.
2803 *
2804 * See Documentation/mm/mmu_notifier.rst
2805 */
2806 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2807
2808 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2809 pmd_populate(mm, &_pmd, pgtable);
2810
2811 pte = pte_offset_map(&_pmd, haddr);
2812 VM_BUG_ON(!pte);
2813 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2814 pte_t entry;
2815
2816 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2817 entry = pte_mkspecial(entry);
2818 if (pmd_uffd_wp(old_pmd))
2819 entry = pte_mkuffd_wp(entry);
2820 VM_BUG_ON(!pte_none(ptep_get(pte)));
2821 set_pte_at(mm, addr, pte, entry);
2822 pte++;
2823 }
2824 pte_unmap(pte - 1);
2825 smp_wmb(); /* make pte visible before pmd */
2826 pmd_populate(mm, pmd, pgtable);
2827 }
2828
__split_huge_pmd_locked(struct vm_area_struct * vma,pmd_t * pmd,unsigned long haddr,bool freeze)2829 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2830 unsigned long haddr, bool freeze)
2831 {
2832 struct mm_struct *mm = vma->vm_mm;
2833 struct folio *folio;
2834 struct page *page;
2835 pgtable_t pgtable;
2836 pmd_t old_pmd, _pmd;
2837 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2838 bool anon_exclusive = false, dirty = false;
2839 unsigned long addr;
2840 pte_t *pte;
2841 int i;
2842
2843 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2844 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2845 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2846 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
2847
2848 count_vm_event(THP_SPLIT_PMD);
2849
2850 if (!vma_is_anonymous(vma)) {
2851 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2852 /*
2853 * We are going to unmap this huge page. So
2854 * just go ahead and zap it
2855 */
2856 if (arch_needs_pgtable_deposit())
2857 zap_deposited_table(mm, pmd);
2858 if (!vma_is_dax(vma) && vma_is_special_huge(vma))
2859 return;
2860 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2861 swp_entry_t entry;
2862
2863 entry = pmd_to_swp_entry(old_pmd);
2864 folio = pfn_swap_entry_folio(entry);
2865 } else if (is_huge_zero_pmd(old_pmd)) {
2866 return;
2867 } else {
2868 page = pmd_page(old_pmd);
2869 folio = page_folio(page);
2870 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
2871 folio_mark_dirty(folio);
2872 if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2873 folio_set_referenced(folio);
2874 folio_remove_rmap_pmd(folio, page, vma);
2875 folio_put(folio);
2876 }
2877 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
2878 return;
2879 }
2880
2881 if (is_huge_zero_pmd(*pmd)) {
2882 /*
2883 * FIXME: Do we want to invalidate secondary mmu by calling
2884 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2885 * inside __split_huge_pmd() ?
2886 *
2887 * We are going from a zero huge page write protected to zero
2888 * small page also write protected so it does not seems useful
2889 * to invalidate secondary mmu at this time.
2890 */
2891 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2892 }
2893
2894 pmd_migration = is_pmd_migration_entry(*pmd);
2895 if (unlikely(pmd_migration)) {
2896 swp_entry_t entry;
2897
2898 old_pmd = *pmd;
2899 entry = pmd_to_swp_entry(old_pmd);
2900 page = pfn_swap_entry_to_page(entry);
2901 write = is_writable_migration_entry(entry);
2902 if (PageAnon(page))
2903 anon_exclusive = is_readable_exclusive_migration_entry(entry);
2904 young = is_migration_entry_young(entry);
2905 dirty = is_migration_entry_dirty(entry);
2906 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2907 uffd_wp = pmd_swp_uffd_wp(old_pmd);
2908 } else {
2909 /*
2910 * Up to this point the pmd is present and huge and userland has
2911 * the whole access to the hugepage during the split (which
2912 * happens in place). If we overwrite the pmd with the not-huge
2913 * version pointing to the pte here (which of course we could if
2914 * all CPUs were bug free), userland could trigger a small page
2915 * size TLB miss on the small sized TLB while the hugepage TLB
2916 * entry is still established in the huge TLB. Some CPU doesn't
2917 * like that. See
2918 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2919 * 383 on page 105. Intel should be safe but is also warns that
2920 * it's only safe if the permission and cache attributes of the
2921 * two entries loaded in the two TLB is identical (which should
2922 * be the case here). But it is generally safer to never allow
2923 * small and huge TLB entries for the same virtual address to be
2924 * loaded simultaneously. So instead of doing "pmd_populate();
2925 * flush_pmd_tlb_range();" we first mark the current pmd
2926 * notpresent (atomically because here the pmd_trans_huge must
2927 * remain set at all times on the pmd until the split is
2928 * complete for this pmd), then we flush the SMP TLB and finally
2929 * we write the non-huge version of the pmd entry with
2930 * pmd_populate.
2931 */
2932 old_pmd = pmdp_invalidate(vma, haddr, pmd);
2933 page = pmd_page(old_pmd);
2934 folio = page_folio(page);
2935 if (pmd_dirty(old_pmd)) {
2936 dirty = true;
2937 folio_set_dirty(folio);
2938 }
2939 write = pmd_write(old_pmd);
2940 young = pmd_young(old_pmd);
2941 soft_dirty = pmd_soft_dirty(old_pmd);
2942 uffd_wp = pmd_uffd_wp(old_pmd);
2943
2944 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2945 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2946
2947 /*
2948 * Without "freeze", we'll simply split the PMD, propagating the
2949 * PageAnonExclusive() flag for each PTE by setting it for
2950 * each subpage -- no need to (temporarily) clear.
2951 *
2952 * With "freeze" we want to replace mapped pages by
2953 * migration entries right away. This is only possible if we
2954 * managed to clear PageAnonExclusive() -- see
2955 * set_pmd_migration_entry().
2956 *
2957 * In case we cannot clear PageAnonExclusive(), split the PMD
2958 * only and let try_to_migrate_one() fail later.
2959 *
2960 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
2961 */
2962 anon_exclusive = PageAnonExclusive(page);
2963 if (freeze && anon_exclusive &&
2964 folio_try_share_anon_rmap_pmd(folio, page))
2965 freeze = false;
2966 if (!freeze) {
2967 rmap_t rmap_flags = RMAP_NONE;
2968
2969 folio_ref_add(folio, HPAGE_PMD_NR - 1);
2970 if (anon_exclusive)
2971 rmap_flags |= RMAP_EXCLUSIVE;
2972 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2973 vma, haddr, rmap_flags);
2974 }
2975 }
2976
2977 /*
2978 * Withdraw the table only after we mark the pmd entry invalid.
2979 * This's critical for some architectures (Power).
2980 */
2981 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2982 pmd_populate(mm, &_pmd, pgtable);
2983
2984 pte = pte_offset_map(&_pmd, haddr);
2985 VM_BUG_ON(!pte);
2986
2987 /*
2988 * Note that NUMA hinting access restrictions are not transferred to
2989 * avoid any possibility of altering permissions across VMAs.
2990 */
2991 if (freeze || pmd_migration) {
2992 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2993 pte_t entry;
2994 swp_entry_t swp_entry;
2995
2996 if (write)
2997 swp_entry = make_writable_migration_entry(
2998 page_to_pfn(page + i));
2999 else if (anon_exclusive)
3000 swp_entry = make_readable_exclusive_migration_entry(
3001 page_to_pfn(page + i));
3002 else
3003 swp_entry = make_readable_migration_entry(
3004 page_to_pfn(page + i));
3005 if (young)
3006 swp_entry = make_migration_entry_young(swp_entry);
3007 if (dirty)
3008 swp_entry = make_migration_entry_dirty(swp_entry);
3009 entry = swp_entry_to_pte(swp_entry);
3010 if (soft_dirty)
3011 entry = pte_swp_mksoft_dirty(entry);
3012 if (uffd_wp)
3013 entry = pte_swp_mkuffd_wp(entry);
3014
3015 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3016 set_pte_at(mm, addr, pte + i, entry);
3017 }
3018 } else {
3019 pte_t entry;
3020
3021 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
3022 if (write)
3023 entry = pte_mkwrite(entry, vma);
3024 if (!young)
3025 entry = pte_mkold(entry);
3026 /* NOTE: this may set soft-dirty too on some archs */
3027 if (dirty)
3028 entry = pte_mkdirty(entry);
3029 if (soft_dirty)
3030 entry = pte_mksoft_dirty(entry);
3031 if (uffd_wp)
3032 entry = pte_mkuffd_wp(entry);
3033
3034 for (i = 0; i < HPAGE_PMD_NR; i++)
3035 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3036
3037 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
3038 }
3039 pte_unmap(pte);
3040
3041 if (!pmd_migration)
3042 folio_remove_rmap_pmd(folio, page, vma);
3043 if (freeze)
3044 put_page(page);
3045
3046 smp_wmb(); /* make pte visible before pmd */
3047 pmd_populate(mm, pmd, pgtable);
3048 }
3049
split_huge_pmd_locked(struct vm_area_struct * vma,unsigned long address,pmd_t * pmd,bool freeze)3050 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
3051 pmd_t *pmd, bool freeze)
3052 {
3053 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
3054 if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
3055 __split_huge_pmd_locked(vma, pmd, address, freeze);
3056 }
3057
__split_huge_pmd(struct vm_area_struct * vma,pmd_t * pmd,unsigned long address,bool freeze)3058 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
3059 unsigned long address, bool freeze)
3060 {
3061 spinlock_t *ptl;
3062 struct mmu_notifier_range range;
3063
3064 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
3065 address & HPAGE_PMD_MASK,
3066 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
3067 mmu_notifier_invalidate_range_start(&range);
3068 ptl = pmd_lock(vma->vm_mm, pmd);
3069 split_huge_pmd_locked(vma, range.start, pmd, freeze);
3070 spin_unlock(ptl);
3071 mmu_notifier_invalidate_range_end(&range);
3072 }
3073
split_huge_pmd_address(struct vm_area_struct * vma,unsigned long address,bool freeze)3074 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
3075 bool freeze)
3076 {
3077 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
3078
3079 if (!pmd)
3080 return;
3081
3082 __split_huge_pmd(vma, pmd, address, freeze);
3083 }
3084
split_huge_pmd_if_needed(struct vm_area_struct * vma,unsigned long address)3085 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
3086 {
3087 /*
3088 * If the new address isn't hpage aligned and it could previously
3089 * contain an hugepage: check if we need to split an huge pmd.
3090 */
3091 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
3092 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
3093 ALIGN(address, HPAGE_PMD_SIZE)))
3094 split_huge_pmd_address(vma, address, false);
3095 }
3096
vma_adjust_trans_huge(struct vm_area_struct * vma,unsigned long start,unsigned long end,struct vm_area_struct * next)3097 void vma_adjust_trans_huge(struct vm_area_struct *vma,
3098 unsigned long start,
3099 unsigned long end,
3100 struct vm_area_struct *next)
3101 {
3102 /* Check if we need to split start first. */
3103 split_huge_pmd_if_needed(vma, start);
3104
3105 /* Check if we need to split end next. */
3106 split_huge_pmd_if_needed(vma, end);
3107
3108 /* If we're incrementing next->vm_start, we might need to split it. */
3109 if (next)
3110 split_huge_pmd_if_needed(next, end);
3111 }
3112
unmap_folio(struct folio * folio)3113 static void unmap_folio(struct folio *folio)
3114 {
3115 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
3116 TTU_BATCH_FLUSH;
3117
3118 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3119
3120 if (folio_test_pmd_mappable(folio))
3121 ttu_flags |= TTU_SPLIT_HUGE_PMD;
3122
3123 /*
3124 * Anon pages need migration entries to preserve them, but file
3125 * pages can simply be left unmapped, then faulted back on demand.
3126 * If that is ever changed (perhaps for mlock), update remap_page().
3127 */
3128 if (folio_test_anon(folio))
3129 try_to_migrate(folio, ttu_flags);
3130 else
3131 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
3132
3133 try_to_unmap_flush();
3134 }
3135
__discard_anon_folio_pmd_locked(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp,struct folio * folio)3136 static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
3137 unsigned long addr, pmd_t *pmdp,
3138 struct folio *folio)
3139 {
3140 struct mm_struct *mm = vma->vm_mm;
3141 int ref_count, map_count;
3142 pmd_t orig_pmd = *pmdp;
3143
3144 if (pmd_dirty(orig_pmd))
3145 folio_set_dirty(folio);
3146 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3147 folio_set_swapbacked(folio);
3148 return false;
3149 }
3150
3151 orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
3152
3153 /*
3154 * Syncing against concurrent GUP-fast:
3155 * - clear PMD; barrier; read refcount
3156 * - inc refcount; barrier; read PMD
3157 */
3158 smp_mb();
3159
3160 ref_count = folio_ref_count(folio);
3161 map_count = folio_mapcount(folio);
3162
3163 /*
3164 * Order reads for folio refcount and dirty flag
3165 * (see comments in __remove_mapping()).
3166 */
3167 smp_rmb();
3168
3169 /*
3170 * If the folio or its PMD is redirtied at this point, or if there
3171 * are unexpected references, we will give up to discard this folio
3172 * and remap it.
3173 *
3174 * The only folio refs must be one from isolation plus the rmap(s).
3175 */
3176 if (pmd_dirty(orig_pmd))
3177 folio_set_dirty(folio);
3178 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3179 folio_set_swapbacked(folio);
3180 set_pmd_at(mm, addr, pmdp, orig_pmd);
3181 return false;
3182 }
3183
3184 if (ref_count != map_count + 1) {
3185 set_pmd_at(mm, addr, pmdp, orig_pmd);
3186 return false;
3187 }
3188
3189 folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
3190 zap_deposited_table(mm, pmdp);
3191 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
3192 if (vma->vm_flags & VM_LOCKED)
3193 mlock_drain_local();
3194 folio_put(folio);
3195
3196 return true;
3197 }
3198
unmap_huge_pmd_locked(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp,struct folio * folio)3199 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
3200 pmd_t *pmdp, struct folio *folio)
3201 {
3202 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
3203 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
3204 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
3205 VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
3206 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
3207
3208 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
3209 }
3210
remap_page(struct folio * folio,unsigned long nr,int flags)3211 static void remap_page(struct folio *folio, unsigned long nr, int flags)
3212 {
3213 int i = 0;
3214
3215 /* If unmap_folio() uses try_to_migrate() on file, remove this check */
3216 if (!folio_test_anon(folio))
3217 return;
3218 for (;;) {
3219 remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
3220 i += folio_nr_pages(folio);
3221 if (i >= nr)
3222 break;
3223 folio = folio_next(folio);
3224 }
3225 }
3226
lru_add_split_folio(struct folio * folio,struct folio * new_folio,struct lruvec * lruvec,struct list_head * list)3227 static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
3228 struct lruvec *lruvec, struct list_head *list)
3229 {
3230 VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
3231 lockdep_assert_held(&lruvec->lru_lock);
3232
3233 if (list) {
3234 /* page reclaim is reclaiming a huge page */
3235 VM_WARN_ON(folio_test_lru(folio));
3236 folio_get(new_folio);
3237 list_add_tail(&new_folio->lru, list);
3238 } else {
3239 /* head is still on lru (and we have it frozen) */
3240 VM_WARN_ON(!folio_test_lru(folio));
3241 if (folio_test_unevictable(folio))
3242 new_folio->mlock_count = 0;
3243 else
3244 list_add_tail(&new_folio->lru, &folio->lru);
3245 folio_set_lru(new_folio);
3246 }
3247 }
3248
3249 /* Racy check whether the huge page can be split */
can_split_folio(struct folio * folio,int caller_pins,int * pextra_pins)3250 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
3251 {
3252 int extra_pins;
3253
3254 /* Additional pins from page cache */
3255 if (folio_test_anon(folio))
3256 extra_pins = folio_test_swapcache(folio) ?
3257 folio_nr_pages(folio) : 0;
3258 else
3259 extra_pins = folio_nr_pages(folio);
3260 if (pextra_pins)
3261 *pextra_pins = extra_pins;
3262 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
3263 caller_pins;
3264 }
3265
3266 /*
3267 * It splits @folio into @new_order folios and copies the @folio metadata to
3268 * all the resulting folios.
3269 */
__split_folio_to_order(struct folio * folio,int old_order,int new_order)3270 static void __split_folio_to_order(struct folio *folio, int old_order,
3271 int new_order)
3272 {
3273 long new_nr_pages = 1 << new_order;
3274 long nr_pages = 1 << old_order;
3275 long i;
3276
3277 /*
3278 * Skip the first new_nr_pages, since the new folio from them have all
3279 * the flags from the original folio.
3280 */
3281 for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
3282 struct page *new_head = &folio->page + i;
3283
3284 /*
3285 * Careful: new_folio is not a "real" folio before we cleared PageTail.
3286 * Don't pass it around before clear_compound_head().
3287 */
3288 struct folio *new_folio = (struct folio *)new_head;
3289
3290 VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
3291
3292 /*
3293 * Clone page flags before unfreezing refcount.
3294 *
3295 * After successful get_page_unless_zero() might follow flags change,
3296 * for example lock_page() which set PG_waiters.
3297 *
3298 * Note that for mapped sub-pages of an anonymous THP,
3299 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
3300 * the migration entry instead from where remap_page() will restore it.
3301 * We can still have PG_anon_exclusive set on effectively unmapped and
3302 * unreferenced sub-pages of an anonymous THP: we can simply drop
3303 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
3304 */
3305 new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
3306 new_folio->flags.f |= (folio->flags.f &
3307 ((1L << PG_referenced) |
3308 (1L << PG_swapbacked) |
3309 (1L << PG_swapcache) |
3310 (1L << PG_mlocked) |
3311 (1L << PG_uptodate) |
3312 (1L << PG_active) |
3313 (1L << PG_workingset) |
3314 (1L << PG_locked) |
3315 (1L << PG_unevictable) |
3316 #ifdef CONFIG_ARCH_USES_PG_ARCH_2
3317 (1L << PG_arch_2) |
3318 #endif
3319 #ifdef CONFIG_ARCH_USES_PG_ARCH_3
3320 (1L << PG_arch_3) |
3321 #endif
3322 (1L << PG_dirty) |
3323 LRU_GEN_MASK | LRU_REFS_MASK));
3324
3325 new_folio->mapping = folio->mapping;
3326 new_folio->index = folio->index + i;
3327
3328 /*
3329 * page->private should not be set in tail pages. Fix up and warn once
3330 * if private is unexpectedly set.
3331 */
3332 if (unlikely(new_folio->private)) {
3333 VM_WARN_ON_ONCE_PAGE(true, new_head);
3334 new_folio->private = NULL;
3335 }
3336
3337 if (folio_test_swapcache(folio))
3338 new_folio->swap.val = folio->swap.val + i;
3339
3340 /* Page flags must be visible before we make the page non-compound. */
3341 smp_wmb();
3342
3343 /*
3344 * Clear PageTail before unfreezing page refcount.
3345 *
3346 * After successful get_page_unless_zero() might follow put_page()
3347 * which needs correct compound_head().
3348 */
3349 clear_compound_head(new_head);
3350 if (new_order) {
3351 prep_compound_page(new_head, new_order);
3352 folio_set_large_rmappable(new_folio);
3353 }
3354
3355 if (folio_test_young(folio))
3356 folio_set_young(new_folio);
3357 if (folio_test_idle(folio))
3358 folio_set_idle(new_folio);
3359 #ifdef CONFIG_MEMCG
3360 new_folio->memcg_data = folio->memcg_data;
3361 #endif
3362
3363 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
3364 }
3365
3366 if (new_order)
3367 folio_set_order(folio, new_order);
3368 else
3369 ClearPageCompound(&folio->page);
3370 }
3371
3372 /*
3373 * It splits an unmapped @folio to lower order smaller folios in two ways.
3374 * @folio: the to-be-split folio
3375 * @new_order: the smallest order of the after split folios (since buddy
3376 * allocator like split generates folios with orders from @folio's
3377 * order - 1 to new_order).
3378 * @split_at: in buddy allocator like split, the folio containing @split_at
3379 * will be split until its order becomes @new_order.
3380 * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
3381 * @mapping: @folio->mapping
3382 * @uniform_split: if the split is uniform or not (buddy allocator like split)
3383 *
3384 *
3385 * 1. uniform split: the given @folio into multiple @new_order small folios,
3386 * where all small folios have the same order. This is done when
3387 * uniform_split is true.
3388 * 2. buddy allocator like (non-uniform) split: the given @folio is split into
3389 * half and one of the half (containing the given page) is split into half
3390 * until the given @page's order becomes @new_order. This is done when
3391 * uniform_split is false.
3392 *
3393 * The high level flow for these two methods are:
3394 * 1. uniform split: a single __split_folio_to_order() is called to split the
3395 * @folio into @new_order, then we traverse all the resulting folios one by
3396 * one in PFN ascending order and perform stats, unfreeze, adding to list,
3397 * and file mapping index operations.
3398 * 2. non-uniform split: in general, folio_order - @new_order calls to
3399 * __split_folio_to_order() are made in a for loop to split the @folio
3400 * to one lower order at a time. The resulting small folios are processed
3401 * like what is done during the traversal in 1, except the one containing
3402 * @page, which is split in next for loop.
3403 *
3404 * After splitting, the caller's folio reference will be transferred to the
3405 * folio containing @page. The caller needs to unlock and/or free after-split
3406 * folios if necessary.
3407 *
3408 * For !uniform_split, when -ENOMEM is returned, the original folio might be
3409 * split. The caller needs to check the input folio.
3410 */
__split_unmapped_folio(struct folio * folio,int new_order,struct page * split_at,struct xa_state * xas,struct address_space * mapping,bool uniform_split)3411 static int __split_unmapped_folio(struct folio *folio, int new_order,
3412 struct page *split_at, struct xa_state *xas,
3413 struct address_space *mapping, bool uniform_split)
3414 {
3415 int order = folio_order(folio);
3416 int start_order = uniform_split ? new_order : order - 1;
3417 bool stop_split = false;
3418 struct folio *next;
3419 int split_order;
3420 int ret = 0;
3421
3422 if (folio_test_anon(folio))
3423 mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
3424
3425 folio_clear_has_hwpoisoned(folio);
3426
3427 /*
3428 * split to new_order one order at a time. For uniform split,
3429 * folio is split to new_order directly.
3430 */
3431 for (split_order = start_order;
3432 split_order >= new_order && !stop_split;
3433 split_order--) {
3434 struct folio *end_folio = folio_next(folio);
3435 int old_order = folio_order(folio);
3436 struct folio *new_folio;
3437
3438 /* order-1 anonymous folio is not supported */
3439 if (folio_test_anon(folio) && split_order == 1)
3440 continue;
3441 if (uniform_split && split_order != new_order)
3442 continue;
3443
3444 if (mapping) {
3445 /*
3446 * uniform split has xas_split_alloc() called before
3447 * irq is disabled to allocate enough memory, whereas
3448 * non-uniform split can handle ENOMEM.
3449 */
3450 if (uniform_split)
3451 xas_split(xas, folio, old_order);
3452 else {
3453 xas_set_order(xas, folio->index, split_order);
3454 xas_try_split(xas, folio, old_order);
3455 if (xas_error(xas)) {
3456 ret = xas_error(xas);
3457 stop_split = true;
3458 }
3459 }
3460 }
3461
3462 if (!stop_split) {
3463 folio_split_memcg_refs(folio, old_order, split_order);
3464 split_page_owner(&folio->page, old_order, split_order);
3465 pgalloc_tag_split(folio, old_order, split_order);
3466
3467 __split_folio_to_order(folio, old_order, split_order);
3468 }
3469
3470 /*
3471 * Iterate through after-split folios and update folio stats.
3472 * But in buddy allocator like split, the folio
3473 * containing the specified page is skipped until its order
3474 * is new_order, since the folio will be worked on in next
3475 * iteration.
3476 */
3477 for (new_folio = folio; new_folio != end_folio; new_folio = next) {
3478 next = folio_next(new_folio);
3479 /*
3480 * for buddy allocator like split, new_folio containing
3481 * @split_at page could be split again, thus do not
3482 * change stats yet. Wait until new_folio's order is
3483 * @new_order or stop_split is set to true by the above
3484 * xas_split() failure.
3485 */
3486 if (new_folio == page_folio(split_at)) {
3487 folio = new_folio;
3488 if (split_order != new_order && !stop_split)
3489 continue;
3490 }
3491 if (folio_test_anon(new_folio))
3492 mod_mthp_stat(folio_order(new_folio),
3493 MTHP_STAT_NR_ANON, 1);
3494 }
3495 }
3496
3497 return ret;
3498 }
3499
non_uniform_split_supported(struct folio * folio,unsigned int new_order,bool warns)3500 bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
3501 bool warns)
3502 {
3503 if (folio_test_anon(folio)) {
3504 /* order-1 is not supported for anonymous THP. */
3505 VM_WARN_ONCE(warns && new_order == 1,
3506 "Cannot split to order-1 folio");
3507 return new_order != 1;
3508 } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3509 !mapping_large_folio_support(folio->mapping)) {
3510 /*
3511 * No split if the file system does not support large folio.
3512 * Note that we might still have THPs in such mappings due to
3513 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
3514 * does not actually support large folios properly.
3515 */
3516 VM_WARN_ONCE(warns,
3517 "Cannot split file folio to non-0 order");
3518 return false;
3519 }
3520
3521 /* Only swapping a whole PMD-mapped folio is supported */
3522 if (folio_test_swapcache(folio)) {
3523 VM_WARN_ONCE(warns,
3524 "Cannot split swapcache folio to non-0 order");
3525 return false;
3526 }
3527
3528 return true;
3529 }
3530
3531 /* See comments in non_uniform_split_supported() */
uniform_split_supported(struct folio * folio,unsigned int new_order,bool warns)3532 bool uniform_split_supported(struct folio *folio, unsigned int new_order,
3533 bool warns)
3534 {
3535 if (folio_test_anon(folio)) {
3536 VM_WARN_ONCE(warns && new_order == 1,
3537 "Cannot split to order-1 folio");
3538 return new_order != 1;
3539 } else if (new_order) {
3540 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3541 !mapping_large_folio_support(folio->mapping)) {
3542 VM_WARN_ONCE(warns,
3543 "Cannot split file folio to non-0 order");
3544 return false;
3545 }
3546 }
3547
3548 if (new_order && folio_test_swapcache(folio)) {
3549 VM_WARN_ONCE(warns,
3550 "Cannot split swapcache folio to non-0 order");
3551 return false;
3552 }
3553
3554 return true;
3555 }
3556
3557 /*
3558 * __folio_split: split a folio at @split_at to a @new_order folio
3559 * @folio: folio to split
3560 * @new_order: the order of the new folio
3561 * @split_at: a page within the new folio
3562 * @lock_at: a page within @folio to be left locked to caller
3563 * @list: after-split folios will be put on it if non NULL
3564 * @uniform_split: perform uniform split or not (non-uniform split)
3565 *
3566 * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
3567 * It is in charge of checking whether the split is supported or not and
3568 * preparing @folio for __split_unmapped_folio().
3569 *
3570 * After splitting, the after-split folio containing @lock_at remains locked
3571 * and others are unlocked:
3572 * 1. for uniform split, @lock_at points to one of @folio's subpages;
3573 * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
3574 *
3575 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
3576 * split but not to @new_order, the caller needs to check)
3577 */
__folio_split(struct folio * folio,unsigned int new_order,struct page * split_at,struct page * lock_at,struct list_head * list,bool uniform_split)3578 static int __folio_split(struct folio *folio, unsigned int new_order,
3579 struct page *split_at, struct page *lock_at,
3580 struct list_head *list, bool uniform_split)
3581 {
3582 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3583 XA_STATE(xas, &folio->mapping->i_pages, folio->index);
3584 struct folio *end_folio = folio_next(folio);
3585 bool is_anon = folio_test_anon(folio);
3586 struct address_space *mapping = NULL;
3587 struct anon_vma *anon_vma = NULL;
3588 int order = folio_order(folio);
3589 struct folio *new_folio, *next;
3590 int nr_shmem_dropped = 0;
3591 int remap_flags = 0;
3592 int extra_pins, ret;
3593 pgoff_t end;
3594 bool is_hzp;
3595
3596 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
3597 VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
3598
3599 if (folio != page_folio(split_at) || folio != page_folio(lock_at))
3600 return -EINVAL;
3601
3602 if (new_order >= folio_order(folio))
3603 return -EINVAL;
3604
3605 if (uniform_split && !uniform_split_supported(folio, new_order, true))
3606 return -EINVAL;
3607
3608 if (!uniform_split &&
3609 !non_uniform_split_supported(folio, new_order, true))
3610 return -EINVAL;
3611
3612 is_hzp = is_huge_zero_folio(folio);
3613 if (is_hzp) {
3614 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3615 return -EBUSY;
3616 }
3617
3618 if (folio_test_writeback(folio))
3619 return -EBUSY;
3620
3621 if (is_anon) {
3622 /*
3623 * The caller does not necessarily hold an mmap_lock that would
3624 * prevent the anon_vma disappearing so we first we take a
3625 * reference to it and then lock the anon_vma for write. This
3626 * is similar to folio_lock_anon_vma_read except the write lock
3627 * is taken to serialise against parallel split or collapse
3628 * operations.
3629 */
3630 anon_vma = folio_get_anon_vma(folio);
3631 if (!anon_vma) {
3632 ret = -EBUSY;
3633 goto out;
3634 }
3635 mapping = NULL;
3636 anon_vma_lock_write(anon_vma);
3637 } else {
3638 unsigned int min_order;
3639 gfp_t gfp;
3640
3641 mapping = folio->mapping;
3642
3643 /* Truncated ? */
3644 /*
3645 * TODO: add support for large shmem folio in swap cache.
3646 * When shmem is in swap cache, mapping is NULL and
3647 * folio_test_swapcache() is true.
3648 */
3649 if (!mapping) {
3650 ret = -EBUSY;
3651 goto out;
3652 }
3653
3654 min_order = mapping_min_folio_order(folio->mapping);
3655 if (new_order < min_order) {
3656 VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
3657 min_order);
3658 ret = -EINVAL;
3659 goto out;
3660 }
3661
3662 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3663 GFP_RECLAIM_MASK);
3664
3665 if (!filemap_release_folio(folio, gfp)) {
3666 ret = -EBUSY;
3667 goto out;
3668 }
3669
3670 if (uniform_split) {
3671 xas_set_order(&xas, folio->index, new_order);
3672 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
3673 if (xas_error(&xas)) {
3674 ret = xas_error(&xas);
3675 goto out;
3676 }
3677 }
3678
3679 anon_vma = NULL;
3680 i_mmap_lock_read(mapping);
3681
3682 /*
3683 *__split_unmapped_folio() may need to trim off pages beyond
3684 * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
3685 * seqlock, which cannot be nested inside the page tree lock.
3686 * So note end now: i_size itself may be changed at any moment,
3687 * but folio lock is good enough to serialize the trimming.
3688 */
3689 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3690 if (shmem_mapping(mapping))
3691 end = shmem_fallocend(mapping->host, end);
3692 }
3693
3694 /*
3695 * Racy check if we can split the page, before unmap_folio() will
3696 * split PMDs
3697 */
3698 if (!can_split_folio(folio, 1, &extra_pins)) {
3699 ret = -EAGAIN;
3700 goto out_unlock;
3701 }
3702
3703 unmap_folio(folio);
3704
3705 /* block interrupt reentry in xa_lock and spinlock */
3706 local_irq_disable();
3707 if (mapping) {
3708 /*
3709 * Check if the folio is present in page cache.
3710 * We assume all tail are present too, if folio is there.
3711 */
3712 xas_lock(&xas);
3713 xas_reset(&xas);
3714 if (xas_load(&xas) != folio) {
3715 ret = -EAGAIN;
3716 goto fail;
3717 }
3718 }
3719
3720 /* Prevent deferred_split_scan() touching ->_refcount */
3721 spin_lock(&ds_queue->split_queue_lock);
3722 if (folio_ref_freeze(folio, 1 + extra_pins)) {
3723 struct swap_cluster_info *ci = NULL;
3724 struct lruvec *lruvec;
3725 int expected_refs;
3726
3727 if (folio_order(folio) > 1 &&
3728 !list_empty(&folio->_deferred_list)) {
3729 ds_queue->split_queue_len--;
3730 if (folio_test_partially_mapped(folio)) {
3731 folio_clear_partially_mapped(folio);
3732 mod_mthp_stat(folio_order(folio),
3733 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
3734 }
3735 /*
3736 * Reinitialize page_deferred_list after removing the
3737 * page from the split_queue, otherwise a subsequent
3738 * split will see list corruption when checking the
3739 * page_deferred_list.
3740 */
3741 list_del_init(&folio->_deferred_list);
3742 }
3743 spin_unlock(&ds_queue->split_queue_lock);
3744 if (mapping) {
3745 int nr = folio_nr_pages(folio);
3746
3747 if (folio_test_pmd_mappable(folio) &&
3748 new_order < HPAGE_PMD_ORDER) {
3749 if (folio_test_swapbacked(folio)) {
3750 __lruvec_stat_mod_folio(folio,
3751 NR_SHMEM_THPS, -nr);
3752 } else {
3753 __lruvec_stat_mod_folio(folio,
3754 NR_FILE_THPS, -nr);
3755 filemap_nr_thps_dec(mapping);
3756 }
3757 }
3758 }
3759
3760 if (folio_test_swapcache(folio)) {
3761 if (mapping) {
3762 VM_WARN_ON_ONCE_FOLIO(mapping, folio);
3763 ret = -EINVAL;
3764 goto fail;
3765 }
3766
3767 ci = swap_cluster_get_and_lock(folio);
3768 }
3769
3770 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
3771 lruvec = folio_lruvec_lock(folio);
3772
3773 ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
3774 mapping, uniform_split);
3775
3776 /*
3777 * Unfreeze after-split folios and put them back to the right
3778 * list. @folio should be kept frozon until page cache
3779 * entries are updated with all the other after-split folios
3780 * to prevent others seeing stale page cache entries.
3781 * As a result, new_folio starts from the next folio of
3782 * @folio.
3783 */
3784 for (new_folio = folio_next(folio); new_folio != end_folio;
3785 new_folio = next) {
3786 unsigned long nr_pages = folio_nr_pages(new_folio);
3787
3788 next = folio_next(new_folio);
3789
3790 expected_refs = folio_expected_ref_count(new_folio) + 1;
3791 folio_ref_unfreeze(new_folio, expected_refs);
3792
3793 lru_add_split_folio(folio, new_folio, lruvec, list);
3794
3795 /*
3796 * Anonymous folio with swap cache.
3797 * NOTE: shmem in swap cache is not supported yet.
3798 */
3799 if (ci) {
3800 __swap_cache_replace_folio(ci, folio, new_folio);
3801 continue;
3802 }
3803
3804 /* Anonymous folio without swap cache */
3805 if (!mapping)
3806 continue;
3807
3808 /* Add the new folio to the page cache. */
3809 if (new_folio->index < end) {
3810 __xa_store(&mapping->i_pages, new_folio->index,
3811 new_folio, 0);
3812 continue;
3813 }
3814
3815 /* Drop folio beyond EOF: ->index >= end */
3816 if (shmem_mapping(mapping))
3817 nr_shmem_dropped += nr_pages;
3818 else if (folio_test_clear_dirty(new_folio))
3819 folio_account_cleaned(
3820 new_folio, inode_to_wb(mapping->host));
3821 __filemap_remove_folio(new_folio, NULL);
3822 folio_put_refs(new_folio, nr_pages);
3823 }
3824 /*
3825 * Unfreeze @folio only after all page cache entries, which
3826 * used to point to it, have been updated with new folios.
3827 * Otherwise, a parallel folio_try_get() can grab @folio
3828 * and its caller can see stale page cache entries.
3829 */
3830 expected_refs = folio_expected_ref_count(folio) + 1;
3831 folio_ref_unfreeze(folio, expected_refs);
3832
3833 unlock_page_lruvec(lruvec);
3834
3835 if (ci)
3836 swap_cluster_unlock(ci);
3837 } else {
3838 spin_unlock(&ds_queue->split_queue_lock);
3839 ret = -EAGAIN;
3840 }
3841 fail:
3842 if (mapping)
3843 xas_unlock(&xas);
3844
3845 local_irq_enable();
3846
3847 if (nr_shmem_dropped)
3848 shmem_uncharge(mapping->host, nr_shmem_dropped);
3849
3850 if (!ret && is_anon)
3851 remap_flags = RMP_USE_SHARED_ZEROPAGE;
3852 remap_page(folio, 1 << order, remap_flags);
3853
3854 /*
3855 * Unlock all after-split folios except the one containing
3856 * @lock_at page. If @folio is not split, it will be kept locked.
3857 */
3858 for (new_folio = folio; new_folio != end_folio; new_folio = next) {
3859 next = folio_next(new_folio);
3860 if (new_folio == page_folio(lock_at))
3861 continue;
3862
3863 folio_unlock(new_folio);
3864 /*
3865 * Subpages may be freed if there wasn't any mapping
3866 * like if add_to_swap() is running on a lru page that
3867 * had its mapping zapped. And freeing these pages
3868 * requires taking the lru_lock so we do the put_page
3869 * of the tail pages after the split is complete.
3870 */
3871 free_folio_and_swap_cache(new_folio);
3872 }
3873
3874 out_unlock:
3875 if (anon_vma) {
3876 anon_vma_unlock_write(anon_vma);
3877 put_anon_vma(anon_vma);
3878 }
3879 if (mapping)
3880 i_mmap_unlock_read(mapping);
3881 out:
3882 xas_destroy(&xas);
3883 if (order == HPAGE_PMD_ORDER)
3884 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3885 count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
3886 return ret;
3887 }
3888
3889 /*
3890 * This function splits a large folio into smaller folios of order @new_order.
3891 * @page can point to any page of the large folio to split. The split operation
3892 * does not change the position of @page.
3893 *
3894 * Prerequisites:
3895 *
3896 * 1) The caller must hold a reference on the @page's owning folio, also known
3897 * as the large folio.
3898 *
3899 * 2) The large folio must be locked.
3900 *
3901 * 3) The folio must not be pinned. Any unexpected folio references, including
3902 * GUP pins, will result in the folio not getting split; instead, the caller
3903 * will receive an -EAGAIN.
3904 *
3905 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
3906 * supported for non-file-backed folios, because folio->_deferred_list, which
3907 * is used by partially mapped folios, is stored in subpage 2, but an order-1
3908 * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
3909 * since they do not use _deferred_list.
3910 *
3911 * After splitting, the caller's folio reference will be transferred to @page,
3912 * resulting in a raised refcount of @page after this call. The other pages may
3913 * be freed if they are not mapped.
3914 *
3915 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3916 *
3917 * Pages in @new_order will inherit the mapping, flags, and so on from the
3918 * huge page.
3919 *
3920 * Returns 0 if the huge page was split successfully.
3921 *
3922 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
3923 * the folio was concurrently removed from the page cache.
3924 *
3925 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
3926 * under writeback, if fs-specific folio metadata cannot currently be
3927 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
3928 * truncation).
3929 *
3930 * Callers should ensure that the order respects the address space mapping
3931 * min-order if one is set for non-anonymous folios.
3932 *
3933 * Returns -EINVAL when trying to split to an order that is incompatible
3934 * with the folio. Splitting to order 0 is compatible with all folios.
3935 */
split_huge_page_to_list_to_order(struct page * page,struct list_head * list,unsigned int new_order)3936 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
3937 unsigned int new_order)
3938 {
3939 struct folio *folio = page_folio(page);
3940
3941 return __folio_split(folio, new_order, &folio->page, page, list, true);
3942 }
3943
3944 /*
3945 * folio_split: split a folio at @split_at to a @new_order folio
3946 * @folio: folio to split
3947 * @new_order: the order of the new folio
3948 * @split_at: a page within the new folio
3949 *
3950 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
3951 * split but not to @new_order, the caller needs to check)
3952 *
3953 * It has the same prerequisites and returns as
3954 * split_huge_page_to_list_to_order().
3955 *
3956 * Split a folio at @split_at to a new_order folio, leave the
3957 * remaining subpages of the original folio as large as possible. For example,
3958 * in the case of splitting an order-9 folio at its third order-3 subpages to
3959 * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
3960 * After the split, there will be a group of folios with different orders and
3961 * the new folio containing @split_at is marked in bracket:
3962 * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
3963 *
3964 * After split, folio is left locked for caller.
3965 */
folio_split(struct folio * folio,unsigned int new_order,struct page * split_at,struct list_head * list)3966 int folio_split(struct folio *folio, unsigned int new_order,
3967 struct page *split_at, struct list_head *list)
3968 {
3969 return __folio_split(folio, new_order, split_at, &folio->page, list,
3970 false);
3971 }
3972
min_order_for_split(struct folio * folio)3973 int min_order_for_split(struct folio *folio)
3974 {
3975 if (folio_test_anon(folio))
3976 return 0;
3977
3978 if (!folio->mapping) {
3979 if (folio_test_pmd_mappable(folio))
3980 count_vm_event(THP_SPLIT_PAGE_FAILED);
3981 return -EBUSY;
3982 }
3983
3984 return mapping_min_folio_order(folio->mapping);
3985 }
3986
split_folio_to_list(struct folio * folio,struct list_head * list)3987 int split_folio_to_list(struct folio *folio, struct list_head *list)
3988 {
3989 int ret = min_order_for_split(folio);
3990
3991 if (ret < 0)
3992 return ret;
3993
3994 return split_huge_page_to_list_to_order(&folio->page, list, ret);
3995 }
3996
3997 /*
3998 * __folio_unqueue_deferred_split() is not to be called directly:
3999 * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
4000 * limits its calls to those folios which may have a _deferred_list for
4001 * queueing THP splits, and that list is (racily observed to be) non-empty.
4002 *
4003 * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
4004 * zero: because even when split_queue_lock is held, a non-empty _deferred_list
4005 * might be in use on deferred_split_scan()'s unlocked on-stack list.
4006 *
4007 * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
4008 * therefore important to unqueue deferred split before changing folio memcg.
4009 */
__folio_unqueue_deferred_split(struct folio * folio)4010 bool __folio_unqueue_deferred_split(struct folio *folio)
4011 {
4012 struct deferred_split *ds_queue;
4013 unsigned long flags;
4014 bool unqueued = false;
4015
4016 WARN_ON_ONCE(folio_ref_count(folio));
4017 WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
4018
4019 ds_queue = get_deferred_split_queue(folio);
4020 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4021 if (!list_empty(&folio->_deferred_list)) {
4022 ds_queue->split_queue_len--;
4023 if (folio_test_partially_mapped(folio)) {
4024 folio_clear_partially_mapped(folio);
4025 mod_mthp_stat(folio_order(folio),
4026 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
4027 }
4028 list_del_init(&folio->_deferred_list);
4029 unqueued = true;
4030 }
4031 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4032
4033 return unqueued; /* useful for debug warnings */
4034 }
4035
4036 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
deferred_split_folio(struct folio * folio,bool partially_mapped)4037 void deferred_split_folio(struct folio *folio, bool partially_mapped)
4038 {
4039 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
4040 #ifdef CONFIG_MEMCG
4041 struct mem_cgroup *memcg = folio_memcg(folio);
4042 #endif
4043 unsigned long flags;
4044
4045 /*
4046 * Order 1 folios have no space for a deferred list, but we also
4047 * won't waste much memory by not adding them to the deferred list.
4048 */
4049 if (folio_order(folio) <= 1)
4050 return;
4051
4052 if (!partially_mapped && !split_underused_thp)
4053 return;
4054
4055 /*
4056 * Exclude swapcache: originally to avoid a corrupt deferred split
4057 * queue. Nowadays that is fully prevented by memcg1_swapout();
4058 * but if page reclaim is already handling the same folio, it is
4059 * unnecessary to handle it again in the shrinker, so excluding
4060 * swapcache here may still be a useful optimization.
4061 */
4062 if (folio_test_swapcache(folio))
4063 return;
4064
4065 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4066 if (partially_mapped) {
4067 if (!folio_test_partially_mapped(folio)) {
4068 folio_set_partially_mapped(folio);
4069 if (folio_test_pmd_mappable(folio))
4070 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
4071 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
4072 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
4073
4074 }
4075 } else {
4076 /* partially mapped folios cannot become non-partially mapped */
4077 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
4078 }
4079 if (list_empty(&folio->_deferred_list)) {
4080 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
4081 ds_queue->split_queue_len++;
4082 #ifdef CONFIG_MEMCG
4083 if (memcg)
4084 set_shrinker_bit(memcg, folio_nid(folio),
4085 deferred_split_shrinker->id);
4086 #endif
4087 }
4088 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4089 }
4090
deferred_split_count(struct shrinker * shrink,struct shrink_control * sc)4091 static unsigned long deferred_split_count(struct shrinker *shrink,
4092 struct shrink_control *sc)
4093 {
4094 struct pglist_data *pgdata = NODE_DATA(sc->nid);
4095 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
4096
4097 #ifdef CONFIG_MEMCG
4098 if (sc->memcg)
4099 ds_queue = &sc->memcg->deferred_split_queue;
4100 #endif
4101 return READ_ONCE(ds_queue->split_queue_len);
4102 }
4103
thp_underused(struct folio * folio)4104 static bool thp_underused(struct folio *folio)
4105 {
4106 int num_zero_pages = 0, num_filled_pages = 0;
4107 void *kaddr;
4108 int i;
4109
4110 if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
4111 return false;
4112
4113 for (i = 0; i < folio_nr_pages(folio); i++) {
4114 kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
4115 if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
4116 num_zero_pages++;
4117 if (num_zero_pages > khugepaged_max_ptes_none) {
4118 kunmap_local(kaddr);
4119 return true;
4120 }
4121 } else {
4122 /*
4123 * Another path for early exit once the number
4124 * of non-zero filled pages exceeds threshold.
4125 */
4126 num_filled_pages++;
4127 if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
4128 kunmap_local(kaddr);
4129 return false;
4130 }
4131 }
4132 kunmap_local(kaddr);
4133 }
4134 return false;
4135 }
4136
deferred_split_scan(struct shrinker * shrink,struct shrink_control * sc)4137 static unsigned long deferred_split_scan(struct shrinker *shrink,
4138 struct shrink_control *sc)
4139 {
4140 struct pglist_data *pgdata = NODE_DATA(sc->nid);
4141 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
4142 unsigned long flags;
4143 LIST_HEAD(list);
4144 struct folio *folio, *next, *prev = NULL;
4145 int split = 0, removed = 0;
4146
4147 #ifdef CONFIG_MEMCG
4148 if (sc->memcg)
4149 ds_queue = &sc->memcg->deferred_split_queue;
4150 #endif
4151
4152 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4153 /* Take pin on all head pages to avoid freeing them under us */
4154 list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
4155 _deferred_list) {
4156 if (folio_try_get(folio)) {
4157 list_move(&folio->_deferred_list, &list);
4158 } else {
4159 /* We lost race with folio_put() */
4160 if (folio_test_partially_mapped(folio)) {
4161 folio_clear_partially_mapped(folio);
4162 mod_mthp_stat(folio_order(folio),
4163 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
4164 }
4165 list_del_init(&folio->_deferred_list);
4166 ds_queue->split_queue_len--;
4167 }
4168 if (!--sc->nr_to_scan)
4169 break;
4170 }
4171 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4172
4173 list_for_each_entry_safe(folio, next, &list, _deferred_list) {
4174 bool did_split = false;
4175 bool underused = false;
4176
4177 if (!folio_test_partially_mapped(folio)) {
4178 /*
4179 * See try_to_map_unused_to_zeropage(): we cannot
4180 * optimize zero-filled pages after splitting an
4181 * mlocked folio.
4182 */
4183 if (folio_test_mlocked(folio))
4184 goto next;
4185 underused = thp_underused(folio);
4186 if (!underused)
4187 goto next;
4188 }
4189 if (!folio_trylock(folio))
4190 goto next;
4191 if (!split_folio(folio)) {
4192 did_split = true;
4193 if (underused)
4194 count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
4195 split++;
4196 }
4197 folio_unlock(folio);
4198 next:
4199 /*
4200 * split_folio() removes folio from list on success.
4201 * Only add back to the queue if folio is partially mapped.
4202 * If thp_underused returns false, or if split_folio fails
4203 * in the case it was underused, then consider it used and
4204 * don't add it back to split_queue.
4205 */
4206 if (did_split) {
4207 ; /* folio already removed from list */
4208 } else if (!folio_test_partially_mapped(folio)) {
4209 list_del_init(&folio->_deferred_list);
4210 removed++;
4211 } else {
4212 /*
4213 * That unlocked list_del_init() above would be unsafe,
4214 * unless its folio is separated from any earlier folios
4215 * left on the list (which may be concurrently unqueued)
4216 * by one safe folio with refcount still raised.
4217 */
4218 swap(folio, prev);
4219 }
4220 if (folio)
4221 folio_put(folio);
4222 }
4223
4224 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4225 list_splice_tail(&list, &ds_queue->split_queue);
4226 ds_queue->split_queue_len -= removed;
4227 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4228
4229 if (prev)
4230 folio_put(prev);
4231
4232 /*
4233 * Stop shrinker if we didn't split any page, but the queue is empty.
4234 * This can happen if pages were freed under us.
4235 */
4236 if (!split && list_empty(&ds_queue->split_queue))
4237 return SHRINK_STOP;
4238 return split;
4239 }
4240
4241 #ifdef CONFIG_DEBUG_FS
split_huge_pages_all(void)4242 static void split_huge_pages_all(void)
4243 {
4244 struct zone *zone;
4245 struct page *page;
4246 struct folio *folio;
4247 unsigned long pfn, max_zone_pfn;
4248 unsigned long total = 0, split = 0;
4249
4250 pr_debug("Split all THPs\n");
4251 for_each_zone(zone) {
4252 if (!managed_zone(zone))
4253 continue;
4254 max_zone_pfn = zone_end_pfn(zone);
4255 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
4256 int nr_pages;
4257
4258 page = pfn_to_online_page(pfn);
4259 if (!page || PageTail(page))
4260 continue;
4261 folio = page_folio(page);
4262 if (!folio_try_get(folio))
4263 continue;
4264
4265 if (unlikely(page_folio(page) != folio))
4266 goto next;
4267
4268 if (zone != folio_zone(folio))
4269 goto next;
4270
4271 if (!folio_test_large(folio)
4272 || folio_test_hugetlb(folio)
4273 || !folio_test_lru(folio))
4274 goto next;
4275
4276 total++;
4277 folio_lock(folio);
4278 nr_pages = folio_nr_pages(folio);
4279 if (!split_folio(folio))
4280 split++;
4281 pfn += nr_pages - 1;
4282 folio_unlock(folio);
4283 next:
4284 folio_put(folio);
4285 cond_resched();
4286 }
4287 }
4288
4289 pr_debug("%lu of %lu THP split\n", split, total);
4290 }
4291
vma_not_suitable_for_thp_split(struct vm_area_struct * vma)4292 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
4293 {
4294 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
4295 is_vm_hugetlb_page(vma);
4296 }
4297
split_huge_pages_pid(int pid,unsigned long vaddr_start,unsigned long vaddr_end,unsigned int new_order,long in_folio_offset)4298 static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
4299 unsigned long vaddr_end, unsigned int new_order,
4300 long in_folio_offset)
4301 {
4302 int ret = 0;
4303 struct task_struct *task;
4304 struct mm_struct *mm;
4305 unsigned long total = 0, split = 0;
4306 unsigned long addr;
4307
4308 vaddr_start &= PAGE_MASK;
4309 vaddr_end &= PAGE_MASK;
4310
4311 task = find_get_task_by_vpid(pid);
4312 if (!task) {
4313 ret = -ESRCH;
4314 goto out;
4315 }
4316
4317 /* Find the mm_struct */
4318 mm = get_task_mm(task);
4319 put_task_struct(task);
4320
4321 if (!mm) {
4322 ret = -EINVAL;
4323 goto out;
4324 }
4325
4326 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
4327 pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
4328
4329 mmap_read_lock(mm);
4330 /*
4331 * always increase addr by PAGE_SIZE, since we could have a PTE page
4332 * table filled with PTE-mapped THPs, each of which is distinct.
4333 */
4334 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
4335 struct vm_area_struct *vma = vma_lookup(mm, addr);
4336 struct folio_walk fw;
4337 struct folio *folio;
4338 struct address_space *mapping;
4339 unsigned int target_order = new_order;
4340
4341 if (!vma)
4342 break;
4343
4344 /* skip special VMA and hugetlb VMA */
4345 if (vma_not_suitable_for_thp_split(vma)) {
4346 addr = vma->vm_end;
4347 continue;
4348 }
4349
4350 folio = folio_walk_start(&fw, vma, addr, 0);
4351 if (!folio)
4352 continue;
4353
4354 if (!is_transparent_hugepage(folio))
4355 goto next;
4356
4357 if (!folio_test_anon(folio)) {
4358 mapping = folio->mapping;
4359 target_order = max(new_order,
4360 mapping_min_folio_order(mapping));
4361 }
4362
4363 if (target_order >= folio_order(folio))
4364 goto next;
4365
4366 total++;
4367 /*
4368 * For folios with private, split_huge_page_to_list_to_order()
4369 * will try to drop it before split and then check if the folio
4370 * can be split or not. So skip the check here.
4371 */
4372 if (!folio_test_private(folio) &&
4373 !can_split_folio(folio, 0, NULL))
4374 goto next;
4375
4376 if (!folio_trylock(folio))
4377 goto next;
4378 folio_get(folio);
4379 folio_walk_end(&fw, vma);
4380
4381 if (!folio_test_anon(folio) && folio->mapping != mapping)
4382 goto unlock;
4383
4384 if (in_folio_offset < 0 ||
4385 in_folio_offset >= folio_nr_pages(folio)) {
4386 if (!split_folio_to_order(folio, target_order))
4387 split++;
4388 } else {
4389 struct page *split_at = folio_page(folio,
4390 in_folio_offset);
4391 if (!folio_split(folio, target_order, split_at, NULL))
4392 split++;
4393 }
4394
4395 unlock:
4396
4397 folio_unlock(folio);
4398 folio_put(folio);
4399
4400 cond_resched();
4401 continue;
4402 next:
4403 folio_walk_end(&fw, vma);
4404 cond_resched();
4405 }
4406 mmap_read_unlock(mm);
4407 mmput(mm);
4408
4409 pr_debug("%lu of %lu THP split\n", split, total);
4410
4411 out:
4412 return ret;
4413 }
4414
split_huge_pages_in_file(const char * file_path,pgoff_t off_start,pgoff_t off_end,unsigned int new_order,long in_folio_offset)4415 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
4416 pgoff_t off_end, unsigned int new_order,
4417 long in_folio_offset)
4418 {
4419 struct filename *file;
4420 struct file *candidate;
4421 struct address_space *mapping;
4422 int ret = -EINVAL;
4423 pgoff_t index;
4424 int nr_pages = 1;
4425 unsigned long total = 0, split = 0;
4426 unsigned int min_order;
4427 unsigned int target_order;
4428
4429 file = getname_kernel(file_path);
4430 if (IS_ERR(file))
4431 return ret;
4432
4433 candidate = file_open_name(file, O_RDONLY, 0);
4434 if (IS_ERR(candidate))
4435 goto out;
4436
4437 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
4438 file_path, off_start, off_end, new_order, in_folio_offset);
4439
4440 mapping = candidate->f_mapping;
4441 min_order = mapping_min_folio_order(mapping);
4442 target_order = max(new_order, min_order);
4443
4444 for (index = off_start; index < off_end; index += nr_pages) {
4445 struct folio *folio = filemap_get_folio(mapping, index);
4446
4447 nr_pages = 1;
4448 if (IS_ERR(folio))
4449 continue;
4450
4451 if (!folio_test_large(folio))
4452 goto next;
4453
4454 total++;
4455 nr_pages = folio_nr_pages(folio);
4456
4457 if (target_order >= folio_order(folio))
4458 goto next;
4459
4460 if (!folio_trylock(folio))
4461 goto next;
4462
4463 if (folio->mapping != mapping)
4464 goto unlock;
4465
4466 if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
4467 if (!split_folio_to_order(folio, target_order))
4468 split++;
4469 } else {
4470 struct page *split_at = folio_page(folio,
4471 in_folio_offset);
4472 if (!folio_split(folio, target_order, split_at, NULL))
4473 split++;
4474 }
4475
4476 unlock:
4477 folio_unlock(folio);
4478 next:
4479 folio_put(folio);
4480 cond_resched();
4481 }
4482
4483 filp_close(candidate, NULL);
4484 ret = 0;
4485
4486 pr_debug("%lu of %lu file-backed THP split\n", split, total);
4487 out:
4488 putname(file);
4489 return ret;
4490 }
4491
4492 #define MAX_INPUT_BUF_SZ 255
4493
split_huge_pages_write(struct file * file,const char __user * buf,size_t count,loff_t * ppops)4494 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
4495 size_t count, loff_t *ppops)
4496 {
4497 static DEFINE_MUTEX(split_debug_mutex);
4498 ssize_t ret;
4499 /*
4500 * hold pid, start_vaddr, end_vaddr, new_order or
4501 * file_path, off_start, off_end, new_order
4502 */
4503 char input_buf[MAX_INPUT_BUF_SZ];
4504 int pid;
4505 unsigned long vaddr_start, vaddr_end;
4506 unsigned int new_order = 0;
4507 long in_folio_offset = -1;
4508
4509 ret = mutex_lock_interruptible(&split_debug_mutex);
4510 if (ret)
4511 return ret;
4512
4513 ret = -EFAULT;
4514
4515 memset(input_buf, 0, MAX_INPUT_BUF_SZ);
4516 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
4517 goto out;
4518
4519 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
4520
4521 if (input_buf[0] == '/') {
4522 char *tok;
4523 char *tok_buf = input_buf;
4524 char file_path[MAX_INPUT_BUF_SZ];
4525 pgoff_t off_start = 0, off_end = 0;
4526 size_t input_len = strlen(input_buf);
4527
4528 tok = strsep(&tok_buf, ",");
4529 if (tok && tok_buf) {
4530 strscpy(file_path, tok);
4531 } else {
4532 ret = -EINVAL;
4533 goto out;
4534 }
4535
4536 ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
4537 &new_order, &in_folio_offset);
4538 if (ret != 2 && ret != 3 && ret != 4) {
4539 ret = -EINVAL;
4540 goto out;
4541 }
4542 ret = split_huge_pages_in_file(file_path, off_start, off_end,
4543 new_order, in_folio_offset);
4544 if (!ret)
4545 ret = input_len;
4546
4547 goto out;
4548 }
4549
4550 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
4551 &vaddr_end, &new_order, &in_folio_offset);
4552 if (ret == 1 && pid == 1) {
4553 split_huge_pages_all();
4554 ret = strlen(input_buf);
4555 goto out;
4556 } else if (ret != 3 && ret != 4 && ret != 5) {
4557 ret = -EINVAL;
4558 goto out;
4559 }
4560
4561 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
4562 in_folio_offset);
4563 if (!ret)
4564 ret = strlen(input_buf);
4565 out:
4566 mutex_unlock(&split_debug_mutex);
4567 return ret;
4568
4569 }
4570
4571 static const struct file_operations split_huge_pages_fops = {
4572 .owner = THIS_MODULE,
4573 .write = split_huge_pages_write,
4574 };
4575
split_huge_pages_debugfs(void)4576 static int __init split_huge_pages_debugfs(void)
4577 {
4578 debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
4579 &split_huge_pages_fops);
4580 return 0;
4581 }
4582 late_initcall(split_huge_pages_debugfs);
4583 #endif
4584
4585 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
set_pmd_migration_entry(struct page_vma_mapped_walk * pvmw,struct page * page)4586 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
4587 struct page *page)
4588 {
4589 struct folio *folio = page_folio(page);
4590 struct vm_area_struct *vma = pvmw->vma;
4591 struct mm_struct *mm = vma->vm_mm;
4592 unsigned long address = pvmw->address;
4593 bool anon_exclusive;
4594 pmd_t pmdval;
4595 swp_entry_t entry;
4596 pmd_t pmdswp;
4597
4598 if (!(pvmw->pmd && !pvmw->pte))
4599 return 0;
4600
4601 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
4602 pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
4603
4604 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
4605 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
4606 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
4607 set_pmd_at(mm, address, pvmw->pmd, pmdval);
4608 return -EBUSY;
4609 }
4610
4611 if (pmd_dirty(pmdval))
4612 folio_mark_dirty(folio);
4613 if (pmd_write(pmdval))
4614 entry = make_writable_migration_entry(page_to_pfn(page));
4615 else if (anon_exclusive)
4616 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
4617 else
4618 entry = make_readable_migration_entry(page_to_pfn(page));
4619 if (pmd_young(pmdval))
4620 entry = make_migration_entry_young(entry);
4621 if (pmd_dirty(pmdval))
4622 entry = make_migration_entry_dirty(entry);
4623 pmdswp = swp_entry_to_pmd(entry);
4624 if (pmd_soft_dirty(pmdval))
4625 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
4626 if (pmd_uffd_wp(pmdval))
4627 pmdswp = pmd_swp_mkuffd_wp(pmdswp);
4628 set_pmd_at(mm, address, pvmw->pmd, pmdswp);
4629 folio_remove_rmap_pmd(folio, page, vma);
4630 folio_put(folio);
4631 trace_set_migration_pmd(address, pmd_val(pmdswp));
4632
4633 return 0;
4634 }
4635
remove_migration_pmd(struct page_vma_mapped_walk * pvmw,struct page * new)4636 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
4637 {
4638 struct folio *folio = page_folio(new);
4639 struct vm_area_struct *vma = pvmw->vma;
4640 struct mm_struct *mm = vma->vm_mm;
4641 unsigned long address = pvmw->address;
4642 unsigned long haddr = address & HPAGE_PMD_MASK;
4643 pmd_t pmde;
4644 swp_entry_t entry;
4645
4646 if (!(pvmw->pmd && !pvmw->pte))
4647 return;
4648
4649 entry = pmd_to_swp_entry(*pvmw->pmd);
4650 folio_get(folio);
4651 pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
4652 if (pmd_swp_soft_dirty(*pvmw->pmd))
4653 pmde = pmd_mksoft_dirty(pmde);
4654 if (is_writable_migration_entry(entry))
4655 pmde = pmd_mkwrite(pmde, vma);
4656 if (pmd_swp_uffd_wp(*pvmw->pmd))
4657 pmde = pmd_mkuffd_wp(pmde);
4658 if (!is_migration_entry_young(entry))
4659 pmde = pmd_mkold(pmde);
4660 /* NOTE: this may contain setting soft-dirty on some archs */
4661 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
4662 pmde = pmd_mkdirty(pmde);
4663
4664 if (folio_test_anon(folio)) {
4665 rmap_t rmap_flags = RMAP_NONE;
4666
4667 if (!is_readable_migration_entry(entry))
4668 rmap_flags |= RMAP_EXCLUSIVE;
4669
4670 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
4671 } else {
4672 folio_add_file_rmap_pmd(folio, new, vma);
4673 }
4674 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
4675 set_pmd_at(mm, haddr, pvmw->pmd, pmde);
4676
4677 /* No need to invalidate - it was non-present before */
4678 update_mmu_cache_pmd(vma, address, pvmw->pmd);
4679 trace_remove_migration_pmd(address, pmd_val(pmde));
4680 }
4681 #endif
4682