xref: /linux/mm/huge_memory.c (revision 91b2978a348073db0e47b380fa66c865eb25f3d8)
120c8ccb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
271e3aac0SAndrea Arcangeli /*
371e3aac0SAndrea Arcangeli  *  Copyright (C) 2009  Red Hat, Inc.
471e3aac0SAndrea Arcangeli  */
571e3aac0SAndrea Arcangeli 
6ae3a8c1cSAndrew Morton #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7ae3a8c1cSAndrew Morton 
871e3aac0SAndrea Arcangeli #include <linux/mm.h>
971e3aac0SAndrea Arcangeli #include <linux/sched.h>
10fa6c0231SZi Yan #include <linux/sched/mm.h>
11f7ccbae4SIngo Molnar #include <linux/sched/coredump.h>
126a3827d7SIngo Molnar #include <linux/sched/numa_balancing.h>
1371e3aac0SAndrea Arcangeli #include <linux/highmem.h>
1471e3aac0SAndrea Arcangeli #include <linux/hugetlb.h>
1571e3aac0SAndrea Arcangeli #include <linux/mmu_notifier.h>
1671e3aac0SAndrea Arcangeli #include <linux/rmap.h>
1771e3aac0SAndrea Arcangeli #include <linux/swap.h>
1897ae1749SKirill A. Shutemov #include <linux/shrinker.h>
19ba76149fSAndrea Arcangeli #include <linux/mm_inline.h>
20e9b61f19SKirill A. Shutemov #include <linux/swapops.h>
21fb5c2029SMatthew Wilcox (Oracle) #include <linux/backing-dev.h>
224897c765SMatthew Wilcox #include <linux/dax.h>
23ba76149fSAndrea Arcangeli #include <linux/khugepaged.h>
24878aee7dSAndrea Arcangeli #include <linux/freezer.h>
25f25748e3SDan Williams #include <linux/pfn_t.h>
26a664b2d8SAndrea Arcangeli #include <linux/mman.h>
273565fce3SDan Williams #include <linux/memremap.h>
28325adeb5SRalf Baechle #include <linux/pagemap.h>
2949071d43SKirill A. Shutemov #include <linux/debugfs.h>
304daae3b4SMel Gorman #include <linux/migrate.h>
3143b5fbbdSSasha Levin #include <linux/hashtable.h>
326b251fc9SAndrea Arcangeli #include <linux/userfaultfd_k.h>
3333c3fc71SVladimir Davydov #include <linux/page_idle.h>
34baa355fdSKirill A. Shutemov #include <linux/shmem_fs.h>
356b31d595SMichal Hocko #include <linux/oom.h>
3698fa15f3SAnshuman Khandual #include <linux/numa.h>
37f7da677bSVlastimil Babka #include <linux/page_owner.h>
38a1a3a2fcSHuang Ying #include <linux/sched/sysctl.h>
39467b171aSAneesh Kumar K.V #include <linux/memory-tiers.h>
4097ae1749SKirill A. Shutemov 
4171e3aac0SAndrea Arcangeli #include <asm/tlb.h>
4271e3aac0SAndrea Arcangeli #include <asm/pgalloc.h>
4371e3aac0SAndrea Arcangeli #include "internal.h"
44014bb1deSNeilBrown #include "swap.h"
4571e3aac0SAndrea Arcangeli 
46283fd6feSAnshuman Khandual #define CREATE_TRACE_POINTS
47283fd6feSAnshuman Khandual #include <trace/events/thp.h>
48283fd6feSAnshuman Khandual 
49ba76149fSAndrea Arcangeli /*
50b14d595aSMichael DeGuzis  * By default, transparent hugepage support is disabled in order to avoid
51b14d595aSMichael DeGuzis  * risking an increased memory footprint for applications that are not
52b14d595aSMichael DeGuzis  * guaranteed to benefit from it. When transparent hugepage support is
53b14d595aSMichael DeGuzis  * enabled, it is for all mappings, and khugepaged scans all mappings.
548bfa3f9aSJianguo Wu  * Defrag is invoked by khugepaged hugepage allocations and by page faults
558bfa3f9aSJianguo Wu  * for all hugepage allocations.
56ba76149fSAndrea Arcangeli  */
5771e3aac0SAndrea Arcangeli unsigned long transparent_hugepage_flags __read_mostly =
5813ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
59ba76149fSAndrea Arcangeli 	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
6013ece886SAndrea Arcangeli #endif
6113ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
6213ece886SAndrea Arcangeli 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
6313ece886SAndrea Arcangeli #endif
64444eb2a4SMel Gorman 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
6579da5407SKirill A. Shutemov 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
6679da5407SKirill A. Shutemov 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
67ba76149fSAndrea Arcangeli 
6854d91729SQi Zheng static struct shrinker *deferred_split_shrinker;
6954d91729SQi Zheng static unsigned long deferred_split_count(struct shrinker *shrink,
7054d91729SQi Zheng 					  struct shrink_control *sc);
7154d91729SQi Zheng static unsigned long deferred_split_scan(struct shrinker *shrink,
7254d91729SQi Zheng 					 struct shrink_control *sc);
73f000565aSAndrea Arcangeli 
7497ae1749SKirill A. Shutemov static atomic_t huge_zero_refcount;
7556873f43SWang, Yalin struct page *huge_zero_page __read_mostly;
763b77e8c8SHugh Dickins unsigned long huge_zero_pfn __read_mostly = ~0UL;
773485b883SRyan Roberts unsigned long huge_anon_orders_always __read_mostly;
783485b883SRyan Roberts unsigned long huge_anon_orders_madvise __read_mostly;
793485b883SRyan Roberts unsigned long huge_anon_orders_inherit __read_mostly;
804a6c1297SKirill A. Shutemov 
813485b883SRyan Roberts unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
823485b883SRyan Roberts 					 unsigned long vm_flags, bool smaps,
833485b883SRyan Roberts 					 bool in_pf, bool enforce_sysfs,
843485b883SRyan Roberts 					 unsigned long orders)
857635d9cbSMichal Hocko {
863485b883SRyan Roberts 	/* Check the intersection of requested and supported orders. */
873485b883SRyan Roberts 	orders &= vma_is_anonymous(vma) ?
883485b883SRyan Roberts 			THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
893485b883SRyan Roberts 	if (!orders)
903485b883SRyan Roberts 		return 0;
913485b883SRyan Roberts 
929fec5168SYang Shi 	if (!vma->vm_mm)		/* vdso */
933485b883SRyan Roberts 		return 0;
949fec5168SYang Shi 
957da4e2cbSYang Shi 	/*
967da4e2cbSYang Shi 	 * Explicitly disabled through madvise or prctl, or some
977da4e2cbSYang Shi 	 * architectures may disable THP for some mappings, for
987da4e2cbSYang Shi 	 * example, s390 kvm.
997da4e2cbSYang Shi 	 * */
1007da4e2cbSYang Shi 	if ((vm_flags & VM_NOHUGEPAGE) ||
1017da4e2cbSYang Shi 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
1023485b883SRyan Roberts 		return 0;
1037da4e2cbSYang Shi 	/*
1047da4e2cbSYang Shi 	 * If the hardware/firmware marked hugepage support disabled.
1057da4e2cbSYang Shi 	 */
1063c556d24SPeter Xu 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
1073485b883SRyan Roberts 		return 0;
1089fec5168SYang Shi 
1097da4e2cbSYang Shi 	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
1107da4e2cbSYang Shi 	if (vma_is_dax(vma))
1113485b883SRyan Roberts 		return in_pf ? orders : 0;
1127da4e2cbSYang Shi 
1137da4e2cbSYang Shi 	/*
1147a81751fSZach O'Keefe 	 * khugepaged special VMA and hugetlb VMA.
1157da4e2cbSYang Shi 	 * Must be checked after dax since some dax mappings may have
1167da4e2cbSYang Shi 	 * VM_MIXEDMAP set.
1177da4e2cbSYang Shi 	 */
1187a81751fSZach O'Keefe 	if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
1193485b883SRyan Roberts 		return 0;
1209fec5168SYang Shi 
1217da4e2cbSYang Shi 	/*
1223485b883SRyan Roberts 	 * Check alignment for file vma and size for both file and anon vma by
1233485b883SRyan Roberts 	 * filtering out the unsuitable orders.
1247da4e2cbSYang Shi 	 *
1257da4e2cbSYang Shi 	 * Skip the check for page fault. Huge fault does the check in fault
1263485b883SRyan Roberts 	 * handlers.
1277da4e2cbSYang Shi 	 */
1283485b883SRyan Roberts 	if (!in_pf) {
1293485b883SRyan Roberts 		int order = highest_order(orders);
1303485b883SRyan Roberts 		unsigned long addr;
1313485b883SRyan Roberts 
1323485b883SRyan Roberts 		while (orders) {
1333485b883SRyan Roberts 			addr = vma->vm_end - (PAGE_SIZE << order);
1343485b883SRyan Roberts 			if (thp_vma_suitable_order(vma, addr, order))
1353485b883SRyan Roberts 				break;
1363485b883SRyan Roberts 			order = next_order(&orders, order);
1373485b883SRyan Roberts 		}
1383485b883SRyan Roberts 
1393485b883SRyan Roberts 		if (!orders)
1403485b883SRyan Roberts 			return 0;
1413485b883SRyan Roberts 	}
1429fec5168SYang Shi 
1437da4e2cbSYang Shi 	/*
1447da4e2cbSYang Shi 	 * Enabled via shmem mount options or sysfs settings.
1457da4e2cbSYang Shi 	 * Must be done before hugepage flags check since shmem has its
1467da4e2cbSYang Shi 	 * own flags.
1477da4e2cbSYang Shi 	 */
1487da4e2cbSYang Shi 	if (!in_pf && shmem_file(vma->vm_file))
1492cf13384SDavid Stevens 		return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
1503485b883SRyan Roberts 				     !enforce_sysfs, vma->vm_mm, vm_flags)
1513485b883SRyan Roberts 			? orders : 0;
1529fec5168SYang Shi 
1537a81751fSZach O'Keefe 	if (!vma_is_anonymous(vma)) {
1547a81751fSZach O'Keefe 		/*
1553485b883SRyan Roberts 		 * Enforce sysfs THP requirements as necessary. Anonymous vmas
1563485b883SRyan Roberts 		 * were already handled in thp_vma_allowable_orders().
1573485b883SRyan Roberts 		 */
1583485b883SRyan Roberts 		if (enforce_sysfs &&
1593485b883SRyan Roberts 		    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
1603485b883SRyan Roberts 						    !hugepage_global_always())))
1613485b883SRyan Roberts 			return 0;
1623485b883SRyan Roberts 
1633485b883SRyan Roberts 		/*
1647a81751fSZach O'Keefe 		 * Trust that ->huge_fault() handlers know what they are doing
1657a81751fSZach O'Keefe 		 * in fault path.
1667a81751fSZach O'Keefe 		 */
1677a81751fSZach O'Keefe 		if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
1683485b883SRyan Roberts 			return orders;
1697a81751fSZach O'Keefe 		/* Only regular file is valid in collapse path */
1707a81751fSZach O'Keefe 		if (((!in_pf || smaps)) && file_thp_enabled(vma))
1713485b883SRyan Roberts 			return orders;
1723485b883SRyan Roberts 		return 0;
1737a81751fSZach O'Keefe 	}
1749fec5168SYang Shi 
1759fec5168SYang Shi 	if (vma_is_temporary_stack(vma))
1763485b883SRyan Roberts 		return 0;
1779fec5168SYang Shi 
1789fec5168SYang Shi 	/*
1799fec5168SYang Shi 	 * THPeligible bit of smaps should show 1 for proper VMAs even
1809fec5168SYang Shi 	 * though anon_vma is not initialized yet.
1817da4e2cbSYang Shi 	 *
1827da4e2cbSYang Shi 	 * Allow page fault since anon_vma may be not initialized until
1837da4e2cbSYang Shi 	 * the first page fault.
1849fec5168SYang Shi 	 */
1859fec5168SYang Shi 	if (!vma->anon_vma)
1863485b883SRyan Roberts 		return (smaps || in_pf) ? orders : 0;
1879fec5168SYang Shi 
1883485b883SRyan Roberts 	return orders;
1897635d9cbSMichal Hocko }
1907635d9cbSMichal Hocko 
191aaa9705bSMiaohe Lin static bool get_huge_zero_page(void)
19297ae1749SKirill A. Shutemov {
19397ae1749SKirill A. Shutemov 	struct page *zero_page;
19497ae1749SKirill A. Shutemov retry:
19597ae1749SKirill A. Shutemov 	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
196aaa9705bSMiaohe Lin 		return true;
19797ae1749SKirill A. Shutemov 
19897ae1749SKirill A. Shutemov 	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
19997ae1749SKirill A. Shutemov 			HPAGE_PMD_ORDER);
200d8a8e1f0SKirill A. Shutemov 	if (!zero_page) {
201d8a8e1f0SKirill A. Shutemov 		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
202aaa9705bSMiaohe Lin 		return false;
203d8a8e1f0SKirill A. Shutemov 	}
20497ae1749SKirill A. Shutemov 	preempt_disable();
2055918d10aSKirill A. Shutemov 	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
20697ae1749SKirill A. Shutemov 		preempt_enable();
2075ddacbe9SYu Zhao 		__free_pages(zero_page, compound_order(zero_page));
20897ae1749SKirill A. Shutemov 		goto retry;
20997ae1749SKirill A. Shutemov 	}
2103b77e8c8SHugh Dickins 	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
21197ae1749SKirill A. Shutemov 
21297ae1749SKirill A. Shutemov 	/* We take additional reference here. It will be put back by shrinker */
21397ae1749SKirill A. Shutemov 	atomic_set(&huge_zero_refcount, 2);
21497ae1749SKirill A. Shutemov 	preempt_enable();
215f4981502SLiu Shixin 	count_vm_event(THP_ZERO_PAGE_ALLOC);
216aaa9705bSMiaohe Lin 	return true;
21797ae1749SKirill A. Shutemov }
21897ae1749SKirill A. Shutemov 
2196fcb52a5SAaron Lu static void put_huge_zero_page(void)
22097ae1749SKirill A. Shutemov {
22197ae1749SKirill A. Shutemov 	/*
22297ae1749SKirill A. Shutemov 	 * Counter should never go to zero here. Only shrinker can put
22397ae1749SKirill A. Shutemov 	 * last reference.
22497ae1749SKirill A. Shutemov 	 */
22597ae1749SKirill A. Shutemov 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
22697ae1749SKirill A. Shutemov }
22797ae1749SKirill A. Shutemov 
2286fcb52a5SAaron Lu struct page *mm_get_huge_zero_page(struct mm_struct *mm)
2296fcb52a5SAaron Lu {
2306fcb52a5SAaron Lu 	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
2316fcb52a5SAaron Lu 		return READ_ONCE(huge_zero_page);
2326fcb52a5SAaron Lu 
2336fcb52a5SAaron Lu 	if (!get_huge_zero_page())
2346fcb52a5SAaron Lu 		return NULL;
2356fcb52a5SAaron Lu 
2366fcb52a5SAaron Lu 	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
2376fcb52a5SAaron Lu 		put_huge_zero_page();
2386fcb52a5SAaron Lu 
2396fcb52a5SAaron Lu 	return READ_ONCE(huge_zero_page);
2406fcb52a5SAaron Lu }
2416fcb52a5SAaron Lu 
2426fcb52a5SAaron Lu void mm_put_huge_zero_page(struct mm_struct *mm)
2436fcb52a5SAaron Lu {
2446fcb52a5SAaron Lu 	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
2456fcb52a5SAaron Lu 		put_huge_zero_page();
2466fcb52a5SAaron Lu }
2476fcb52a5SAaron Lu 
24848896466SGlauber Costa static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
24997ae1749SKirill A. Shutemov 					struct shrink_control *sc)
25097ae1749SKirill A. Shutemov {
25197ae1749SKirill A. Shutemov 	/* we can free zero page only if last reference remains */
25297ae1749SKirill A. Shutemov 	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
25348896466SGlauber Costa }
25497ae1749SKirill A. Shutemov 
25548896466SGlauber Costa static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
25648896466SGlauber Costa 				       struct shrink_control *sc)
25748896466SGlauber Costa {
25897ae1749SKirill A. Shutemov 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
2595918d10aSKirill A. Shutemov 		struct page *zero_page = xchg(&huge_zero_page, NULL);
2605918d10aSKirill A. Shutemov 		BUG_ON(zero_page == NULL);
2613b77e8c8SHugh Dickins 		WRITE_ONCE(huge_zero_pfn, ~0UL);
2625ddacbe9SYu Zhao 		__free_pages(zero_page, compound_order(zero_page));
26348896466SGlauber Costa 		return HPAGE_PMD_NR;
26497ae1749SKirill A. Shutemov 	}
26597ae1749SKirill A. Shutemov 
26697ae1749SKirill A. Shutemov 	return 0;
26797ae1749SKirill A. Shutemov }
26897ae1749SKirill A. Shutemov 
26954d91729SQi Zheng static struct shrinker *huge_zero_page_shrinker;
27097ae1749SKirill A. Shutemov 
27171e3aac0SAndrea Arcangeli #ifdef CONFIG_SYSFS
27271e3aac0SAndrea Arcangeli static ssize_t enabled_show(struct kobject *kobj,
27371e3aac0SAndrea Arcangeli 			    struct kobj_attribute *attr, char *buf)
27471e3aac0SAndrea Arcangeli {
275bfb0ffebSJoe Perches 	const char *output;
276bfb0ffebSJoe Perches 
277444eb2a4SMel Gorman 	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
278bfb0ffebSJoe Perches 		output = "[always] madvise never";
279bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
280bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
281bfb0ffebSJoe Perches 		output = "always [madvise] never";
282444eb2a4SMel Gorman 	else
283bfb0ffebSJoe Perches 		output = "always madvise [never]";
284bfb0ffebSJoe Perches 
285bfb0ffebSJoe Perches 	return sysfs_emit(buf, "%s\n", output);
28671e3aac0SAndrea Arcangeli }
287444eb2a4SMel Gorman 
28871e3aac0SAndrea Arcangeli static ssize_t enabled_store(struct kobject *kobj,
28971e3aac0SAndrea Arcangeli 			     struct kobj_attribute *attr,
29071e3aac0SAndrea Arcangeli 			     const char *buf, size_t count)
29171e3aac0SAndrea Arcangeli {
29221440d7eSDavid Rientjes 	ssize_t ret = count;
293ba76149fSAndrea Arcangeli 
294f42f2552SDavid Rientjes 	if (sysfs_streq(buf, "always")) {
29521440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
29621440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
297f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "madvise")) {
29821440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
29921440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
300f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "never")) {
30121440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
30221440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
30321440d7eSDavid Rientjes 	} else
30421440d7eSDavid Rientjes 		ret = -EINVAL;
305ba76149fSAndrea Arcangeli 
306ba76149fSAndrea Arcangeli 	if (ret > 0) {
307b46e756fSKirill A. Shutemov 		int err = start_stop_khugepaged();
308ba76149fSAndrea Arcangeli 		if (err)
309ba76149fSAndrea Arcangeli 			ret = err;
310ba76149fSAndrea Arcangeli 	}
311ba76149fSAndrea Arcangeli 	return ret;
31271e3aac0SAndrea Arcangeli }
31337139bb0SMiaohe Lin 
31437139bb0SMiaohe Lin static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
31571e3aac0SAndrea Arcangeli 
316b46e756fSKirill A. Shutemov ssize_t single_hugepage_flag_show(struct kobject *kobj,
31771e3aac0SAndrea Arcangeli 				  struct kobj_attribute *attr, char *buf,
31871e3aac0SAndrea Arcangeli 				  enum transparent_hugepage_flag flag)
31971e3aac0SAndrea Arcangeli {
320bfb0ffebSJoe Perches 	return sysfs_emit(buf, "%d\n",
321e27e6151SBen Hutchings 			  !!test_bit(flag, &transparent_hugepage_flags));
32271e3aac0SAndrea Arcangeli }
323e27e6151SBen Hutchings 
324b46e756fSKirill A. Shutemov ssize_t single_hugepage_flag_store(struct kobject *kobj,
32571e3aac0SAndrea Arcangeli 				 struct kobj_attribute *attr,
32671e3aac0SAndrea Arcangeli 				 const char *buf, size_t count,
32771e3aac0SAndrea Arcangeli 				 enum transparent_hugepage_flag flag)
32871e3aac0SAndrea Arcangeli {
329e27e6151SBen Hutchings 	unsigned long value;
330e27e6151SBen Hutchings 	int ret;
331e27e6151SBen Hutchings 
332e27e6151SBen Hutchings 	ret = kstrtoul(buf, 10, &value);
333e27e6151SBen Hutchings 	if (ret < 0)
334e27e6151SBen Hutchings 		return ret;
335e27e6151SBen Hutchings 	if (value > 1)
33671e3aac0SAndrea Arcangeli 		return -EINVAL;
33771e3aac0SAndrea Arcangeli 
338e27e6151SBen Hutchings 	if (value)
339e27e6151SBen Hutchings 		set_bit(flag, &transparent_hugepage_flags);
340e27e6151SBen Hutchings 	else
341e27e6151SBen Hutchings 		clear_bit(flag, &transparent_hugepage_flags);
342e27e6151SBen Hutchings 
34371e3aac0SAndrea Arcangeli 	return count;
34471e3aac0SAndrea Arcangeli }
34571e3aac0SAndrea Arcangeli 
34671e3aac0SAndrea Arcangeli static ssize_t defrag_show(struct kobject *kobj,
34771e3aac0SAndrea Arcangeli 			   struct kobj_attribute *attr, char *buf)
34871e3aac0SAndrea Arcangeli {
349bfb0ffebSJoe Perches 	const char *output;
350bfb0ffebSJoe Perches 
351bfb0ffebSJoe Perches 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
352bfb0ffebSJoe Perches 		     &transparent_hugepage_flags))
353bfb0ffebSJoe Perches 		output = "[always] defer defer+madvise madvise never";
354bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
355bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
356bfb0ffebSJoe Perches 		output = "always [defer] defer+madvise madvise never";
357bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
358bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
359bfb0ffebSJoe Perches 		output = "always defer [defer+madvise] madvise never";
360bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
361bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
362bfb0ffebSJoe Perches 		output = "always defer defer+madvise [madvise] never";
363bfb0ffebSJoe Perches 	else
364bfb0ffebSJoe Perches 		output = "always defer defer+madvise madvise [never]";
365bfb0ffebSJoe Perches 
366bfb0ffebSJoe Perches 	return sysfs_emit(buf, "%s\n", output);
36771e3aac0SAndrea Arcangeli }
36821440d7eSDavid Rientjes 
36971e3aac0SAndrea Arcangeli static ssize_t defrag_store(struct kobject *kobj,
37071e3aac0SAndrea Arcangeli 			    struct kobj_attribute *attr,
37171e3aac0SAndrea Arcangeli 			    const char *buf, size_t count)
37271e3aac0SAndrea Arcangeli {
373f42f2552SDavid Rientjes 	if (sysfs_streq(buf, "always")) {
37421440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
37521440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
37621440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
37721440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
378f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "defer+madvise")) {
37921440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
38021440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
38121440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
38221440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
383f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "defer")) {
3844fad7fb6SDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
3854fad7fb6SDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
3864fad7fb6SDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
3874fad7fb6SDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
388f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "madvise")) {
38921440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
39021440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
39121440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
39221440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
393f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "never")) {
39421440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
39521440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
39621440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
39721440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
39821440d7eSDavid Rientjes 	} else
39921440d7eSDavid Rientjes 		return -EINVAL;
40021440d7eSDavid Rientjes 
40121440d7eSDavid Rientjes 	return count;
40271e3aac0SAndrea Arcangeli }
40337139bb0SMiaohe Lin static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
40471e3aac0SAndrea Arcangeli 
40579da5407SKirill A. Shutemov static ssize_t use_zero_page_show(struct kobject *kobj,
40679da5407SKirill A. Shutemov 				  struct kobj_attribute *attr, char *buf)
40779da5407SKirill A. Shutemov {
408b46e756fSKirill A. Shutemov 	return single_hugepage_flag_show(kobj, attr, buf,
40979da5407SKirill A. Shutemov 					 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41079da5407SKirill A. Shutemov }
41179da5407SKirill A. Shutemov static ssize_t use_zero_page_store(struct kobject *kobj,
41279da5407SKirill A. Shutemov 		struct kobj_attribute *attr, const char *buf, size_t count)
41379da5407SKirill A. Shutemov {
414b46e756fSKirill A. Shutemov 	return single_hugepage_flag_store(kobj, attr, buf, count,
41579da5407SKirill A. Shutemov 				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41679da5407SKirill A. Shutemov }
41737139bb0SMiaohe Lin static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
41849920d28SHugh Dickins 
41949920d28SHugh Dickins static ssize_t hpage_pmd_size_show(struct kobject *kobj,
42049920d28SHugh Dickins 				   struct kobj_attribute *attr, char *buf)
42149920d28SHugh Dickins {
422ae7a927dSJoe Perches 	return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
42349920d28SHugh Dickins }
42449920d28SHugh Dickins static struct kobj_attribute hpage_pmd_size_attr =
42549920d28SHugh Dickins 	__ATTR_RO(hpage_pmd_size);
42649920d28SHugh Dickins 
42771e3aac0SAndrea Arcangeli static struct attribute *hugepage_attr[] = {
42871e3aac0SAndrea Arcangeli 	&enabled_attr.attr,
42971e3aac0SAndrea Arcangeli 	&defrag_attr.attr,
43079da5407SKirill A. Shutemov 	&use_zero_page_attr.attr,
43149920d28SHugh Dickins 	&hpage_pmd_size_attr.attr,
432396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_SHMEM
4335a6e75f8SKirill A. Shutemov 	&shmem_enabled_attr.attr,
4345a6e75f8SKirill A. Shutemov #endif
43571e3aac0SAndrea Arcangeli 	NULL,
43671e3aac0SAndrea Arcangeli };
43771e3aac0SAndrea Arcangeli 
4388aa95a21SArvind Yadav static const struct attribute_group hugepage_attr_group = {
43971e3aac0SAndrea Arcangeli 	.attrs = hugepage_attr,
440ba76149fSAndrea Arcangeli };
441ba76149fSAndrea Arcangeli 
4423485b883SRyan Roberts static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
4433485b883SRyan Roberts static void thpsize_release(struct kobject *kobj);
4443485b883SRyan Roberts static DEFINE_SPINLOCK(huge_anon_orders_lock);
4453485b883SRyan Roberts static LIST_HEAD(thpsize_list);
4463485b883SRyan Roberts 
4473485b883SRyan Roberts struct thpsize {
4483485b883SRyan Roberts 	struct kobject kobj;
4493485b883SRyan Roberts 	struct list_head node;
4503485b883SRyan Roberts 	int order;
4513485b883SRyan Roberts };
4523485b883SRyan Roberts 
4533485b883SRyan Roberts #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
4543485b883SRyan Roberts 
4553485b883SRyan Roberts static ssize_t thpsize_enabled_show(struct kobject *kobj,
4563485b883SRyan Roberts 				    struct kobj_attribute *attr, char *buf)
4573485b883SRyan Roberts {
4583485b883SRyan Roberts 	int order = to_thpsize(kobj)->order;
4593485b883SRyan Roberts 	const char *output;
4603485b883SRyan Roberts 
4613485b883SRyan Roberts 	if (test_bit(order, &huge_anon_orders_always))
4623485b883SRyan Roberts 		output = "[always] inherit madvise never";
4633485b883SRyan Roberts 	else if (test_bit(order, &huge_anon_orders_inherit))
4643485b883SRyan Roberts 		output = "always [inherit] madvise never";
4653485b883SRyan Roberts 	else if (test_bit(order, &huge_anon_orders_madvise))
4663485b883SRyan Roberts 		output = "always inherit [madvise] never";
4673485b883SRyan Roberts 	else
4683485b883SRyan Roberts 		output = "always inherit madvise [never]";
4693485b883SRyan Roberts 
4703485b883SRyan Roberts 	return sysfs_emit(buf, "%s\n", output);
4713485b883SRyan Roberts }
4723485b883SRyan Roberts 
4733485b883SRyan Roberts static ssize_t thpsize_enabled_store(struct kobject *kobj,
4743485b883SRyan Roberts 				     struct kobj_attribute *attr,
4753485b883SRyan Roberts 				     const char *buf, size_t count)
4763485b883SRyan Roberts {
4773485b883SRyan Roberts 	int order = to_thpsize(kobj)->order;
4783485b883SRyan Roberts 	ssize_t ret = count;
4793485b883SRyan Roberts 
4803485b883SRyan Roberts 	if (sysfs_streq(buf, "always")) {
4813485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
4823485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_inherit);
4833485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_madvise);
4843485b883SRyan Roberts 		set_bit(order, &huge_anon_orders_always);
4853485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
4863485b883SRyan Roberts 	} else if (sysfs_streq(buf, "inherit")) {
4873485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
4883485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_always);
4893485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_madvise);
4903485b883SRyan Roberts 		set_bit(order, &huge_anon_orders_inherit);
4913485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
4923485b883SRyan Roberts 	} else if (sysfs_streq(buf, "madvise")) {
4933485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
4943485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_always);
4953485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_inherit);
4963485b883SRyan Roberts 		set_bit(order, &huge_anon_orders_madvise);
4973485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
4983485b883SRyan Roberts 	} else if (sysfs_streq(buf, "never")) {
4993485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
5003485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_always);
5013485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_inherit);
5023485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_madvise);
5033485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
5043485b883SRyan Roberts 	} else
5053485b883SRyan Roberts 		ret = -EINVAL;
5063485b883SRyan Roberts 
5073485b883SRyan Roberts 	return ret;
5083485b883SRyan Roberts }
5093485b883SRyan Roberts 
5103485b883SRyan Roberts static struct kobj_attribute thpsize_enabled_attr =
5113485b883SRyan Roberts 	__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
5123485b883SRyan Roberts 
5133485b883SRyan Roberts static struct attribute *thpsize_attrs[] = {
5143485b883SRyan Roberts 	&thpsize_enabled_attr.attr,
5153485b883SRyan Roberts 	NULL,
5163485b883SRyan Roberts };
5173485b883SRyan Roberts 
5183485b883SRyan Roberts static const struct attribute_group thpsize_attr_group = {
5193485b883SRyan Roberts 	.attrs = thpsize_attrs,
5203485b883SRyan Roberts };
5213485b883SRyan Roberts 
5223485b883SRyan Roberts static const struct kobj_type thpsize_ktype = {
5233485b883SRyan Roberts 	.release = &thpsize_release,
5243485b883SRyan Roberts 	.sysfs_ops = &kobj_sysfs_ops,
5253485b883SRyan Roberts };
5263485b883SRyan Roberts 
5273485b883SRyan Roberts static struct thpsize *thpsize_create(int order, struct kobject *parent)
5283485b883SRyan Roberts {
5293485b883SRyan Roberts 	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
5303485b883SRyan Roberts 	struct thpsize *thpsize;
5313485b883SRyan Roberts 	int ret;
5323485b883SRyan Roberts 
5333485b883SRyan Roberts 	thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
5343485b883SRyan Roberts 	if (!thpsize)
5353485b883SRyan Roberts 		return ERR_PTR(-ENOMEM);
5363485b883SRyan Roberts 
5373485b883SRyan Roberts 	ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
5383485b883SRyan Roberts 				   "hugepages-%lukB", size);
5393485b883SRyan Roberts 	if (ret) {
5403485b883SRyan Roberts 		kfree(thpsize);
5413485b883SRyan Roberts 		return ERR_PTR(ret);
5423485b883SRyan Roberts 	}
5433485b883SRyan Roberts 
5443485b883SRyan Roberts 	ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
5453485b883SRyan Roberts 	if (ret) {
5463485b883SRyan Roberts 		kobject_put(&thpsize->kobj);
5473485b883SRyan Roberts 		return ERR_PTR(ret);
5483485b883SRyan Roberts 	}
5493485b883SRyan Roberts 
5503485b883SRyan Roberts 	thpsize->order = order;
5513485b883SRyan Roberts 	return thpsize;
5523485b883SRyan Roberts }
5533485b883SRyan Roberts 
5543485b883SRyan Roberts static void thpsize_release(struct kobject *kobj)
5553485b883SRyan Roberts {
5563485b883SRyan Roberts 	kfree(to_thpsize(kobj));
5573485b883SRyan Roberts }
5583485b883SRyan Roberts 
559569e5590SShaohua Li static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
560569e5590SShaohua Li {
561569e5590SShaohua Li 	int err;
5623485b883SRyan Roberts 	struct thpsize *thpsize;
5633485b883SRyan Roberts 	unsigned long orders;
5643485b883SRyan Roberts 	int order;
5653485b883SRyan Roberts 
5663485b883SRyan Roberts 	/*
5673485b883SRyan Roberts 	 * Default to setting PMD-sized THP to inherit the global setting and
5683485b883SRyan Roberts 	 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
5693485b883SRyan Roberts 	 * constant so we have to do this here.
5703485b883SRyan Roberts 	 */
5713485b883SRyan Roberts 	huge_anon_orders_inherit = BIT(PMD_ORDER);
572569e5590SShaohua Li 
573569e5590SShaohua Li 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
574569e5590SShaohua Li 	if (unlikely(!*hugepage_kobj)) {
575ae3a8c1cSAndrew Morton 		pr_err("failed to create transparent hugepage kobject\n");
576569e5590SShaohua Li 		return -ENOMEM;
577569e5590SShaohua Li 	}
578569e5590SShaohua Li 
579569e5590SShaohua Li 	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
580569e5590SShaohua Li 	if (err) {
581ae3a8c1cSAndrew Morton 		pr_err("failed to register transparent hugepage group\n");
582569e5590SShaohua Li 		goto delete_obj;
583569e5590SShaohua Li 	}
584569e5590SShaohua Li 
585569e5590SShaohua Li 	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
586569e5590SShaohua Li 	if (err) {
587ae3a8c1cSAndrew Morton 		pr_err("failed to register transparent hugepage group\n");
588569e5590SShaohua Li 		goto remove_hp_group;
589569e5590SShaohua Li 	}
590569e5590SShaohua Li 
5913485b883SRyan Roberts 	orders = THP_ORDERS_ALL_ANON;
5923485b883SRyan Roberts 	order = highest_order(orders);
5933485b883SRyan Roberts 	while (orders) {
5943485b883SRyan Roberts 		thpsize = thpsize_create(order, *hugepage_kobj);
5953485b883SRyan Roberts 		if (IS_ERR(thpsize)) {
5963485b883SRyan Roberts 			pr_err("failed to create thpsize for order %d\n", order);
5973485b883SRyan Roberts 			err = PTR_ERR(thpsize);
5983485b883SRyan Roberts 			goto remove_all;
5993485b883SRyan Roberts 		}
6003485b883SRyan Roberts 		list_add(&thpsize->node, &thpsize_list);
6013485b883SRyan Roberts 		order = next_order(&orders, order);
6023485b883SRyan Roberts 	}
6033485b883SRyan Roberts 
604569e5590SShaohua Li 	return 0;
605569e5590SShaohua Li 
6063485b883SRyan Roberts remove_all:
6073485b883SRyan Roberts 	hugepage_exit_sysfs(*hugepage_kobj);
6083485b883SRyan Roberts 	return err;
609569e5590SShaohua Li remove_hp_group:
610569e5590SShaohua Li 	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
611569e5590SShaohua Li delete_obj:
612569e5590SShaohua Li 	kobject_put(*hugepage_kobj);
613569e5590SShaohua Li 	return err;
614569e5590SShaohua Li }
615569e5590SShaohua Li 
616569e5590SShaohua Li static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
617569e5590SShaohua Li {
6183485b883SRyan Roberts 	struct thpsize *thpsize, *tmp;
6193485b883SRyan Roberts 
6203485b883SRyan Roberts 	list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
6213485b883SRyan Roberts 		list_del(&thpsize->node);
6223485b883SRyan Roberts 		kobject_put(&thpsize->kobj);
6233485b883SRyan Roberts 	}
6243485b883SRyan Roberts 
625569e5590SShaohua Li 	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
626569e5590SShaohua Li 	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
627569e5590SShaohua Li 	kobject_put(hugepage_kobj);
628569e5590SShaohua Li }
629569e5590SShaohua Li #else
630569e5590SShaohua Li static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
631569e5590SShaohua Li {
632569e5590SShaohua Li 	return 0;
633569e5590SShaohua Li }
634569e5590SShaohua Li 
635569e5590SShaohua Li static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
636569e5590SShaohua Li {
637569e5590SShaohua Li }
63871e3aac0SAndrea Arcangeli #endif /* CONFIG_SYSFS */
63971e3aac0SAndrea Arcangeli 
64054d91729SQi Zheng static int __init thp_shrinker_init(void)
64154d91729SQi Zheng {
64254d91729SQi Zheng 	huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
64354d91729SQi Zheng 	if (!huge_zero_page_shrinker)
64454d91729SQi Zheng 		return -ENOMEM;
64554d91729SQi Zheng 
64654d91729SQi Zheng 	deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
64754d91729SQi Zheng 						 SHRINKER_MEMCG_AWARE |
64854d91729SQi Zheng 						 SHRINKER_NONSLAB,
64954d91729SQi Zheng 						 "thp-deferred_split");
65054d91729SQi Zheng 	if (!deferred_split_shrinker) {
65154d91729SQi Zheng 		shrinker_free(huge_zero_page_shrinker);
65254d91729SQi Zheng 		return -ENOMEM;
65354d91729SQi Zheng 	}
65454d91729SQi Zheng 
65554d91729SQi Zheng 	huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
65654d91729SQi Zheng 	huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
65754d91729SQi Zheng 	shrinker_register(huge_zero_page_shrinker);
65854d91729SQi Zheng 
65954d91729SQi Zheng 	deferred_split_shrinker->count_objects = deferred_split_count;
66054d91729SQi Zheng 	deferred_split_shrinker->scan_objects = deferred_split_scan;
66154d91729SQi Zheng 	shrinker_register(deferred_split_shrinker);
66254d91729SQi Zheng 
66354d91729SQi Zheng 	return 0;
66454d91729SQi Zheng }
66554d91729SQi Zheng 
66654d91729SQi Zheng static void __init thp_shrinker_exit(void)
66754d91729SQi Zheng {
66854d91729SQi Zheng 	shrinker_free(huge_zero_page_shrinker);
66954d91729SQi Zheng 	shrinker_free(deferred_split_shrinker);
67054d91729SQi Zheng }
67154d91729SQi Zheng 
67271e3aac0SAndrea Arcangeli static int __init hugepage_init(void)
67371e3aac0SAndrea Arcangeli {
67471e3aac0SAndrea Arcangeli 	int err;
675569e5590SShaohua Li 	struct kobject *hugepage_kobj;
67671e3aac0SAndrea Arcangeli 
6774b7167b9SAndrea Arcangeli 	if (!has_transparent_hugepage()) {
6783c556d24SPeter Xu 		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
679569e5590SShaohua Li 		return -EINVAL;
6804b7167b9SAndrea Arcangeli 	}
6814b7167b9SAndrea Arcangeli 
682ff20c2e0SKirill A. Shutemov 	/*
683ff20c2e0SKirill A. Shutemov 	 * hugepages can't be allocated by the buddy allocator
684ff20c2e0SKirill A. Shutemov 	 */
68523baf831SKirill A. Shutemov 	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
686ff20c2e0SKirill A. Shutemov 	/*
687ff20c2e0SKirill A. Shutemov 	 * we use page->mapping and page->index in second tail page
688ff20c2e0SKirill A. Shutemov 	 * as list_head: assuming THP order >= 2
689ff20c2e0SKirill A. Shutemov 	 */
690ff20c2e0SKirill A. Shutemov 	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
691ff20c2e0SKirill A. Shutemov 
692569e5590SShaohua Li 	err = hugepage_init_sysfs(&hugepage_kobj);
693569e5590SShaohua Li 	if (err)
69465ebb64fSKirill A. Shutemov 		goto err_sysfs;
695ba76149fSAndrea Arcangeli 
696b46e756fSKirill A. Shutemov 	err = khugepaged_init();
697ba76149fSAndrea Arcangeli 	if (err)
69865ebb64fSKirill A. Shutemov 		goto err_slab;
699ba76149fSAndrea Arcangeli 
70054d91729SQi Zheng 	err = thp_shrinker_init();
70165ebb64fSKirill A. Shutemov 	if (err)
70254d91729SQi Zheng 		goto err_shrinker;
70397ae1749SKirill A. Shutemov 
70497562cd2SRik van Riel 	/*
70597562cd2SRik van Riel 	 * By default disable transparent hugepages on smaller systems,
70697562cd2SRik van Riel 	 * where the extra memory used could hurt more than TLB overhead
70797562cd2SRik van Riel 	 * is likely to save.  The admin can still enable it through /sys.
70897562cd2SRik van Riel 	 */
709ca79b0c2SArun KS 	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
71097562cd2SRik van Riel 		transparent_hugepage_flags = 0;
71179553da2SKirill A. Shutemov 		return 0;
71279553da2SKirill A. Shutemov 	}
71397562cd2SRik van Riel 
71479553da2SKirill A. Shutemov 	err = start_stop_khugepaged();
71565ebb64fSKirill A. Shutemov 	if (err)
71665ebb64fSKirill A. Shutemov 		goto err_khugepaged;
717ba76149fSAndrea Arcangeli 
718569e5590SShaohua Li 	return 0;
71965ebb64fSKirill A. Shutemov err_khugepaged:
72054d91729SQi Zheng 	thp_shrinker_exit();
72154d91729SQi Zheng err_shrinker:
722b46e756fSKirill A. Shutemov 	khugepaged_destroy();
72365ebb64fSKirill A. Shutemov err_slab:
724569e5590SShaohua Li 	hugepage_exit_sysfs(hugepage_kobj);
72565ebb64fSKirill A. Shutemov err_sysfs:
726ba76149fSAndrea Arcangeli 	return err;
72771e3aac0SAndrea Arcangeli }
728a64fb3cdSPaul Gortmaker subsys_initcall(hugepage_init);
72971e3aac0SAndrea Arcangeli 
73071e3aac0SAndrea Arcangeli static int __init setup_transparent_hugepage(char *str)
73171e3aac0SAndrea Arcangeli {
73271e3aac0SAndrea Arcangeli 	int ret = 0;
73371e3aac0SAndrea Arcangeli 	if (!str)
73471e3aac0SAndrea Arcangeli 		goto out;
73571e3aac0SAndrea Arcangeli 	if (!strcmp(str, "always")) {
73671e3aac0SAndrea Arcangeli 		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
73771e3aac0SAndrea Arcangeli 			&transparent_hugepage_flags);
73871e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
73971e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
74071e3aac0SAndrea Arcangeli 		ret = 1;
74171e3aac0SAndrea Arcangeli 	} else if (!strcmp(str, "madvise")) {
74271e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
74371e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
74471e3aac0SAndrea Arcangeli 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
74571e3aac0SAndrea Arcangeli 			&transparent_hugepage_flags);
74671e3aac0SAndrea Arcangeli 		ret = 1;
74771e3aac0SAndrea Arcangeli 	} else if (!strcmp(str, "never")) {
74871e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
74971e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
75071e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
75171e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
75271e3aac0SAndrea Arcangeli 		ret = 1;
75371e3aac0SAndrea Arcangeli 	}
75471e3aac0SAndrea Arcangeli out:
75571e3aac0SAndrea Arcangeli 	if (!ret)
756ae3a8c1cSAndrew Morton 		pr_warn("transparent_hugepage= cannot parse, ignored\n");
75771e3aac0SAndrea Arcangeli 	return ret;
75871e3aac0SAndrea Arcangeli }
75971e3aac0SAndrea Arcangeli __setup("transparent_hugepage=", setup_transparent_hugepage);
76071e3aac0SAndrea Arcangeli 
761f55e1014SLinus Torvalds pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
76271e3aac0SAndrea Arcangeli {
763f55e1014SLinus Torvalds 	if (likely(vma->vm_flags & VM_WRITE))
764161e393cSRick Edgecombe 		pmd = pmd_mkwrite(pmd, vma);
76571e3aac0SAndrea Arcangeli 	return pmd;
76671e3aac0SAndrea Arcangeli }
76771e3aac0SAndrea Arcangeli 
76887eaceb3SYang Shi #ifdef CONFIG_MEMCG
769f8baa6beSMatthew Wilcox (Oracle) static inline
770f8baa6beSMatthew Wilcox (Oracle) struct deferred_split *get_deferred_split_queue(struct folio *folio)
7719a982250SKirill A. Shutemov {
772f8baa6beSMatthew Wilcox (Oracle) 	struct mem_cgroup *memcg = folio_memcg(folio);
773f8baa6beSMatthew Wilcox (Oracle) 	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
77487eaceb3SYang Shi 
77587eaceb3SYang Shi 	if (memcg)
77687eaceb3SYang Shi 		return &memcg->deferred_split_queue;
77787eaceb3SYang Shi 	else
77887eaceb3SYang Shi 		return &pgdat->deferred_split_queue;
7799a982250SKirill A. Shutemov }
78087eaceb3SYang Shi #else
781f8baa6beSMatthew Wilcox (Oracle) static inline
782f8baa6beSMatthew Wilcox (Oracle) struct deferred_split *get_deferred_split_queue(struct folio *folio)
78387eaceb3SYang Shi {
784f8baa6beSMatthew Wilcox (Oracle) 	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
78587eaceb3SYang Shi 
78687eaceb3SYang Shi 	return &pgdat->deferred_split_queue;
78787eaceb3SYang Shi }
78887eaceb3SYang Shi #endif
7899a982250SKirill A. Shutemov 
790da6e7bf3SMatthew Wilcox (Oracle) void folio_prep_large_rmappable(struct folio *folio)
7919a982250SKirill A. Shutemov {
7928991de90SMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
7938991de90SMatthew Wilcox (Oracle) 	INIT_LIST_HEAD(&folio->_deferred_list);
794de53c05fSMatthew Wilcox (Oracle) 	folio_set_large_rmappable(folio);
7959a982250SKirill A. Shutemov }
7969a982250SKirill A. Shutemov 
797a644b0abSMatthew Wilcox (Oracle) static inline bool is_transparent_hugepage(struct folio *folio)
798005ba37cSSean Christopherson {
799a644b0abSMatthew Wilcox (Oracle) 	if (!folio_test_large(folio))
800fa1f68ccSZou Wei 		return false;
801005ba37cSSean Christopherson 
802f04029f3SMatthew Wilcox (Oracle) 	return is_huge_zero_page(&folio->page) ||
803de53c05fSMatthew Wilcox (Oracle) 		folio_test_large_rmappable(folio);
804005ba37cSSean Christopherson }
805005ba37cSSean Christopherson 
80697d3d0f9SKirill A. Shutemov static unsigned long __thp_get_unmapped_area(struct file *filp,
80797d3d0f9SKirill A. Shutemov 		unsigned long addr, unsigned long len,
80874d2fad1SToshi Kani 		loff_t off, unsigned long flags, unsigned long size)
80974d2fad1SToshi Kani {
81074d2fad1SToshi Kani 	loff_t off_end = off + len;
81174d2fad1SToshi Kani 	loff_t off_align = round_up(off, size);
81297d3d0f9SKirill A. Shutemov 	unsigned long len_pad, ret;
81374d2fad1SToshi Kani 
81474d2fad1SToshi Kani 	if (off_end <= off_align || (off_end - off_align) < size)
81574d2fad1SToshi Kani 		return 0;
81674d2fad1SToshi Kani 
81774d2fad1SToshi Kani 	len_pad = len + size;
81874d2fad1SToshi Kani 	if (len_pad < len || (off + len_pad) < off)
81974d2fad1SToshi Kani 		return 0;
82074d2fad1SToshi Kani 
82197d3d0f9SKirill A. Shutemov 	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
82274d2fad1SToshi Kani 					      off >> PAGE_SHIFT, flags);
82397d3d0f9SKirill A. Shutemov 
82497d3d0f9SKirill A. Shutemov 	/*
82597d3d0f9SKirill A. Shutemov 	 * The failure might be due to length padding. The caller will retry
82697d3d0f9SKirill A. Shutemov 	 * without the padding.
82797d3d0f9SKirill A. Shutemov 	 */
82897d3d0f9SKirill A. Shutemov 	if (IS_ERR_VALUE(ret))
82974d2fad1SToshi Kani 		return 0;
83074d2fad1SToshi Kani 
83197d3d0f9SKirill A. Shutemov 	/*
83297d3d0f9SKirill A. Shutemov 	 * Do not try to align to THP boundary if allocation at the address
83397d3d0f9SKirill A. Shutemov 	 * hint succeeds.
83497d3d0f9SKirill A. Shutemov 	 */
83597d3d0f9SKirill A. Shutemov 	if (ret == addr)
83674d2fad1SToshi Kani 		return addr;
83797d3d0f9SKirill A. Shutemov 
83897d3d0f9SKirill A. Shutemov 	ret += (off - ret) & (size - 1);
83997d3d0f9SKirill A. Shutemov 	return ret;
84074d2fad1SToshi Kani }
84174d2fad1SToshi Kani 
84274d2fad1SToshi Kani unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
84374d2fad1SToshi Kani 		unsigned long len, unsigned long pgoff, unsigned long flags)
84474d2fad1SToshi Kani {
84597d3d0f9SKirill A. Shutemov 	unsigned long ret;
84674d2fad1SToshi Kani 	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
84774d2fad1SToshi Kani 
84897d3d0f9SKirill A. Shutemov 	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
84997d3d0f9SKirill A. Shutemov 	if (ret)
85097d3d0f9SKirill A. Shutemov 		return ret;
8511854bc6eSWilliam Kucharski 
85274d2fad1SToshi Kani 	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
85374d2fad1SToshi Kani }
85474d2fad1SToshi Kani EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
85574d2fad1SToshi Kani 
8562b740303SSouptick Joarder static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
8572b740303SSouptick Joarder 			struct page *page, gfp_t gfp)
85871e3aac0SAndrea Arcangeli {
85982b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
860cfe3236dSKefeng Wang 	struct folio *folio = page_folio(page);
86171e3aac0SAndrea Arcangeli 	pgtable_t pgtable;
86282b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
8632b740303SSouptick Joarder 	vm_fault_t ret = 0;
86471e3aac0SAndrea Arcangeli 
865cfe3236dSKefeng Wang 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
86600501b53SJohannes Weiner 
867cfe3236dSKefeng Wang 	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
868cfe3236dSKefeng Wang 		folio_put(folio);
8696b251fc9SAndrea Arcangeli 		count_vm_event(THP_FAULT_FALLBACK);
87085b9f46eSDavid Rientjes 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
8716b251fc9SAndrea Arcangeli 		return VM_FAULT_FALLBACK;
8726b251fc9SAndrea Arcangeli 	}
873cfe3236dSKefeng Wang 	folio_throttle_swaprate(folio, gfp);
87471e3aac0SAndrea Arcangeli 
8754cf58924SJoel Fernandes (Google) 	pgtable = pte_alloc_one(vma->vm_mm);
87600501b53SJohannes Weiner 	if (unlikely(!pgtable)) {
8776b31d595SMichal Hocko 		ret = VM_FAULT_OOM;
8786b31d595SMichal Hocko 		goto release;
87900501b53SJohannes Weiner 	}
88000501b53SJohannes Weiner 
881c79b57e4SHuang Ying 	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
88252f37629SMinchan Kim 	/*
883cfe3236dSKefeng Wang 	 * The memory barrier inside __folio_mark_uptodate makes sure that
88452f37629SMinchan Kim 	 * clear_huge_page writes become visible before the set_pmd_at()
88552f37629SMinchan Kim 	 * write.
88652f37629SMinchan Kim 	 */
887cfe3236dSKefeng Wang 	__folio_mark_uptodate(folio);
88871e3aac0SAndrea Arcangeli 
88982b0f8c3SJan Kara 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
89082b0f8c3SJan Kara 	if (unlikely(!pmd_none(*vmf->pmd))) {
8916b31d595SMichal Hocko 		goto unlock_release;
89271e3aac0SAndrea Arcangeli 	} else {
89371e3aac0SAndrea Arcangeli 		pmd_t entry;
8946b251fc9SAndrea Arcangeli 
8956b31d595SMichal Hocko 		ret = check_stable_address_space(vma->vm_mm);
8966b31d595SMichal Hocko 		if (ret)
8976b31d595SMichal Hocko 			goto unlock_release;
8986b31d595SMichal Hocko 
8996b251fc9SAndrea Arcangeli 		/* Deliver the page fault to userland */
9006b251fc9SAndrea Arcangeli 		if (userfaultfd_missing(vma)) {
90182b0f8c3SJan Kara 			spin_unlock(vmf->ptl);
902cfe3236dSKefeng Wang 			folio_put(folio);
903bae473a4SKirill A. Shutemov 			pte_free(vma->vm_mm, pgtable);
9048fd5eda4SMiaohe Lin 			ret = handle_userfault(vmf, VM_UFFD_MISSING);
9058fd5eda4SMiaohe Lin 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
9068fd5eda4SMiaohe Lin 			return ret;
9076b251fc9SAndrea Arcangeli 		}
9086b251fc9SAndrea Arcangeli 
9093122359aSKirill A. Shutemov 		entry = mk_huge_pmd(page, vma->vm_page_prot);
910f55e1014SLinus Torvalds 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
911cfe3236dSKefeng Wang 		folio_add_new_anon_rmap(folio, vma, haddr);
912cfe3236dSKefeng Wang 		folio_add_lru_vma(folio, vma);
91382b0f8c3SJan Kara 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
91482b0f8c3SJan Kara 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
915fca40573SBibo Mao 		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
916bae473a4SKirill A. Shutemov 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
917c4812909SKirill A. Shutemov 		mm_inc_nr_ptes(vma->vm_mm);
91882b0f8c3SJan Kara 		spin_unlock(vmf->ptl);
9196b251fc9SAndrea Arcangeli 		count_vm_event(THP_FAULT_ALLOC);
9209d82c694SJohannes Weiner 		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
92171e3aac0SAndrea Arcangeli 	}
92271e3aac0SAndrea Arcangeli 
923aa2e878eSDavid Rientjes 	return 0;
9246b31d595SMichal Hocko unlock_release:
9256b31d595SMichal Hocko 	spin_unlock(vmf->ptl);
9266b31d595SMichal Hocko release:
9276b31d595SMichal Hocko 	if (pgtable)
9286b31d595SMichal Hocko 		pte_free(vma->vm_mm, pgtable);
929cfe3236dSKefeng Wang 	folio_put(folio);
9306b31d595SMichal Hocko 	return ret;
9316b31d595SMichal Hocko 
93271e3aac0SAndrea Arcangeli }
93371e3aac0SAndrea Arcangeli 
934444eb2a4SMel Gorman /*
93521440d7eSDavid Rientjes  * always: directly stall for all thp allocations
93621440d7eSDavid Rientjes  * defer: wake kswapd and fail if not immediately available
93721440d7eSDavid Rientjes  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
93821440d7eSDavid Rientjes  *		  fail if not immediately available
93921440d7eSDavid Rientjes  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
94021440d7eSDavid Rientjes  *	    available
94121440d7eSDavid Rientjes  * never: never stall for any thp allocation
942444eb2a4SMel Gorman  */
943164cc4feSRik van Riel gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
9440bbbc0b3SAndrea Arcangeli {
945164cc4feSRik van Riel 	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
94689c83fb5SMichal Hocko 
947ac79f78dSDavid Rientjes 	/* Always do synchronous compaction */
94821440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
949a8282608SAndrea Arcangeli 		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
950ac79f78dSDavid Rientjes 
951ac79f78dSDavid Rientjes 	/* Kick kcompactd and fail quickly */
95221440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
95319deb769SDavid Rientjes 		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
954ac79f78dSDavid Rientjes 
955ac79f78dSDavid Rientjes 	/* Synchronous compaction if madvised, otherwise kick kcompactd */
95621440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
95719deb769SDavid Rientjes 		return GFP_TRANSHUGE_LIGHT |
95819deb769SDavid Rientjes 			(vma_madvised ? __GFP_DIRECT_RECLAIM :
959ac79f78dSDavid Rientjes 					__GFP_KSWAPD_RECLAIM);
960ac79f78dSDavid Rientjes 
961ac79f78dSDavid Rientjes 	/* Only do synchronous compaction if madvised */
96221440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
96319deb769SDavid Rientjes 		return GFP_TRANSHUGE_LIGHT |
96419deb769SDavid Rientjes 		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
965ac79f78dSDavid Rientjes 
96619deb769SDavid Rientjes 	return GFP_TRANSHUGE_LIGHT;
967444eb2a4SMel Gorman }
968444eb2a4SMel Gorman 
969c4088ebdSKirill A. Shutemov /* Caller must hold page table lock. */
9702efeb8daSMiaohe Lin static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
97197ae1749SKirill A. Shutemov 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
9725918d10aSKirill A. Shutemov 		struct page *zero_page)
973fc9fe822SKirill A. Shutemov {
974fc9fe822SKirill A. Shutemov 	pmd_t entry;
9757c414164SAndrew Morton 	if (!pmd_none(*pmd))
9762efeb8daSMiaohe Lin 		return;
9775918d10aSKirill A. Shutemov 	entry = mk_pmd(zero_page, vma->vm_page_prot);
978fc9fe822SKirill A. Shutemov 	entry = pmd_mkhuge(entry);
9796b0b50b0SAneesh Kumar K.V 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
980fc9fe822SKirill A. Shutemov 	set_pmd_at(mm, haddr, pmd, entry);
981c4812909SKirill A. Shutemov 	mm_inc_nr_ptes(mm);
982fc9fe822SKirill A. Shutemov }
983fc9fe822SKirill A. Shutemov 
9842b740303SSouptick Joarder vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
98571e3aac0SAndrea Arcangeli {
98682b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
987077fcf11SAneesh Kumar K.V 	gfp_t gfp;
988cb196ee1SMatthew Wilcox (Oracle) 	struct folio *folio;
98982b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
99071e3aac0SAndrea Arcangeli 
9913485b883SRyan Roberts 	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
992c0292554SKirill A. Shutemov 		return VM_FAULT_FALLBACK;
99371e3aac0SAndrea Arcangeli 	if (unlikely(anon_vma_prepare(vma)))
99471e3aac0SAndrea Arcangeli 		return VM_FAULT_OOM;
9954fa6893fSYang Shi 	khugepaged_enter_vma(vma, vma->vm_flags);
996d2081b2bSYang Shi 
99782b0f8c3SJan Kara 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
998bae473a4SKirill A. Shutemov 			!mm_forbids_zeropage(vma->vm_mm) &&
99979da5407SKirill A. Shutemov 			transparent_hugepage_use_zero_page()) {
100080371957SKirill A. Shutemov 		pgtable_t pgtable;
10015918d10aSKirill A. Shutemov 		struct page *zero_page;
10022b740303SSouptick Joarder 		vm_fault_t ret;
10034cf58924SJoel Fernandes (Google) 		pgtable = pte_alloc_one(vma->vm_mm);
100480371957SKirill A. Shutemov 		if (unlikely(!pgtable))
100580371957SKirill A. Shutemov 			return VM_FAULT_OOM;
10066fcb52a5SAaron Lu 		zero_page = mm_get_huge_zero_page(vma->vm_mm);
10075918d10aSKirill A. Shutemov 		if (unlikely(!zero_page)) {
1008bae473a4SKirill A. Shutemov 			pte_free(vma->vm_mm, pgtable);
100997ae1749SKirill A. Shutemov 			count_vm_event(THP_FAULT_FALLBACK);
1010c0292554SKirill A. Shutemov 			return VM_FAULT_FALLBACK;
101197ae1749SKirill A. Shutemov 		}
101282b0f8c3SJan Kara 		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
10136b251fc9SAndrea Arcangeli 		ret = 0;
101482b0f8c3SJan Kara 		if (pmd_none(*vmf->pmd)) {
10156b31d595SMichal Hocko 			ret = check_stable_address_space(vma->vm_mm);
10166b31d595SMichal Hocko 			if (ret) {
10176b31d595SMichal Hocko 				spin_unlock(vmf->ptl);
1018bfe8cc1dSGerald Schaefer 				pte_free(vma->vm_mm, pgtable);
10196b31d595SMichal Hocko 			} else if (userfaultfd_missing(vma)) {
102082b0f8c3SJan Kara 				spin_unlock(vmf->ptl);
1021bfe8cc1dSGerald Schaefer 				pte_free(vma->vm_mm, pgtable);
102282b0f8c3SJan Kara 				ret = handle_userfault(vmf, VM_UFFD_MISSING);
10236b251fc9SAndrea Arcangeli 				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
10246b251fc9SAndrea Arcangeli 			} else {
1025bae473a4SKirill A. Shutemov 				set_huge_zero_page(pgtable, vma->vm_mm, vma,
102682b0f8c3SJan Kara 						   haddr, vmf->pmd, zero_page);
1027fca40573SBibo Mao 				update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
102882b0f8c3SJan Kara 				spin_unlock(vmf->ptl);
10296b251fc9SAndrea Arcangeli 			}
1030bfe8cc1dSGerald Schaefer 		} else {
103182b0f8c3SJan Kara 			spin_unlock(vmf->ptl);
1032bae473a4SKirill A. Shutemov 			pte_free(vma->vm_mm, pgtable);
1033bfe8cc1dSGerald Schaefer 		}
10346b251fc9SAndrea Arcangeli 		return ret;
103580371957SKirill A. Shutemov 	}
1036164cc4feSRik van Riel 	gfp = vma_thp_gfp_mask(vma);
1037cb196ee1SMatthew Wilcox (Oracle) 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
1038cb196ee1SMatthew Wilcox (Oracle) 	if (unlikely(!folio)) {
103981ab4201SAndi Kleen 		count_vm_event(THP_FAULT_FALLBACK);
1040c0292554SKirill A. Shutemov 		return VM_FAULT_FALLBACK;
104181ab4201SAndi Kleen 	}
1042cb196ee1SMatthew Wilcox (Oracle) 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
104371e3aac0SAndrea Arcangeli }
104471e3aac0SAndrea Arcangeli 
1045ae18d6dcSMatthew Wilcox static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
10463b6521f5SOliver O'Halloran 		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
10473b6521f5SOliver O'Halloran 		pgtable_t pgtable)
10485cad465dSMatthew Wilcox {
10495cad465dSMatthew Wilcox 	struct mm_struct *mm = vma->vm_mm;
10505cad465dSMatthew Wilcox 	pmd_t entry;
10515cad465dSMatthew Wilcox 	spinlock_t *ptl;
10525cad465dSMatthew Wilcox 
10535cad465dSMatthew Wilcox 	ptl = pmd_lock(mm, pmd);
1054c6f3c5eeSAneesh Kumar K.V 	if (!pmd_none(*pmd)) {
1055c6f3c5eeSAneesh Kumar K.V 		if (write) {
1056c6f3c5eeSAneesh Kumar K.V 			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
1057c6f3c5eeSAneesh Kumar K.V 				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1058c6f3c5eeSAneesh Kumar K.V 				goto out_unlock;
1059c6f3c5eeSAneesh Kumar K.V 			}
1060c6f3c5eeSAneesh Kumar K.V 			entry = pmd_mkyoung(*pmd);
1061c6f3c5eeSAneesh Kumar K.V 			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1062c6f3c5eeSAneesh Kumar K.V 			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1063c6f3c5eeSAneesh Kumar K.V 				update_mmu_cache_pmd(vma, addr, pmd);
1064c6f3c5eeSAneesh Kumar K.V 		}
1065c6f3c5eeSAneesh Kumar K.V 
1066c6f3c5eeSAneesh Kumar K.V 		goto out_unlock;
1067c6f3c5eeSAneesh Kumar K.V 	}
1068c6f3c5eeSAneesh Kumar K.V 
1069f25748e3SDan Williams 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1070f25748e3SDan Williams 	if (pfn_t_devmap(pfn))
1071f25748e3SDan Williams 		entry = pmd_mkdevmap(entry);
10725cad465dSMatthew Wilcox 	if (write) {
1073f55e1014SLinus Torvalds 		entry = pmd_mkyoung(pmd_mkdirty(entry));
1074f55e1014SLinus Torvalds 		entry = maybe_pmd_mkwrite(entry, vma);
10755cad465dSMatthew Wilcox 	}
10763b6521f5SOliver O'Halloran 
10773b6521f5SOliver O'Halloran 	if (pgtable) {
10783b6521f5SOliver O'Halloran 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
1079c4812909SKirill A. Shutemov 		mm_inc_nr_ptes(mm);
1080c6f3c5eeSAneesh Kumar K.V 		pgtable = NULL;
10813b6521f5SOliver O'Halloran 	}
10823b6521f5SOliver O'Halloran 
10835cad465dSMatthew Wilcox 	set_pmd_at(mm, addr, pmd, entry);
10845cad465dSMatthew Wilcox 	update_mmu_cache_pmd(vma, addr, pmd);
1085c6f3c5eeSAneesh Kumar K.V 
1086c6f3c5eeSAneesh Kumar K.V out_unlock:
10875cad465dSMatthew Wilcox 	spin_unlock(ptl);
1088c6f3c5eeSAneesh Kumar K.V 	if (pgtable)
1089c6f3c5eeSAneesh Kumar K.V 		pte_free(mm, pgtable);
10905cad465dSMatthew Wilcox }
10915cad465dSMatthew Wilcox 
10929a9731b1SThomas Hellstrom (VMware) /**
10937b806d22SLorenzo Stoakes  * vmf_insert_pfn_pmd - insert a pmd size pfn
10949a9731b1SThomas Hellstrom (VMware)  * @vmf: Structure describing the fault
10959a9731b1SThomas Hellstrom (VMware)  * @pfn: pfn to insert
10969a9731b1SThomas Hellstrom (VMware)  * @write: whether it's a write fault
10979a9731b1SThomas Hellstrom (VMware)  *
10987b806d22SLorenzo Stoakes  * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
10999a9731b1SThomas Hellstrom (VMware)  *
11009a9731b1SThomas Hellstrom (VMware)  * Return: vm_fault_t value.
11019a9731b1SThomas Hellstrom (VMware)  */
11027b806d22SLorenzo Stoakes vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
11035cad465dSMatthew Wilcox {
1104fce86ff5SDan Williams 	unsigned long addr = vmf->address & PMD_MASK;
1105fce86ff5SDan Williams 	struct vm_area_struct *vma = vmf->vma;
11067b806d22SLorenzo Stoakes 	pgprot_t pgprot = vma->vm_page_prot;
11073b6521f5SOliver O'Halloran 	pgtable_t pgtable = NULL;
1108fce86ff5SDan Williams 
11095cad465dSMatthew Wilcox 	/*
11105cad465dSMatthew Wilcox 	 * If we had pmd_special, we could avoid all these restrictions,
11115cad465dSMatthew Wilcox 	 * but we need to be consistent with PTEs and architectures that
11125cad465dSMatthew Wilcox 	 * can't support a 'special' bit.
11135cad465dSMatthew Wilcox 	 */
1114e1fb4a08SDave Jiang 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1115e1fb4a08SDave Jiang 			!pfn_t_devmap(pfn));
11165cad465dSMatthew Wilcox 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
11175cad465dSMatthew Wilcox 						(VM_PFNMAP|VM_MIXEDMAP));
11185cad465dSMatthew Wilcox 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
11195cad465dSMatthew Wilcox 
11205cad465dSMatthew Wilcox 	if (addr < vma->vm_start || addr >= vma->vm_end)
11215cad465dSMatthew Wilcox 		return VM_FAULT_SIGBUS;
1122308a047cSBorislav Petkov 
11233b6521f5SOliver O'Halloran 	if (arch_needs_pgtable_deposit()) {
11244cf58924SJoel Fernandes (Google) 		pgtable = pte_alloc_one(vma->vm_mm);
11253b6521f5SOliver O'Halloran 		if (!pgtable)
11263b6521f5SOliver O'Halloran 			return VM_FAULT_OOM;
11273b6521f5SOliver O'Halloran 	}
11283b6521f5SOliver O'Halloran 
1129308a047cSBorislav Petkov 	track_pfn_insert(vma, &pgprot, pfn);
1130308a047cSBorislav Petkov 
1131fce86ff5SDan Williams 	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
1132ae18d6dcSMatthew Wilcox 	return VM_FAULT_NOPAGE;
11335cad465dSMatthew Wilcox }
11347b806d22SLorenzo Stoakes EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
11355cad465dSMatthew Wilcox 
1136a00cc7d9SMatthew Wilcox #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1137f55e1014SLinus Torvalds static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1138a00cc7d9SMatthew Wilcox {
1139f55e1014SLinus Torvalds 	if (likely(vma->vm_flags & VM_WRITE))
1140a00cc7d9SMatthew Wilcox 		pud = pud_mkwrite(pud);
1141a00cc7d9SMatthew Wilcox 	return pud;
1142a00cc7d9SMatthew Wilcox }
1143a00cc7d9SMatthew Wilcox 
1144a00cc7d9SMatthew Wilcox static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
11457b806d22SLorenzo Stoakes 		pud_t *pud, pfn_t pfn, bool write)
1146a00cc7d9SMatthew Wilcox {
1147a00cc7d9SMatthew Wilcox 	struct mm_struct *mm = vma->vm_mm;
11487b806d22SLorenzo Stoakes 	pgprot_t prot = vma->vm_page_prot;
1149a00cc7d9SMatthew Wilcox 	pud_t entry;
1150a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
1151a00cc7d9SMatthew Wilcox 
1152a00cc7d9SMatthew Wilcox 	ptl = pud_lock(mm, pud);
1153c6f3c5eeSAneesh Kumar K.V 	if (!pud_none(*pud)) {
1154c6f3c5eeSAneesh Kumar K.V 		if (write) {
1155c6f3c5eeSAneesh Kumar K.V 			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
1156c6f3c5eeSAneesh Kumar K.V 				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
1157c6f3c5eeSAneesh Kumar K.V 				goto out_unlock;
1158c6f3c5eeSAneesh Kumar K.V 			}
1159c6f3c5eeSAneesh Kumar K.V 			entry = pud_mkyoung(*pud);
1160c6f3c5eeSAneesh Kumar K.V 			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1161c6f3c5eeSAneesh Kumar K.V 			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1162c6f3c5eeSAneesh Kumar K.V 				update_mmu_cache_pud(vma, addr, pud);
1163c6f3c5eeSAneesh Kumar K.V 		}
1164c6f3c5eeSAneesh Kumar K.V 		goto out_unlock;
1165c6f3c5eeSAneesh Kumar K.V 	}
1166c6f3c5eeSAneesh Kumar K.V 
1167a00cc7d9SMatthew Wilcox 	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1168a00cc7d9SMatthew Wilcox 	if (pfn_t_devmap(pfn))
1169a00cc7d9SMatthew Wilcox 		entry = pud_mkdevmap(entry);
1170a00cc7d9SMatthew Wilcox 	if (write) {
1171f55e1014SLinus Torvalds 		entry = pud_mkyoung(pud_mkdirty(entry));
1172f55e1014SLinus Torvalds 		entry = maybe_pud_mkwrite(entry, vma);
1173a00cc7d9SMatthew Wilcox 	}
1174a00cc7d9SMatthew Wilcox 	set_pud_at(mm, addr, pud, entry);
1175a00cc7d9SMatthew Wilcox 	update_mmu_cache_pud(vma, addr, pud);
1176c6f3c5eeSAneesh Kumar K.V 
1177c6f3c5eeSAneesh Kumar K.V out_unlock:
1178a00cc7d9SMatthew Wilcox 	spin_unlock(ptl);
1179a00cc7d9SMatthew Wilcox }
1180a00cc7d9SMatthew Wilcox 
11819a9731b1SThomas Hellstrom (VMware) /**
11827b806d22SLorenzo Stoakes  * vmf_insert_pfn_pud - insert a pud size pfn
11839a9731b1SThomas Hellstrom (VMware)  * @vmf: Structure describing the fault
11849a9731b1SThomas Hellstrom (VMware)  * @pfn: pfn to insert
11859a9731b1SThomas Hellstrom (VMware)  * @write: whether it's a write fault
11869a9731b1SThomas Hellstrom (VMware)  *
11877b806d22SLorenzo Stoakes  * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
11889a9731b1SThomas Hellstrom (VMware)  *
11899a9731b1SThomas Hellstrom (VMware)  * Return: vm_fault_t value.
11909a9731b1SThomas Hellstrom (VMware)  */
11917b806d22SLorenzo Stoakes vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
1192a00cc7d9SMatthew Wilcox {
1193fce86ff5SDan Williams 	unsigned long addr = vmf->address & PUD_MASK;
1194fce86ff5SDan Williams 	struct vm_area_struct *vma = vmf->vma;
11957b806d22SLorenzo Stoakes 	pgprot_t pgprot = vma->vm_page_prot;
1196fce86ff5SDan Williams 
1197a00cc7d9SMatthew Wilcox 	/*
1198a00cc7d9SMatthew Wilcox 	 * If we had pud_special, we could avoid all these restrictions,
1199a00cc7d9SMatthew Wilcox 	 * but we need to be consistent with PTEs and architectures that
1200a00cc7d9SMatthew Wilcox 	 * can't support a 'special' bit.
1201a00cc7d9SMatthew Wilcox 	 */
120262ec0d8cSDave Jiang 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
120362ec0d8cSDave Jiang 			!pfn_t_devmap(pfn));
1204a00cc7d9SMatthew Wilcox 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1205a00cc7d9SMatthew Wilcox 						(VM_PFNMAP|VM_MIXEDMAP));
1206a00cc7d9SMatthew Wilcox 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1207a00cc7d9SMatthew Wilcox 
1208a00cc7d9SMatthew Wilcox 	if (addr < vma->vm_start || addr >= vma->vm_end)
1209a00cc7d9SMatthew Wilcox 		return VM_FAULT_SIGBUS;
1210a00cc7d9SMatthew Wilcox 
1211a00cc7d9SMatthew Wilcox 	track_pfn_insert(vma, &pgprot, pfn);
1212a00cc7d9SMatthew Wilcox 
12137b806d22SLorenzo Stoakes 	insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
1214a00cc7d9SMatthew Wilcox 	return VM_FAULT_NOPAGE;
1215a00cc7d9SMatthew Wilcox }
12167b806d22SLorenzo Stoakes EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1217a00cc7d9SMatthew Wilcox #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1218a00cc7d9SMatthew Wilcox 
12193565fce3SDan Williams static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1220a69e4717SMiaohe Lin 		      pmd_t *pmd, bool write)
12213565fce3SDan Williams {
12223565fce3SDan Williams 	pmd_t _pmd;
12233565fce3SDan Williams 
1224a8f97366SKirill A. Shutemov 	_pmd = pmd_mkyoung(*pmd);
1225a69e4717SMiaohe Lin 	if (write)
1226a8f97366SKirill A. Shutemov 		_pmd = pmd_mkdirty(_pmd);
12273565fce3SDan Williams 	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1228a69e4717SMiaohe Lin 				  pmd, _pmd, write))
12293565fce3SDan Williams 		update_mmu_cache_pmd(vma, addr, pmd);
12303565fce3SDan Williams }
12313565fce3SDan Williams 
12323565fce3SDan Williams struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1233df06b37fSKeith Busch 		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
12343565fce3SDan Williams {
12353565fce3SDan Williams 	unsigned long pfn = pmd_pfn(*pmd);
12363565fce3SDan Williams 	struct mm_struct *mm = vma->vm_mm;
12373565fce3SDan Williams 	struct page *page;
12380f089235SLogan Gunthorpe 	int ret;
12393565fce3SDan Williams 
12403565fce3SDan Williams 	assert_spin_locked(pmd_lockptr(mm, pmd));
12413565fce3SDan Williams 
1242f6f37321SLinus Torvalds 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
12433565fce3SDan Williams 		return NULL;
12443565fce3SDan Williams 
12453565fce3SDan Williams 	if (pmd_present(*pmd) && pmd_devmap(*pmd))
12463565fce3SDan Williams 		/* pass */;
12473565fce3SDan Williams 	else
12483565fce3SDan Williams 		return NULL;
12493565fce3SDan Williams 
12503565fce3SDan Williams 	if (flags & FOLL_TOUCH)
1251a69e4717SMiaohe Lin 		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
12523565fce3SDan Williams 
12533565fce3SDan Williams 	/*
12543565fce3SDan Williams 	 * device mapped pages can only be returned if the
12553565fce3SDan Williams 	 * caller will manage the page reference count.
12563565fce3SDan Williams 	 */
12573faa52c0SJohn Hubbard 	if (!(flags & (FOLL_GET | FOLL_PIN)))
12583565fce3SDan Williams 		return ERR_PTR(-EEXIST);
12593565fce3SDan Williams 
12603565fce3SDan Williams 	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1261df06b37fSKeith Busch 	*pgmap = get_dev_pagemap(pfn, *pgmap);
1262df06b37fSKeith Busch 	if (!*pgmap)
12633565fce3SDan Williams 		return ERR_PTR(-EFAULT);
12643565fce3SDan Williams 	page = pfn_to_page(pfn);
12650f089235SLogan Gunthorpe 	ret = try_grab_page(page, flags);
12660f089235SLogan Gunthorpe 	if (ret)
12670f089235SLogan Gunthorpe 		page = ERR_PTR(ret);
12683565fce3SDan Williams 
12693565fce3SDan Williams 	return page;
12703565fce3SDan Williams }
12713565fce3SDan Williams 
127271e3aac0SAndrea Arcangeli int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
127371e3aac0SAndrea Arcangeli 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
12748f34f1eaSPeter Xu 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
127571e3aac0SAndrea Arcangeli {
1276c4088ebdSKirill A. Shutemov 	spinlock_t *dst_ptl, *src_ptl;
127771e3aac0SAndrea Arcangeli 	struct page *src_page;
127871e3aac0SAndrea Arcangeli 	pmd_t pmd;
127912c9d70bSMatthew Wilcox 	pgtable_t pgtable = NULL;
1280628d47ceSKirill A. Shutemov 	int ret = -ENOMEM;
128171e3aac0SAndrea Arcangeli 
1282628d47ceSKirill A. Shutemov 	/* Skip if can be re-fill on fault */
12838f34f1eaSPeter Xu 	if (!vma_is_anonymous(dst_vma))
1284628d47ceSKirill A. Shutemov 		return 0;
1285628d47ceSKirill A. Shutemov 
12864cf58924SJoel Fernandes (Google) 	pgtable = pte_alloc_one(dst_mm);
128771e3aac0SAndrea Arcangeli 	if (unlikely(!pgtable))
128871e3aac0SAndrea Arcangeli 		goto out;
128971e3aac0SAndrea Arcangeli 
1290c4088ebdSKirill A. Shutemov 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
1291c4088ebdSKirill A. Shutemov 	src_ptl = pmd_lockptr(src_mm, src_pmd);
1292c4088ebdSKirill A. Shutemov 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
129371e3aac0SAndrea Arcangeli 
129471e3aac0SAndrea Arcangeli 	ret = -EAGAIN;
129571e3aac0SAndrea Arcangeli 	pmd = *src_pmd;
129684c3fc4eSZi Yan 
129784c3fc4eSZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
129884c3fc4eSZi Yan 	if (unlikely(is_swap_pmd(pmd))) {
129984c3fc4eSZi Yan 		swp_entry_t entry = pmd_to_swp_entry(pmd);
130084c3fc4eSZi Yan 
130184c3fc4eSZi Yan 		VM_BUG_ON(!is_pmd_migration_entry(pmd));
13026c287605SDavid Hildenbrand 		if (!is_readable_migration_entry(entry)) {
13034dd845b5SAlistair Popple 			entry = make_readable_migration_entry(
13044dd845b5SAlistair Popple 							swp_offset(entry));
130584c3fc4eSZi Yan 			pmd = swp_entry_to_pmd(entry);
1306ab6e3d09SNaoya Horiguchi 			if (pmd_swp_soft_dirty(*src_pmd))
1307ab6e3d09SNaoya Horiguchi 				pmd = pmd_swp_mksoft_dirty(pmd);
13088f34f1eaSPeter Xu 			if (pmd_swp_uffd_wp(*src_pmd))
13098f34f1eaSPeter Xu 				pmd = pmd_swp_mkuffd_wp(pmd);
131084c3fc4eSZi Yan 			set_pmd_at(src_mm, addr, src_pmd, pmd);
131184c3fc4eSZi Yan 		}
1312dd8a67f9SZi Yan 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1313af5b0f6aSKirill A. Shutemov 		mm_inc_nr_ptes(dst_mm);
1314dd8a67f9SZi Yan 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
13158f34f1eaSPeter Xu 		if (!userfaultfd_wp(dst_vma))
13168f34f1eaSPeter Xu 			pmd = pmd_swp_clear_uffd_wp(pmd);
131784c3fc4eSZi Yan 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
131884c3fc4eSZi Yan 		ret = 0;
131984c3fc4eSZi Yan 		goto out_unlock;
132084c3fc4eSZi Yan 	}
132184c3fc4eSZi Yan #endif
132284c3fc4eSZi Yan 
1323628d47ceSKirill A. Shutemov 	if (unlikely(!pmd_trans_huge(pmd))) {
132471e3aac0SAndrea Arcangeli 		pte_free(dst_mm, pgtable);
132571e3aac0SAndrea Arcangeli 		goto out_unlock;
132671e3aac0SAndrea Arcangeli 	}
1327fc9fe822SKirill A. Shutemov 	/*
1328c4088ebdSKirill A. Shutemov 	 * When page table lock is held, the huge zero pmd should not be
1329fc9fe822SKirill A. Shutemov 	 * under splitting since we don't split the page itself, only pmd to
1330fc9fe822SKirill A. Shutemov 	 * a page table.
1331fc9fe822SKirill A. Shutemov 	 */
1332fc9fe822SKirill A. Shutemov 	if (is_huge_zero_pmd(pmd)) {
133397ae1749SKirill A. Shutemov 		/*
133497ae1749SKirill A. Shutemov 		 * get_huge_zero_page() will never allocate a new page here,
133597ae1749SKirill A. Shutemov 		 * since we already have a zero page to copy. It just takes a
133697ae1749SKirill A. Shutemov 		 * reference.
133797ae1749SKirill A. Shutemov 		 */
13385fc7a5f6SPeter Xu 		mm_get_huge_zero_page(dst_mm);
13395fc7a5f6SPeter Xu 		goto out_zero_page;
1340fc9fe822SKirill A. Shutemov 	}
1341de466bd6SMel Gorman 
134271e3aac0SAndrea Arcangeli 	src_page = pmd_page(pmd);
1343309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1344d042035eSPeter Xu 
1345fb3d824dSDavid Hildenbrand 	get_page(src_page);
1346fb3d824dSDavid Hildenbrand 	if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
1347fb3d824dSDavid Hildenbrand 		/* Page maybe pinned: split and retry the fault on PTEs. */
1348fb3d824dSDavid Hildenbrand 		put_page(src_page);
1349d042035eSPeter Xu 		pte_free(dst_mm, pgtable);
1350d042035eSPeter Xu 		spin_unlock(src_ptl);
1351d042035eSPeter Xu 		spin_unlock(dst_ptl);
13528f34f1eaSPeter Xu 		__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1353d042035eSPeter Xu 		return -EAGAIN;
1354d042035eSPeter Xu 	}
135571e3aac0SAndrea Arcangeli 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
13565fc7a5f6SPeter Xu out_zero_page:
1357c4812909SKirill A. Shutemov 	mm_inc_nr_ptes(dst_mm);
13585c7fb56eSDan Williams 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
135971e3aac0SAndrea Arcangeli 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
13608f34f1eaSPeter Xu 	if (!userfaultfd_wp(dst_vma))
13618f34f1eaSPeter Xu 		pmd = pmd_clear_uffd_wp(pmd);
136271e3aac0SAndrea Arcangeli 	pmd = pmd_mkold(pmd_wrprotect(pmd));
136371e3aac0SAndrea Arcangeli 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
136471e3aac0SAndrea Arcangeli 
136571e3aac0SAndrea Arcangeli 	ret = 0;
136671e3aac0SAndrea Arcangeli out_unlock:
1367c4088ebdSKirill A. Shutemov 	spin_unlock(src_ptl);
1368c4088ebdSKirill A. Shutemov 	spin_unlock(dst_ptl);
136971e3aac0SAndrea Arcangeli out:
137071e3aac0SAndrea Arcangeli 	return ret;
137171e3aac0SAndrea Arcangeli }
137271e3aac0SAndrea Arcangeli 
1373a00cc7d9SMatthew Wilcox #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1374a00cc7d9SMatthew Wilcox static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
13755fe653e9SMiaohe Lin 		      pud_t *pud, bool write)
1376a00cc7d9SMatthew Wilcox {
1377a00cc7d9SMatthew Wilcox 	pud_t _pud;
1378a00cc7d9SMatthew Wilcox 
1379a8f97366SKirill A. Shutemov 	_pud = pud_mkyoung(*pud);
13805fe653e9SMiaohe Lin 	if (write)
1381a8f97366SKirill A. Shutemov 		_pud = pud_mkdirty(_pud);
1382a00cc7d9SMatthew Wilcox 	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
13835fe653e9SMiaohe Lin 				  pud, _pud, write))
1384a00cc7d9SMatthew Wilcox 		update_mmu_cache_pud(vma, addr, pud);
1385a00cc7d9SMatthew Wilcox }
1386a00cc7d9SMatthew Wilcox 
1387a00cc7d9SMatthew Wilcox struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1388df06b37fSKeith Busch 		pud_t *pud, int flags, struct dev_pagemap **pgmap)
1389a00cc7d9SMatthew Wilcox {
1390a00cc7d9SMatthew Wilcox 	unsigned long pfn = pud_pfn(*pud);
1391a00cc7d9SMatthew Wilcox 	struct mm_struct *mm = vma->vm_mm;
1392a00cc7d9SMatthew Wilcox 	struct page *page;
13930f089235SLogan Gunthorpe 	int ret;
1394a00cc7d9SMatthew Wilcox 
1395a00cc7d9SMatthew Wilcox 	assert_spin_locked(pud_lockptr(mm, pud));
1396a00cc7d9SMatthew Wilcox 
1397f6f37321SLinus Torvalds 	if (flags & FOLL_WRITE && !pud_write(*pud))
1398a00cc7d9SMatthew Wilcox 		return NULL;
1399a00cc7d9SMatthew Wilcox 
1400a00cc7d9SMatthew Wilcox 	if (pud_present(*pud) && pud_devmap(*pud))
1401a00cc7d9SMatthew Wilcox 		/* pass */;
1402a00cc7d9SMatthew Wilcox 	else
1403a00cc7d9SMatthew Wilcox 		return NULL;
1404a00cc7d9SMatthew Wilcox 
1405a00cc7d9SMatthew Wilcox 	if (flags & FOLL_TOUCH)
14065fe653e9SMiaohe Lin 		touch_pud(vma, addr, pud, flags & FOLL_WRITE);
1407a00cc7d9SMatthew Wilcox 
1408a00cc7d9SMatthew Wilcox 	/*
1409a00cc7d9SMatthew Wilcox 	 * device mapped pages can only be returned if the
1410a00cc7d9SMatthew Wilcox 	 * caller will manage the page reference count.
14113faa52c0SJohn Hubbard 	 *
14123faa52c0SJohn Hubbard 	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
1413a00cc7d9SMatthew Wilcox 	 */
14143faa52c0SJohn Hubbard 	if (!(flags & (FOLL_GET | FOLL_PIN)))
1415a00cc7d9SMatthew Wilcox 		return ERR_PTR(-EEXIST);
1416a00cc7d9SMatthew Wilcox 
1417a00cc7d9SMatthew Wilcox 	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1418df06b37fSKeith Busch 	*pgmap = get_dev_pagemap(pfn, *pgmap);
1419df06b37fSKeith Busch 	if (!*pgmap)
1420a00cc7d9SMatthew Wilcox 		return ERR_PTR(-EFAULT);
1421a00cc7d9SMatthew Wilcox 	page = pfn_to_page(pfn);
14220f089235SLogan Gunthorpe 
14230f089235SLogan Gunthorpe 	ret = try_grab_page(page, flags);
14240f089235SLogan Gunthorpe 	if (ret)
14250f089235SLogan Gunthorpe 		page = ERR_PTR(ret);
1426a00cc7d9SMatthew Wilcox 
1427a00cc7d9SMatthew Wilcox 	return page;
1428a00cc7d9SMatthew Wilcox }
1429a00cc7d9SMatthew Wilcox 
1430a00cc7d9SMatthew Wilcox int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1431a00cc7d9SMatthew Wilcox 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1432a00cc7d9SMatthew Wilcox 		  struct vm_area_struct *vma)
1433a00cc7d9SMatthew Wilcox {
1434a00cc7d9SMatthew Wilcox 	spinlock_t *dst_ptl, *src_ptl;
1435a00cc7d9SMatthew Wilcox 	pud_t pud;
1436a00cc7d9SMatthew Wilcox 	int ret;
1437a00cc7d9SMatthew Wilcox 
1438a00cc7d9SMatthew Wilcox 	dst_ptl = pud_lock(dst_mm, dst_pud);
1439a00cc7d9SMatthew Wilcox 	src_ptl = pud_lockptr(src_mm, src_pud);
1440a00cc7d9SMatthew Wilcox 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1441a00cc7d9SMatthew Wilcox 
1442a00cc7d9SMatthew Wilcox 	ret = -EAGAIN;
1443a00cc7d9SMatthew Wilcox 	pud = *src_pud;
1444a00cc7d9SMatthew Wilcox 	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1445a00cc7d9SMatthew Wilcox 		goto out_unlock;
1446a00cc7d9SMatthew Wilcox 
1447a00cc7d9SMatthew Wilcox 	/*
1448a00cc7d9SMatthew Wilcox 	 * When page table lock is held, the huge zero pud should not be
1449a00cc7d9SMatthew Wilcox 	 * under splitting since we don't split the page itself, only pud to
1450a00cc7d9SMatthew Wilcox 	 * a page table.
1451a00cc7d9SMatthew Wilcox 	 */
1452a00cc7d9SMatthew Wilcox 	if (is_huge_zero_pud(pud)) {
1453a00cc7d9SMatthew Wilcox 		/* No huge zero pud yet */
1454a00cc7d9SMatthew Wilcox 	}
1455a00cc7d9SMatthew Wilcox 
1456fb3d824dSDavid Hildenbrand 	/*
1457fb3d824dSDavid Hildenbrand 	 * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
1458fb3d824dSDavid Hildenbrand 	 * and split if duplicating fails.
1459fb3d824dSDavid Hildenbrand 	 */
1460a00cc7d9SMatthew Wilcox 	pudp_set_wrprotect(src_mm, addr, src_pud);
1461a00cc7d9SMatthew Wilcox 	pud = pud_mkold(pud_wrprotect(pud));
1462a00cc7d9SMatthew Wilcox 	set_pud_at(dst_mm, addr, dst_pud, pud);
1463a00cc7d9SMatthew Wilcox 
1464a00cc7d9SMatthew Wilcox 	ret = 0;
1465a00cc7d9SMatthew Wilcox out_unlock:
1466a00cc7d9SMatthew Wilcox 	spin_unlock(src_ptl);
1467a00cc7d9SMatthew Wilcox 	spin_unlock(dst_ptl);
1468a00cc7d9SMatthew Wilcox 	return ret;
1469a00cc7d9SMatthew Wilcox }
1470a00cc7d9SMatthew Wilcox 
1471a00cc7d9SMatthew Wilcox void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1472a00cc7d9SMatthew Wilcox {
1473a00cc7d9SMatthew Wilcox 	bool write = vmf->flags & FAULT_FLAG_WRITE;
1474a00cc7d9SMatthew Wilcox 
1475a00cc7d9SMatthew Wilcox 	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1476a00cc7d9SMatthew Wilcox 	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1477a00cc7d9SMatthew Wilcox 		goto unlock;
1478a00cc7d9SMatthew Wilcox 
14795fe653e9SMiaohe Lin 	touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1480a00cc7d9SMatthew Wilcox unlock:
1481a00cc7d9SMatthew Wilcox 	spin_unlock(vmf->ptl);
1482a00cc7d9SMatthew Wilcox }
1483a00cc7d9SMatthew Wilcox #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1484a00cc7d9SMatthew Wilcox 
14855db4f15cSYang Shi void huge_pmd_set_accessed(struct vm_fault *vmf)
1486a1dd450bSWill Deacon {
148720f664aaSMinchan Kim 	bool write = vmf->flags & FAULT_FLAG_WRITE;
1488a1dd450bSWill Deacon 
148982b0f8c3SJan Kara 	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1490a69e4717SMiaohe Lin 	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1491a1dd450bSWill Deacon 		goto unlock;
1492a1dd450bSWill Deacon 
1493a69e4717SMiaohe Lin 	touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1494a1dd450bSWill Deacon 
1495a1dd450bSWill Deacon unlock:
149682b0f8c3SJan Kara 	spin_unlock(vmf->ptl);
1497a1dd450bSWill Deacon }
1498a1dd450bSWill Deacon 
14995db4f15cSYang Shi vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
150071e3aac0SAndrea Arcangeli {
1501c89357e2SDavid Hildenbrand 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
150282b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
15032fad3d14SMatthew Wilcox (Oracle) 	struct folio *folio;
15043917c802SKirill A. Shutemov 	struct page *page;
150582b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
15065db4f15cSYang Shi 	pmd_t orig_pmd = vmf->orig_pmd;
150771e3aac0SAndrea Arcangeli 
150882b0f8c3SJan Kara 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
150981d1b09cSSasha Levin 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
15103917c802SKirill A. Shutemov 
151193b4796dSKirill A. Shutemov 	if (is_huge_zero_pmd(orig_pmd))
15123917c802SKirill A. Shutemov 		goto fallback;
15133917c802SKirill A. Shutemov 
151482b0f8c3SJan Kara 	spin_lock(vmf->ptl);
15153917c802SKirill A. Shutemov 
15163917c802SKirill A. Shutemov 	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
15173917c802SKirill A. Shutemov 		spin_unlock(vmf->ptl);
15183917c802SKirill A. Shutemov 		return 0;
15193917c802SKirill A. Shutemov 	}
152071e3aac0SAndrea Arcangeli 
152171e3aac0SAndrea Arcangeli 	page = pmd_page(orig_pmd);
15222fad3d14SMatthew Wilcox (Oracle) 	folio = page_folio(page);
1523f6004e73SMiaohe Lin 	VM_BUG_ON_PAGE(!PageHead(page), page);
15243917c802SKirill A. Shutemov 
15256c287605SDavid Hildenbrand 	/* Early check when only holding the PT lock. */
15266c287605SDavid Hildenbrand 	if (PageAnonExclusive(page))
15276c287605SDavid Hildenbrand 		goto reuse;
15286c287605SDavid Hildenbrand 
15292fad3d14SMatthew Wilcox (Oracle) 	if (!folio_trylock(folio)) {
15302fad3d14SMatthew Wilcox (Oracle) 		folio_get(folio);
1531ba3c4ce6SHuang Ying 		spin_unlock(vmf->ptl);
15322fad3d14SMatthew Wilcox (Oracle) 		folio_lock(folio);
1533ba3c4ce6SHuang Ying 		spin_lock(vmf->ptl);
1534ba3c4ce6SHuang Ying 		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
15353917c802SKirill A. Shutemov 			spin_unlock(vmf->ptl);
15362fad3d14SMatthew Wilcox (Oracle) 			folio_unlock(folio);
15372fad3d14SMatthew Wilcox (Oracle) 			folio_put(folio);
15383917c802SKirill A. Shutemov 			return 0;
1539ba3c4ce6SHuang Ying 		}
15402fad3d14SMatthew Wilcox (Oracle) 		folio_put(folio);
1541ba3c4ce6SHuang Ying 	}
15423917c802SKirill A. Shutemov 
15436c287605SDavid Hildenbrand 	/* Recheck after temporarily dropping the PT lock. */
15446c287605SDavid Hildenbrand 	if (PageAnonExclusive(page)) {
15452fad3d14SMatthew Wilcox (Oracle) 		folio_unlock(folio);
15466c287605SDavid Hildenbrand 		goto reuse;
15476c287605SDavid Hildenbrand 	}
15486c287605SDavid Hildenbrand 
15493917c802SKirill A. Shutemov 	/*
15502fad3d14SMatthew Wilcox (Oracle) 	 * See do_wp_page(): we can only reuse the folio exclusively if
15512fad3d14SMatthew Wilcox (Oracle) 	 * there are no additional references. Note that we always drain
15521fec6890SMatthew Wilcox (Oracle) 	 * the LRU cache immediately after adding a THP.
15533917c802SKirill A. Shutemov 	 */
15542fad3d14SMatthew Wilcox (Oracle) 	if (folio_ref_count(folio) >
15552fad3d14SMatthew Wilcox (Oracle) 			1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
15563bff7e3fSDavid Hildenbrand 		goto unlock_fallback;
15572fad3d14SMatthew Wilcox (Oracle) 	if (folio_test_swapcache(folio))
15582fad3d14SMatthew Wilcox (Oracle) 		folio_free_swap(folio);
15592fad3d14SMatthew Wilcox (Oracle) 	if (folio_ref_count(folio) == 1) {
156071e3aac0SAndrea Arcangeli 		pmd_t entry;
15616c54dc6cSDavid Hildenbrand 
156206968625SDavid Hildenbrand 		folio_move_anon_rmap(folio, vma);
15635ca43289SDavid Hildenbrand 		SetPageAnonExclusive(page);
15642fad3d14SMatthew Wilcox (Oracle) 		folio_unlock(folio);
15656c287605SDavid Hildenbrand reuse:
1566c89357e2SDavid Hildenbrand 		if (unlikely(unshare)) {
1567c89357e2SDavid Hildenbrand 			spin_unlock(vmf->ptl);
1568c89357e2SDavid Hildenbrand 			return 0;
1569c89357e2SDavid Hildenbrand 		}
157071e3aac0SAndrea Arcangeli 		entry = pmd_mkyoung(orig_pmd);
1571f55e1014SLinus Torvalds 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
157282b0f8c3SJan Kara 		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
157382b0f8c3SJan Kara 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
15743917c802SKirill A. Shutemov 		spin_unlock(vmf->ptl);
1575cb8d8633SDavid Hildenbrand 		return 0;
157671e3aac0SAndrea Arcangeli 	}
15773917c802SKirill A. Shutemov 
15783bff7e3fSDavid Hildenbrand unlock_fallback:
15792fad3d14SMatthew Wilcox (Oracle) 	folio_unlock(folio);
158082b0f8c3SJan Kara 	spin_unlock(vmf->ptl);
15813917c802SKirill A. Shutemov fallback:
15823917c802SKirill A. Shutemov 	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
15833917c802SKirill A. Shutemov 	return VM_FAULT_FALLBACK;
158471e3aac0SAndrea Arcangeli }
158571e3aac0SAndrea Arcangeli 
1586c27f479eSDavid Hildenbrand static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1587c27f479eSDavid Hildenbrand 					   unsigned long addr, pmd_t pmd)
1588c27f479eSDavid Hildenbrand {
1589c27f479eSDavid Hildenbrand 	struct page *page;
1590c27f479eSDavid Hildenbrand 
1591c27f479eSDavid Hildenbrand 	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1592c27f479eSDavid Hildenbrand 		return false;
1593c27f479eSDavid Hildenbrand 
1594c27f479eSDavid Hildenbrand 	/* Don't touch entries that are not even readable (NUMA hinting). */
1595c27f479eSDavid Hildenbrand 	if (pmd_protnone(pmd))
1596c27f479eSDavid Hildenbrand 		return false;
1597c27f479eSDavid Hildenbrand 
1598c27f479eSDavid Hildenbrand 	/* Do we need write faults for softdirty tracking? */
1599c27f479eSDavid Hildenbrand 	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1600c27f479eSDavid Hildenbrand 		return false;
1601c27f479eSDavid Hildenbrand 
1602c27f479eSDavid Hildenbrand 	/* Do we need write faults for uffd-wp tracking? */
1603c27f479eSDavid Hildenbrand 	if (userfaultfd_huge_pmd_wp(vma, pmd))
1604c27f479eSDavid Hildenbrand 		return false;
1605c27f479eSDavid Hildenbrand 
1606c27f479eSDavid Hildenbrand 	if (!(vma->vm_flags & VM_SHARED)) {
1607c27f479eSDavid Hildenbrand 		/* See can_change_pte_writable(). */
1608c27f479eSDavid Hildenbrand 		page = vm_normal_page_pmd(vma, addr, pmd);
1609c27f479eSDavid Hildenbrand 		return page && PageAnon(page) && PageAnonExclusive(page);
1610c27f479eSDavid Hildenbrand 	}
1611c27f479eSDavid Hildenbrand 
1612c27f479eSDavid Hildenbrand 	/* See can_change_pte_writable(). */
1613c27f479eSDavid Hildenbrand 	return pmd_dirty(pmd);
1614c27f479eSDavid Hildenbrand }
1615c27f479eSDavid Hildenbrand 
16165535be30SDavid Hildenbrand /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
16175535be30SDavid Hildenbrand static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
16185535be30SDavid Hildenbrand 					struct vm_area_struct *vma,
16195535be30SDavid Hildenbrand 					unsigned int flags)
16208310d48bSKeno Fischer {
16215535be30SDavid Hildenbrand 	/* If the pmd is writable, we can write to the page. */
16225535be30SDavid Hildenbrand 	if (pmd_write(pmd))
16235535be30SDavid Hildenbrand 		return true;
16245535be30SDavid Hildenbrand 
16255535be30SDavid Hildenbrand 	/* Maybe FOLL_FORCE is set to override it? */
16265535be30SDavid Hildenbrand 	if (!(flags & FOLL_FORCE))
16275535be30SDavid Hildenbrand 		return false;
16285535be30SDavid Hildenbrand 
16295535be30SDavid Hildenbrand 	/* But FOLL_FORCE has no effect on shared mappings */
16305535be30SDavid Hildenbrand 	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
16315535be30SDavid Hildenbrand 		return false;
16325535be30SDavid Hildenbrand 
16335535be30SDavid Hildenbrand 	/* ... or read-only private ones */
16345535be30SDavid Hildenbrand 	if (!(vma->vm_flags & VM_MAYWRITE))
16355535be30SDavid Hildenbrand 		return false;
16365535be30SDavid Hildenbrand 
16375535be30SDavid Hildenbrand 	/* ... or already writable ones that just need to take a write fault */
16385535be30SDavid Hildenbrand 	if (vma->vm_flags & VM_WRITE)
16395535be30SDavid Hildenbrand 		return false;
16405535be30SDavid Hildenbrand 
16415535be30SDavid Hildenbrand 	/*
16425535be30SDavid Hildenbrand 	 * See can_change_pte_writable(): we broke COW and could map the page
16435535be30SDavid Hildenbrand 	 * writable if we have an exclusive anonymous page ...
16445535be30SDavid Hildenbrand 	 */
16455535be30SDavid Hildenbrand 	if (!page || !PageAnon(page) || !PageAnonExclusive(page))
16465535be30SDavid Hildenbrand 		return false;
16475535be30SDavid Hildenbrand 
16485535be30SDavid Hildenbrand 	/* ... and a write-fault isn't required for other reasons. */
16495535be30SDavid Hildenbrand 	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
16505535be30SDavid Hildenbrand 		return false;
16515535be30SDavid Hildenbrand 	return !userfaultfd_huge_pmd_wp(vma, pmd);
16528310d48bSKeno Fischer }
16538310d48bSKeno Fischer 
1654b676b293SDavid Rientjes struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
165571e3aac0SAndrea Arcangeli 				   unsigned long addr,
165671e3aac0SAndrea Arcangeli 				   pmd_t *pmd,
165771e3aac0SAndrea Arcangeli 				   unsigned int flags)
165871e3aac0SAndrea Arcangeli {
1659b676b293SDavid Rientjes 	struct mm_struct *mm = vma->vm_mm;
16605535be30SDavid Hildenbrand 	struct page *page;
16610f089235SLogan Gunthorpe 	int ret;
166271e3aac0SAndrea Arcangeli 
1663c4088ebdSKirill A. Shutemov 	assert_spin_locked(pmd_lockptr(mm, pmd));
166471e3aac0SAndrea Arcangeli 
16655535be30SDavid Hildenbrand 	page = pmd_page(*pmd);
16665535be30SDavid Hildenbrand 	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
16675535be30SDavid Hildenbrand 
16685535be30SDavid Hildenbrand 	if ((flags & FOLL_WRITE) &&
16695535be30SDavid Hildenbrand 	    !can_follow_write_pmd(*pmd, page, vma, flags))
16705535be30SDavid Hildenbrand 		return NULL;
167171e3aac0SAndrea Arcangeli 
167285facf25SKirill A. Shutemov 	/* Avoid dumping huge zero page */
167385facf25SKirill A. Shutemov 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
167485facf25SKirill A. Shutemov 		return ERR_PTR(-EFAULT);
167585facf25SKirill A. Shutemov 
1676d74943a2SDavid Hildenbrand 	if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
16775535be30SDavid Hildenbrand 		return NULL;
16783faa52c0SJohn Hubbard 
167984209e87SDavid Hildenbrand 	if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
1680a7f22660SDavid Hildenbrand 		return ERR_PTR(-EMLINK);
1681a7f22660SDavid Hildenbrand 
1682b6a2619cSDavid Hildenbrand 	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
1683b6a2619cSDavid Hildenbrand 			!PageAnonExclusive(page), page);
1684b6a2619cSDavid Hildenbrand 
16850f089235SLogan Gunthorpe 	ret = try_grab_page(page, flags);
16860f089235SLogan Gunthorpe 	if (ret)
16870f089235SLogan Gunthorpe 		return ERR_PTR(ret);
16883faa52c0SJohn Hubbard 
16893565fce3SDan Williams 	if (flags & FOLL_TOUCH)
1690a69e4717SMiaohe Lin 		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
16913faa52c0SJohn Hubbard 
169271e3aac0SAndrea Arcangeli 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1693ca120cf6SDan Williams 	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
169471e3aac0SAndrea Arcangeli 
169571e3aac0SAndrea Arcangeli 	return page;
169671e3aac0SAndrea Arcangeli }
169771e3aac0SAndrea Arcangeli 
1698d10e63f2SMel Gorman /* NUMA hinting page fault entry point for trans huge pmds */
16995db4f15cSYang Shi vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1700d10e63f2SMel Gorman {
170182b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
1702c5b5a3ddSYang Shi 	pmd_t oldpmd = vmf->orig_pmd;
1703c5b5a3ddSYang Shi 	pmd_t pmd;
1704667ffc31SKefeng Wang 	struct folio *folio;
170582b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1706667ffc31SKefeng Wang 	int nid = NUMA_NO_NODE;
170733024536SHuang Ying 	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
17086a56ccbcSDavid Hildenbrand 	bool migrated = false, writable = false;
17096688cc05SPeter Zijlstra 	int flags = 0;
1710d10e63f2SMel Gorman 
171182b0f8c3SJan Kara 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1712c5b5a3ddSYang Shi 	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
171382b0f8c3SJan Kara 		spin_unlock(vmf->ptl);
1714de466bd6SMel Gorman 		goto out;
1715de466bd6SMel Gorman 	}
1716de466bd6SMel Gorman 
1717c5b5a3ddSYang Shi 	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
17186a56ccbcSDavid Hildenbrand 
17196a56ccbcSDavid Hildenbrand 	/*
17206a56ccbcSDavid Hildenbrand 	 * Detect now whether the PMD could be writable; this information
17216a56ccbcSDavid Hildenbrand 	 * is only valid while holding the PT lock.
17226a56ccbcSDavid Hildenbrand 	 */
17236a56ccbcSDavid Hildenbrand 	writable = pmd_write(pmd);
17246a56ccbcSDavid Hildenbrand 	if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
17256a56ccbcSDavid Hildenbrand 	    can_change_pmd_writable(vma, vmf->address, pmd))
17266a56ccbcSDavid Hildenbrand 		writable = true;
17276a56ccbcSDavid Hildenbrand 
1728667ffc31SKefeng Wang 	folio = vm_normal_folio_pmd(vma, haddr, pmd);
1729667ffc31SKefeng Wang 	if (!folio)
1730c5b5a3ddSYang Shi 		goto out_map;
1731c5b5a3ddSYang Shi 
1732c5b5a3ddSYang Shi 	/* See similar comment in do_numa_page for explanation */
17336a56ccbcSDavid Hildenbrand 	if (!writable)
1734c5b5a3ddSYang Shi 		flags |= TNF_NO_GROUP;
1735c5b5a3ddSYang Shi 
1736667ffc31SKefeng Wang 	nid = folio_nid(folio);
173733024536SHuang Ying 	/*
173833024536SHuang Ying 	 * For memory tiering mode, cpupid of slow memory page is used
173933024536SHuang Ying 	 * to record page access time.  So use default value.
174033024536SHuang Ying 	 */
1741667ffc31SKefeng Wang 	if (node_is_toptier(nid))
1742c4a8d2faSKefeng Wang 		last_cpupid = folio_last_cpupid(folio);
1743cda6d936SKefeng Wang 	target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
1744c5b5a3ddSYang Shi 	if (target_nid == NUMA_NO_NODE) {
1745667ffc31SKefeng Wang 		folio_put(folio);
1746c5b5a3ddSYang Shi 		goto out_map;
1747c5b5a3ddSYang Shi 	}
1748c5b5a3ddSYang Shi 
174982b0f8c3SJan Kara 	spin_unlock(vmf->ptl);
17506a56ccbcSDavid Hildenbrand 	writable = false;
17518b1b436dSPeter Zijlstra 
1752667ffc31SKefeng Wang 	migrated = migrate_misplaced_folio(folio, vma, target_nid);
17536688cc05SPeter Zijlstra 	if (migrated) {
17546688cc05SPeter Zijlstra 		flags |= TNF_MIGRATED;
1755667ffc31SKefeng Wang 		nid = target_nid;
1756c5b5a3ddSYang Shi 	} else {
1757074c2381SMel Gorman 		flags |= TNF_MIGRATE_FAIL;
1758c5b5a3ddSYang Shi 		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1759c5b5a3ddSYang Shi 		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
176082b0f8c3SJan Kara 			spin_unlock(vmf->ptl);
1761c5b5a3ddSYang Shi 			goto out;
1762c5b5a3ddSYang Shi 		}
1763c5b5a3ddSYang Shi 		goto out_map;
1764c5b5a3ddSYang Shi 	}
1765b8916634SMel Gorman 
1766b8916634SMel Gorman out:
1767667ffc31SKefeng Wang 	if (nid != NUMA_NO_NODE)
1768667ffc31SKefeng Wang 		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
17698191acbdSMel Gorman 
1770d10e63f2SMel Gorman 	return 0;
1771c5b5a3ddSYang Shi 
1772c5b5a3ddSYang Shi out_map:
1773c5b5a3ddSYang Shi 	/* Restore the PMD */
1774c5b5a3ddSYang Shi 	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1775c5b5a3ddSYang Shi 	pmd = pmd_mkyoung(pmd);
17766a56ccbcSDavid Hildenbrand 	if (writable)
1777161e393cSRick Edgecombe 		pmd = pmd_mkwrite(pmd, vma);
1778c5b5a3ddSYang Shi 	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1779c5b5a3ddSYang Shi 	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1780c5b5a3ddSYang Shi 	spin_unlock(vmf->ptl);
1781c5b5a3ddSYang Shi 	goto out;
1782d10e63f2SMel Gorman }
1783d10e63f2SMel Gorman 
1784319904adSHuang Ying /*
1785319904adSHuang Ying  * Return true if we do MADV_FREE successfully on entire pmd page.
1786319904adSHuang Ying  * Otherwise, return false.
1787319904adSHuang Ying  */
1788319904adSHuang Ying bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1789b8d3c4c3SMinchan Kim 		pmd_t *pmd, unsigned long addr, unsigned long next)
1790b8d3c4c3SMinchan Kim {
1791b8d3c4c3SMinchan Kim 	spinlock_t *ptl;
1792b8d3c4c3SMinchan Kim 	pmd_t orig_pmd;
1793fc986a38SKefeng Wang 	struct folio *folio;
1794b8d3c4c3SMinchan Kim 	struct mm_struct *mm = tlb->mm;
1795319904adSHuang Ying 	bool ret = false;
1796b8d3c4c3SMinchan Kim 
1797ed6a7935SPeter Zijlstra 	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
179807e32661SAneesh Kumar K.V 
1799b6ec57f4SKirill A. Shutemov 	ptl = pmd_trans_huge_lock(pmd, vma);
1800b6ec57f4SKirill A. Shutemov 	if (!ptl)
180125eedabeSLinus Torvalds 		goto out_unlocked;
1802b8d3c4c3SMinchan Kim 
1803b8d3c4c3SMinchan Kim 	orig_pmd = *pmd;
1804319904adSHuang Ying 	if (is_huge_zero_pmd(orig_pmd))
1805b8d3c4c3SMinchan Kim 		goto out;
1806b8d3c4c3SMinchan Kim 
180784c3fc4eSZi Yan 	if (unlikely(!pmd_present(orig_pmd))) {
180884c3fc4eSZi Yan 		VM_BUG_ON(thp_migration_supported() &&
180984c3fc4eSZi Yan 				  !is_pmd_migration_entry(orig_pmd));
181084c3fc4eSZi Yan 		goto out;
181184c3fc4eSZi Yan 	}
181284c3fc4eSZi Yan 
1813fc986a38SKefeng Wang 	folio = pfn_folio(pmd_pfn(orig_pmd));
1814b8d3c4c3SMinchan Kim 	/*
1815fc986a38SKefeng Wang 	 * If other processes are mapping this folio, we couldn't discard
1816fc986a38SKefeng Wang 	 * the folio unless they all do MADV_FREE so let's skip the folio.
1817b8d3c4c3SMinchan Kim 	 */
181820b18aadSYin Fengwei 	if (folio_estimated_sharers(folio) != 1)
1819b8d3c4c3SMinchan Kim 		goto out;
1820b8d3c4c3SMinchan Kim 
1821fc986a38SKefeng Wang 	if (!folio_trylock(folio))
1822b8d3c4c3SMinchan Kim 		goto out;
1823b8d3c4c3SMinchan Kim 
1824b8d3c4c3SMinchan Kim 	/*
1825b8d3c4c3SMinchan Kim 	 * If user want to discard part-pages of THP, split it so MADV_FREE
1826b8d3c4c3SMinchan Kim 	 * will deactivate only them.
1827b8d3c4c3SMinchan Kim 	 */
1828b8d3c4c3SMinchan Kim 	if (next - addr != HPAGE_PMD_SIZE) {
1829fc986a38SKefeng Wang 		folio_get(folio);
1830b8d3c4c3SMinchan Kim 		spin_unlock(ptl);
1831fc986a38SKefeng Wang 		split_folio(folio);
1832fc986a38SKefeng Wang 		folio_unlock(folio);
1833fc986a38SKefeng Wang 		folio_put(folio);
1834b8d3c4c3SMinchan Kim 		goto out_unlocked;
1835b8d3c4c3SMinchan Kim 	}
1836b8d3c4c3SMinchan Kim 
1837fc986a38SKefeng Wang 	if (folio_test_dirty(folio))
1838fc986a38SKefeng Wang 		folio_clear_dirty(folio);
1839fc986a38SKefeng Wang 	folio_unlock(folio);
1840b8d3c4c3SMinchan Kim 
1841b8d3c4c3SMinchan Kim 	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
184258ceeb6bSKirill A. Shutemov 		pmdp_invalidate(vma, addr, pmd);
1843b8d3c4c3SMinchan Kim 		orig_pmd = pmd_mkold(orig_pmd);
1844b8d3c4c3SMinchan Kim 		orig_pmd = pmd_mkclean(orig_pmd);
1845b8d3c4c3SMinchan Kim 
1846b8d3c4c3SMinchan Kim 		set_pmd_at(mm, addr, pmd, orig_pmd);
1847b8d3c4c3SMinchan Kim 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1848b8d3c4c3SMinchan Kim 	}
1849802a3a92SShaohua Li 
18506a6fe9ebSKefeng Wang 	folio_mark_lazyfree(folio);
1851319904adSHuang Ying 	ret = true;
1852b8d3c4c3SMinchan Kim out:
1853b8d3c4c3SMinchan Kim 	spin_unlock(ptl);
1854b8d3c4c3SMinchan Kim out_unlocked:
1855b8d3c4c3SMinchan Kim 	return ret;
1856b8d3c4c3SMinchan Kim }
1857b8d3c4c3SMinchan Kim 
1858953c66c2SAneesh Kumar K.V static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1859953c66c2SAneesh Kumar K.V {
1860953c66c2SAneesh Kumar K.V 	pgtable_t pgtable;
1861953c66c2SAneesh Kumar K.V 
1862953c66c2SAneesh Kumar K.V 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1863953c66c2SAneesh Kumar K.V 	pte_free(mm, pgtable);
1864c4812909SKirill A. Shutemov 	mm_dec_nr_ptes(mm);
1865953c66c2SAneesh Kumar K.V }
1866953c66c2SAneesh Kumar K.V 
186771e3aac0SAndrea Arcangeli int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1868f21760b1SShaohua Li 		 pmd_t *pmd, unsigned long addr)
186971e3aac0SAndrea Arcangeli {
1870f5c8ad47SDavid Miller 	pmd_t orig_pmd;
1871da146769SKirill A. Shutemov 	spinlock_t *ptl;
1872da146769SKirill A. Shutemov 
1873ed6a7935SPeter Zijlstra 	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
187407e32661SAneesh Kumar K.V 
1875b6ec57f4SKirill A. Shutemov 	ptl = __pmd_trans_huge_lock(pmd, vma);
1876b6ec57f4SKirill A. Shutemov 	if (!ptl)
1877da146769SKirill A. Shutemov 		return 0;
1878a6bf2bb0SAneesh Kumar K.V 	/*
1879a6bf2bb0SAneesh Kumar K.V 	 * For architectures like ppc64 we look at deposited pgtable
18808809aa2dSAneesh Kumar K.V 	 * when calling pmdp_huge_get_and_clear. So do the
1881a6bf2bb0SAneesh Kumar K.V 	 * pgtable_trans_huge_withdraw after finishing pmdp related
1882a6bf2bb0SAneesh Kumar K.V 	 * operations.
1883a6bf2bb0SAneesh Kumar K.V 	 */
188493a98695SAneesh Kumar K.V 	orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1885fcbe08d6SMartin Schwidefsky 						tlb->fullmm);
1886e5136e87SRick Edgecombe 	arch_check_zapped_pmd(vma, orig_pmd);
1887f21760b1SShaohua Li 	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
18882484ca9bSThomas Hellstrom (VMware) 	if (vma_is_special_huge(vma)) {
18893b6521f5SOliver O'Halloran 		if (arch_needs_pgtable_deposit())
18903b6521f5SOliver O'Halloran 			zap_deposited_table(tlb->mm, pmd);
18914897c765SMatthew Wilcox 		spin_unlock(ptl);
1892da146769SKirill A. Shutemov 	} else if (is_huge_zero_pmd(orig_pmd)) {
1893c14a6eb4SOliver O'Halloran 		zap_deposited_table(tlb->mm, pmd);
1894bf929152SKirill A. Shutemov 		spin_unlock(ptl);
1895479f0abbSKirill A. Shutemov 	} else {
1896616b8371SZi Yan 		struct page *page = NULL;
1897616b8371SZi Yan 		int flush_needed = 1;
1898616b8371SZi Yan 
1899616b8371SZi Yan 		if (pmd_present(orig_pmd)) {
1900616b8371SZi Yan 			page = pmd_page(orig_pmd);
1901cea86fe2SHugh Dickins 			page_remove_rmap(page, vma, true);
1902309381feSSasha Levin 			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1903309381feSSasha Levin 			VM_BUG_ON_PAGE(!PageHead(page), page);
1904616b8371SZi Yan 		} else if (thp_migration_supported()) {
1905616b8371SZi Yan 			swp_entry_t entry;
1906616b8371SZi Yan 
1907616b8371SZi Yan 			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1908616b8371SZi Yan 			entry = pmd_to_swp_entry(orig_pmd);
1909af5cdaf8SAlistair Popple 			page = pfn_swap_entry_to_page(entry);
1910616b8371SZi Yan 			flush_needed = 0;
1911616b8371SZi Yan 		} else
1912616b8371SZi Yan 			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1913616b8371SZi Yan 
1914b5072380SKirill A. Shutemov 		if (PageAnon(page)) {
1915c14a6eb4SOliver O'Halloran 			zap_deposited_table(tlb->mm, pmd);
1916b5072380SKirill A. Shutemov 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1917b5072380SKirill A. Shutemov 		} else {
1918953c66c2SAneesh Kumar K.V 			if (arch_needs_pgtable_deposit())
1919953c66c2SAneesh Kumar K.V 				zap_deposited_table(tlb->mm, pmd);
1920fadae295SYang Shi 			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
1921b5072380SKirill A. Shutemov 		}
1922616b8371SZi Yan 
1923bf929152SKirill A. Shutemov 		spin_unlock(ptl);
1924616b8371SZi Yan 		if (flush_needed)
1925e77b0852SAneesh Kumar K.V 			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1926479f0abbSKirill A. Shutemov 	}
1927da146769SKirill A. Shutemov 	return 1;
192871e3aac0SAndrea Arcangeli }
192971e3aac0SAndrea Arcangeli 
19301dd38b6cSAneesh Kumar K.V #ifndef pmd_move_must_withdraw
19311dd38b6cSAneesh Kumar K.V static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
19321dd38b6cSAneesh Kumar K.V 					 spinlock_t *old_pmd_ptl,
19331dd38b6cSAneesh Kumar K.V 					 struct vm_area_struct *vma)
19341dd38b6cSAneesh Kumar K.V {
19351dd38b6cSAneesh Kumar K.V 	/*
19361dd38b6cSAneesh Kumar K.V 	 * With split pmd lock we also need to move preallocated
19371dd38b6cSAneesh Kumar K.V 	 * PTE page table if new_pmd is on different PMD page table.
19381dd38b6cSAneesh Kumar K.V 	 *
19391dd38b6cSAneesh Kumar K.V 	 * We also don't deposit and withdraw tables for file pages.
19401dd38b6cSAneesh Kumar K.V 	 */
19411dd38b6cSAneesh Kumar K.V 	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
19421dd38b6cSAneesh Kumar K.V }
19431dd38b6cSAneesh Kumar K.V #endif
19441dd38b6cSAneesh Kumar K.V 
1945ab6e3d09SNaoya Horiguchi static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1946ab6e3d09SNaoya Horiguchi {
1947ab6e3d09SNaoya Horiguchi #ifdef CONFIG_MEM_SOFT_DIRTY
1948ab6e3d09SNaoya Horiguchi 	if (unlikely(is_pmd_migration_entry(pmd)))
1949ab6e3d09SNaoya Horiguchi 		pmd = pmd_swp_mksoft_dirty(pmd);
1950ab6e3d09SNaoya Horiguchi 	else if (pmd_present(pmd))
1951ab6e3d09SNaoya Horiguchi 		pmd = pmd_mksoft_dirty(pmd);
1952ab6e3d09SNaoya Horiguchi #endif
1953ab6e3d09SNaoya Horiguchi 	return pmd;
1954ab6e3d09SNaoya Horiguchi }
1955ab6e3d09SNaoya Horiguchi 
1956bf8616d5SHugh Dickins bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1957b8aa9d9dSWei Yang 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
195837a1c49aSAndrea Arcangeli {
1959bf929152SKirill A. Shutemov 	spinlock_t *old_ptl, *new_ptl;
196037a1c49aSAndrea Arcangeli 	pmd_t pmd;
196137a1c49aSAndrea Arcangeli 	struct mm_struct *mm = vma->vm_mm;
19625d190420SAaron Lu 	bool force_flush = false;
196337a1c49aSAndrea Arcangeli 
196437a1c49aSAndrea Arcangeli 	/*
196537a1c49aSAndrea Arcangeli 	 * The destination pmd shouldn't be established, free_pgtables()
1966a5be621eSHugh Dickins 	 * should have released it; but move_page_tables() might have already
1967a5be621eSHugh Dickins 	 * inserted a page table, if racing against shmem/file collapse.
196837a1c49aSAndrea Arcangeli 	 */
1969a5be621eSHugh Dickins 	if (!pmd_none(*new_pmd)) {
197037a1c49aSAndrea Arcangeli 		VM_BUG_ON(pmd_trans_huge(*new_pmd));
19714b471e88SKirill A. Shutemov 		return false;
197237a1c49aSAndrea Arcangeli 	}
197337a1c49aSAndrea Arcangeli 
1974bf929152SKirill A. Shutemov 	/*
1975bf929152SKirill A. Shutemov 	 * We don't have to worry about the ordering of src and dst
1976c1e8d7c6SMichel Lespinasse 	 * ptlocks because exclusive mmap_lock prevents deadlock.
1977bf929152SKirill A. Shutemov 	 */
1978b6ec57f4SKirill A. Shutemov 	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1979b6ec57f4SKirill A. Shutemov 	if (old_ptl) {
1980bf929152SKirill A. Shutemov 		new_ptl = pmd_lockptr(mm, new_pmd);
1981bf929152SKirill A. Shutemov 		if (new_ptl != old_ptl)
1982bf929152SKirill A. Shutemov 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
19838809aa2dSAneesh Kumar K.V 		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1984eb66ae03SLinus Torvalds 		if (pmd_present(pmd))
1985a2ce2666SAaron Lu 			force_flush = true;
198637a1c49aSAndrea Arcangeli 		VM_BUG_ON(!pmd_none(*new_pmd));
19873592806cSKirill A. Shutemov 
19881dd38b6cSAneesh Kumar K.V 		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1989b3084f4dSAneesh Kumar K.V 			pgtable_t pgtable;
19903592806cSKirill A. Shutemov 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
19913592806cSKirill A. Shutemov 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
19923592806cSKirill A. Shutemov 		}
1993ab6e3d09SNaoya Horiguchi 		pmd = move_soft_dirty_pmd(pmd);
1994ab6e3d09SNaoya Horiguchi 		set_pmd_at(mm, new_addr, new_pmd, pmd);
19955d190420SAaron Lu 		if (force_flush)
19967c38f181SMiaohe Lin 			flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1997eb66ae03SLinus Torvalds 		if (new_ptl != old_ptl)
1998eb66ae03SLinus Torvalds 			spin_unlock(new_ptl);
1999bf929152SKirill A. Shutemov 		spin_unlock(old_ptl);
20004b471e88SKirill A. Shutemov 		return true;
200137a1c49aSAndrea Arcangeli 	}
20024b471e88SKirill A. Shutemov 	return false;
200337a1c49aSAndrea Arcangeli }
200437a1c49aSAndrea Arcangeli 
2005f123d74aSMel Gorman /*
2006f123d74aSMel Gorman  * Returns
2007f123d74aSMel Gorman  *  - 0 if PMD could not be locked
2008f0953a1bSIngo Molnar  *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
2009e346e668SYang Shi  *      or if prot_numa but THP migration is not supported
2010f0953a1bSIngo Molnar  *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
2011f123d74aSMel Gorman  */
20124a18419fSNadav Amit int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
20134a18419fSNadav Amit 		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
20144a18419fSNadav Amit 		    unsigned long cp_flags)
2015cd7548abSJohannes Weiner {
2016cd7548abSJohannes Weiner 	struct mm_struct *mm = vma->vm_mm;
2017bf929152SKirill A. Shutemov 	spinlock_t *ptl;
2018c9fe6656SNadav Amit 	pmd_t oldpmd, entry;
201958705444SPeter Xu 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
2020292924b2SPeter Xu 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2021292924b2SPeter Xu 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
20226a56ccbcSDavid Hildenbrand 	int ret = 1;
2023cd7548abSJohannes Weiner 
20244a18419fSNadav Amit 	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
20254a18419fSNadav Amit 
2026e346e668SYang Shi 	if (prot_numa && !thp_migration_supported())
2027e346e668SYang Shi 		return 1;
2028e346e668SYang Shi 
2029b6ec57f4SKirill A. Shutemov 	ptl = __pmd_trans_huge_lock(pmd, vma);
20300a85e51dSKirill A. Shutemov 	if (!ptl)
20310a85e51dSKirill A. Shutemov 		return 0;
20320a85e51dSKirill A. Shutemov 
203384c3fc4eSZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
203484c3fc4eSZi Yan 	if (is_swap_pmd(*pmd)) {
203584c3fc4eSZi Yan 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
2036d986ba2bSKefeng Wang 		struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
203724bf08c4SDavid Hildenbrand 		pmd_t newpmd;
203884c3fc4eSZi Yan 
203984c3fc4eSZi Yan 		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
20404dd845b5SAlistair Popple 		if (is_writable_migration_entry(entry)) {
204184c3fc4eSZi Yan 			/*
204284c3fc4eSZi Yan 			 * A protection check is difficult so
204384c3fc4eSZi Yan 			 * just be safe and disable write
204484c3fc4eSZi Yan 			 */
2045d986ba2bSKefeng Wang 			if (folio_test_anon(folio))
20466c287605SDavid Hildenbrand 				entry = make_readable_exclusive_migration_entry(swp_offset(entry));
20476c287605SDavid Hildenbrand 			else
20486c287605SDavid Hildenbrand 				entry = make_readable_migration_entry(swp_offset(entry));
204984c3fc4eSZi Yan 			newpmd = swp_entry_to_pmd(entry);
2050ab6e3d09SNaoya Horiguchi 			if (pmd_swp_soft_dirty(*pmd))
2051ab6e3d09SNaoya Horiguchi 				newpmd = pmd_swp_mksoft_dirty(newpmd);
205224bf08c4SDavid Hildenbrand 		} else {
205324bf08c4SDavid Hildenbrand 			newpmd = *pmd;
205484c3fc4eSZi Yan 		}
205524bf08c4SDavid Hildenbrand 
205624bf08c4SDavid Hildenbrand 		if (uffd_wp)
205724bf08c4SDavid Hildenbrand 			newpmd = pmd_swp_mkuffd_wp(newpmd);
205824bf08c4SDavid Hildenbrand 		else if (uffd_wp_resolve)
205924bf08c4SDavid Hildenbrand 			newpmd = pmd_swp_clear_uffd_wp(newpmd);
206024bf08c4SDavid Hildenbrand 		if (!pmd_same(*pmd, newpmd))
206124bf08c4SDavid Hildenbrand 			set_pmd_at(mm, addr, pmd, newpmd);
206284c3fc4eSZi Yan 		goto unlock;
206384c3fc4eSZi Yan 	}
206484c3fc4eSZi Yan #endif
206584c3fc4eSZi Yan 
2066a1a3a2fcSHuang Ying 	if (prot_numa) {
2067d986ba2bSKefeng Wang 		struct folio *folio;
206833024536SHuang Ying 		bool toptier;
2069e944fd67SMel Gorman 		/*
2070e944fd67SMel Gorman 		 * Avoid trapping faults against the zero page. The read-only
2071e944fd67SMel Gorman 		 * data is likely to be read-cached on the local CPU and
2072e944fd67SMel Gorman 		 * local/remote hits to the zero page are not interesting.
2073e944fd67SMel Gorman 		 */
2074a1a3a2fcSHuang Ying 		if (is_huge_zero_pmd(*pmd))
20750a85e51dSKirill A. Shutemov 			goto unlock;
2076e944fd67SMel Gorman 
2077a1a3a2fcSHuang Ying 		if (pmd_protnone(*pmd))
20780a85e51dSKirill A. Shutemov 			goto unlock;
20790a85e51dSKirill A. Shutemov 
2080d986ba2bSKefeng Wang 		folio = page_folio(pmd_page(*pmd));
2081d986ba2bSKefeng Wang 		toptier = node_is_toptier(folio_nid(folio));
2082a1a3a2fcSHuang Ying 		/*
2083a1a3a2fcSHuang Ying 		 * Skip scanning top tier node if normal numa
2084a1a3a2fcSHuang Ying 		 * balancing is disabled
2085a1a3a2fcSHuang Ying 		 */
2086a1a3a2fcSHuang Ying 		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
208733024536SHuang Ying 		    toptier)
2088a1a3a2fcSHuang Ying 			goto unlock;
208933024536SHuang Ying 
209033024536SHuang Ying 		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
209133024536SHuang Ying 		    !toptier)
2092d986ba2bSKefeng Wang 			folio_xchg_access_time(folio,
2093d986ba2bSKefeng Wang 					       jiffies_to_msecs(jiffies));
2094a1a3a2fcSHuang Ying 	}
2095ced10803SKirill A. Shutemov 	/*
20963e4e28c5SMichel Lespinasse 	 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
2097ced10803SKirill A. Shutemov 	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
20983e4e28c5SMichel Lespinasse 	 * which is also under mmap_read_lock(mm):
2099ced10803SKirill A. Shutemov 	 *
2100ced10803SKirill A. Shutemov 	 *	CPU0:				CPU1:
2101ced10803SKirill A. Shutemov 	 *				change_huge_pmd(prot_numa=1)
2102ced10803SKirill A. Shutemov 	 *				 pmdp_huge_get_and_clear_notify()
2103ced10803SKirill A. Shutemov 	 * madvise_dontneed()
2104ced10803SKirill A. Shutemov 	 *  zap_pmd_range()
2105ced10803SKirill A. Shutemov 	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
2106ced10803SKirill A. Shutemov 	 *   // skip the pmd
2107ced10803SKirill A. Shutemov 	 *				 set_pmd_at();
2108ced10803SKirill A. Shutemov 	 *				 // pmd is re-established
2109ced10803SKirill A. Shutemov 	 *
2110ced10803SKirill A. Shutemov 	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2111ced10803SKirill A. Shutemov 	 * which may break userspace.
2112ced10803SKirill A. Shutemov 	 *
21134f831457SNadav Amit 	 * pmdp_invalidate_ad() is required to make sure we don't miss
2114ced10803SKirill A. Shutemov 	 * dirty/young flags set by hardware.
2115ced10803SKirill A. Shutemov 	 */
21164f831457SNadav Amit 	oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2117ced10803SKirill A. Shutemov 
2118c9fe6656SNadav Amit 	entry = pmd_modify(oldpmd, newprot);
2119f1eb1bacSPeter Xu 	if (uffd_wp)
2120292924b2SPeter Xu 		entry = pmd_mkuffd_wp(entry);
2121f1eb1bacSPeter Xu 	else if (uffd_wp_resolve)
2122292924b2SPeter Xu 		/*
2123292924b2SPeter Xu 		 * Leave the write bit to be handled by PF interrupt
2124292924b2SPeter Xu 		 * handler, then things like COW could be properly
2125292924b2SPeter Xu 		 * handled.
2126292924b2SPeter Xu 		 */
2127292924b2SPeter Xu 		entry = pmd_clear_uffd_wp(entry);
2128c27f479eSDavid Hildenbrand 
2129c27f479eSDavid Hildenbrand 	/* See change_pte_range(). */
2130c27f479eSDavid Hildenbrand 	if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2131c27f479eSDavid Hildenbrand 	    can_change_pmd_writable(vma, addr, entry))
2132161e393cSRick Edgecombe 		entry = pmd_mkwrite(entry, vma);
2133c27f479eSDavid Hildenbrand 
2134f123d74aSMel Gorman 	ret = HPAGE_PMD_NR;
213556eecdb9SAneesh Kumar K.V 	set_pmd_at(mm, addr, pmd, entry);
21364a18419fSNadav Amit 
2137c9fe6656SNadav Amit 	if (huge_pmd_needs_flush(oldpmd, entry))
21384a18419fSNadav Amit 		tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
21390a85e51dSKirill A. Shutemov unlock:
2140bf929152SKirill A. Shutemov 	spin_unlock(ptl);
2141cd7548abSJohannes Weiner 	return ret;
2142cd7548abSJohannes Weiner }
2143cd7548abSJohannes Weiner 
2144adef4406SAndrea Arcangeli #ifdef CONFIG_USERFAULTFD
2145adef4406SAndrea Arcangeli /*
2146adef4406SAndrea Arcangeli  * The PT lock for src_pmd and the mmap_lock for reading are held by
2147adef4406SAndrea Arcangeli  * the caller, but it must return after releasing the page_table_lock.
2148adef4406SAndrea Arcangeli  * Just move the page from src_pmd to dst_pmd if possible.
2149adef4406SAndrea Arcangeli  * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2150adef4406SAndrea Arcangeli  * repeated by the caller, or other errors in case of failure.
2151adef4406SAndrea Arcangeli  */
2152adef4406SAndrea Arcangeli int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2153adef4406SAndrea Arcangeli 			struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2154adef4406SAndrea Arcangeli 			unsigned long dst_addr, unsigned long src_addr)
2155adef4406SAndrea Arcangeli {
2156adef4406SAndrea Arcangeli 	pmd_t _dst_pmd, src_pmdval;
2157adef4406SAndrea Arcangeli 	struct page *src_page;
2158adef4406SAndrea Arcangeli 	struct folio *src_folio;
2159adef4406SAndrea Arcangeli 	struct anon_vma *src_anon_vma;
2160adef4406SAndrea Arcangeli 	spinlock_t *src_ptl, *dst_ptl;
2161adef4406SAndrea Arcangeli 	pgtable_t src_pgtable;
2162adef4406SAndrea Arcangeli 	struct mmu_notifier_range range;
2163adef4406SAndrea Arcangeli 	int err = 0;
2164adef4406SAndrea Arcangeli 
2165adef4406SAndrea Arcangeli 	src_pmdval = *src_pmd;
2166adef4406SAndrea Arcangeli 	src_ptl = pmd_lockptr(mm, src_pmd);
2167adef4406SAndrea Arcangeli 
2168adef4406SAndrea Arcangeli 	lockdep_assert_held(src_ptl);
2169adef4406SAndrea Arcangeli 	mmap_assert_locked(mm);
2170adef4406SAndrea Arcangeli 
2171adef4406SAndrea Arcangeli 	/* Sanity checks before the operation */
2172adef4406SAndrea Arcangeli 	if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2173adef4406SAndrea Arcangeli 	    WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2174adef4406SAndrea Arcangeli 		spin_unlock(src_ptl);
2175adef4406SAndrea Arcangeli 		return -EINVAL;
2176adef4406SAndrea Arcangeli 	}
2177adef4406SAndrea Arcangeli 
2178adef4406SAndrea Arcangeli 	if (!pmd_trans_huge(src_pmdval)) {
2179adef4406SAndrea Arcangeli 		spin_unlock(src_ptl);
2180adef4406SAndrea Arcangeli 		if (is_pmd_migration_entry(src_pmdval)) {
2181adef4406SAndrea Arcangeli 			pmd_migration_entry_wait(mm, &src_pmdval);
2182adef4406SAndrea Arcangeli 			return -EAGAIN;
2183adef4406SAndrea Arcangeli 		}
2184adef4406SAndrea Arcangeli 		return -ENOENT;
2185adef4406SAndrea Arcangeli 	}
2186adef4406SAndrea Arcangeli 
2187adef4406SAndrea Arcangeli 	src_page = pmd_page(src_pmdval);
2188adef4406SAndrea Arcangeli 	if (unlikely(!PageAnonExclusive(src_page))) {
2189adef4406SAndrea Arcangeli 		spin_unlock(src_ptl);
2190adef4406SAndrea Arcangeli 		return -EBUSY;
2191adef4406SAndrea Arcangeli 	}
2192adef4406SAndrea Arcangeli 
2193adef4406SAndrea Arcangeli 	src_folio = page_folio(src_page);
2194adef4406SAndrea Arcangeli 	folio_get(src_folio);
2195adef4406SAndrea Arcangeli 	spin_unlock(src_ptl);
2196adef4406SAndrea Arcangeli 
2197adef4406SAndrea Arcangeli 	flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2198adef4406SAndrea Arcangeli 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2199adef4406SAndrea Arcangeli 				src_addr + HPAGE_PMD_SIZE);
2200adef4406SAndrea Arcangeli 	mmu_notifier_invalidate_range_start(&range);
2201adef4406SAndrea Arcangeli 
2202adef4406SAndrea Arcangeli 	folio_lock(src_folio);
2203adef4406SAndrea Arcangeli 
2204adef4406SAndrea Arcangeli 	/*
2205adef4406SAndrea Arcangeli 	 * split_huge_page walks the anon_vma chain without the page
2206adef4406SAndrea Arcangeli 	 * lock. Serialize against it with the anon_vma lock, the page
2207adef4406SAndrea Arcangeli 	 * lock is not enough.
2208adef4406SAndrea Arcangeli 	 */
2209adef4406SAndrea Arcangeli 	src_anon_vma = folio_get_anon_vma(src_folio);
2210adef4406SAndrea Arcangeli 	if (!src_anon_vma) {
2211adef4406SAndrea Arcangeli 		err = -EAGAIN;
2212adef4406SAndrea Arcangeli 		goto unlock_folio;
2213adef4406SAndrea Arcangeli 	}
2214adef4406SAndrea Arcangeli 	anon_vma_lock_write(src_anon_vma);
2215adef4406SAndrea Arcangeli 
2216adef4406SAndrea Arcangeli 	dst_ptl = pmd_lockptr(mm, dst_pmd);
2217adef4406SAndrea Arcangeli 	double_pt_lock(src_ptl, dst_ptl);
2218adef4406SAndrea Arcangeli 	if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2219adef4406SAndrea Arcangeli 		     !pmd_same(*dst_pmd, dst_pmdval))) {
2220adef4406SAndrea Arcangeli 		err = -EAGAIN;
2221adef4406SAndrea Arcangeli 		goto unlock_ptls;
2222adef4406SAndrea Arcangeli 	}
2223adef4406SAndrea Arcangeli 	if (folio_maybe_dma_pinned(src_folio) ||
2224adef4406SAndrea Arcangeli 	    !PageAnonExclusive(&src_folio->page)) {
2225adef4406SAndrea Arcangeli 		err = -EBUSY;
2226adef4406SAndrea Arcangeli 		goto unlock_ptls;
2227adef4406SAndrea Arcangeli 	}
2228adef4406SAndrea Arcangeli 
2229adef4406SAndrea Arcangeli 	if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2230adef4406SAndrea Arcangeli 	    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2231adef4406SAndrea Arcangeli 		err = -EBUSY;
2232adef4406SAndrea Arcangeli 		goto unlock_ptls;
2233adef4406SAndrea Arcangeli 	}
2234adef4406SAndrea Arcangeli 
2235adef4406SAndrea Arcangeli 	folio_move_anon_rmap(src_folio, dst_vma);
2236adef4406SAndrea Arcangeli 	WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
2237adef4406SAndrea Arcangeli 
2238adef4406SAndrea Arcangeli 	src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2239adef4406SAndrea Arcangeli 	/* Folio got pinned from under us. Put it back and fail the move. */
2240adef4406SAndrea Arcangeli 	if (folio_maybe_dma_pinned(src_folio)) {
2241adef4406SAndrea Arcangeli 		set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2242adef4406SAndrea Arcangeli 		err = -EBUSY;
2243adef4406SAndrea Arcangeli 		goto unlock_ptls;
2244adef4406SAndrea Arcangeli 	}
2245adef4406SAndrea Arcangeli 
2246adef4406SAndrea Arcangeli 	_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2247adef4406SAndrea Arcangeli 	/* Follow mremap() behavior and treat the entry dirty after the move */
2248adef4406SAndrea Arcangeli 	_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2249adef4406SAndrea Arcangeli 	set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2250adef4406SAndrea Arcangeli 
2251adef4406SAndrea Arcangeli 	src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2252adef4406SAndrea Arcangeli 	pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2253adef4406SAndrea Arcangeli unlock_ptls:
2254adef4406SAndrea Arcangeli 	double_pt_unlock(src_ptl, dst_ptl);
2255adef4406SAndrea Arcangeli 	anon_vma_unlock_write(src_anon_vma);
2256adef4406SAndrea Arcangeli 	put_anon_vma(src_anon_vma);
2257adef4406SAndrea Arcangeli unlock_folio:
2258adef4406SAndrea Arcangeli 	/* unblock rmap walks */
2259adef4406SAndrea Arcangeli 	folio_unlock(src_folio);
2260adef4406SAndrea Arcangeli 	mmu_notifier_invalidate_range_end(&range);
2261adef4406SAndrea Arcangeli 	folio_put(src_folio);
2262adef4406SAndrea Arcangeli 	return err;
2263adef4406SAndrea Arcangeli }
2264adef4406SAndrea Arcangeli #endif /* CONFIG_USERFAULTFD */
2265adef4406SAndrea Arcangeli 
2266025c5b24SNaoya Horiguchi /*
22678f19b0c0SHuang Ying  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2268025c5b24SNaoya Horiguchi  *
22698f19b0c0SHuang Ying  * Note that if it returns page table lock pointer, this routine returns without
22708f19b0c0SHuang Ying  * unlocking page table lock. So callers must unlock it.
2271025c5b24SNaoya Horiguchi  */
2272b6ec57f4SKirill A. Shutemov spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2273025c5b24SNaoya Horiguchi {
2274b6ec57f4SKirill A. Shutemov 	spinlock_t *ptl;
2275b6ec57f4SKirill A. Shutemov 	ptl = pmd_lock(vma->vm_mm, pmd);
227684c3fc4eSZi Yan 	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
227784c3fc4eSZi Yan 			pmd_devmap(*pmd)))
2278b6ec57f4SKirill A. Shutemov 		return ptl;
2279b6ec57f4SKirill A. Shutemov 	spin_unlock(ptl);
2280b6ec57f4SKirill A. Shutemov 	return NULL;
2281025c5b24SNaoya Horiguchi }
2282025c5b24SNaoya Horiguchi 
2283a00cc7d9SMatthew Wilcox /*
2284d965e390SMiaohe Lin  * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2285a00cc7d9SMatthew Wilcox  *
2286d965e390SMiaohe Lin  * Note that if it returns page table lock pointer, this routine returns without
2287d965e390SMiaohe Lin  * unlocking page table lock. So callers must unlock it.
2288a00cc7d9SMatthew Wilcox  */
2289a00cc7d9SMatthew Wilcox spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2290a00cc7d9SMatthew Wilcox {
2291a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
2292a00cc7d9SMatthew Wilcox 
2293a00cc7d9SMatthew Wilcox 	ptl = pud_lock(vma->vm_mm, pud);
2294a00cc7d9SMatthew Wilcox 	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2295a00cc7d9SMatthew Wilcox 		return ptl;
2296a00cc7d9SMatthew Wilcox 	spin_unlock(ptl);
2297a00cc7d9SMatthew Wilcox 	return NULL;
2298a00cc7d9SMatthew Wilcox }
2299a00cc7d9SMatthew Wilcox 
2300a00cc7d9SMatthew Wilcox #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2301a00cc7d9SMatthew Wilcox int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2302a00cc7d9SMatthew Wilcox 		 pud_t *pud, unsigned long addr)
2303a00cc7d9SMatthew Wilcox {
2304a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
2305a00cc7d9SMatthew Wilcox 
2306a00cc7d9SMatthew Wilcox 	ptl = __pud_trans_huge_lock(pud, vma);
2307a00cc7d9SMatthew Wilcox 	if (!ptl)
2308a00cc7d9SMatthew Wilcox 		return 0;
230974929079SMiaohe Lin 
2310f32928abSAneesh Kumar K.V 	pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2311a00cc7d9SMatthew Wilcox 	tlb_remove_pud_tlb_entry(tlb, pud, addr);
23122484ca9bSThomas Hellstrom (VMware) 	if (vma_is_special_huge(vma)) {
2313a00cc7d9SMatthew Wilcox 		spin_unlock(ptl);
2314a00cc7d9SMatthew Wilcox 		/* No zero page support yet */
2315a00cc7d9SMatthew Wilcox 	} else {
2316a00cc7d9SMatthew Wilcox 		/* No support for anonymous PUD pages yet */
2317a00cc7d9SMatthew Wilcox 		BUG();
2318a00cc7d9SMatthew Wilcox 	}
2319a00cc7d9SMatthew Wilcox 	return 1;
2320a00cc7d9SMatthew Wilcox }
2321a00cc7d9SMatthew Wilcox 
2322a00cc7d9SMatthew Wilcox static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2323a00cc7d9SMatthew Wilcox 		unsigned long haddr)
2324a00cc7d9SMatthew Wilcox {
2325a00cc7d9SMatthew Wilcox 	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2326a00cc7d9SMatthew Wilcox 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2327a00cc7d9SMatthew Wilcox 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2328a00cc7d9SMatthew Wilcox 	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2329a00cc7d9SMatthew Wilcox 
2330ce9311cfSYisheng Xie 	count_vm_event(THP_SPLIT_PUD);
2331a00cc7d9SMatthew Wilcox 
2332ec8832d0SAlistair Popple 	pudp_huge_clear_flush(vma, haddr, pud);
2333a00cc7d9SMatthew Wilcox }
2334a00cc7d9SMatthew Wilcox 
2335a00cc7d9SMatthew Wilcox void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2336a00cc7d9SMatthew Wilcox 		unsigned long address)
2337a00cc7d9SMatthew Wilcox {
2338a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
2339ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
2340a00cc7d9SMatthew Wilcox 
23417d4a8be0SAlistair Popple 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
23426f4f13e8SJérôme Glisse 				address & HPAGE_PUD_MASK,
2343ac46d4f3SJérôme Glisse 				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2344ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
2345ac46d4f3SJérôme Glisse 	ptl = pud_lock(vma->vm_mm, pud);
2346a00cc7d9SMatthew Wilcox 	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2347a00cc7d9SMatthew Wilcox 		goto out;
2348ac46d4f3SJérôme Glisse 	__split_huge_pud_locked(vma, pud, range.start);
2349a00cc7d9SMatthew Wilcox 
2350a00cc7d9SMatthew Wilcox out:
2351a00cc7d9SMatthew Wilcox 	spin_unlock(ptl);
2352ec8832d0SAlistair Popple 	mmu_notifier_invalidate_range_end(&range);
2353a00cc7d9SMatthew Wilcox }
2354a00cc7d9SMatthew Wilcox #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2355a00cc7d9SMatthew Wilcox 
2356eef1b3baSKirill A. Shutemov static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2357eef1b3baSKirill A. Shutemov 		unsigned long haddr, pmd_t *pmd)
2358eef1b3baSKirill A. Shutemov {
2359eef1b3baSKirill A. Shutemov 	struct mm_struct *mm = vma->vm_mm;
2360eef1b3baSKirill A. Shutemov 	pgtable_t pgtable;
236142b2af2cSDavid Hildenbrand 	pmd_t _pmd, old_pmd;
2362c9c1ee20SHugh Dickins 	unsigned long addr;
2363c9c1ee20SHugh Dickins 	pte_t *pte;
2364eef1b3baSKirill A. Shutemov 	int i;
2365eef1b3baSKirill A. Shutemov 
23660f10851eSJérôme Glisse 	/*
23670f10851eSJérôme Glisse 	 * Leave pmd empty until pte is filled note that it is fine to delay
23680f10851eSJérôme Glisse 	 * notification until mmu_notifier_invalidate_range_end() as we are
23690f10851eSJérôme Glisse 	 * replacing a zero pmd write protected page with a zero pte write
23700f10851eSJérôme Glisse 	 * protected page.
23710f10851eSJérôme Glisse 	 *
2372ee65728eSMike Rapoport 	 * See Documentation/mm/mmu_notifier.rst
23730f10851eSJérôme Glisse 	 */
237442b2af2cSDavid Hildenbrand 	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2375eef1b3baSKirill A. Shutemov 
2376eef1b3baSKirill A. Shutemov 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2377eef1b3baSKirill A. Shutemov 	pmd_populate(mm, &_pmd, pgtable);
2378eef1b3baSKirill A. Shutemov 
2379c9c1ee20SHugh Dickins 	pte = pte_offset_map(&_pmd, haddr);
2380c9c1ee20SHugh Dickins 	VM_BUG_ON(!pte);
2381c9c1ee20SHugh Dickins 	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2382c9c1ee20SHugh Dickins 		pte_t entry;
2383c9c1ee20SHugh Dickins 
2384c9c1ee20SHugh Dickins 		entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2385eef1b3baSKirill A. Shutemov 		entry = pte_mkspecial(entry);
238642b2af2cSDavid Hildenbrand 		if (pmd_uffd_wp(old_pmd))
238742b2af2cSDavid Hildenbrand 			entry = pte_mkuffd_wp(entry);
2388c33c7948SRyan Roberts 		VM_BUG_ON(!pte_none(ptep_get(pte)));
2389c9c1ee20SHugh Dickins 		set_pte_at(mm, addr, pte, entry);
2390c9c1ee20SHugh Dickins 		pte++;
2391eef1b3baSKirill A. Shutemov 	}
2392c9c1ee20SHugh Dickins 	pte_unmap(pte - 1);
2393eef1b3baSKirill A. Shutemov 	smp_wmb(); /* make pte visible before pmd */
2394eef1b3baSKirill A. Shutemov 	pmd_populate(mm, pmd, pgtable);
2395eef1b3baSKirill A. Shutemov }
2396eef1b3baSKirill A. Shutemov 
2397eef1b3baSKirill A. Shutemov static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2398ba988280SKirill A. Shutemov 		unsigned long haddr, bool freeze)
2399eef1b3baSKirill A. Shutemov {
2400eef1b3baSKirill A. Shutemov 	struct mm_struct *mm = vma->vm_mm;
2401*91b2978aSDavid Hildenbrand 	struct folio *folio;
2402eef1b3baSKirill A. Shutemov 	struct page *page;
2403eef1b3baSKirill A. Shutemov 	pgtable_t pgtable;
2404423ac9afSAneesh Kumar K.V 	pmd_t old_pmd, _pmd;
2405292924b2SPeter Xu 	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
24060ccf7f16SPeter Xu 	bool anon_exclusive = false, dirty = false;
24072ac015e2SKirill A. Shutemov 	unsigned long addr;
2408c9c1ee20SHugh Dickins 	pte_t *pte;
2409eef1b3baSKirill A. Shutemov 	int i;
2410eef1b3baSKirill A. Shutemov 
2411eef1b3baSKirill A. Shutemov 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2412eef1b3baSKirill A. Shutemov 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2413eef1b3baSKirill A. Shutemov 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
241484c3fc4eSZi Yan 	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
241584c3fc4eSZi Yan 				&& !pmd_devmap(*pmd));
2416eef1b3baSKirill A. Shutemov 
2417eef1b3baSKirill A. Shutemov 	count_vm_event(THP_SPLIT_PMD);
2418eef1b3baSKirill A. Shutemov 
2419d21b9e57SKirill A. Shutemov 	if (!vma_is_anonymous(vma)) {
2420ec8832d0SAlistair Popple 		old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2421953c66c2SAneesh Kumar K.V 		/*
2422953c66c2SAneesh Kumar K.V 		 * We are going to unmap this huge page. So
2423953c66c2SAneesh Kumar K.V 		 * just go ahead and zap it
2424953c66c2SAneesh Kumar K.V 		 */
2425953c66c2SAneesh Kumar K.V 		if (arch_needs_pgtable_deposit())
2426953c66c2SAneesh Kumar K.V 			zap_deposited_table(mm, pmd);
24272484ca9bSThomas Hellstrom (VMware) 		if (vma_is_special_huge(vma))
2428d21b9e57SKirill A. Shutemov 			return;
242999fa8a48SHugh Dickins 		if (unlikely(is_pmd_migration_entry(old_pmd))) {
243099fa8a48SHugh Dickins 			swp_entry_t entry;
243199fa8a48SHugh Dickins 
243299fa8a48SHugh Dickins 			entry = pmd_to_swp_entry(old_pmd);
2433af5cdaf8SAlistair Popple 			page = pfn_swap_entry_to_page(entry);
243499fa8a48SHugh Dickins 		} else {
243599fa8a48SHugh Dickins 			page = pmd_page(old_pmd);
243699fa8a48SHugh Dickins 			if (!PageDirty(page) && pmd_dirty(old_pmd))
2437e1f1b157SHugh Dickins 				set_page_dirty(page);
243899fa8a48SHugh Dickins 			if (!PageReferenced(page) && pmd_young(old_pmd))
2439d21b9e57SKirill A. Shutemov 				SetPageReferenced(page);
2440cea86fe2SHugh Dickins 			page_remove_rmap(page, vma, true);
2441d21b9e57SKirill A. Shutemov 			put_page(page);
244299fa8a48SHugh Dickins 		}
2443fadae295SYang Shi 		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
2444eef1b3baSKirill A. Shutemov 		return;
244599fa8a48SHugh Dickins 	}
244699fa8a48SHugh Dickins 
24473b77e8c8SHugh Dickins 	if (is_huge_zero_pmd(*pmd)) {
24484645b9feSJérôme Glisse 		/*
24494645b9feSJérôme Glisse 		 * FIXME: Do we want to invalidate secondary mmu by calling
24501af5a810SAlistair Popple 		 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
24511af5a810SAlistair Popple 		 * inside __split_huge_pmd() ?
24524645b9feSJérôme Glisse 		 *
24534645b9feSJérôme Glisse 		 * We are going from a zero huge page write protected to zero
24544645b9feSJérôme Glisse 		 * small page also write protected so it does not seems useful
24554645b9feSJérôme Glisse 		 * to invalidate secondary mmu at this time.
24564645b9feSJérôme Glisse 		 */
2457eef1b3baSKirill A. Shutemov 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
2458eef1b3baSKirill A. Shutemov 	}
2459eef1b3baSKirill A. Shutemov 
2460423ac9afSAneesh Kumar K.V 	/*
2461423ac9afSAneesh Kumar K.V 	 * Up to this point the pmd is present and huge and userland has the
2462423ac9afSAneesh Kumar K.V 	 * whole access to the hugepage during the split (which happens in
2463423ac9afSAneesh Kumar K.V 	 * place). If we overwrite the pmd with the not-huge version pointing
2464423ac9afSAneesh Kumar K.V 	 * to the pte here (which of course we could if all CPUs were bug
2465423ac9afSAneesh Kumar K.V 	 * free), userland could trigger a small page size TLB miss on the
2466423ac9afSAneesh Kumar K.V 	 * small sized TLB while the hugepage TLB entry is still established in
2467423ac9afSAneesh Kumar K.V 	 * the huge TLB. Some CPU doesn't like that.
246842742d9bSAlexander A. Klimov 	 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
246942742d9bSAlexander A. Klimov 	 * 383 on page 105. Intel should be safe but is also warns that it's
2470423ac9afSAneesh Kumar K.V 	 * only safe if the permission and cache attributes of the two entries
2471423ac9afSAneesh Kumar K.V 	 * loaded in the two TLB is identical (which should be the case here).
2472423ac9afSAneesh Kumar K.V 	 * But it is generally safer to never allow small and huge TLB entries
2473423ac9afSAneesh Kumar K.V 	 * for the same virtual address to be loaded simultaneously. So instead
2474423ac9afSAneesh Kumar K.V 	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2475423ac9afSAneesh Kumar K.V 	 * current pmd notpresent (atomically because here the pmd_trans_huge
2476423ac9afSAneesh Kumar K.V 	 * must remain set at all times on the pmd until the split is complete
2477423ac9afSAneesh Kumar K.V 	 * for this pmd), then we flush the SMP TLB and finally we write the
2478423ac9afSAneesh Kumar K.V 	 * non-huge version of the pmd entry with pmd_populate.
2479423ac9afSAneesh Kumar K.V 	 */
2480423ac9afSAneesh Kumar K.V 	old_pmd = pmdp_invalidate(vma, haddr, pmd);
2481423ac9afSAneesh Kumar K.V 
2482423ac9afSAneesh Kumar K.V 	pmd_migration = is_pmd_migration_entry(old_pmd);
24832e83ee1dSPeter Xu 	if (unlikely(pmd_migration)) {
248484c3fc4eSZi Yan 		swp_entry_t entry;
248584c3fc4eSZi Yan 
2486423ac9afSAneesh Kumar K.V 		entry = pmd_to_swp_entry(old_pmd);
2487af5cdaf8SAlistair Popple 		page = pfn_swap_entry_to_page(entry);
24884dd845b5SAlistair Popple 		write = is_writable_migration_entry(entry);
24896c287605SDavid Hildenbrand 		if (PageAnon(page))
24906c287605SDavid Hildenbrand 			anon_exclusive = is_readable_exclusive_migration_entry(entry);
24912e346877SPeter Xu 		young = is_migration_entry_young(entry);
24922e346877SPeter Xu 		dirty = is_migration_entry_dirty(entry);
24932e83ee1dSPeter Xu 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
2494f45ec5ffSPeter Xu 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
24952e83ee1dSPeter Xu 	} else {
2496423ac9afSAneesh Kumar K.V 		page = pmd_page(old_pmd);
2497*91b2978aSDavid Hildenbrand 		folio = page_folio(page);
24980ccf7f16SPeter Xu 		if (pmd_dirty(old_pmd)) {
24990ccf7f16SPeter Xu 			dirty = true;
2500*91b2978aSDavid Hildenbrand 			folio_set_dirty(folio);
25010ccf7f16SPeter Xu 		}
2502423ac9afSAneesh Kumar K.V 		write = pmd_write(old_pmd);
2503423ac9afSAneesh Kumar K.V 		young = pmd_young(old_pmd);
2504423ac9afSAneesh Kumar K.V 		soft_dirty = pmd_soft_dirty(old_pmd);
2505292924b2SPeter Xu 		uffd_wp = pmd_uffd_wp(old_pmd);
25066c287605SDavid Hildenbrand 
2507*91b2978aSDavid Hildenbrand 		VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2508*91b2978aSDavid Hildenbrand 		VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
25096c287605SDavid Hildenbrand 
25106c287605SDavid Hildenbrand 		/*
25116c287605SDavid Hildenbrand 		 * Without "freeze", we'll simply split the PMD, propagating the
25126c287605SDavid Hildenbrand 		 * PageAnonExclusive() flag for each PTE by setting it for
25136c287605SDavid Hildenbrand 		 * each subpage -- no need to (temporarily) clear.
25146c287605SDavid Hildenbrand 		 *
25156c287605SDavid Hildenbrand 		 * With "freeze" we want to replace mapped pages by
25166c287605SDavid Hildenbrand 		 * migration entries right away. This is only possible if we
25176c287605SDavid Hildenbrand 		 * managed to clear PageAnonExclusive() -- see
25186c287605SDavid Hildenbrand 		 * set_pmd_migration_entry().
25196c287605SDavid Hildenbrand 		 *
25206c287605SDavid Hildenbrand 		 * In case we cannot clear PageAnonExclusive(), split the PMD
25216c287605SDavid Hildenbrand 		 * only and let try_to_migrate_one() fail later.
2522088b8aa5SDavid Hildenbrand 		 *
2523088b8aa5SDavid Hildenbrand 		 * See page_try_share_anon_rmap(): invalidate PMD first.
25246c287605SDavid Hildenbrand 		 */
2525*91b2978aSDavid Hildenbrand 		anon_exclusive = PageAnonExclusive(page);
25266c287605SDavid Hildenbrand 		if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
25276c287605SDavid Hildenbrand 			freeze = false;
2528*91b2978aSDavid Hildenbrand 		if (!freeze) {
2529*91b2978aSDavid Hildenbrand 			rmap_t rmap_flags = RMAP_NONE;
2530*91b2978aSDavid Hildenbrand 
2531*91b2978aSDavid Hildenbrand 			folio_ref_add(folio, HPAGE_PMD_NR - 1);
2532*91b2978aSDavid Hildenbrand 			if (anon_exclusive)
2533*91b2978aSDavid Hildenbrand 				rmap_flags |= RMAP_EXCLUSIVE;
2534*91b2978aSDavid Hildenbrand 			folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2535*91b2978aSDavid Hildenbrand 						 vma, haddr, rmap_flags);
2536*91b2978aSDavid Hildenbrand 		}
25379d84604bSHugh Dickins 	}
2538eef1b3baSKirill A. Shutemov 
2539423ac9afSAneesh Kumar K.V 	/*
2540423ac9afSAneesh Kumar K.V 	 * Withdraw the table only after we mark the pmd entry invalid.
2541423ac9afSAneesh Kumar K.V 	 * This's critical for some architectures (Power).
2542423ac9afSAneesh Kumar K.V 	 */
2543eef1b3baSKirill A. Shutemov 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2544eef1b3baSKirill A. Shutemov 	pmd_populate(mm, &_pmd, pgtable);
2545eef1b3baSKirill A. Shutemov 
2546c9c1ee20SHugh Dickins 	pte = pte_offset_map(&_pmd, haddr);
2547c9c1ee20SHugh Dickins 	VM_BUG_ON(!pte);
25482ac015e2SKirill A. Shutemov 	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2549c9c1ee20SHugh Dickins 		pte_t entry;
2550eef1b3baSKirill A. Shutemov 		/*
2551eef1b3baSKirill A. Shutemov 		 * Note that NUMA hinting access restrictions are not
2552eef1b3baSKirill A. Shutemov 		 * transferred to avoid any possibility of altering
2553eef1b3baSKirill A. Shutemov 		 * permissions across VMAs.
2554eef1b3baSKirill A. Shutemov 		 */
255584c3fc4eSZi Yan 		if (freeze || pmd_migration) {
2556ba988280SKirill A. Shutemov 			swp_entry_t swp_entry;
25574dd845b5SAlistair Popple 			if (write)
25584dd845b5SAlistair Popple 				swp_entry = make_writable_migration_entry(
25594dd845b5SAlistair Popple 							page_to_pfn(page + i));
25606c287605SDavid Hildenbrand 			else if (anon_exclusive)
25616c287605SDavid Hildenbrand 				swp_entry = make_readable_exclusive_migration_entry(
25626c287605SDavid Hildenbrand 							page_to_pfn(page + i));
25634dd845b5SAlistair Popple 			else
25644dd845b5SAlistair Popple 				swp_entry = make_readable_migration_entry(
25654dd845b5SAlistair Popple 							page_to_pfn(page + i));
25662e346877SPeter Xu 			if (young)
25672e346877SPeter Xu 				swp_entry = make_migration_entry_young(swp_entry);
25682e346877SPeter Xu 			if (dirty)
25692e346877SPeter Xu 				swp_entry = make_migration_entry_dirty(swp_entry);
2570ba988280SKirill A. Shutemov 			entry = swp_entry_to_pte(swp_entry);
2571804dd150SAndrea Arcangeli 			if (soft_dirty)
2572804dd150SAndrea Arcangeli 				entry = pte_swp_mksoft_dirty(entry);
2573f45ec5ffSPeter Xu 			if (uffd_wp)
2574f45ec5ffSPeter Xu 				entry = pte_swp_mkuffd_wp(entry);
2575ba988280SKirill A. Shutemov 		} else {
25766d2329f8SAndrea Arcangeli 			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
25771462c52eSDavid Hildenbrand 			if (write)
2578161e393cSRick Edgecombe 				entry = pte_mkwrite(entry, vma);
2579eef1b3baSKirill A. Shutemov 			if (!young)
2580eef1b3baSKirill A. Shutemov 				entry = pte_mkold(entry);
2581e833bc50SPeter Xu 			/* NOTE: this may set soft-dirty too on some archs */
2582e833bc50SPeter Xu 			if (dirty)
2583e833bc50SPeter Xu 				entry = pte_mkdirty(entry);
2584804dd150SAndrea Arcangeli 			if (soft_dirty)
2585804dd150SAndrea Arcangeli 				entry = pte_mksoft_dirty(entry);
2586292924b2SPeter Xu 			if (uffd_wp)
2587292924b2SPeter Xu 				entry = pte_mkuffd_wp(entry);
2588ba988280SKirill A. Shutemov 		}
2589c33c7948SRyan Roberts 		VM_BUG_ON(!pte_none(ptep_get(pte)));
25902ac015e2SKirill A. Shutemov 		set_pte_at(mm, addr, pte, entry);
2591c9c1ee20SHugh Dickins 		pte++;
2592eef1b3baSKirill A. Shutemov 	}
2593c9c1ee20SHugh Dickins 	pte_unmap(pte - 1);
2594eef1b3baSKirill A. Shutemov 
2595cb67f428SHugh Dickins 	if (!pmd_migration)
2596cb67f428SHugh Dickins 		page_remove_rmap(page, vma, true);
259796d82debSHugh Dickins 	if (freeze)
259896d82debSHugh Dickins 		put_page(page);
2599eef1b3baSKirill A. Shutemov 
2600eef1b3baSKirill A. Shutemov 	smp_wmb(); /* make pte visible before pmd */
2601eef1b3baSKirill A. Shutemov 	pmd_populate(mm, pmd, pgtable);
2602eef1b3baSKirill A. Shutemov }
2603eef1b3baSKirill A. Shutemov 
2604eef1b3baSKirill A. Shutemov void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2605af28a988SMatthew Wilcox (Oracle) 		unsigned long address, bool freeze, struct folio *folio)
2606eef1b3baSKirill A. Shutemov {
2607eef1b3baSKirill A. Shutemov 	spinlock_t *ptl;
2608ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
2609eef1b3baSKirill A. Shutemov 
26107d4a8be0SAlistair Popple 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
26116f4f13e8SJérôme Glisse 				address & HPAGE_PMD_MASK,
2612ac46d4f3SJérôme Glisse 				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2613ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
2614ac46d4f3SJérôme Glisse 	ptl = pmd_lock(vma->vm_mm, pmd);
261533f4751eSNaoya Horiguchi 
261633f4751eSNaoya Horiguchi 	/*
2617af28a988SMatthew Wilcox (Oracle) 	 * If caller asks to setup a migration entry, we need a folio to check
2618af28a988SMatthew Wilcox (Oracle) 	 * pmd against. Otherwise we can end up replacing wrong folio.
261933f4751eSNaoya Horiguchi 	 */
2620af28a988SMatthew Wilcox (Oracle) 	VM_BUG_ON(freeze && !folio);
262183a8441fSMatthew Wilcox (Oracle) 	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
262233f4751eSNaoya Horiguchi 
26237f760917SDavid Hildenbrand 	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
262483a8441fSMatthew Wilcox (Oracle) 	    is_pmd_migration_entry(*pmd)) {
2625cea33328SMiaohe Lin 		/*
2626cea33328SMiaohe Lin 		 * It's safe to call pmd_page when folio is set because it's
2627cea33328SMiaohe Lin 		 * guaranteed that pmd is present.
2628cea33328SMiaohe Lin 		 */
262983a8441fSMatthew Wilcox (Oracle) 		if (folio && folio != page_folio(pmd_page(*pmd)))
263083a8441fSMatthew Wilcox (Oracle) 			goto out;
2631ac46d4f3SJérôme Glisse 		__split_huge_pmd_locked(vma, pmd, range.start, freeze);
263283a8441fSMatthew Wilcox (Oracle) 	}
26337f760917SDavid Hildenbrand 
2634e90309c9SKirill A. Shutemov out:
2635eef1b3baSKirill A. Shutemov 	spin_unlock(ptl);
2636ec8832d0SAlistair Popple 	mmu_notifier_invalidate_range_end(&range);
2637eef1b3baSKirill A. Shutemov }
2638eef1b3baSKirill A. Shutemov 
2639fec89c10SKirill A. Shutemov void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2640af28a988SMatthew Wilcox (Oracle) 		bool freeze, struct folio *folio)
264194fcc585SAndrea Arcangeli {
264250722804SZach O'Keefe 	pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
264394fcc585SAndrea Arcangeli 
264450722804SZach O'Keefe 	if (!pmd)
2645f72e7dcdSHugh Dickins 		return;
2646f72e7dcdSHugh Dickins 
2647af28a988SMatthew Wilcox (Oracle) 	__split_huge_pmd(vma, pmd, address, freeze, folio);
264894fcc585SAndrea Arcangeli }
264994fcc585SAndrea Arcangeli 
265071f9e58eSMiaohe Lin static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
265171f9e58eSMiaohe Lin {
265271f9e58eSMiaohe Lin 	/*
265371f9e58eSMiaohe Lin 	 * If the new address isn't hpage aligned and it could previously
265471f9e58eSMiaohe Lin 	 * contain an hugepage: check if we need to split an huge pmd.
265571f9e58eSMiaohe Lin 	 */
265671f9e58eSMiaohe Lin 	if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
265771f9e58eSMiaohe Lin 	    range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
265871f9e58eSMiaohe Lin 			 ALIGN(address, HPAGE_PMD_SIZE)))
265971f9e58eSMiaohe Lin 		split_huge_pmd_address(vma, address, false, NULL);
266071f9e58eSMiaohe Lin }
266171f9e58eSMiaohe Lin 
2662e1b9996bSKirill A. Shutemov void vma_adjust_trans_huge(struct vm_area_struct *vma,
266394fcc585SAndrea Arcangeli 			     unsigned long start,
266494fcc585SAndrea Arcangeli 			     unsigned long end,
266594fcc585SAndrea Arcangeli 			     long adjust_next)
266694fcc585SAndrea Arcangeli {
266771f9e58eSMiaohe Lin 	/* Check if we need to split start first. */
266871f9e58eSMiaohe Lin 	split_huge_pmd_if_needed(vma, start);
266971f9e58eSMiaohe Lin 
267071f9e58eSMiaohe Lin 	/* Check if we need to split end next. */
267171f9e58eSMiaohe Lin 	split_huge_pmd_if_needed(vma, end);
267294fcc585SAndrea Arcangeli 
267394fcc585SAndrea Arcangeli 	/*
267468540502SMatthew Wilcox (Oracle) 	 * If we're also updating the next vma vm_start,
267571f9e58eSMiaohe Lin 	 * check if we need to split it.
267694fcc585SAndrea Arcangeli 	 */
267794fcc585SAndrea Arcangeli 	if (adjust_next > 0) {
267868540502SMatthew Wilcox (Oracle) 		struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
267994fcc585SAndrea Arcangeli 		unsigned long nstart = next->vm_start;
2680f9d86a60SWei Yang 		nstart += adjust_next;
268171f9e58eSMiaohe Lin 		split_huge_pmd_if_needed(next, nstart);
268294fcc585SAndrea Arcangeli 	}
268394fcc585SAndrea Arcangeli }
2684e9b61f19SKirill A. Shutemov 
2685684555aaSMatthew Wilcox (Oracle) static void unmap_folio(struct folio *folio)
2686e9b61f19SKirill A. Shutemov {
2687a98a2f0cSAlistair Popple 	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
26883027c6f8SBaolin Wang 		TTU_SYNC | TTU_BATCH_FLUSH;
2689e9b61f19SKirill A. Shutemov 
2690684555aaSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2691e9b61f19SKirill A. Shutemov 
2692a98a2f0cSAlistair Popple 	/*
2693a98a2f0cSAlistair Popple 	 * Anon pages need migration entries to preserve them, but file
2694a98a2f0cSAlistair Popple 	 * pages can simply be left unmapped, then faulted back on demand.
2695a98a2f0cSAlistair Popple 	 * If that is ever changed (perhaps for mlock), update remap_page().
2696a98a2f0cSAlistair Popple 	 */
26974b8554c5SMatthew Wilcox (Oracle) 	if (folio_test_anon(folio))
26984b8554c5SMatthew Wilcox (Oracle) 		try_to_migrate(folio, ttu_flags);
2699a98a2f0cSAlistair Popple 	else
2700869f7ee6SMatthew Wilcox (Oracle) 		try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
27013027c6f8SBaolin Wang 
27023027c6f8SBaolin Wang 	try_to_unmap_flush();
2703bd56086fSKirill A. Shutemov }
2704bd56086fSKirill A. Shutemov 
27054eecb8b9SMatthew Wilcox (Oracle) static void remap_page(struct folio *folio, unsigned long nr)
2706e9b61f19SKirill A. Shutemov {
27074eecb8b9SMatthew Wilcox (Oracle) 	int i = 0;
2708ab02c252SHugh Dickins 
2709684555aaSMatthew Wilcox (Oracle) 	/* If unmap_folio() uses try_to_migrate() on file, remove this check */
27104eecb8b9SMatthew Wilcox (Oracle) 	if (!folio_test_anon(folio))
2711ab02c252SHugh Dickins 		return;
27124eecb8b9SMatthew Wilcox (Oracle) 	for (;;) {
27134eecb8b9SMatthew Wilcox (Oracle) 		remove_migration_ptes(folio, folio, true);
27144eecb8b9SMatthew Wilcox (Oracle) 		i += folio_nr_pages(folio);
27154eecb8b9SMatthew Wilcox (Oracle) 		if (i >= nr)
27164eecb8b9SMatthew Wilcox (Oracle) 			break;
27174eecb8b9SMatthew Wilcox (Oracle) 		folio = folio_next(folio);
2718e9b61f19SKirill A. Shutemov 	}
2719ace71a19SKirill A. Shutemov }
2720e9b61f19SKirill A. Shutemov 
272194866635SAlex Shi static void lru_add_page_tail(struct page *head, struct page *tail,
272288dcb9a3SAlex Shi 		struct lruvec *lruvec, struct list_head *list)
272388dcb9a3SAlex Shi {
272494866635SAlex Shi 	VM_BUG_ON_PAGE(!PageHead(head), head);
272594866635SAlex Shi 	VM_BUG_ON_PAGE(PageCompound(tail), head);
272694866635SAlex Shi 	VM_BUG_ON_PAGE(PageLRU(tail), head);
27276168d0daSAlex Shi 	lockdep_assert_held(&lruvec->lru_lock);
272888dcb9a3SAlex Shi 
27296dbb5741SAlex Shi 	if (list) {
273088dcb9a3SAlex Shi 		/* page reclaim is reclaiming a huge page */
27316dbb5741SAlex Shi 		VM_WARN_ON(PageLRU(head));
273294866635SAlex Shi 		get_page(tail);
273394866635SAlex Shi 		list_add_tail(&tail->lru, list);
273488dcb9a3SAlex Shi 	} else {
27356dbb5741SAlex Shi 		/* head is still on lru (and we have it frozen) */
27366dbb5741SAlex Shi 		VM_WARN_ON(!PageLRU(head));
273707ca7606SHugh Dickins 		if (PageUnevictable(tail))
273807ca7606SHugh Dickins 			tail->mlock_count = 0;
273907ca7606SHugh Dickins 		else
27406dbb5741SAlex Shi 			list_add_tail(&tail->lru, &head->lru);
274107ca7606SHugh Dickins 		SetPageLRU(tail);
274288dcb9a3SAlex Shi 	}
274388dcb9a3SAlex Shi }
274488dcb9a3SAlex Shi 
274507e09c48SDavid Hildenbrand static void __split_huge_page_tail(struct folio *folio, int tail,
2746e9b61f19SKirill A. Shutemov 		struct lruvec *lruvec, struct list_head *list)
2747e9b61f19SKirill A. Shutemov {
274807e09c48SDavid Hildenbrand 	struct page *head = &folio->page;
2749e9b61f19SKirill A. Shutemov 	struct page *page_tail = head + tail;
275007e09c48SDavid Hildenbrand 	/*
275107e09c48SDavid Hildenbrand 	 * Careful: new_folio is not a "real" folio before we cleared PageTail.
275207e09c48SDavid Hildenbrand 	 * Don't pass it around before clear_compound_head().
275307e09c48SDavid Hildenbrand 	 */
275407e09c48SDavid Hildenbrand 	struct folio *new_folio = (struct folio *)page_tail;
2755e9b61f19SKirill A. Shutemov 
27568df651c7SKirill A. Shutemov 	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2757e9b61f19SKirill A. Shutemov 
2758e9b61f19SKirill A. Shutemov 	/*
2759605ca5edSKonstantin Khlebnikov 	 * Clone page flags before unfreezing refcount.
2760605ca5edSKonstantin Khlebnikov 	 *
2761605ca5edSKonstantin Khlebnikov 	 * After successful get_page_unless_zero() might follow flags change,
27628958b249SHaitao Shi 	 * for example lock_page() which set PG_waiters.
27636c287605SDavid Hildenbrand 	 *
27646c287605SDavid Hildenbrand 	 * Note that for mapped sub-pages of an anonymous THP,
2765684555aaSMatthew Wilcox (Oracle) 	 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
27666c287605SDavid Hildenbrand 	 * the migration entry instead from where remap_page() will restore it.
27676c287605SDavid Hildenbrand 	 * We can still have PG_anon_exclusive set on effectively unmapped and
27686c287605SDavid Hildenbrand 	 * unreferenced sub-pages of an anonymous THP: we can simply drop
27696c287605SDavid Hildenbrand 	 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2770e9b61f19SKirill A. Shutemov 	 */
2771e9b61f19SKirill A. Shutemov 	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2772e9b61f19SKirill A. Shutemov 	page_tail->flags |= (head->flags &
2773e9b61f19SKirill A. Shutemov 			((1L << PG_referenced) |
2774e9b61f19SKirill A. Shutemov 			 (1L << PG_swapbacked) |
277538d8b4e6SHuang Ying 			 (1L << PG_swapcache) |
2776e9b61f19SKirill A. Shutemov 			 (1L << PG_mlocked) |
2777e9b61f19SKirill A. Shutemov 			 (1L << PG_uptodate) |
2778e9b61f19SKirill A. Shutemov 			 (1L << PG_active) |
27791899ad18SJohannes Weiner 			 (1L << PG_workingset) |
2780e9b61f19SKirill A. Shutemov 			 (1L << PG_locked) |
2781b8d3c4c3SMinchan Kim 			 (1L << PG_unevictable) |
2782b0284cd2SCatalin Marinas #ifdef CONFIG_ARCH_USES_PG_ARCH_X
278372e6afa0SCatalin Marinas 			 (1L << PG_arch_2) |
2784ef6458b1SPeter Collingbourne 			 (1L << PG_arch_3) |
278572e6afa0SCatalin Marinas #endif
2786ec1c86b2SYu Zhao 			 (1L << PG_dirty) |
2787ec1c86b2SYu Zhao 			 LRU_GEN_MASK | LRU_REFS_MASK));
2788e9b61f19SKirill A. Shutemov 
2789cb67f428SHugh Dickins 	/* ->mapping in first and second tail page is replaced by other uses */
2790173d9d9fSHugh Dickins 	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2791173d9d9fSHugh Dickins 			page_tail);
2792173d9d9fSHugh Dickins 	page_tail->mapping = head->mapping;
2793173d9d9fSHugh Dickins 	page_tail->index = head->index + tail;
279471e2d666SMel Gorman 
279571e2d666SMel Gorman 	/*
2796cfeed8ffSDavid Hildenbrand 	 * page->private should not be set in tail pages. Fix up and warn once
2797cfeed8ffSDavid Hildenbrand 	 * if private is unexpectedly set.
279871e2d666SMel Gorman 	 */
2799cfeed8ffSDavid Hildenbrand 	if (unlikely(page_tail->private)) {
2800cfeed8ffSDavid Hildenbrand 		VM_WARN_ON_ONCE_PAGE(true, page_tail);
2801b653db77SMatthew Wilcox (Oracle) 		page_tail->private = 0;
280271e2d666SMel Gorman 	}
280307e09c48SDavid Hildenbrand 	if (folio_test_swapcache(folio))
280407e09c48SDavid Hildenbrand 		new_folio->swap.val = folio->swap.val + tail;
2805173d9d9fSHugh Dickins 
2806605ca5edSKonstantin Khlebnikov 	/* Page flags must be visible before we make the page non-compound. */
2807e9b61f19SKirill A. Shutemov 	smp_wmb();
2808e9b61f19SKirill A. Shutemov 
2809605ca5edSKonstantin Khlebnikov 	/*
2810605ca5edSKonstantin Khlebnikov 	 * Clear PageTail before unfreezing page refcount.
2811605ca5edSKonstantin Khlebnikov 	 *
2812605ca5edSKonstantin Khlebnikov 	 * After successful get_page_unless_zero() might follow put_page()
2813605ca5edSKonstantin Khlebnikov 	 * which needs correct compound_head().
2814605ca5edSKonstantin Khlebnikov 	 */
2815e9b61f19SKirill A. Shutemov 	clear_compound_head(page_tail);
2816e9b61f19SKirill A. Shutemov 
2817605ca5edSKonstantin Khlebnikov 	/* Finally unfreeze refcount. Additional reference from page cache. */
2818b7542769SKefeng Wang 	page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) ||
2819b7542769SKefeng Wang 					  folio_test_swapcache(folio)));
2820605ca5edSKonstantin Khlebnikov 
2821b7542769SKefeng Wang 	if (folio_test_young(folio))
2822b7542769SKefeng Wang 		folio_set_young(new_folio);
2823b7542769SKefeng Wang 	if (folio_test_idle(folio))
2824b7542769SKefeng Wang 		folio_set_idle(new_folio);
2825e9b61f19SKirill A. Shutemov 
2826c8253011SKefeng Wang 	folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
282794723aafSMichal Hocko 
282894723aafSMichal Hocko 	/*
282994723aafSMichal Hocko 	 * always add to the tail because some iterators expect new
283094723aafSMichal Hocko 	 * pages to show after the currently processed elements - e.g.
283194723aafSMichal Hocko 	 * migrate_pages
283294723aafSMichal Hocko 	 */
2833e9b61f19SKirill A. Shutemov 	lru_add_page_tail(head, page_tail, lruvec, list);
2834e9b61f19SKirill A. Shutemov }
2835e9b61f19SKirill A. Shutemov 
2836baa355fdSKirill A. Shutemov static void __split_huge_page(struct page *page, struct list_head *list,
2837b6769834SAlex Shi 		pgoff_t end)
2838e9b61f19SKirill A. Shutemov {
2839e809c3feSMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
2840e809c3feSMatthew Wilcox (Oracle) 	struct page *head = &folio->page;
2841e9b61f19SKirill A. Shutemov 	struct lruvec *lruvec;
28424101196bSMatthew Wilcox (Oracle) 	struct address_space *swap_cache = NULL;
28434101196bSMatthew Wilcox (Oracle) 	unsigned long offset = 0;
28448cce5475SKirill A. Shutemov 	unsigned int nr = thp_nr_pages(head);
2845509f0069SHugh Dickins 	int i, nr_dropped = 0;
2846e9b61f19SKirill A. Shutemov 
2847e9b61f19SKirill A. Shutemov 	/* complete memcg works before add pages to LRU */
2848be6c8982SZhou Guanghui 	split_page_memcg(head, nr);
2849e9b61f19SKirill A. Shutemov 
285007e09c48SDavid Hildenbrand 	if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
285107e09c48SDavid Hildenbrand 		offset = swp_offset(folio->swap);
285207e09c48SDavid Hildenbrand 		swap_cache = swap_address_space(folio->swap);
28534101196bSMatthew Wilcox (Oracle) 		xa_lock(&swap_cache->i_pages);
28544101196bSMatthew Wilcox (Oracle) 	}
28554101196bSMatthew Wilcox (Oracle) 
2856f0953a1bSIngo Molnar 	/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
2857e809c3feSMatthew Wilcox (Oracle) 	lruvec = folio_lruvec_lock(folio);
2858b6769834SAlex Shi 
2859eac96c3eSYang Shi 	ClearPageHasHWPoisoned(head);
2860eac96c3eSYang Shi 
28618cce5475SKirill A. Shutemov 	for (i = nr - 1; i >= 1; i--) {
286207e09c48SDavid Hildenbrand 		__split_huge_page_tail(folio, i, lruvec, list);
2863d144bf62SHugh Dickins 		/* Some pages can be beyond EOF: drop them from page cache */
2864baa355fdSKirill A. Shutemov 		if (head[i].index >= end) {
2865fb5c2029SMatthew Wilcox (Oracle) 			struct folio *tail = page_folio(head + i);
2866fb5c2029SMatthew Wilcox (Oracle) 
2867d144bf62SHugh Dickins 			if (shmem_mapping(head->mapping))
2868509f0069SHugh Dickins 				nr_dropped++;
2869fb5c2029SMatthew Wilcox (Oracle) 			else if (folio_test_clear_dirty(tail))
2870fb5c2029SMatthew Wilcox (Oracle) 				folio_account_cleaned(tail,
2871fb5c2029SMatthew Wilcox (Oracle) 					inode_to_wb(folio->mapping->host));
2872fb5c2029SMatthew Wilcox (Oracle) 			__filemap_remove_folio(tail, NULL);
2873fb5c2029SMatthew Wilcox (Oracle) 			folio_put(tail);
28744101196bSMatthew Wilcox (Oracle) 		} else if (!PageAnon(page)) {
28754101196bSMatthew Wilcox (Oracle) 			__xa_store(&head->mapping->i_pages, head[i].index,
28764101196bSMatthew Wilcox (Oracle) 					head + i, 0);
28774101196bSMatthew Wilcox (Oracle) 		} else if (swap_cache) {
28784101196bSMatthew Wilcox (Oracle) 			__xa_store(&swap_cache->i_pages, offset + i,
28794101196bSMatthew Wilcox (Oracle) 					head + i, 0);
2880baa355fdSKirill A. Shutemov 		}
2881baa355fdSKirill A. Shutemov 	}
2882e9b61f19SKirill A. Shutemov 
2883e9b61f19SKirill A. Shutemov 	ClearPageCompound(head);
28846168d0daSAlex Shi 	unlock_page_lruvec(lruvec);
2885b6769834SAlex Shi 	/* Caller disabled irqs, so they are still disabled here */
2886f7da677bSVlastimil Babka 
28878cce5475SKirill A. Shutemov 	split_page_owner(head, nr);
2888f7da677bSVlastimil Babka 
2889baa355fdSKirill A. Shutemov 	/* See comment in __split_huge_page_tail() */
2890baa355fdSKirill A. Shutemov 	if (PageAnon(head)) {
2891aa5dc07fSMatthew Wilcox 		/* Additional pin to swap cache */
28924101196bSMatthew Wilcox (Oracle) 		if (PageSwapCache(head)) {
289338d8b4e6SHuang Ying 			page_ref_add(head, 2);
28944101196bSMatthew Wilcox (Oracle) 			xa_unlock(&swap_cache->i_pages);
28954101196bSMatthew Wilcox (Oracle) 		} else {
2896baa355fdSKirill A. Shutemov 			page_ref_inc(head);
28974101196bSMatthew Wilcox (Oracle) 		}
2898baa355fdSKirill A. Shutemov 	} else {
2899aa5dc07fSMatthew Wilcox 		/* Additional pin to page cache */
2900baa355fdSKirill A. Shutemov 		page_ref_add(head, 2);
2901b93b0163SMatthew Wilcox 		xa_unlock(&head->mapping->i_pages);
2902baa355fdSKirill A. Shutemov 	}
2903b6769834SAlex Shi 	local_irq_enable();
2904e9b61f19SKirill A. Shutemov 
2905509f0069SHugh Dickins 	if (nr_dropped)
2906509f0069SHugh Dickins 		shmem_uncharge(head->mapping->host, nr_dropped);
29074eecb8b9SMatthew Wilcox (Oracle) 	remap_page(folio, nr);
2908e9b61f19SKirill A. Shutemov 
290907e09c48SDavid Hildenbrand 	if (folio_test_swapcache(folio))
291007e09c48SDavid Hildenbrand 		split_swap_cluster(folio->swap);
2911c4f9c701SHuang Ying 
29128cce5475SKirill A. Shutemov 	for (i = 0; i < nr; i++) {
2913e9b61f19SKirill A. Shutemov 		struct page *subpage = head + i;
2914e9b61f19SKirill A. Shutemov 		if (subpage == page)
2915e9b61f19SKirill A. Shutemov 			continue;
2916e9b61f19SKirill A. Shutemov 		unlock_page(subpage);
2917e9b61f19SKirill A. Shutemov 
2918e9b61f19SKirill A. Shutemov 		/*
2919e9b61f19SKirill A. Shutemov 		 * Subpages may be freed if there wasn't any mapping
2920e9b61f19SKirill A. Shutemov 		 * like if add_to_swap() is running on a lru page that
2921e9b61f19SKirill A. Shutemov 		 * had its mapping zapped. And freeing these pages
2922e9b61f19SKirill A. Shutemov 		 * requires taking the lru_lock so we do the put_page
2923e9b61f19SKirill A. Shutemov 		 * of the tail pages after the split is complete.
2924e9b61f19SKirill A. Shutemov 		 */
29250b175468SMiaohe Lin 		free_page_and_swap_cache(subpage);
2926e9b61f19SKirill A. Shutemov 	}
2927e9b61f19SKirill A. Shutemov }
2928e9b61f19SKirill A. Shutemov 
2929b8f593cdSHuang Ying /* Racy check whether the huge page can be split */
2930d4b4084aSMatthew Wilcox (Oracle) bool can_split_folio(struct folio *folio, int *pextra_pins)
2931b8f593cdSHuang Ying {
2932b8f593cdSHuang Ying 	int extra_pins;
2933b8f593cdSHuang Ying 
2934aa5dc07fSMatthew Wilcox 	/* Additional pins from page cache */
2935d4b4084aSMatthew Wilcox (Oracle) 	if (folio_test_anon(folio))
2936d4b4084aSMatthew Wilcox (Oracle) 		extra_pins = folio_test_swapcache(folio) ?
2937d4b4084aSMatthew Wilcox (Oracle) 				folio_nr_pages(folio) : 0;
2938b8f593cdSHuang Ying 	else
2939d4b4084aSMatthew Wilcox (Oracle) 		extra_pins = folio_nr_pages(folio);
2940b8f593cdSHuang Ying 	if (pextra_pins)
2941b8f593cdSHuang Ying 		*pextra_pins = extra_pins;
2942d4b4084aSMatthew Wilcox (Oracle) 	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
2943b8f593cdSHuang Ying }
2944b8f593cdSHuang Ying 
29456d0a07edSAndrea Arcangeli /*
2946e9b61f19SKirill A. Shutemov  * This function splits huge page into normal pages. @page can point to any
2947e9b61f19SKirill A. Shutemov  * subpage of huge page to split. Split doesn't change the position of @page.
2948e9b61f19SKirill A. Shutemov  *
2949e9b61f19SKirill A. Shutemov  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
2950e9b61f19SKirill A. Shutemov  * The huge page must be locked.
2951e9b61f19SKirill A. Shutemov  *
2952e9b61f19SKirill A. Shutemov  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
2953e9b61f19SKirill A. Shutemov  *
2954e9b61f19SKirill A. Shutemov  * Both head page and tail pages will inherit mapping, flags, and so on from
2955e9b61f19SKirill A. Shutemov  * the hugepage.
2956e9b61f19SKirill A. Shutemov  *
2957e9b61f19SKirill A. Shutemov  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
2958e9b61f19SKirill A. Shutemov  * they are not mapped.
2959e9b61f19SKirill A. Shutemov  *
2960e9b61f19SKirill A. Shutemov  * Returns 0 if the hugepage is split successfully.
2961e9b61f19SKirill A. Shutemov  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
2962e9b61f19SKirill A. Shutemov  * us.
2963e9b61f19SKirill A. Shutemov  */
2964e9b61f19SKirill A. Shutemov int split_huge_page_to_list(struct page *page, struct list_head *list)
2965e9b61f19SKirill A. Shutemov {
29664eecb8b9SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
2967f8baa6beSMatthew Wilcox (Oracle) 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
29683e9a13daSMatthew Wilcox (Oracle) 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
2969baa355fdSKirill A. Shutemov 	struct anon_vma *anon_vma = NULL;
2970baa355fdSKirill A. Shutemov 	struct address_space *mapping = NULL;
2971504e070dSYang Shi 	int extra_pins, ret;
2972006d3ff2SHugh Dickins 	pgoff_t end;
2973478d134eSXu Yu 	bool is_hzp;
2974e9b61f19SKirill A. Shutemov 
29753e9a13daSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
29763e9a13daSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2977e9b61f19SKirill A. Shutemov 
29783e9a13daSMatthew Wilcox (Oracle) 	is_hzp = is_huge_zero_page(&folio->page);
29794737edbbSNaoya Horiguchi 	if (is_hzp) {
29804737edbbSNaoya Horiguchi 		pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
2981478d134eSXu Yu 		return -EBUSY;
29824737edbbSNaoya Horiguchi 	}
2983478d134eSXu Yu 
29843e9a13daSMatthew Wilcox (Oracle) 	if (folio_test_writeback(folio))
298559807685SHuang Ying 		return -EBUSY;
298659807685SHuang Ying 
29873e9a13daSMatthew Wilcox (Oracle) 	if (folio_test_anon(folio)) {
2988e9b61f19SKirill A. Shutemov 		/*
2989c1e8d7c6SMichel Lespinasse 		 * The caller does not necessarily hold an mmap_lock that would
2990baa355fdSKirill A. Shutemov 		 * prevent the anon_vma disappearing so we first we take a
2991baa355fdSKirill A. Shutemov 		 * reference to it and then lock the anon_vma for write. This
29922f031c6fSMatthew Wilcox (Oracle) 		 * is similar to folio_lock_anon_vma_read except the write lock
2993baa355fdSKirill A. Shutemov 		 * is taken to serialise against parallel split or collapse
2994baa355fdSKirill A. Shutemov 		 * operations.
2995e9b61f19SKirill A. Shutemov 		 */
299629eea9b5SMatthew Wilcox (Oracle) 		anon_vma = folio_get_anon_vma(folio);
2997e9b61f19SKirill A. Shutemov 		if (!anon_vma) {
2998e9b61f19SKirill A. Shutemov 			ret = -EBUSY;
2999e9b61f19SKirill A. Shutemov 			goto out;
3000e9b61f19SKirill A. Shutemov 		}
3001006d3ff2SHugh Dickins 		end = -1;
3002baa355fdSKirill A. Shutemov 		mapping = NULL;
3003e9b61f19SKirill A. Shutemov 		anon_vma_lock_write(anon_vma);
3004baa355fdSKirill A. Shutemov 	} else {
30056a3edd29SYin Fengwei 		gfp_t gfp;
30066a3edd29SYin Fengwei 
30073e9a13daSMatthew Wilcox (Oracle) 		mapping = folio->mapping;
3008baa355fdSKirill A. Shutemov 
3009baa355fdSKirill A. Shutemov 		/* Truncated ? */
3010baa355fdSKirill A. Shutemov 		if (!mapping) {
3011baa355fdSKirill A. Shutemov 			ret = -EBUSY;
3012baa355fdSKirill A. Shutemov 			goto out;
3013baa355fdSKirill A. Shutemov 		}
3014baa355fdSKirill A. Shutemov 
30156a3edd29SYin Fengwei 		gfp = current_gfp_context(mapping_gfp_mask(mapping) &
30166a3edd29SYin Fengwei 							GFP_RECLAIM_MASK);
30176a3edd29SYin Fengwei 
30180201ebf2SDavid Howells 		if (!filemap_release_folio(folio, gfp)) {
30196a3edd29SYin Fengwei 			ret = -EBUSY;
30206a3edd29SYin Fengwei 			goto out;
30216a3edd29SYin Fengwei 		}
30226a3edd29SYin Fengwei 
30233e9a13daSMatthew Wilcox (Oracle) 		xas_split_alloc(&xas, folio, folio_order(folio), gfp);
30246b24ca4aSMatthew Wilcox (Oracle) 		if (xas_error(&xas)) {
30256b24ca4aSMatthew Wilcox (Oracle) 			ret = xas_error(&xas);
30266b24ca4aSMatthew Wilcox (Oracle) 			goto out;
30276b24ca4aSMatthew Wilcox (Oracle) 		}
30286b24ca4aSMatthew Wilcox (Oracle) 
3029baa355fdSKirill A. Shutemov 		anon_vma = NULL;
3030baa355fdSKirill A. Shutemov 		i_mmap_lock_read(mapping);
3031006d3ff2SHugh Dickins 
3032006d3ff2SHugh Dickins 		/*
3033006d3ff2SHugh Dickins 		 *__split_huge_page() may need to trim off pages beyond EOF:
3034006d3ff2SHugh Dickins 		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3035006d3ff2SHugh Dickins 		 * which cannot be nested inside the page tree lock. So note
3036006d3ff2SHugh Dickins 		 * end now: i_size itself may be changed at any moment, but
30373e9a13daSMatthew Wilcox (Oracle) 		 * folio lock is good enough to serialize the trimming.
3038006d3ff2SHugh Dickins 		 */
3039006d3ff2SHugh Dickins 		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3040d144bf62SHugh Dickins 		if (shmem_mapping(mapping))
3041d144bf62SHugh Dickins 			end = shmem_fallocend(mapping->host, end);
3042baa355fdSKirill A. Shutemov 	}
3043e9b61f19SKirill A. Shutemov 
3044e9b61f19SKirill A. Shutemov 	/*
3045684555aaSMatthew Wilcox (Oracle) 	 * Racy check if we can split the page, before unmap_folio() will
3046e9b61f19SKirill A. Shutemov 	 * split PMDs
3047e9b61f19SKirill A. Shutemov 	 */
3048d4b4084aSMatthew Wilcox (Oracle) 	if (!can_split_folio(folio, &extra_pins)) {
3049fd4a7ac3SBaolin Wang 		ret = -EAGAIN;
3050e9b61f19SKirill A. Shutemov 		goto out_unlock;
3051e9b61f19SKirill A. Shutemov 	}
3052e9b61f19SKirill A. Shutemov 
3053684555aaSMatthew Wilcox (Oracle) 	unmap_folio(folio);
3054e9b61f19SKirill A. Shutemov 
3055b6769834SAlex Shi 	/* block interrupt reentry in xa_lock and spinlock */
3056b6769834SAlex Shi 	local_irq_disable();
3057baa355fdSKirill A. Shutemov 	if (mapping) {
3058baa355fdSKirill A. Shutemov 		/*
30593e9a13daSMatthew Wilcox (Oracle) 		 * Check if the folio is present in page cache.
30603e9a13daSMatthew Wilcox (Oracle) 		 * We assume all tail are present too, if folio is there.
3061baa355fdSKirill A. Shutemov 		 */
30626b24ca4aSMatthew Wilcox (Oracle) 		xas_lock(&xas);
30636b24ca4aSMatthew Wilcox (Oracle) 		xas_reset(&xas);
30643e9a13daSMatthew Wilcox (Oracle) 		if (xas_load(&xas) != folio)
3065baa355fdSKirill A. Shutemov 			goto fail;
3066baa355fdSKirill A. Shutemov 	}
3067baa355fdSKirill A. Shutemov 
30680139aa7bSJoonsoo Kim 	/* Prevent deferred_split_scan() touching ->_refcount */
3069364c1eebSYang Shi 	spin_lock(&ds_queue->split_queue_lock);
30703e9a13daSMatthew Wilcox (Oracle) 	if (folio_ref_freeze(folio, 1 + extra_pins)) {
30714375a553SMatthew Wilcox (Oracle) 		if (!list_empty(&folio->_deferred_list)) {
3072364c1eebSYang Shi 			ds_queue->split_queue_len--;
30734375a553SMatthew Wilcox (Oracle) 			list_del(&folio->_deferred_list);
30749a982250SKirill A. Shutemov 		}
3075afb97172SWei Yang 		spin_unlock(&ds_queue->split_queue_lock);
307606d3eff6SKirill A. Shutemov 		if (mapping) {
30773e9a13daSMatthew Wilcox (Oracle) 			int nr = folio_nr_pages(folio);
3078bf9eceadSMuchun Song 
30793e9a13daSMatthew Wilcox (Oracle) 			xas_split(&xas, folio, folio_order(folio));
3080a48d5bdcSStefan Roesch 			if (folio_test_pmd_mappable(folio)) {
30813e9a13daSMatthew Wilcox (Oracle) 				if (folio_test_swapbacked(folio)) {
3082a48d5bdcSStefan Roesch 					__lruvec_stat_mod_folio(folio,
3083a48d5bdcSStefan Roesch 							NR_SHMEM_THPS, -nr);
30841ca7554dSMarek Szyprowski 				} else {
3085a48d5bdcSStefan Roesch 					__lruvec_stat_mod_folio(folio,
3086a48d5bdcSStefan Roesch 							NR_FILE_THPS, -nr);
30871ca7554dSMarek Szyprowski 					filemap_nr_thps_dec(mapping);
30881ca7554dSMarek Szyprowski 				}
308906d3eff6SKirill A. Shutemov 			}
3090a48d5bdcSStefan Roesch 		}
309106d3eff6SKirill A. Shutemov 
3092b6769834SAlex Shi 		__split_huge_page(page, list, end);
3093e9b61f19SKirill A. Shutemov 		ret = 0;
3094baa355fdSKirill A. Shutemov 	} else {
3095364c1eebSYang Shi 		spin_unlock(&ds_queue->split_queue_lock);
3096504e070dSYang Shi fail:
3097504e070dSYang Shi 		if (mapping)
30986b24ca4aSMatthew Wilcox (Oracle) 			xas_unlock(&xas);
3099b6769834SAlex Shi 		local_irq_enable();
31004eecb8b9SMatthew Wilcox (Oracle) 		remap_page(folio, folio_nr_pages(folio));
3101fd4a7ac3SBaolin Wang 		ret = -EAGAIN;
3102e9b61f19SKirill A. Shutemov 	}
3103e9b61f19SKirill A. Shutemov 
3104e9b61f19SKirill A. Shutemov out_unlock:
3105baa355fdSKirill A. Shutemov 	if (anon_vma) {
3106e9b61f19SKirill A. Shutemov 		anon_vma_unlock_write(anon_vma);
3107e9b61f19SKirill A. Shutemov 		put_anon_vma(anon_vma);
3108baa355fdSKirill A. Shutemov 	}
3109baa355fdSKirill A. Shutemov 	if (mapping)
3110baa355fdSKirill A. Shutemov 		i_mmap_unlock_read(mapping);
3111e9b61f19SKirill A. Shutemov out:
311269a37a8bSMatthew Wilcox (Oracle) 	xas_destroy(&xas);
3113e9b61f19SKirill A. Shutemov 	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3114e9b61f19SKirill A. Shutemov 	return ret;
3115e9b61f19SKirill A. Shutemov }
31169a982250SKirill A. Shutemov 
31178dc4a8f1SMatthew Wilcox (Oracle) void folio_undo_large_rmappable(struct folio *folio)
31189a982250SKirill A. Shutemov {
31198dc4a8f1SMatthew Wilcox (Oracle) 	struct deferred_split *ds_queue;
31209a982250SKirill A. Shutemov 	unsigned long flags;
31219a982250SKirill A. Shutemov 
3122deedad80SYin Fengwei 	/*
3123deedad80SYin Fengwei 	 * At this point, there is no one trying to add the folio to
3124deedad80SYin Fengwei 	 * deferred_list. If folio is not in deferred_list, it's safe
3125deedad80SYin Fengwei 	 * to check without acquiring the split_queue_lock.
3126deedad80SYin Fengwei 	 */
31278dc4a8f1SMatthew Wilcox (Oracle) 	if (data_race(list_empty(&folio->_deferred_list)))
31288dc4a8f1SMatthew Wilcox (Oracle) 		return;
31298dc4a8f1SMatthew Wilcox (Oracle) 
31308dc4a8f1SMatthew Wilcox (Oracle) 	ds_queue = get_deferred_split_queue(folio);
3131364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
31328991de90SMatthew Wilcox (Oracle) 	if (!list_empty(&folio->_deferred_list)) {
3133364c1eebSYang Shi 		ds_queue->split_queue_len--;
31348991de90SMatthew Wilcox (Oracle) 		list_del(&folio->_deferred_list);
31359a982250SKirill A. Shutemov 	}
3136364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3137deedad80SYin Fengwei }
31389a982250SKirill A. Shutemov 
3139f158ed61SMatthew Wilcox (Oracle) void deferred_split_folio(struct folio *folio)
31409a982250SKirill A. Shutemov {
3141f8baa6beSMatthew Wilcox (Oracle) 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
314287eaceb3SYang Shi #ifdef CONFIG_MEMCG
31438991de90SMatthew Wilcox (Oracle) 	struct mem_cgroup *memcg = folio_memcg(folio);
314487eaceb3SYang Shi #endif
31459a982250SKirill A. Shutemov 	unsigned long flags;
31469a982250SKirill A. Shutemov 
31478991de90SMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
31489a982250SKirill A. Shutemov 
314987eaceb3SYang Shi 	/*
315087eaceb3SYang Shi 	 * The try_to_unmap() in page reclaim path might reach here too,
315187eaceb3SYang Shi 	 * this may cause a race condition to corrupt deferred split queue.
31528991de90SMatthew Wilcox (Oracle) 	 * And, if page reclaim is already handling the same folio, it is
315387eaceb3SYang Shi 	 * unnecessary to handle it again in shrinker.
315487eaceb3SYang Shi 	 *
31558991de90SMatthew Wilcox (Oracle) 	 * Check the swapcache flag to determine if the folio is being
31568991de90SMatthew Wilcox (Oracle) 	 * handled by page reclaim since THP swap would add the folio into
315787eaceb3SYang Shi 	 * swap cache before calling try_to_unmap().
315887eaceb3SYang Shi 	 */
31598991de90SMatthew Wilcox (Oracle) 	if (folio_test_swapcache(folio))
316087eaceb3SYang Shi 		return;
316187eaceb3SYang Shi 
31628991de90SMatthew Wilcox (Oracle) 	if (!list_empty(&folio->_deferred_list))
31639a982250SKirill A. Shutemov 		return;
31649a982250SKirill A. Shutemov 
3165364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
31668991de90SMatthew Wilcox (Oracle) 	if (list_empty(&folio->_deferred_list)) {
3167f9719a03SKirill A. Shutemov 		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
31688991de90SMatthew Wilcox (Oracle) 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
3169364c1eebSYang Shi 		ds_queue->split_queue_len++;
317087eaceb3SYang Shi #ifdef CONFIG_MEMCG
317187eaceb3SYang Shi 		if (memcg)
31728991de90SMatthew Wilcox (Oracle) 			set_shrinker_bit(memcg, folio_nid(folio),
317354d91729SQi Zheng 					 deferred_split_shrinker->id);
317487eaceb3SYang Shi #endif
31759a982250SKirill A. Shutemov 	}
3176364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
31779a982250SKirill A. Shutemov }
31789a982250SKirill A. Shutemov 
31799a982250SKirill A. Shutemov static unsigned long deferred_split_count(struct shrinker *shrink,
31809a982250SKirill A. Shutemov 		struct shrink_control *sc)
31819a982250SKirill A. Shutemov {
3182a3d0a918SKirill A. Shutemov 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
3183364c1eebSYang Shi 	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
318487eaceb3SYang Shi 
318587eaceb3SYang Shi #ifdef CONFIG_MEMCG
318687eaceb3SYang Shi 	if (sc->memcg)
318787eaceb3SYang Shi 		ds_queue = &sc->memcg->deferred_split_queue;
318887eaceb3SYang Shi #endif
3189364c1eebSYang Shi 	return READ_ONCE(ds_queue->split_queue_len);
31909a982250SKirill A. Shutemov }
31919a982250SKirill A. Shutemov 
31929a982250SKirill A. Shutemov static unsigned long deferred_split_scan(struct shrinker *shrink,
31939a982250SKirill A. Shutemov 		struct shrink_control *sc)
31949a982250SKirill A. Shutemov {
3195a3d0a918SKirill A. Shutemov 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
3196364c1eebSYang Shi 	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
31979a982250SKirill A. Shutemov 	unsigned long flags;
31984375a553SMatthew Wilcox (Oracle) 	LIST_HEAD(list);
31994375a553SMatthew Wilcox (Oracle) 	struct folio *folio, *next;
32009a982250SKirill A. Shutemov 	int split = 0;
32019a982250SKirill A. Shutemov 
320287eaceb3SYang Shi #ifdef CONFIG_MEMCG
320387eaceb3SYang Shi 	if (sc->memcg)
320487eaceb3SYang Shi 		ds_queue = &sc->memcg->deferred_split_queue;
320587eaceb3SYang Shi #endif
320687eaceb3SYang Shi 
3207364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
32089a982250SKirill A. Shutemov 	/* Take pin on all head pages to avoid freeing them under us */
32094375a553SMatthew Wilcox (Oracle) 	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
32104375a553SMatthew Wilcox (Oracle) 							_deferred_list) {
32114375a553SMatthew Wilcox (Oracle) 		if (folio_try_get(folio)) {
32124375a553SMatthew Wilcox (Oracle) 			list_move(&folio->_deferred_list, &list);
3213e3ae1953SKirill A. Shutemov 		} else {
32144375a553SMatthew Wilcox (Oracle) 			/* We lost race with folio_put() */
32154375a553SMatthew Wilcox (Oracle) 			list_del_init(&folio->_deferred_list);
3216364c1eebSYang Shi 			ds_queue->split_queue_len--;
32179a982250SKirill A. Shutemov 		}
3218e3ae1953SKirill A. Shutemov 		if (!--sc->nr_to_scan)
3219e3ae1953SKirill A. Shutemov 			break;
32209a982250SKirill A. Shutemov 	}
3221364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
32229a982250SKirill A. Shutemov 
32234375a553SMatthew Wilcox (Oracle) 	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
32244375a553SMatthew Wilcox (Oracle) 		if (!folio_trylock(folio))
3225fa41b900SKirill A. Shutemov 			goto next;
32269a982250SKirill A. Shutemov 		/* split_huge_page() removes page from list on success */
32274375a553SMatthew Wilcox (Oracle) 		if (!split_folio(folio))
32289a982250SKirill A. Shutemov 			split++;
32294375a553SMatthew Wilcox (Oracle) 		folio_unlock(folio);
3230fa41b900SKirill A. Shutemov next:
32314375a553SMatthew Wilcox (Oracle) 		folio_put(folio);
32329a982250SKirill A. Shutemov 	}
32339a982250SKirill A. Shutemov 
3234364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3235364c1eebSYang Shi 	list_splice_tail(&list, &ds_queue->split_queue);
3236364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
32379a982250SKirill A. Shutemov 
3238cb8d68ecSKirill A. Shutemov 	/*
3239cb8d68ecSKirill A. Shutemov 	 * Stop shrinker if we didn't split any page, but the queue is empty.
3240cb8d68ecSKirill A. Shutemov 	 * This can happen if pages were freed under us.
3241cb8d68ecSKirill A. Shutemov 	 */
3242364c1eebSYang Shi 	if (!split && list_empty(&ds_queue->split_queue))
3243cb8d68ecSKirill A. Shutemov 		return SHRINK_STOP;
3244cb8d68ecSKirill A. Shutemov 	return split;
32459a982250SKirill A. Shutemov }
32469a982250SKirill A. Shutemov 
324749071d43SKirill A. Shutemov #ifdef CONFIG_DEBUG_FS
3248fa6c0231SZi Yan static void split_huge_pages_all(void)
324949071d43SKirill A. Shutemov {
325049071d43SKirill A. Shutemov 	struct zone *zone;
325149071d43SKirill A. Shutemov 	struct page *page;
3252630e7c5eSKefeng Wang 	struct folio *folio;
325349071d43SKirill A. Shutemov 	unsigned long pfn, max_zone_pfn;
325449071d43SKirill A. Shutemov 	unsigned long total = 0, split = 0;
325549071d43SKirill A. Shutemov 
3256fa6c0231SZi Yan 	pr_debug("Split all THPs\n");
3257a17206daSMiaohe Lin 	for_each_zone(zone) {
3258a17206daSMiaohe Lin 		if (!managed_zone(zone))
3259a17206daSMiaohe Lin 			continue;
326049071d43SKirill A. Shutemov 		max_zone_pfn = zone_end_pfn(zone);
326149071d43SKirill A. Shutemov 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3262a17206daSMiaohe Lin 			int nr_pages;
326349071d43SKirill A. Shutemov 
32642b7aa91bSNaoya Horiguchi 			page = pfn_to_online_page(pfn);
3265630e7c5eSKefeng Wang 			if (!page || PageTail(page))
3266630e7c5eSKefeng Wang 				continue;
3267630e7c5eSKefeng Wang 			folio = page_folio(page);
3268630e7c5eSKefeng Wang 			if (!folio_try_get(folio))
326949071d43SKirill A. Shutemov 				continue;
327049071d43SKirill A. Shutemov 
3271630e7c5eSKefeng Wang 			if (unlikely(page_folio(page) != folio))
327249071d43SKirill A. Shutemov 				goto next;
327349071d43SKirill A. Shutemov 
3274630e7c5eSKefeng Wang 			if (zone != folio_zone(folio))
3275630e7c5eSKefeng Wang 				goto next;
3276630e7c5eSKefeng Wang 
3277630e7c5eSKefeng Wang 			if (!folio_test_large(folio)
3278630e7c5eSKefeng Wang 				|| folio_test_hugetlb(folio)
3279630e7c5eSKefeng Wang 				|| !folio_test_lru(folio))
328049071d43SKirill A. Shutemov 				goto next;
328149071d43SKirill A. Shutemov 
328249071d43SKirill A. Shutemov 			total++;
3283630e7c5eSKefeng Wang 			folio_lock(folio);
3284630e7c5eSKefeng Wang 			nr_pages = folio_nr_pages(folio);
3285630e7c5eSKefeng Wang 			if (!split_folio(folio))
328649071d43SKirill A. Shutemov 				split++;
3287a17206daSMiaohe Lin 			pfn += nr_pages - 1;
3288630e7c5eSKefeng Wang 			folio_unlock(folio);
328949071d43SKirill A. Shutemov next:
3290630e7c5eSKefeng Wang 			folio_put(folio);
3291fa6c0231SZi Yan 			cond_resched();
329249071d43SKirill A. Shutemov 		}
329349071d43SKirill A. Shutemov 	}
329449071d43SKirill A. Shutemov 
3295fa6c0231SZi Yan 	pr_debug("%lu of %lu THP split\n", split, total);
329649071d43SKirill A. Shutemov }
3297fa6c0231SZi Yan 
3298fa6c0231SZi Yan static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
3299fa6c0231SZi Yan {
3300fa6c0231SZi Yan 	return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
3301fa6c0231SZi Yan 		    is_vm_hugetlb_page(vma);
3302fa6c0231SZi Yan }
3303fa6c0231SZi Yan 
3304fa6c0231SZi Yan static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
3305fa6c0231SZi Yan 				unsigned long vaddr_end)
3306fa6c0231SZi Yan {
3307fa6c0231SZi Yan 	int ret = 0;
3308fa6c0231SZi Yan 	struct task_struct *task;
3309fa6c0231SZi Yan 	struct mm_struct *mm;
3310fa6c0231SZi Yan 	unsigned long total = 0, split = 0;
3311fa6c0231SZi Yan 	unsigned long addr;
3312fa6c0231SZi Yan 
3313fa6c0231SZi Yan 	vaddr_start &= PAGE_MASK;
3314fa6c0231SZi Yan 	vaddr_end &= PAGE_MASK;
3315fa6c0231SZi Yan 
3316fa6c0231SZi Yan 	/* Find the task_struct from pid */
3317fa6c0231SZi Yan 	rcu_read_lock();
3318fa6c0231SZi Yan 	task = find_task_by_vpid(pid);
3319fa6c0231SZi Yan 	if (!task) {
3320fa6c0231SZi Yan 		rcu_read_unlock();
3321fa6c0231SZi Yan 		ret = -ESRCH;
3322fa6c0231SZi Yan 		goto out;
3323fa6c0231SZi Yan 	}
3324fa6c0231SZi Yan 	get_task_struct(task);
3325fa6c0231SZi Yan 	rcu_read_unlock();
3326fa6c0231SZi Yan 
3327fa6c0231SZi Yan 	/* Find the mm_struct */
3328fa6c0231SZi Yan 	mm = get_task_mm(task);
3329fa6c0231SZi Yan 	put_task_struct(task);
3330fa6c0231SZi Yan 
3331fa6c0231SZi Yan 	if (!mm) {
3332fa6c0231SZi Yan 		ret = -EINVAL;
3333fa6c0231SZi Yan 		goto out;
3334fa6c0231SZi Yan 	}
3335fa6c0231SZi Yan 
3336fa6c0231SZi Yan 	pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3337fa6c0231SZi Yan 		 pid, vaddr_start, vaddr_end);
3338fa6c0231SZi Yan 
3339fa6c0231SZi Yan 	mmap_read_lock(mm);
3340fa6c0231SZi Yan 	/*
3341fa6c0231SZi Yan 	 * always increase addr by PAGE_SIZE, since we could have a PTE page
3342fa6c0231SZi Yan 	 * table filled with PTE-mapped THPs, each of which is distinct.
3343fa6c0231SZi Yan 	 */
3344fa6c0231SZi Yan 	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
334574ba2b38SMiaohe Lin 		struct vm_area_struct *vma = vma_lookup(mm, addr);
3346fa6c0231SZi Yan 		struct page *page;
3347a644b0abSMatthew Wilcox (Oracle) 		struct folio *folio;
3348fa6c0231SZi Yan 
334974ba2b38SMiaohe Lin 		if (!vma)
3350fa6c0231SZi Yan 			break;
3351fa6c0231SZi Yan 
3352fa6c0231SZi Yan 		/* skip special VMA and hugetlb VMA */
3353fa6c0231SZi Yan 		if (vma_not_suitable_for_thp_split(vma)) {
3354fa6c0231SZi Yan 			addr = vma->vm_end;
3355fa6c0231SZi Yan 			continue;
3356fa6c0231SZi Yan 		}
3357fa6c0231SZi Yan 
3358fa6c0231SZi Yan 		/* FOLL_DUMP to ignore special (like zero) pages */
335987d2762eSMiaohe Lin 		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
3360fa6c0231SZi Yan 
3361f7091ed6SHaiyue Wang 		if (IS_ERR_OR_NULL(page))
3362fa6c0231SZi Yan 			continue;
3363fa6c0231SZi Yan 
3364a644b0abSMatthew Wilcox (Oracle) 		folio = page_folio(page);
3365a644b0abSMatthew Wilcox (Oracle) 		if (!is_transparent_hugepage(folio))
3366fa6c0231SZi Yan 			goto next;
3367fa6c0231SZi Yan 
3368fa6c0231SZi Yan 		total++;
3369a644b0abSMatthew Wilcox (Oracle) 		if (!can_split_folio(folio, NULL))
3370fa6c0231SZi Yan 			goto next;
3371fa6c0231SZi Yan 
3372a644b0abSMatthew Wilcox (Oracle) 		if (!folio_trylock(folio))
3373fa6c0231SZi Yan 			goto next;
3374fa6c0231SZi Yan 
3375a644b0abSMatthew Wilcox (Oracle) 		if (!split_folio(folio))
3376fa6c0231SZi Yan 			split++;
3377fa6c0231SZi Yan 
3378a644b0abSMatthew Wilcox (Oracle) 		folio_unlock(folio);
3379fa6c0231SZi Yan next:
3380a644b0abSMatthew Wilcox (Oracle) 		folio_put(folio);
3381fa6c0231SZi Yan 		cond_resched();
3382fa6c0231SZi Yan 	}
3383fa6c0231SZi Yan 	mmap_read_unlock(mm);
3384fa6c0231SZi Yan 	mmput(mm);
3385fa6c0231SZi Yan 
3386fa6c0231SZi Yan 	pr_debug("%lu of %lu THP split\n", split, total);
3387fa6c0231SZi Yan 
3388fa6c0231SZi Yan out:
3389fa6c0231SZi Yan 	return ret;
3390fa6c0231SZi Yan }
3391fa6c0231SZi Yan 
3392fbe37501SZi Yan static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
3393fbe37501SZi Yan 				pgoff_t off_end)
3394fbe37501SZi Yan {
3395fbe37501SZi Yan 	struct filename *file;
3396fbe37501SZi Yan 	struct file *candidate;
3397fbe37501SZi Yan 	struct address_space *mapping;
3398fbe37501SZi Yan 	int ret = -EINVAL;
3399fbe37501SZi Yan 	pgoff_t index;
3400fbe37501SZi Yan 	int nr_pages = 1;
3401fbe37501SZi Yan 	unsigned long total = 0, split = 0;
3402fbe37501SZi Yan 
3403fbe37501SZi Yan 	file = getname_kernel(file_path);
3404fbe37501SZi Yan 	if (IS_ERR(file))
3405fbe37501SZi Yan 		return ret;
3406fbe37501SZi Yan 
3407fbe37501SZi Yan 	candidate = file_open_name(file, O_RDONLY, 0);
3408fbe37501SZi Yan 	if (IS_ERR(candidate))
3409fbe37501SZi Yan 		goto out;
3410fbe37501SZi Yan 
3411fbe37501SZi Yan 	pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3412fbe37501SZi Yan 		 file_path, off_start, off_end);
3413fbe37501SZi Yan 
3414fbe37501SZi Yan 	mapping = candidate->f_mapping;
3415fbe37501SZi Yan 
3416fbe37501SZi Yan 	for (index = off_start; index < off_end; index += nr_pages) {
34171fb130b2SChristoph Hellwig 		struct folio *folio = filemap_get_folio(mapping, index);
3418fbe37501SZi Yan 
3419fbe37501SZi Yan 		nr_pages = 1;
342066dabbb6SChristoph Hellwig 		if (IS_ERR(folio))
3421fbe37501SZi Yan 			continue;
3422fbe37501SZi Yan 
34239ee2c086SMatthew Wilcox (Oracle) 		if (!folio_test_large(folio))
3424fbe37501SZi Yan 			goto next;
3425fbe37501SZi Yan 
3426fbe37501SZi Yan 		total++;
34279ee2c086SMatthew Wilcox (Oracle) 		nr_pages = folio_nr_pages(folio);
3428fbe37501SZi Yan 
34299ee2c086SMatthew Wilcox (Oracle) 		if (!folio_trylock(folio))
3430fbe37501SZi Yan 			goto next;
3431fbe37501SZi Yan 
34329ee2c086SMatthew Wilcox (Oracle) 		if (!split_folio(folio))
3433fbe37501SZi Yan 			split++;
3434fbe37501SZi Yan 
34359ee2c086SMatthew Wilcox (Oracle) 		folio_unlock(folio);
3436fbe37501SZi Yan next:
34379ee2c086SMatthew Wilcox (Oracle) 		folio_put(folio);
3438fbe37501SZi Yan 		cond_resched();
3439fbe37501SZi Yan 	}
3440fbe37501SZi Yan 
3441fbe37501SZi Yan 	filp_close(candidate, NULL);
3442fbe37501SZi Yan 	ret = 0;
3443fbe37501SZi Yan 
3444fbe37501SZi Yan 	pr_debug("%lu of %lu file-backed THP split\n", split, total);
3445fbe37501SZi Yan out:
3446fbe37501SZi Yan 	putname(file);
3447fbe37501SZi Yan 	return ret;
3448fbe37501SZi Yan }
3449fbe37501SZi Yan 
3450fa6c0231SZi Yan #define MAX_INPUT_BUF_SZ 255
3451fa6c0231SZi Yan 
3452fa6c0231SZi Yan static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
3453fa6c0231SZi Yan 				size_t count, loff_t *ppops)
3454fa6c0231SZi Yan {
3455fa6c0231SZi Yan 	static DEFINE_MUTEX(split_debug_mutex);
3456fa6c0231SZi Yan 	ssize_t ret;
3457fbe37501SZi Yan 	/* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
3458fbe37501SZi Yan 	char input_buf[MAX_INPUT_BUF_SZ];
3459fa6c0231SZi Yan 	int pid;
3460fa6c0231SZi Yan 	unsigned long vaddr_start, vaddr_end;
3461fa6c0231SZi Yan 
3462fa6c0231SZi Yan 	ret = mutex_lock_interruptible(&split_debug_mutex);
3463fa6c0231SZi Yan 	if (ret)
3464fa6c0231SZi Yan 		return ret;
3465fa6c0231SZi Yan 
3466fa6c0231SZi Yan 	ret = -EFAULT;
3467fa6c0231SZi Yan 
3468fa6c0231SZi Yan 	memset(input_buf, 0, MAX_INPUT_BUF_SZ);
3469fa6c0231SZi Yan 	if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3470fa6c0231SZi Yan 		goto out;
3471fa6c0231SZi Yan 
3472fa6c0231SZi Yan 	input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
3473fbe37501SZi Yan 
3474fbe37501SZi Yan 	if (input_buf[0] == '/') {
3475fbe37501SZi Yan 		char *tok;
3476fbe37501SZi Yan 		char *buf = input_buf;
3477fbe37501SZi Yan 		char file_path[MAX_INPUT_BUF_SZ];
3478fbe37501SZi Yan 		pgoff_t off_start = 0, off_end = 0;
3479fbe37501SZi Yan 		size_t input_len = strlen(input_buf);
3480fbe37501SZi Yan 
3481fbe37501SZi Yan 		tok = strsep(&buf, ",");
3482fbe37501SZi Yan 		if (tok) {
34831212e00cSMatthew Wilcox (Oracle) 			strcpy(file_path, tok);
3484fbe37501SZi Yan 		} else {
3485fbe37501SZi Yan 			ret = -EINVAL;
3486fbe37501SZi Yan 			goto out;
3487fbe37501SZi Yan 		}
3488fbe37501SZi Yan 
3489fbe37501SZi Yan 		ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
3490fbe37501SZi Yan 		if (ret != 2) {
3491fbe37501SZi Yan 			ret = -EINVAL;
3492fbe37501SZi Yan 			goto out;
3493fbe37501SZi Yan 		}
3494fbe37501SZi Yan 		ret = split_huge_pages_in_file(file_path, off_start, off_end);
3495fbe37501SZi Yan 		if (!ret)
3496fbe37501SZi Yan 			ret = input_len;
3497fbe37501SZi Yan 
3498fbe37501SZi Yan 		goto out;
3499fbe37501SZi Yan 	}
3500fbe37501SZi Yan 
3501fa6c0231SZi Yan 	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
3502fa6c0231SZi Yan 	if (ret == 1 && pid == 1) {
3503fa6c0231SZi Yan 		split_huge_pages_all();
3504fa6c0231SZi Yan 		ret = strlen(input_buf);
3505fa6c0231SZi Yan 		goto out;
3506fa6c0231SZi Yan 	} else if (ret != 3) {
3507fa6c0231SZi Yan 		ret = -EINVAL;
3508fa6c0231SZi Yan 		goto out;
3509fa6c0231SZi Yan 	}
3510fa6c0231SZi Yan 
3511fa6c0231SZi Yan 	ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
3512fa6c0231SZi Yan 	if (!ret)
3513fa6c0231SZi Yan 		ret = strlen(input_buf);
3514fa6c0231SZi Yan out:
3515fa6c0231SZi Yan 	mutex_unlock(&split_debug_mutex);
3516fa6c0231SZi Yan 	return ret;
3517fa6c0231SZi Yan 
3518fa6c0231SZi Yan }
3519fa6c0231SZi Yan 
3520fa6c0231SZi Yan static const struct file_operations split_huge_pages_fops = {
3521fa6c0231SZi Yan 	.owner	 = THIS_MODULE,
3522fa6c0231SZi Yan 	.write	 = split_huge_pages_write,
3523fa6c0231SZi Yan 	.llseek  = no_llseek,
3524fa6c0231SZi Yan };
352549071d43SKirill A. Shutemov 
352649071d43SKirill A. Shutemov static int __init split_huge_pages_debugfs(void)
352749071d43SKirill A. Shutemov {
3528d9f7979cSGreg Kroah-Hartman 	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
352949071d43SKirill A. Shutemov 			    &split_huge_pages_fops);
353049071d43SKirill A. Shutemov 	return 0;
353149071d43SKirill A. Shutemov }
353249071d43SKirill A. Shutemov late_initcall(split_huge_pages_debugfs);
353349071d43SKirill A. Shutemov #endif
3534616b8371SZi Yan 
3535616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
35367f5abe60SDavid Hildenbrand int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3537616b8371SZi Yan 		struct page *page)
3538616b8371SZi Yan {
3539616b8371SZi Yan 	struct vm_area_struct *vma = pvmw->vma;
3540616b8371SZi Yan 	struct mm_struct *mm = vma->vm_mm;
3541616b8371SZi Yan 	unsigned long address = pvmw->address;
35426c287605SDavid Hildenbrand 	bool anon_exclusive;
3543616b8371SZi Yan 	pmd_t pmdval;
3544616b8371SZi Yan 	swp_entry_t entry;
3545ab6e3d09SNaoya Horiguchi 	pmd_t pmdswp;
3546616b8371SZi Yan 
3547616b8371SZi Yan 	if (!(pvmw->pmd && !pvmw->pte))
35487f5abe60SDavid Hildenbrand 		return 0;
3549616b8371SZi Yan 
3550616b8371SZi Yan 	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
35518a8683adSHuang Ying 	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
35526c287605SDavid Hildenbrand 
3553088b8aa5SDavid Hildenbrand 	/* See page_try_share_anon_rmap(): invalidate PMD first. */
35546c287605SDavid Hildenbrand 	anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
35556c287605SDavid Hildenbrand 	if (anon_exclusive && page_try_share_anon_rmap(page)) {
35566c287605SDavid Hildenbrand 		set_pmd_at(mm, address, pvmw->pmd, pmdval);
35577f5abe60SDavid Hildenbrand 		return -EBUSY;
35586c287605SDavid Hildenbrand 	}
35596c287605SDavid Hildenbrand 
3560616b8371SZi Yan 	if (pmd_dirty(pmdval))
3561616b8371SZi Yan 		set_page_dirty(page);
35624dd845b5SAlistair Popple 	if (pmd_write(pmdval))
35634dd845b5SAlistair Popple 		entry = make_writable_migration_entry(page_to_pfn(page));
35646c287605SDavid Hildenbrand 	else if (anon_exclusive)
35656c287605SDavid Hildenbrand 		entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
35664dd845b5SAlistair Popple 	else
35674dd845b5SAlistair Popple 		entry = make_readable_migration_entry(page_to_pfn(page));
35682e346877SPeter Xu 	if (pmd_young(pmdval))
35692e346877SPeter Xu 		entry = make_migration_entry_young(entry);
35702e346877SPeter Xu 	if (pmd_dirty(pmdval))
35712e346877SPeter Xu 		entry = make_migration_entry_dirty(entry);
3572ab6e3d09SNaoya Horiguchi 	pmdswp = swp_entry_to_pmd(entry);
3573ab6e3d09SNaoya Horiguchi 	if (pmd_soft_dirty(pmdval))
3574ab6e3d09SNaoya Horiguchi 		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
357524bf08c4SDavid Hildenbrand 	if (pmd_uffd_wp(pmdval))
357624bf08c4SDavid Hildenbrand 		pmdswp = pmd_swp_mkuffd_wp(pmdswp);
3577ab6e3d09SNaoya Horiguchi 	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3578cea86fe2SHugh Dickins 	page_remove_rmap(page, vma, true);
3579616b8371SZi Yan 	put_page(page);
3580283fd6feSAnshuman Khandual 	trace_set_migration_pmd(address, pmd_val(pmdswp));
35817f5abe60SDavid Hildenbrand 
35827f5abe60SDavid Hildenbrand 	return 0;
3583616b8371SZi Yan }
3584616b8371SZi Yan 
3585616b8371SZi Yan void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3586616b8371SZi Yan {
358714d85a6eSDavid Hildenbrand 	struct folio *folio = page_folio(new);
3588616b8371SZi Yan 	struct vm_area_struct *vma = pvmw->vma;
3589616b8371SZi Yan 	struct mm_struct *mm = vma->vm_mm;
3590616b8371SZi Yan 	unsigned long address = pvmw->address;
35914fba8f2aSMiaohe Lin 	unsigned long haddr = address & HPAGE_PMD_MASK;
3592616b8371SZi Yan 	pmd_t pmde;
3593616b8371SZi Yan 	swp_entry_t entry;
3594616b8371SZi Yan 
3595616b8371SZi Yan 	if (!(pvmw->pmd && !pvmw->pte))
3596616b8371SZi Yan 		return;
3597616b8371SZi Yan 
3598616b8371SZi Yan 	entry = pmd_to_swp_entry(*pvmw->pmd);
359914d85a6eSDavid Hildenbrand 	folio_get(folio);
36002e346877SPeter Xu 	pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
3601ab6e3d09SNaoya Horiguchi 	if (pmd_swp_soft_dirty(*pvmw->pmd))
3602ab6e3d09SNaoya Horiguchi 		pmde = pmd_mksoft_dirty(pmde);
36033c811f78SDavid Hildenbrand 	if (is_writable_migration_entry(entry))
3604161e393cSRick Edgecombe 		pmde = pmd_mkwrite(pmde, vma);
36058f34f1eaSPeter Xu 	if (pmd_swp_uffd_wp(*pvmw->pmd))
3606f1eb1bacSPeter Xu 		pmde = pmd_mkuffd_wp(pmde);
36072e346877SPeter Xu 	if (!is_migration_entry_young(entry))
36082e346877SPeter Xu 		pmde = pmd_mkold(pmde);
36092e346877SPeter Xu 	/* NOTE: this may contain setting soft-dirty on some archs */
361014d85a6eSDavid Hildenbrand 	if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
36112e346877SPeter Xu 		pmde = pmd_mkdirty(pmde);
3612616b8371SZi Yan 
361314d85a6eSDavid Hildenbrand 	if (folio_test_anon(folio)) {
36146c287605SDavid Hildenbrand 		rmap_t rmap_flags = RMAP_COMPOUND;
36156c287605SDavid Hildenbrand 
36166c287605SDavid Hildenbrand 		if (!is_readable_migration_entry(entry))
36176c287605SDavid Hildenbrand 			rmap_flags |= RMAP_EXCLUSIVE;
36186c287605SDavid Hildenbrand 
36194fba8f2aSMiaohe Lin 		page_add_anon_rmap(new, vma, haddr, rmap_flags);
36206c287605SDavid Hildenbrand 	} else {
362114d85a6eSDavid Hildenbrand 		folio_add_file_rmap_pmd(folio, new, vma);
36226c287605SDavid Hildenbrand 	}
362314d85a6eSDavid Hildenbrand 	VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
36244fba8f2aSMiaohe Lin 	set_pmd_at(mm, haddr, pvmw->pmd, pmde);
36255cbcf225SMuchun Song 
36265cbcf225SMuchun Song 	/* No need to invalidate - it was non-present before */
3627616b8371SZi Yan 	update_mmu_cache_pmd(vma, address, pvmw->pmd);
3628283fd6feSAnshuman Khandual 	trace_remove_migration_pmd(address, pmd_val(pmde));
3629616b8371SZi Yan }
3630616b8371SZi Yan #endif
3631