xref: /linux/mm/huge_memory.c (revision 54c23548e0f5609f55b353bcd3c1aa295852f383)
120c8ccb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
271e3aac0SAndrea Arcangeli /*
371e3aac0SAndrea Arcangeli  *  Copyright (C) 2009  Red Hat, Inc.
471e3aac0SAndrea Arcangeli  */
571e3aac0SAndrea Arcangeli 
6ae3a8c1cSAndrew Morton #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7ae3a8c1cSAndrew Morton 
871e3aac0SAndrea Arcangeli #include <linux/mm.h>
971e3aac0SAndrea Arcangeli #include <linux/sched.h>
10fa6c0231SZi Yan #include <linux/sched/mm.h>
11f7ccbae4SIngo Molnar #include <linux/sched/coredump.h>
126a3827d7SIngo Molnar #include <linux/sched/numa_balancing.h>
1371e3aac0SAndrea Arcangeli #include <linux/highmem.h>
1471e3aac0SAndrea Arcangeli #include <linux/hugetlb.h>
1571e3aac0SAndrea Arcangeli #include <linux/mmu_notifier.h>
1671e3aac0SAndrea Arcangeli #include <linux/rmap.h>
1771e3aac0SAndrea Arcangeli #include <linux/swap.h>
1897ae1749SKirill A. Shutemov #include <linux/shrinker.h>
19ba76149fSAndrea Arcangeli #include <linux/mm_inline.h>
20e9b61f19SKirill A. Shutemov #include <linux/swapops.h>
21fb5c2029SMatthew Wilcox (Oracle) #include <linux/backing-dev.h>
224897c765SMatthew Wilcox #include <linux/dax.h>
23ba76149fSAndrea Arcangeli #include <linux/khugepaged.h>
24878aee7dSAndrea Arcangeli #include <linux/freezer.h>
25f25748e3SDan Williams #include <linux/pfn_t.h>
26a664b2d8SAndrea Arcangeli #include <linux/mman.h>
273565fce3SDan Williams #include <linux/memremap.h>
28325adeb5SRalf Baechle #include <linux/pagemap.h>
2949071d43SKirill A. Shutemov #include <linux/debugfs.h>
304daae3b4SMel Gorman #include <linux/migrate.h>
3143b5fbbdSSasha Levin #include <linux/hashtable.h>
326b251fc9SAndrea Arcangeli #include <linux/userfaultfd_k.h>
3333c3fc71SVladimir Davydov #include <linux/page_idle.h>
34baa355fdSKirill A. Shutemov #include <linux/shmem_fs.h>
356b31d595SMichal Hocko #include <linux/oom.h>
3698fa15f3SAnshuman Khandual #include <linux/numa.h>
37f7da677bSVlastimil Babka #include <linux/page_owner.h>
38a1a3a2fcSHuang Ying #include <linux/sched/sysctl.h>
39467b171aSAneesh Kumar K.V #include <linux/memory-tiers.h>
404ef9ad19SYang Shi #include <linux/compat.h>
4197ae1749SKirill A. Shutemov 
4271e3aac0SAndrea Arcangeli #include <asm/tlb.h>
4371e3aac0SAndrea Arcangeli #include <asm/pgalloc.h>
4471e3aac0SAndrea Arcangeli #include "internal.h"
45014bb1deSNeilBrown #include "swap.h"
4671e3aac0SAndrea Arcangeli 
47283fd6feSAnshuman Khandual #define CREATE_TRACE_POINTS
48283fd6feSAnshuman Khandual #include <trace/events/thp.h>
49283fd6feSAnshuman Khandual 
50ba76149fSAndrea Arcangeli /*
51b14d595aSMichael DeGuzis  * By default, transparent hugepage support is disabled in order to avoid
52b14d595aSMichael DeGuzis  * risking an increased memory footprint for applications that are not
53b14d595aSMichael DeGuzis  * guaranteed to benefit from it. When transparent hugepage support is
54b14d595aSMichael DeGuzis  * enabled, it is for all mappings, and khugepaged scans all mappings.
558bfa3f9aSJianguo Wu  * Defrag is invoked by khugepaged hugepage allocations and by page faults
568bfa3f9aSJianguo Wu  * for all hugepage allocations.
57ba76149fSAndrea Arcangeli  */
5871e3aac0SAndrea Arcangeli unsigned long transparent_hugepage_flags __read_mostly =
5913ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
60ba76149fSAndrea Arcangeli 	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
6113ece886SAndrea Arcangeli #endif
6213ece886SAndrea Arcangeli #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
6313ece886SAndrea Arcangeli 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
6413ece886SAndrea Arcangeli #endif
65444eb2a4SMel Gorman 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
6679da5407SKirill A. Shutemov 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
6779da5407SKirill A. Shutemov 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
68ba76149fSAndrea Arcangeli 
6954d91729SQi Zheng static struct shrinker *deferred_split_shrinker;
7054d91729SQi Zheng static unsigned long deferred_split_count(struct shrinker *shrink,
7154d91729SQi Zheng 					  struct shrink_control *sc);
7254d91729SQi Zheng static unsigned long deferred_split_scan(struct shrinker *shrink,
7354d91729SQi Zheng 					 struct shrink_control *sc);
74f000565aSAndrea Arcangeli 
7597ae1749SKirill A. Shutemov static atomic_t huge_zero_refcount;
7656873f43SWang, Yalin struct page *huge_zero_page __read_mostly;
773b77e8c8SHugh Dickins unsigned long huge_zero_pfn __read_mostly = ~0UL;
783485b883SRyan Roberts unsigned long huge_anon_orders_always __read_mostly;
793485b883SRyan Roberts unsigned long huge_anon_orders_madvise __read_mostly;
803485b883SRyan Roberts unsigned long huge_anon_orders_inherit __read_mostly;
814a6c1297SKirill A. Shutemov 
823485b883SRyan Roberts unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
833485b883SRyan Roberts 					 unsigned long vm_flags, bool smaps,
843485b883SRyan Roberts 					 bool in_pf, bool enforce_sysfs,
853485b883SRyan Roberts 					 unsigned long orders)
867635d9cbSMichal Hocko {
873485b883SRyan Roberts 	/* Check the intersection of requested and supported orders. */
883485b883SRyan Roberts 	orders &= vma_is_anonymous(vma) ?
893485b883SRyan Roberts 			THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
903485b883SRyan Roberts 	if (!orders)
913485b883SRyan Roberts 		return 0;
923485b883SRyan Roberts 
939fec5168SYang Shi 	if (!vma->vm_mm)		/* vdso */
943485b883SRyan Roberts 		return 0;
959fec5168SYang Shi 
967da4e2cbSYang Shi 	/*
977da4e2cbSYang Shi 	 * Explicitly disabled through madvise or prctl, or some
987da4e2cbSYang Shi 	 * architectures may disable THP for some mappings, for
997da4e2cbSYang Shi 	 * example, s390 kvm.
1007da4e2cbSYang Shi 	 * */
1017da4e2cbSYang Shi 	if ((vm_flags & VM_NOHUGEPAGE) ||
1027da4e2cbSYang Shi 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
1033485b883SRyan Roberts 		return 0;
1047da4e2cbSYang Shi 	/*
1057da4e2cbSYang Shi 	 * If the hardware/firmware marked hugepage support disabled.
1067da4e2cbSYang Shi 	 */
1073c556d24SPeter Xu 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
1083485b883SRyan Roberts 		return 0;
1099fec5168SYang Shi 
1107da4e2cbSYang Shi 	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
1117da4e2cbSYang Shi 	if (vma_is_dax(vma))
1123485b883SRyan Roberts 		return in_pf ? orders : 0;
1137da4e2cbSYang Shi 
1147da4e2cbSYang Shi 	/*
1157a81751fSZach O'Keefe 	 * khugepaged special VMA and hugetlb VMA.
1167da4e2cbSYang Shi 	 * Must be checked after dax since some dax mappings may have
1177da4e2cbSYang Shi 	 * VM_MIXEDMAP set.
1187da4e2cbSYang Shi 	 */
1197a81751fSZach O'Keefe 	if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
1203485b883SRyan Roberts 		return 0;
1219fec5168SYang Shi 
1227da4e2cbSYang Shi 	/*
1233485b883SRyan Roberts 	 * Check alignment for file vma and size for both file and anon vma by
1243485b883SRyan Roberts 	 * filtering out the unsuitable orders.
1257da4e2cbSYang Shi 	 *
1267da4e2cbSYang Shi 	 * Skip the check for page fault. Huge fault does the check in fault
1273485b883SRyan Roberts 	 * handlers.
1287da4e2cbSYang Shi 	 */
1293485b883SRyan Roberts 	if (!in_pf) {
1303485b883SRyan Roberts 		int order = highest_order(orders);
1313485b883SRyan Roberts 		unsigned long addr;
1323485b883SRyan Roberts 
1333485b883SRyan Roberts 		while (orders) {
1343485b883SRyan Roberts 			addr = vma->vm_end - (PAGE_SIZE << order);
1353485b883SRyan Roberts 			if (thp_vma_suitable_order(vma, addr, order))
1363485b883SRyan Roberts 				break;
1373485b883SRyan Roberts 			order = next_order(&orders, order);
1383485b883SRyan Roberts 		}
1393485b883SRyan Roberts 
1403485b883SRyan Roberts 		if (!orders)
1413485b883SRyan Roberts 			return 0;
1423485b883SRyan Roberts 	}
1439fec5168SYang Shi 
1447da4e2cbSYang Shi 	/*
1457da4e2cbSYang Shi 	 * Enabled via shmem mount options or sysfs settings.
1467da4e2cbSYang Shi 	 * Must be done before hugepage flags check since shmem has its
1477da4e2cbSYang Shi 	 * own flags.
1487da4e2cbSYang Shi 	 */
1497da4e2cbSYang Shi 	if (!in_pf && shmem_file(vma->vm_file))
1502cf13384SDavid Stevens 		return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
1513485b883SRyan Roberts 				     !enforce_sysfs, vma->vm_mm, vm_flags)
1523485b883SRyan Roberts 			? orders : 0;
1539fec5168SYang Shi 
1547a81751fSZach O'Keefe 	if (!vma_is_anonymous(vma)) {
1557a81751fSZach O'Keefe 		/*
1563485b883SRyan Roberts 		 * Enforce sysfs THP requirements as necessary. Anonymous vmas
1573485b883SRyan Roberts 		 * were already handled in thp_vma_allowable_orders().
1583485b883SRyan Roberts 		 */
1593485b883SRyan Roberts 		if (enforce_sysfs &&
1603485b883SRyan Roberts 		    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
1613485b883SRyan Roberts 						    !hugepage_global_always())))
1623485b883SRyan Roberts 			return 0;
1633485b883SRyan Roberts 
1643485b883SRyan Roberts 		/*
1657a81751fSZach O'Keefe 		 * Trust that ->huge_fault() handlers know what they are doing
1667a81751fSZach O'Keefe 		 * in fault path.
1677a81751fSZach O'Keefe 		 */
1687a81751fSZach O'Keefe 		if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
1693485b883SRyan Roberts 			return orders;
1707a81751fSZach O'Keefe 		/* Only regular file is valid in collapse path */
1717a81751fSZach O'Keefe 		if (((!in_pf || smaps)) && file_thp_enabled(vma))
1723485b883SRyan Roberts 			return orders;
1733485b883SRyan Roberts 		return 0;
1747a81751fSZach O'Keefe 	}
1759fec5168SYang Shi 
1769fec5168SYang Shi 	if (vma_is_temporary_stack(vma))
1773485b883SRyan Roberts 		return 0;
1789fec5168SYang Shi 
1799fec5168SYang Shi 	/*
1809fec5168SYang Shi 	 * THPeligible bit of smaps should show 1 for proper VMAs even
1819fec5168SYang Shi 	 * though anon_vma is not initialized yet.
1827da4e2cbSYang Shi 	 *
1837da4e2cbSYang Shi 	 * Allow page fault since anon_vma may be not initialized until
1847da4e2cbSYang Shi 	 * the first page fault.
1859fec5168SYang Shi 	 */
1869fec5168SYang Shi 	if (!vma->anon_vma)
1873485b883SRyan Roberts 		return (smaps || in_pf) ? orders : 0;
1889fec5168SYang Shi 
1893485b883SRyan Roberts 	return orders;
1907635d9cbSMichal Hocko }
1917635d9cbSMichal Hocko 
192aaa9705bSMiaohe Lin static bool get_huge_zero_page(void)
19397ae1749SKirill A. Shutemov {
19497ae1749SKirill A. Shutemov 	struct page *zero_page;
19597ae1749SKirill A. Shutemov retry:
19697ae1749SKirill A. Shutemov 	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
197aaa9705bSMiaohe Lin 		return true;
19897ae1749SKirill A. Shutemov 
19997ae1749SKirill A. Shutemov 	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
20097ae1749SKirill A. Shutemov 			HPAGE_PMD_ORDER);
201d8a8e1f0SKirill A. Shutemov 	if (!zero_page) {
202d8a8e1f0SKirill A. Shutemov 		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
203aaa9705bSMiaohe Lin 		return false;
204d8a8e1f0SKirill A. Shutemov 	}
20597ae1749SKirill A. Shutemov 	preempt_disable();
2065918d10aSKirill A. Shutemov 	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
20797ae1749SKirill A. Shutemov 		preempt_enable();
2085ddacbe9SYu Zhao 		__free_pages(zero_page, compound_order(zero_page));
20997ae1749SKirill A. Shutemov 		goto retry;
21097ae1749SKirill A. Shutemov 	}
2113b77e8c8SHugh Dickins 	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
21297ae1749SKirill A. Shutemov 
21397ae1749SKirill A. Shutemov 	/* We take additional reference here. It will be put back by shrinker */
21497ae1749SKirill A. Shutemov 	atomic_set(&huge_zero_refcount, 2);
21597ae1749SKirill A. Shutemov 	preempt_enable();
216f4981502SLiu Shixin 	count_vm_event(THP_ZERO_PAGE_ALLOC);
217aaa9705bSMiaohe Lin 	return true;
21897ae1749SKirill A. Shutemov }
21997ae1749SKirill A. Shutemov 
2206fcb52a5SAaron Lu static void put_huge_zero_page(void)
22197ae1749SKirill A. Shutemov {
22297ae1749SKirill A. Shutemov 	/*
22397ae1749SKirill A. Shutemov 	 * Counter should never go to zero here. Only shrinker can put
22497ae1749SKirill A. Shutemov 	 * last reference.
22597ae1749SKirill A. Shutemov 	 */
22697ae1749SKirill A. Shutemov 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
22797ae1749SKirill A. Shutemov }
22897ae1749SKirill A. Shutemov 
2296fcb52a5SAaron Lu struct page *mm_get_huge_zero_page(struct mm_struct *mm)
2306fcb52a5SAaron Lu {
2316fcb52a5SAaron Lu 	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
2326fcb52a5SAaron Lu 		return READ_ONCE(huge_zero_page);
2336fcb52a5SAaron Lu 
2346fcb52a5SAaron Lu 	if (!get_huge_zero_page())
2356fcb52a5SAaron Lu 		return NULL;
2366fcb52a5SAaron Lu 
2376fcb52a5SAaron Lu 	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
2386fcb52a5SAaron Lu 		put_huge_zero_page();
2396fcb52a5SAaron Lu 
2406fcb52a5SAaron Lu 	return READ_ONCE(huge_zero_page);
2416fcb52a5SAaron Lu }
2426fcb52a5SAaron Lu 
2436fcb52a5SAaron Lu void mm_put_huge_zero_page(struct mm_struct *mm)
2446fcb52a5SAaron Lu {
2456fcb52a5SAaron Lu 	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
2466fcb52a5SAaron Lu 		put_huge_zero_page();
2476fcb52a5SAaron Lu }
2486fcb52a5SAaron Lu 
24948896466SGlauber Costa static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
25097ae1749SKirill A. Shutemov 					struct shrink_control *sc)
25197ae1749SKirill A. Shutemov {
25297ae1749SKirill A. Shutemov 	/* we can free zero page only if last reference remains */
25397ae1749SKirill A. Shutemov 	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
25448896466SGlauber Costa }
25597ae1749SKirill A. Shutemov 
25648896466SGlauber Costa static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
25748896466SGlauber Costa 				       struct shrink_control *sc)
25848896466SGlauber Costa {
25997ae1749SKirill A. Shutemov 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
2605918d10aSKirill A. Shutemov 		struct page *zero_page = xchg(&huge_zero_page, NULL);
2615918d10aSKirill A. Shutemov 		BUG_ON(zero_page == NULL);
2623b77e8c8SHugh Dickins 		WRITE_ONCE(huge_zero_pfn, ~0UL);
2635ddacbe9SYu Zhao 		__free_pages(zero_page, compound_order(zero_page));
26448896466SGlauber Costa 		return HPAGE_PMD_NR;
26597ae1749SKirill A. Shutemov 	}
26697ae1749SKirill A. Shutemov 
26797ae1749SKirill A. Shutemov 	return 0;
26897ae1749SKirill A. Shutemov }
26997ae1749SKirill A. Shutemov 
27054d91729SQi Zheng static struct shrinker *huge_zero_page_shrinker;
27197ae1749SKirill A. Shutemov 
27271e3aac0SAndrea Arcangeli #ifdef CONFIG_SYSFS
27371e3aac0SAndrea Arcangeli static ssize_t enabled_show(struct kobject *kobj,
27471e3aac0SAndrea Arcangeli 			    struct kobj_attribute *attr, char *buf)
27571e3aac0SAndrea Arcangeli {
276bfb0ffebSJoe Perches 	const char *output;
277bfb0ffebSJoe Perches 
278444eb2a4SMel Gorman 	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
279bfb0ffebSJoe Perches 		output = "[always] madvise never";
280bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
281bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
282bfb0ffebSJoe Perches 		output = "always [madvise] never";
283444eb2a4SMel Gorman 	else
284bfb0ffebSJoe Perches 		output = "always madvise [never]";
285bfb0ffebSJoe Perches 
286bfb0ffebSJoe Perches 	return sysfs_emit(buf, "%s\n", output);
28771e3aac0SAndrea Arcangeli }
288444eb2a4SMel Gorman 
28971e3aac0SAndrea Arcangeli static ssize_t enabled_store(struct kobject *kobj,
29071e3aac0SAndrea Arcangeli 			     struct kobj_attribute *attr,
29171e3aac0SAndrea Arcangeli 			     const char *buf, size_t count)
29271e3aac0SAndrea Arcangeli {
29321440d7eSDavid Rientjes 	ssize_t ret = count;
294ba76149fSAndrea Arcangeli 
295f42f2552SDavid Rientjes 	if (sysfs_streq(buf, "always")) {
29621440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
29721440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
298f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "madvise")) {
29921440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
30021440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
301f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "never")) {
30221440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
30321440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
30421440d7eSDavid Rientjes 	} else
30521440d7eSDavid Rientjes 		ret = -EINVAL;
306ba76149fSAndrea Arcangeli 
307ba76149fSAndrea Arcangeli 	if (ret > 0) {
308b46e756fSKirill A. Shutemov 		int err = start_stop_khugepaged();
309ba76149fSAndrea Arcangeli 		if (err)
310ba76149fSAndrea Arcangeli 			ret = err;
311ba76149fSAndrea Arcangeli 	}
312ba76149fSAndrea Arcangeli 	return ret;
31371e3aac0SAndrea Arcangeli }
31437139bb0SMiaohe Lin 
31537139bb0SMiaohe Lin static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
31671e3aac0SAndrea Arcangeli 
317b46e756fSKirill A. Shutemov ssize_t single_hugepage_flag_show(struct kobject *kobj,
31871e3aac0SAndrea Arcangeli 				  struct kobj_attribute *attr, char *buf,
31971e3aac0SAndrea Arcangeli 				  enum transparent_hugepage_flag flag)
32071e3aac0SAndrea Arcangeli {
321bfb0ffebSJoe Perches 	return sysfs_emit(buf, "%d\n",
322e27e6151SBen Hutchings 			  !!test_bit(flag, &transparent_hugepage_flags));
32371e3aac0SAndrea Arcangeli }
324e27e6151SBen Hutchings 
325b46e756fSKirill A. Shutemov ssize_t single_hugepage_flag_store(struct kobject *kobj,
32671e3aac0SAndrea Arcangeli 				 struct kobj_attribute *attr,
32771e3aac0SAndrea Arcangeli 				 const char *buf, size_t count,
32871e3aac0SAndrea Arcangeli 				 enum transparent_hugepage_flag flag)
32971e3aac0SAndrea Arcangeli {
330e27e6151SBen Hutchings 	unsigned long value;
331e27e6151SBen Hutchings 	int ret;
332e27e6151SBen Hutchings 
333e27e6151SBen Hutchings 	ret = kstrtoul(buf, 10, &value);
334e27e6151SBen Hutchings 	if (ret < 0)
335e27e6151SBen Hutchings 		return ret;
336e27e6151SBen Hutchings 	if (value > 1)
33771e3aac0SAndrea Arcangeli 		return -EINVAL;
33871e3aac0SAndrea Arcangeli 
339e27e6151SBen Hutchings 	if (value)
340e27e6151SBen Hutchings 		set_bit(flag, &transparent_hugepage_flags);
341e27e6151SBen Hutchings 	else
342e27e6151SBen Hutchings 		clear_bit(flag, &transparent_hugepage_flags);
343e27e6151SBen Hutchings 
34471e3aac0SAndrea Arcangeli 	return count;
34571e3aac0SAndrea Arcangeli }
34671e3aac0SAndrea Arcangeli 
34771e3aac0SAndrea Arcangeli static ssize_t defrag_show(struct kobject *kobj,
34871e3aac0SAndrea Arcangeli 			   struct kobj_attribute *attr, char *buf)
34971e3aac0SAndrea Arcangeli {
350bfb0ffebSJoe Perches 	const char *output;
351bfb0ffebSJoe Perches 
352bfb0ffebSJoe Perches 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
353bfb0ffebSJoe Perches 		     &transparent_hugepage_flags))
354bfb0ffebSJoe Perches 		output = "[always] defer defer+madvise madvise never";
355bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
356bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
357bfb0ffebSJoe Perches 		output = "always [defer] defer+madvise madvise never";
358bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
359bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
360bfb0ffebSJoe Perches 		output = "always defer [defer+madvise] madvise never";
361bfb0ffebSJoe Perches 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
362bfb0ffebSJoe Perches 			  &transparent_hugepage_flags))
363bfb0ffebSJoe Perches 		output = "always defer defer+madvise [madvise] never";
364bfb0ffebSJoe Perches 	else
365bfb0ffebSJoe Perches 		output = "always defer defer+madvise madvise [never]";
366bfb0ffebSJoe Perches 
367bfb0ffebSJoe Perches 	return sysfs_emit(buf, "%s\n", output);
36871e3aac0SAndrea Arcangeli }
36921440d7eSDavid Rientjes 
37071e3aac0SAndrea Arcangeli static ssize_t defrag_store(struct kobject *kobj,
37171e3aac0SAndrea Arcangeli 			    struct kobj_attribute *attr,
37271e3aac0SAndrea Arcangeli 			    const char *buf, size_t count)
37371e3aac0SAndrea Arcangeli {
374f42f2552SDavid Rientjes 	if (sysfs_streq(buf, "always")) {
37521440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
37621440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
37721440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
37821440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
379f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "defer+madvise")) {
38021440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
38121440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
38221440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
38321440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
384f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "defer")) {
3854fad7fb6SDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
3864fad7fb6SDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
3874fad7fb6SDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
3884fad7fb6SDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
389f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "madvise")) {
39021440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
39121440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
39221440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
39321440d7eSDavid Rientjes 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
394f42f2552SDavid Rientjes 	} else if (sysfs_streq(buf, "never")) {
39521440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
39621440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
39721440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
39821440d7eSDavid Rientjes 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
39921440d7eSDavid Rientjes 	} else
40021440d7eSDavid Rientjes 		return -EINVAL;
40121440d7eSDavid Rientjes 
40221440d7eSDavid Rientjes 	return count;
40371e3aac0SAndrea Arcangeli }
40437139bb0SMiaohe Lin static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
40571e3aac0SAndrea Arcangeli 
40679da5407SKirill A. Shutemov static ssize_t use_zero_page_show(struct kobject *kobj,
40779da5407SKirill A. Shutemov 				  struct kobj_attribute *attr, char *buf)
40879da5407SKirill A. Shutemov {
409b46e756fSKirill A. Shutemov 	return single_hugepage_flag_show(kobj, attr, buf,
41079da5407SKirill A. Shutemov 					 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41179da5407SKirill A. Shutemov }
41279da5407SKirill A. Shutemov static ssize_t use_zero_page_store(struct kobject *kobj,
41379da5407SKirill A. Shutemov 		struct kobj_attribute *attr, const char *buf, size_t count)
41479da5407SKirill A. Shutemov {
415b46e756fSKirill A. Shutemov 	return single_hugepage_flag_store(kobj, attr, buf, count,
41679da5407SKirill A. Shutemov 				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41779da5407SKirill A. Shutemov }
41837139bb0SMiaohe Lin static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
41949920d28SHugh Dickins 
42049920d28SHugh Dickins static ssize_t hpage_pmd_size_show(struct kobject *kobj,
42149920d28SHugh Dickins 				   struct kobj_attribute *attr, char *buf)
42249920d28SHugh Dickins {
423ae7a927dSJoe Perches 	return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
42449920d28SHugh Dickins }
42549920d28SHugh Dickins static struct kobj_attribute hpage_pmd_size_attr =
42649920d28SHugh Dickins 	__ATTR_RO(hpage_pmd_size);
42749920d28SHugh Dickins 
42871e3aac0SAndrea Arcangeli static struct attribute *hugepage_attr[] = {
42971e3aac0SAndrea Arcangeli 	&enabled_attr.attr,
43071e3aac0SAndrea Arcangeli 	&defrag_attr.attr,
43179da5407SKirill A. Shutemov 	&use_zero_page_attr.attr,
43249920d28SHugh Dickins 	&hpage_pmd_size_attr.attr,
433396bcc52SMatthew Wilcox (Oracle) #ifdef CONFIG_SHMEM
4345a6e75f8SKirill A. Shutemov 	&shmem_enabled_attr.attr,
4355a6e75f8SKirill A. Shutemov #endif
43671e3aac0SAndrea Arcangeli 	NULL,
43771e3aac0SAndrea Arcangeli };
43871e3aac0SAndrea Arcangeli 
4398aa95a21SArvind Yadav static const struct attribute_group hugepage_attr_group = {
44071e3aac0SAndrea Arcangeli 	.attrs = hugepage_attr,
441ba76149fSAndrea Arcangeli };
442ba76149fSAndrea Arcangeli 
4433485b883SRyan Roberts static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
4443485b883SRyan Roberts static void thpsize_release(struct kobject *kobj);
4453485b883SRyan Roberts static DEFINE_SPINLOCK(huge_anon_orders_lock);
4463485b883SRyan Roberts static LIST_HEAD(thpsize_list);
4473485b883SRyan Roberts 
4483485b883SRyan Roberts struct thpsize {
4493485b883SRyan Roberts 	struct kobject kobj;
4503485b883SRyan Roberts 	struct list_head node;
4513485b883SRyan Roberts 	int order;
4523485b883SRyan Roberts };
4533485b883SRyan Roberts 
4543485b883SRyan Roberts #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
4553485b883SRyan Roberts 
4563485b883SRyan Roberts static ssize_t thpsize_enabled_show(struct kobject *kobj,
4573485b883SRyan Roberts 				    struct kobj_attribute *attr, char *buf)
4583485b883SRyan Roberts {
4593485b883SRyan Roberts 	int order = to_thpsize(kobj)->order;
4603485b883SRyan Roberts 	const char *output;
4613485b883SRyan Roberts 
4623485b883SRyan Roberts 	if (test_bit(order, &huge_anon_orders_always))
4633485b883SRyan Roberts 		output = "[always] inherit madvise never";
4643485b883SRyan Roberts 	else if (test_bit(order, &huge_anon_orders_inherit))
4653485b883SRyan Roberts 		output = "always [inherit] madvise never";
4663485b883SRyan Roberts 	else if (test_bit(order, &huge_anon_orders_madvise))
4673485b883SRyan Roberts 		output = "always inherit [madvise] never";
4683485b883SRyan Roberts 	else
4693485b883SRyan Roberts 		output = "always inherit madvise [never]";
4703485b883SRyan Roberts 
4713485b883SRyan Roberts 	return sysfs_emit(buf, "%s\n", output);
4723485b883SRyan Roberts }
4733485b883SRyan Roberts 
4743485b883SRyan Roberts static ssize_t thpsize_enabled_store(struct kobject *kobj,
4753485b883SRyan Roberts 				     struct kobj_attribute *attr,
4763485b883SRyan Roberts 				     const char *buf, size_t count)
4773485b883SRyan Roberts {
4783485b883SRyan Roberts 	int order = to_thpsize(kobj)->order;
4793485b883SRyan Roberts 	ssize_t ret = count;
4803485b883SRyan Roberts 
4813485b883SRyan Roberts 	if (sysfs_streq(buf, "always")) {
4823485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
4833485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_inherit);
4843485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_madvise);
4853485b883SRyan Roberts 		set_bit(order, &huge_anon_orders_always);
4863485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
4873485b883SRyan Roberts 	} else if (sysfs_streq(buf, "inherit")) {
4883485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
4893485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_always);
4903485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_madvise);
4913485b883SRyan Roberts 		set_bit(order, &huge_anon_orders_inherit);
4923485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
4933485b883SRyan Roberts 	} else if (sysfs_streq(buf, "madvise")) {
4943485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
4953485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_always);
4963485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_inherit);
4973485b883SRyan Roberts 		set_bit(order, &huge_anon_orders_madvise);
4983485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
4993485b883SRyan Roberts 	} else if (sysfs_streq(buf, "never")) {
5003485b883SRyan Roberts 		spin_lock(&huge_anon_orders_lock);
5013485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_always);
5023485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_inherit);
5033485b883SRyan Roberts 		clear_bit(order, &huge_anon_orders_madvise);
5043485b883SRyan Roberts 		spin_unlock(&huge_anon_orders_lock);
5053485b883SRyan Roberts 	} else
5063485b883SRyan Roberts 		ret = -EINVAL;
5073485b883SRyan Roberts 
5083485b883SRyan Roberts 	return ret;
5093485b883SRyan Roberts }
5103485b883SRyan Roberts 
5113485b883SRyan Roberts static struct kobj_attribute thpsize_enabled_attr =
5123485b883SRyan Roberts 	__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
5133485b883SRyan Roberts 
5143485b883SRyan Roberts static struct attribute *thpsize_attrs[] = {
5153485b883SRyan Roberts 	&thpsize_enabled_attr.attr,
5163485b883SRyan Roberts 	NULL,
5173485b883SRyan Roberts };
5183485b883SRyan Roberts 
5193485b883SRyan Roberts static const struct attribute_group thpsize_attr_group = {
5203485b883SRyan Roberts 	.attrs = thpsize_attrs,
5213485b883SRyan Roberts };
5223485b883SRyan Roberts 
5233485b883SRyan Roberts static const struct kobj_type thpsize_ktype = {
5243485b883SRyan Roberts 	.release = &thpsize_release,
5253485b883SRyan Roberts 	.sysfs_ops = &kobj_sysfs_ops,
5263485b883SRyan Roberts };
5273485b883SRyan Roberts 
5283485b883SRyan Roberts static struct thpsize *thpsize_create(int order, struct kobject *parent)
5293485b883SRyan Roberts {
5303485b883SRyan Roberts 	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
5313485b883SRyan Roberts 	struct thpsize *thpsize;
5323485b883SRyan Roberts 	int ret;
5333485b883SRyan Roberts 
5343485b883SRyan Roberts 	thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
5353485b883SRyan Roberts 	if (!thpsize)
5363485b883SRyan Roberts 		return ERR_PTR(-ENOMEM);
5373485b883SRyan Roberts 
5383485b883SRyan Roberts 	ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
5393485b883SRyan Roberts 				   "hugepages-%lukB", size);
5403485b883SRyan Roberts 	if (ret) {
5413485b883SRyan Roberts 		kfree(thpsize);
5423485b883SRyan Roberts 		return ERR_PTR(ret);
5433485b883SRyan Roberts 	}
5443485b883SRyan Roberts 
5453485b883SRyan Roberts 	ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
5463485b883SRyan Roberts 	if (ret) {
5473485b883SRyan Roberts 		kobject_put(&thpsize->kobj);
5483485b883SRyan Roberts 		return ERR_PTR(ret);
5493485b883SRyan Roberts 	}
5503485b883SRyan Roberts 
5513485b883SRyan Roberts 	thpsize->order = order;
5523485b883SRyan Roberts 	return thpsize;
5533485b883SRyan Roberts }
5543485b883SRyan Roberts 
5553485b883SRyan Roberts static void thpsize_release(struct kobject *kobj)
5563485b883SRyan Roberts {
5573485b883SRyan Roberts 	kfree(to_thpsize(kobj));
5583485b883SRyan Roberts }
5593485b883SRyan Roberts 
560569e5590SShaohua Li static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
561569e5590SShaohua Li {
562569e5590SShaohua Li 	int err;
5633485b883SRyan Roberts 	struct thpsize *thpsize;
5643485b883SRyan Roberts 	unsigned long orders;
5653485b883SRyan Roberts 	int order;
5663485b883SRyan Roberts 
5673485b883SRyan Roberts 	/*
5683485b883SRyan Roberts 	 * Default to setting PMD-sized THP to inherit the global setting and
5693485b883SRyan Roberts 	 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
5703485b883SRyan Roberts 	 * constant so we have to do this here.
5713485b883SRyan Roberts 	 */
5723485b883SRyan Roberts 	huge_anon_orders_inherit = BIT(PMD_ORDER);
573569e5590SShaohua Li 
574569e5590SShaohua Li 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
575569e5590SShaohua Li 	if (unlikely(!*hugepage_kobj)) {
576ae3a8c1cSAndrew Morton 		pr_err("failed to create transparent hugepage kobject\n");
577569e5590SShaohua Li 		return -ENOMEM;
578569e5590SShaohua Li 	}
579569e5590SShaohua Li 
580569e5590SShaohua Li 	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
581569e5590SShaohua Li 	if (err) {
582ae3a8c1cSAndrew Morton 		pr_err("failed to register transparent hugepage group\n");
583569e5590SShaohua Li 		goto delete_obj;
584569e5590SShaohua Li 	}
585569e5590SShaohua Li 
586569e5590SShaohua Li 	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
587569e5590SShaohua Li 	if (err) {
588ae3a8c1cSAndrew Morton 		pr_err("failed to register transparent hugepage group\n");
589569e5590SShaohua Li 		goto remove_hp_group;
590569e5590SShaohua Li 	}
591569e5590SShaohua Li 
5923485b883SRyan Roberts 	orders = THP_ORDERS_ALL_ANON;
5933485b883SRyan Roberts 	order = highest_order(orders);
5943485b883SRyan Roberts 	while (orders) {
5953485b883SRyan Roberts 		thpsize = thpsize_create(order, *hugepage_kobj);
5963485b883SRyan Roberts 		if (IS_ERR(thpsize)) {
5973485b883SRyan Roberts 			pr_err("failed to create thpsize for order %d\n", order);
5983485b883SRyan Roberts 			err = PTR_ERR(thpsize);
5993485b883SRyan Roberts 			goto remove_all;
6003485b883SRyan Roberts 		}
6013485b883SRyan Roberts 		list_add(&thpsize->node, &thpsize_list);
6023485b883SRyan Roberts 		order = next_order(&orders, order);
6033485b883SRyan Roberts 	}
6043485b883SRyan Roberts 
605569e5590SShaohua Li 	return 0;
606569e5590SShaohua Li 
6073485b883SRyan Roberts remove_all:
6083485b883SRyan Roberts 	hugepage_exit_sysfs(*hugepage_kobj);
6093485b883SRyan Roberts 	return err;
610569e5590SShaohua Li remove_hp_group:
611569e5590SShaohua Li 	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
612569e5590SShaohua Li delete_obj:
613569e5590SShaohua Li 	kobject_put(*hugepage_kobj);
614569e5590SShaohua Li 	return err;
615569e5590SShaohua Li }
616569e5590SShaohua Li 
617569e5590SShaohua Li static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
618569e5590SShaohua Li {
6193485b883SRyan Roberts 	struct thpsize *thpsize, *tmp;
6203485b883SRyan Roberts 
6213485b883SRyan Roberts 	list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
6223485b883SRyan Roberts 		list_del(&thpsize->node);
6233485b883SRyan Roberts 		kobject_put(&thpsize->kobj);
6243485b883SRyan Roberts 	}
6253485b883SRyan Roberts 
626569e5590SShaohua Li 	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
627569e5590SShaohua Li 	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
628569e5590SShaohua Li 	kobject_put(hugepage_kobj);
629569e5590SShaohua Li }
630569e5590SShaohua Li #else
631569e5590SShaohua Li static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
632569e5590SShaohua Li {
633569e5590SShaohua Li 	return 0;
634569e5590SShaohua Li }
635569e5590SShaohua Li 
636569e5590SShaohua Li static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
637569e5590SShaohua Li {
638569e5590SShaohua Li }
63971e3aac0SAndrea Arcangeli #endif /* CONFIG_SYSFS */
64071e3aac0SAndrea Arcangeli 
64154d91729SQi Zheng static int __init thp_shrinker_init(void)
64254d91729SQi Zheng {
64354d91729SQi Zheng 	huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
64454d91729SQi Zheng 	if (!huge_zero_page_shrinker)
64554d91729SQi Zheng 		return -ENOMEM;
64654d91729SQi Zheng 
64754d91729SQi Zheng 	deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
64854d91729SQi Zheng 						 SHRINKER_MEMCG_AWARE |
64954d91729SQi Zheng 						 SHRINKER_NONSLAB,
65054d91729SQi Zheng 						 "thp-deferred_split");
65154d91729SQi Zheng 	if (!deferred_split_shrinker) {
65254d91729SQi Zheng 		shrinker_free(huge_zero_page_shrinker);
65354d91729SQi Zheng 		return -ENOMEM;
65454d91729SQi Zheng 	}
65554d91729SQi Zheng 
65654d91729SQi Zheng 	huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
65754d91729SQi Zheng 	huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
65854d91729SQi Zheng 	shrinker_register(huge_zero_page_shrinker);
65954d91729SQi Zheng 
66054d91729SQi Zheng 	deferred_split_shrinker->count_objects = deferred_split_count;
66154d91729SQi Zheng 	deferred_split_shrinker->scan_objects = deferred_split_scan;
66254d91729SQi Zheng 	shrinker_register(deferred_split_shrinker);
66354d91729SQi Zheng 
66454d91729SQi Zheng 	return 0;
66554d91729SQi Zheng }
66654d91729SQi Zheng 
66754d91729SQi Zheng static void __init thp_shrinker_exit(void)
66854d91729SQi Zheng {
66954d91729SQi Zheng 	shrinker_free(huge_zero_page_shrinker);
67054d91729SQi Zheng 	shrinker_free(deferred_split_shrinker);
67154d91729SQi Zheng }
67254d91729SQi Zheng 
67371e3aac0SAndrea Arcangeli static int __init hugepage_init(void)
67471e3aac0SAndrea Arcangeli {
67571e3aac0SAndrea Arcangeli 	int err;
676569e5590SShaohua Li 	struct kobject *hugepage_kobj;
67771e3aac0SAndrea Arcangeli 
6784b7167b9SAndrea Arcangeli 	if (!has_transparent_hugepage()) {
6793c556d24SPeter Xu 		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
680569e5590SShaohua Li 		return -EINVAL;
6814b7167b9SAndrea Arcangeli 	}
6824b7167b9SAndrea Arcangeli 
683ff20c2e0SKirill A. Shutemov 	/*
684ff20c2e0SKirill A. Shutemov 	 * hugepages can't be allocated by the buddy allocator
685ff20c2e0SKirill A. Shutemov 	 */
6865e0a760bSKirill A. Shutemov 	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
687ff20c2e0SKirill A. Shutemov 	/*
688ff20c2e0SKirill A. Shutemov 	 * we use page->mapping and page->index in second tail page
689ff20c2e0SKirill A. Shutemov 	 * as list_head: assuming THP order >= 2
690ff20c2e0SKirill A. Shutemov 	 */
691ff20c2e0SKirill A. Shutemov 	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
692ff20c2e0SKirill A. Shutemov 
693569e5590SShaohua Li 	err = hugepage_init_sysfs(&hugepage_kobj);
694569e5590SShaohua Li 	if (err)
69565ebb64fSKirill A. Shutemov 		goto err_sysfs;
696ba76149fSAndrea Arcangeli 
697b46e756fSKirill A. Shutemov 	err = khugepaged_init();
698ba76149fSAndrea Arcangeli 	if (err)
69965ebb64fSKirill A. Shutemov 		goto err_slab;
700ba76149fSAndrea Arcangeli 
70154d91729SQi Zheng 	err = thp_shrinker_init();
70265ebb64fSKirill A. Shutemov 	if (err)
70354d91729SQi Zheng 		goto err_shrinker;
70497ae1749SKirill A. Shutemov 
70597562cd2SRik van Riel 	/*
70697562cd2SRik van Riel 	 * By default disable transparent hugepages on smaller systems,
70797562cd2SRik van Riel 	 * where the extra memory used could hurt more than TLB overhead
70897562cd2SRik van Riel 	 * is likely to save.  The admin can still enable it through /sys.
70997562cd2SRik van Riel 	 */
710ca79b0c2SArun KS 	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
71197562cd2SRik van Riel 		transparent_hugepage_flags = 0;
71279553da2SKirill A. Shutemov 		return 0;
71379553da2SKirill A. Shutemov 	}
71497562cd2SRik van Riel 
71579553da2SKirill A. Shutemov 	err = start_stop_khugepaged();
71665ebb64fSKirill A. Shutemov 	if (err)
71765ebb64fSKirill A. Shutemov 		goto err_khugepaged;
718ba76149fSAndrea Arcangeli 
719569e5590SShaohua Li 	return 0;
72065ebb64fSKirill A. Shutemov err_khugepaged:
72154d91729SQi Zheng 	thp_shrinker_exit();
72254d91729SQi Zheng err_shrinker:
723b46e756fSKirill A. Shutemov 	khugepaged_destroy();
72465ebb64fSKirill A. Shutemov err_slab:
725569e5590SShaohua Li 	hugepage_exit_sysfs(hugepage_kobj);
72665ebb64fSKirill A. Shutemov err_sysfs:
727ba76149fSAndrea Arcangeli 	return err;
72871e3aac0SAndrea Arcangeli }
729a64fb3cdSPaul Gortmaker subsys_initcall(hugepage_init);
73071e3aac0SAndrea Arcangeli 
73171e3aac0SAndrea Arcangeli static int __init setup_transparent_hugepage(char *str)
73271e3aac0SAndrea Arcangeli {
73371e3aac0SAndrea Arcangeli 	int ret = 0;
73471e3aac0SAndrea Arcangeli 	if (!str)
73571e3aac0SAndrea Arcangeli 		goto out;
73671e3aac0SAndrea Arcangeli 	if (!strcmp(str, "always")) {
73771e3aac0SAndrea Arcangeli 		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
73871e3aac0SAndrea Arcangeli 			&transparent_hugepage_flags);
73971e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
74071e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
74171e3aac0SAndrea Arcangeli 		ret = 1;
74271e3aac0SAndrea Arcangeli 	} else if (!strcmp(str, "madvise")) {
74371e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
74471e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
74571e3aac0SAndrea Arcangeli 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
74671e3aac0SAndrea Arcangeli 			&transparent_hugepage_flags);
74771e3aac0SAndrea Arcangeli 		ret = 1;
74871e3aac0SAndrea Arcangeli 	} else if (!strcmp(str, "never")) {
74971e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
75071e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
75171e3aac0SAndrea Arcangeli 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
75271e3aac0SAndrea Arcangeli 			  &transparent_hugepage_flags);
75371e3aac0SAndrea Arcangeli 		ret = 1;
75471e3aac0SAndrea Arcangeli 	}
75571e3aac0SAndrea Arcangeli out:
75671e3aac0SAndrea Arcangeli 	if (!ret)
757ae3a8c1cSAndrew Morton 		pr_warn("transparent_hugepage= cannot parse, ignored\n");
75871e3aac0SAndrea Arcangeli 	return ret;
75971e3aac0SAndrea Arcangeli }
76071e3aac0SAndrea Arcangeli __setup("transparent_hugepage=", setup_transparent_hugepage);
76171e3aac0SAndrea Arcangeli 
762f55e1014SLinus Torvalds pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
76371e3aac0SAndrea Arcangeli {
764f55e1014SLinus Torvalds 	if (likely(vma->vm_flags & VM_WRITE))
765161e393cSRick Edgecombe 		pmd = pmd_mkwrite(pmd, vma);
76671e3aac0SAndrea Arcangeli 	return pmd;
76771e3aac0SAndrea Arcangeli }
76871e3aac0SAndrea Arcangeli 
76987eaceb3SYang Shi #ifdef CONFIG_MEMCG
770f8baa6beSMatthew Wilcox (Oracle) static inline
771f8baa6beSMatthew Wilcox (Oracle) struct deferred_split *get_deferred_split_queue(struct folio *folio)
7729a982250SKirill A. Shutemov {
773f8baa6beSMatthew Wilcox (Oracle) 	struct mem_cgroup *memcg = folio_memcg(folio);
774f8baa6beSMatthew Wilcox (Oracle) 	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
77587eaceb3SYang Shi 
77687eaceb3SYang Shi 	if (memcg)
77787eaceb3SYang Shi 		return &memcg->deferred_split_queue;
77887eaceb3SYang Shi 	else
77987eaceb3SYang Shi 		return &pgdat->deferred_split_queue;
7809a982250SKirill A. Shutemov }
78187eaceb3SYang Shi #else
782f8baa6beSMatthew Wilcox (Oracle) static inline
783f8baa6beSMatthew Wilcox (Oracle) struct deferred_split *get_deferred_split_queue(struct folio *folio)
78487eaceb3SYang Shi {
785f8baa6beSMatthew Wilcox (Oracle) 	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
78687eaceb3SYang Shi 
78787eaceb3SYang Shi 	return &pgdat->deferred_split_queue;
78887eaceb3SYang Shi }
78987eaceb3SYang Shi #endif
7909a982250SKirill A. Shutemov 
791da6e7bf3SMatthew Wilcox (Oracle) void folio_prep_large_rmappable(struct folio *folio)
7929a982250SKirill A. Shutemov {
7938897277aSMatthew Wilcox (Oracle) 	if (!folio || !folio_test_large(folio))
7948897277aSMatthew Wilcox (Oracle) 		return;
7958897277aSMatthew Wilcox (Oracle) 	if (folio_order(folio) > 1)
7968991de90SMatthew Wilcox (Oracle) 		INIT_LIST_HEAD(&folio->_deferred_list);
797de53c05fSMatthew Wilcox (Oracle) 	folio_set_large_rmappable(folio);
7989a982250SKirill A. Shutemov }
7999a982250SKirill A. Shutemov 
800a644b0abSMatthew Wilcox (Oracle) static inline bool is_transparent_hugepage(struct folio *folio)
801005ba37cSSean Christopherson {
802a644b0abSMatthew Wilcox (Oracle) 	if (!folio_test_large(folio))
803fa1f68ccSZou Wei 		return false;
804005ba37cSSean Christopherson 
805f04029f3SMatthew Wilcox (Oracle) 	return is_huge_zero_page(&folio->page) ||
806de53c05fSMatthew Wilcox (Oracle) 		folio_test_large_rmappable(folio);
807005ba37cSSean Christopherson }
808005ba37cSSean Christopherson 
80997d3d0f9SKirill A. Shutemov static unsigned long __thp_get_unmapped_area(struct file *filp,
81097d3d0f9SKirill A. Shutemov 		unsigned long addr, unsigned long len,
81174d2fad1SToshi Kani 		loff_t off, unsigned long flags, unsigned long size)
81274d2fad1SToshi Kani {
81374d2fad1SToshi Kani 	loff_t off_end = off + len;
81474d2fad1SToshi Kani 	loff_t off_align = round_up(off, size);
81596204e15SRyan Roberts 	unsigned long len_pad, ret, off_sub;
81674d2fad1SToshi Kani 
8174ef9ad19SYang Shi 	if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
8184ef9ad19SYang Shi 		return 0;
8194ef9ad19SYang Shi 
82074d2fad1SToshi Kani 	if (off_end <= off_align || (off_end - off_align) < size)
82174d2fad1SToshi Kani 		return 0;
82274d2fad1SToshi Kani 
82374d2fad1SToshi Kani 	len_pad = len + size;
82474d2fad1SToshi Kani 	if (len_pad < len || (off + len_pad) < off)
82574d2fad1SToshi Kani 		return 0;
82674d2fad1SToshi Kani 
82797d3d0f9SKirill A. Shutemov 	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
82874d2fad1SToshi Kani 					      off >> PAGE_SHIFT, flags);
82997d3d0f9SKirill A. Shutemov 
83097d3d0f9SKirill A. Shutemov 	/*
83197d3d0f9SKirill A. Shutemov 	 * The failure might be due to length padding. The caller will retry
83297d3d0f9SKirill A. Shutemov 	 * without the padding.
83397d3d0f9SKirill A. Shutemov 	 */
83497d3d0f9SKirill A. Shutemov 	if (IS_ERR_VALUE(ret))
83574d2fad1SToshi Kani 		return 0;
83674d2fad1SToshi Kani 
83797d3d0f9SKirill A. Shutemov 	/*
83897d3d0f9SKirill A. Shutemov 	 * Do not try to align to THP boundary if allocation at the address
83997d3d0f9SKirill A. Shutemov 	 * hint succeeds.
84097d3d0f9SKirill A. Shutemov 	 */
84197d3d0f9SKirill A. Shutemov 	if (ret == addr)
84274d2fad1SToshi Kani 		return addr;
84397d3d0f9SKirill A. Shutemov 
84496204e15SRyan Roberts 	off_sub = (off - ret) & (size - 1);
84596204e15SRyan Roberts 
84696204e15SRyan Roberts 	if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
84796204e15SRyan Roberts 	    !off_sub)
84896204e15SRyan Roberts 		return ret + size;
84996204e15SRyan Roberts 
85096204e15SRyan Roberts 	ret += off_sub;
85197d3d0f9SKirill A. Shutemov 	return ret;
85274d2fad1SToshi Kani }
85374d2fad1SToshi Kani 
85474d2fad1SToshi Kani unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
85574d2fad1SToshi Kani 		unsigned long len, unsigned long pgoff, unsigned long flags)
85674d2fad1SToshi Kani {
85797d3d0f9SKirill A. Shutemov 	unsigned long ret;
85874d2fad1SToshi Kani 	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
85974d2fad1SToshi Kani 
86097d3d0f9SKirill A. Shutemov 	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
86197d3d0f9SKirill A. Shutemov 	if (ret)
86297d3d0f9SKirill A. Shutemov 		return ret;
8631854bc6eSWilliam Kucharski 
86474d2fad1SToshi Kani 	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
86574d2fad1SToshi Kani }
86674d2fad1SToshi Kani EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
86774d2fad1SToshi Kani 
8682b740303SSouptick Joarder static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
8692b740303SSouptick Joarder 			struct page *page, gfp_t gfp)
87071e3aac0SAndrea Arcangeli {
87182b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
872cfe3236dSKefeng Wang 	struct folio *folio = page_folio(page);
87371e3aac0SAndrea Arcangeli 	pgtable_t pgtable;
87482b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
8752b740303SSouptick Joarder 	vm_fault_t ret = 0;
87671e3aac0SAndrea Arcangeli 
877cfe3236dSKefeng Wang 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
87800501b53SJohannes Weiner 
879cfe3236dSKefeng Wang 	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
880cfe3236dSKefeng Wang 		folio_put(folio);
8816b251fc9SAndrea Arcangeli 		count_vm_event(THP_FAULT_FALLBACK);
88285b9f46eSDavid Rientjes 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
8836b251fc9SAndrea Arcangeli 		return VM_FAULT_FALLBACK;
8846b251fc9SAndrea Arcangeli 	}
885cfe3236dSKefeng Wang 	folio_throttle_swaprate(folio, gfp);
88671e3aac0SAndrea Arcangeli 
8874cf58924SJoel Fernandes (Google) 	pgtable = pte_alloc_one(vma->vm_mm);
88800501b53SJohannes Weiner 	if (unlikely(!pgtable)) {
8896b31d595SMichal Hocko 		ret = VM_FAULT_OOM;
8906b31d595SMichal Hocko 		goto release;
89100501b53SJohannes Weiner 	}
89200501b53SJohannes Weiner 
893c79b57e4SHuang Ying 	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
89452f37629SMinchan Kim 	/*
895cfe3236dSKefeng Wang 	 * The memory barrier inside __folio_mark_uptodate makes sure that
89652f37629SMinchan Kim 	 * clear_huge_page writes become visible before the set_pmd_at()
89752f37629SMinchan Kim 	 * write.
89852f37629SMinchan Kim 	 */
899cfe3236dSKefeng Wang 	__folio_mark_uptodate(folio);
90071e3aac0SAndrea Arcangeli 
90182b0f8c3SJan Kara 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
90282b0f8c3SJan Kara 	if (unlikely(!pmd_none(*vmf->pmd))) {
9036b31d595SMichal Hocko 		goto unlock_release;
90471e3aac0SAndrea Arcangeli 	} else {
90571e3aac0SAndrea Arcangeli 		pmd_t entry;
9066b251fc9SAndrea Arcangeli 
9076b31d595SMichal Hocko 		ret = check_stable_address_space(vma->vm_mm);
9086b31d595SMichal Hocko 		if (ret)
9096b31d595SMichal Hocko 			goto unlock_release;
9106b31d595SMichal Hocko 
9116b251fc9SAndrea Arcangeli 		/* Deliver the page fault to userland */
9126b251fc9SAndrea Arcangeli 		if (userfaultfd_missing(vma)) {
91382b0f8c3SJan Kara 			spin_unlock(vmf->ptl);
914cfe3236dSKefeng Wang 			folio_put(folio);
915bae473a4SKirill A. Shutemov 			pte_free(vma->vm_mm, pgtable);
9168fd5eda4SMiaohe Lin 			ret = handle_userfault(vmf, VM_UFFD_MISSING);
9178fd5eda4SMiaohe Lin 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
9188fd5eda4SMiaohe Lin 			return ret;
9196b251fc9SAndrea Arcangeli 		}
9206b251fc9SAndrea Arcangeli 
9213122359aSKirill A. Shutemov 		entry = mk_huge_pmd(page, vma->vm_page_prot);
922f55e1014SLinus Torvalds 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
923cfe3236dSKefeng Wang 		folio_add_new_anon_rmap(folio, vma, haddr);
924cfe3236dSKefeng Wang 		folio_add_lru_vma(folio, vma);
92582b0f8c3SJan Kara 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
92682b0f8c3SJan Kara 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
927fca40573SBibo Mao 		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
928bae473a4SKirill A. Shutemov 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
929c4812909SKirill A. Shutemov 		mm_inc_nr_ptes(vma->vm_mm);
93082b0f8c3SJan Kara 		spin_unlock(vmf->ptl);
9316b251fc9SAndrea Arcangeli 		count_vm_event(THP_FAULT_ALLOC);
9329d82c694SJohannes Weiner 		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
93371e3aac0SAndrea Arcangeli 	}
93471e3aac0SAndrea Arcangeli 
935aa2e878eSDavid Rientjes 	return 0;
9366b31d595SMichal Hocko unlock_release:
9376b31d595SMichal Hocko 	spin_unlock(vmf->ptl);
9386b31d595SMichal Hocko release:
9396b31d595SMichal Hocko 	if (pgtable)
9406b31d595SMichal Hocko 		pte_free(vma->vm_mm, pgtable);
941cfe3236dSKefeng Wang 	folio_put(folio);
9426b31d595SMichal Hocko 	return ret;
9436b31d595SMichal Hocko 
94471e3aac0SAndrea Arcangeli }
94571e3aac0SAndrea Arcangeli 
946444eb2a4SMel Gorman /*
94721440d7eSDavid Rientjes  * always: directly stall for all thp allocations
94821440d7eSDavid Rientjes  * defer: wake kswapd and fail if not immediately available
94921440d7eSDavid Rientjes  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
95021440d7eSDavid Rientjes  *		  fail if not immediately available
95121440d7eSDavid Rientjes  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
95221440d7eSDavid Rientjes  *	    available
95321440d7eSDavid Rientjes  * never: never stall for any thp allocation
954444eb2a4SMel Gorman  */
955164cc4feSRik van Riel gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
9560bbbc0b3SAndrea Arcangeli {
957164cc4feSRik van Riel 	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
95889c83fb5SMichal Hocko 
959ac79f78dSDavid Rientjes 	/* Always do synchronous compaction */
96021440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
961a8282608SAndrea Arcangeli 		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
962ac79f78dSDavid Rientjes 
963ac79f78dSDavid Rientjes 	/* Kick kcompactd and fail quickly */
96421440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
96519deb769SDavid Rientjes 		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
966ac79f78dSDavid Rientjes 
967ac79f78dSDavid Rientjes 	/* Synchronous compaction if madvised, otherwise kick kcompactd */
96821440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
96919deb769SDavid Rientjes 		return GFP_TRANSHUGE_LIGHT |
97019deb769SDavid Rientjes 			(vma_madvised ? __GFP_DIRECT_RECLAIM :
971ac79f78dSDavid Rientjes 					__GFP_KSWAPD_RECLAIM);
972ac79f78dSDavid Rientjes 
973ac79f78dSDavid Rientjes 	/* Only do synchronous compaction if madvised */
97421440d7eSDavid Rientjes 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
97519deb769SDavid Rientjes 		return GFP_TRANSHUGE_LIGHT |
97619deb769SDavid Rientjes 		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
977ac79f78dSDavid Rientjes 
97819deb769SDavid Rientjes 	return GFP_TRANSHUGE_LIGHT;
979444eb2a4SMel Gorman }
980444eb2a4SMel Gorman 
981c4088ebdSKirill A. Shutemov /* Caller must hold page table lock. */
9822efeb8daSMiaohe Lin static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
98397ae1749SKirill A. Shutemov 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
9845918d10aSKirill A. Shutemov 		struct page *zero_page)
985fc9fe822SKirill A. Shutemov {
986fc9fe822SKirill A. Shutemov 	pmd_t entry;
9877c414164SAndrew Morton 	if (!pmd_none(*pmd))
9882efeb8daSMiaohe Lin 		return;
9895918d10aSKirill A. Shutemov 	entry = mk_pmd(zero_page, vma->vm_page_prot);
990fc9fe822SKirill A. Shutemov 	entry = pmd_mkhuge(entry);
9916b0b50b0SAneesh Kumar K.V 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
992fc9fe822SKirill A. Shutemov 	set_pmd_at(mm, haddr, pmd, entry);
993c4812909SKirill A. Shutemov 	mm_inc_nr_ptes(mm);
994fc9fe822SKirill A. Shutemov }
995fc9fe822SKirill A. Shutemov 
9962b740303SSouptick Joarder vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
99771e3aac0SAndrea Arcangeli {
99882b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
999077fcf11SAneesh Kumar K.V 	gfp_t gfp;
1000cb196ee1SMatthew Wilcox (Oracle) 	struct folio *folio;
100182b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
100271e3aac0SAndrea Arcangeli 
10033485b883SRyan Roberts 	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
1004c0292554SKirill A. Shutemov 		return VM_FAULT_FALLBACK;
100571e3aac0SAndrea Arcangeli 	if (unlikely(anon_vma_prepare(vma)))
100671e3aac0SAndrea Arcangeli 		return VM_FAULT_OOM;
10074fa6893fSYang Shi 	khugepaged_enter_vma(vma, vma->vm_flags);
1008d2081b2bSYang Shi 
100982b0f8c3SJan Kara 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1010bae473a4SKirill A. Shutemov 			!mm_forbids_zeropage(vma->vm_mm) &&
101179da5407SKirill A. Shutemov 			transparent_hugepage_use_zero_page()) {
101280371957SKirill A. Shutemov 		pgtable_t pgtable;
10135918d10aSKirill A. Shutemov 		struct page *zero_page;
10142b740303SSouptick Joarder 		vm_fault_t ret;
10154cf58924SJoel Fernandes (Google) 		pgtable = pte_alloc_one(vma->vm_mm);
101680371957SKirill A. Shutemov 		if (unlikely(!pgtable))
101780371957SKirill A. Shutemov 			return VM_FAULT_OOM;
10186fcb52a5SAaron Lu 		zero_page = mm_get_huge_zero_page(vma->vm_mm);
10195918d10aSKirill A. Shutemov 		if (unlikely(!zero_page)) {
1020bae473a4SKirill A. Shutemov 			pte_free(vma->vm_mm, pgtable);
102197ae1749SKirill A. Shutemov 			count_vm_event(THP_FAULT_FALLBACK);
1022c0292554SKirill A. Shutemov 			return VM_FAULT_FALLBACK;
102397ae1749SKirill A. Shutemov 		}
102482b0f8c3SJan Kara 		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
10256b251fc9SAndrea Arcangeli 		ret = 0;
102682b0f8c3SJan Kara 		if (pmd_none(*vmf->pmd)) {
10276b31d595SMichal Hocko 			ret = check_stable_address_space(vma->vm_mm);
10286b31d595SMichal Hocko 			if (ret) {
10296b31d595SMichal Hocko 				spin_unlock(vmf->ptl);
1030bfe8cc1dSGerald Schaefer 				pte_free(vma->vm_mm, pgtable);
10316b31d595SMichal Hocko 			} else if (userfaultfd_missing(vma)) {
103282b0f8c3SJan Kara 				spin_unlock(vmf->ptl);
1033bfe8cc1dSGerald Schaefer 				pte_free(vma->vm_mm, pgtable);
103482b0f8c3SJan Kara 				ret = handle_userfault(vmf, VM_UFFD_MISSING);
10356b251fc9SAndrea Arcangeli 				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
10366b251fc9SAndrea Arcangeli 			} else {
1037bae473a4SKirill A. Shutemov 				set_huge_zero_page(pgtable, vma->vm_mm, vma,
103882b0f8c3SJan Kara 						   haddr, vmf->pmd, zero_page);
1039fca40573SBibo Mao 				update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
104082b0f8c3SJan Kara 				spin_unlock(vmf->ptl);
10416b251fc9SAndrea Arcangeli 			}
1042bfe8cc1dSGerald Schaefer 		} else {
104382b0f8c3SJan Kara 			spin_unlock(vmf->ptl);
1044bae473a4SKirill A. Shutemov 			pte_free(vma->vm_mm, pgtable);
1045bfe8cc1dSGerald Schaefer 		}
10466b251fc9SAndrea Arcangeli 		return ret;
104780371957SKirill A. Shutemov 	}
1048164cc4feSRik van Riel 	gfp = vma_thp_gfp_mask(vma);
1049cb196ee1SMatthew Wilcox (Oracle) 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
1050cb196ee1SMatthew Wilcox (Oracle) 	if (unlikely(!folio)) {
105181ab4201SAndi Kleen 		count_vm_event(THP_FAULT_FALLBACK);
1052c0292554SKirill A. Shutemov 		return VM_FAULT_FALLBACK;
105381ab4201SAndi Kleen 	}
1054cb196ee1SMatthew Wilcox (Oracle) 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
105571e3aac0SAndrea Arcangeli }
105671e3aac0SAndrea Arcangeli 
1057ae18d6dcSMatthew Wilcox static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
10583b6521f5SOliver O'Halloran 		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
10593b6521f5SOliver O'Halloran 		pgtable_t pgtable)
10605cad465dSMatthew Wilcox {
10615cad465dSMatthew Wilcox 	struct mm_struct *mm = vma->vm_mm;
10625cad465dSMatthew Wilcox 	pmd_t entry;
10635cad465dSMatthew Wilcox 	spinlock_t *ptl;
10645cad465dSMatthew Wilcox 
10655cad465dSMatthew Wilcox 	ptl = pmd_lock(mm, pmd);
1066c6f3c5eeSAneesh Kumar K.V 	if (!pmd_none(*pmd)) {
1067c6f3c5eeSAneesh Kumar K.V 		if (write) {
1068c6f3c5eeSAneesh Kumar K.V 			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
1069c6f3c5eeSAneesh Kumar K.V 				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1070c6f3c5eeSAneesh Kumar K.V 				goto out_unlock;
1071c6f3c5eeSAneesh Kumar K.V 			}
1072c6f3c5eeSAneesh Kumar K.V 			entry = pmd_mkyoung(*pmd);
1073c6f3c5eeSAneesh Kumar K.V 			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1074c6f3c5eeSAneesh Kumar K.V 			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1075c6f3c5eeSAneesh Kumar K.V 				update_mmu_cache_pmd(vma, addr, pmd);
1076c6f3c5eeSAneesh Kumar K.V 		}
1077c6f3c5eeSAneesh Kumar K.V 
1078c6f3c5eeSAneesh Kumar K.V 		goto out_unlock;
1079c6f3c5eeSAneesh Kumar K.V 	}
1080c6f3c5eeSAneesh Kumar K.V 
1081f25748e3SDan Williams 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1082f25748e3SDan Williams 	if (pfn_t_devmap(pfn))
1083f25748e3SDan Williams 		entry = pmd_mkdevmap(entry);
10845cad465dSMatthew Wilcox 	if (write) {
1085f55e1014SLinus Torvalds 		entry = pmd_mkyoung(pmd_mkdirty(entry));
1086f55e1014SLinus Torvalds 		entry = maybe_pmd_mkwrite(entry, vma);
10875cad465dSMatthew Wilcox 	}
10883b6521f5SOliver O'Halloran 
10893b6521f5SOliver O'Halloran 	if (pgtable) {
10903b6521f5SOliver O'Halloran 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
1091c4812909SKirill A. Shutemov 		mm_inc_nr_ptes(mm);
1092c6f3c5eeSAneesh Kumar K.V 		pgtable = NULL;
10933b6521f5SOliver O'Halloran 	}
10943b6521f5SOliver O'Halloran 
10955cad465dSMatthew Wilcox 	set_pmd_at(mm, addr, pmd, entry);
10965cad465dSMatthew Wilcox 	update_mmu_cache_pmd(vma, addr, pmd);
1097c6f3c5eeSAneesh Kumar K.V 
1098c6f3c5eeSAneesh Kumar K.V out_unlock:
10995cad465dSMatthew Wilcox 	spin_unlock(ptl);
1100c6f3c5eeSAneesh Kumar K.V 	if (pgtable)
1101c6f3c5eeSAneesh Kumar K.V 		pte_free(mm, pgtable);
11025cad465dSMatthew Wilcox }
11035cad465dSMatthew Wilcox 
11049a9731b1SThomas Hellstrom (VMware) /**
11057b806d22SLorenzo Stoakes  * vmf_insert_pfn_pmd - insert a pmd size pfn
11069a9731b1SThomas Hellstrom (VMware)  * @vmf: Structure describing the fault
11079a9731b1SThomas Hellstrom (VMware)  * @pfn: pfn to insert
11089a9731b1SThomas Hellstrom (VMware)  * @write: whether it's a write fault
11099a9731b1SThomas Hellstrom (VMware)  *
11107b806d22SLorenzo Stoakes  * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
11119a9731b1SThomas Hellstrom (VMware)  *
11129a9731b1SThomas Hellstrom (VMware)  * Return: vm_fault_t value.
11139a9731b1SThomas Hellstrom (VMware)  */
11147b806d22SLorenzo Stoakes vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
11155cad465dSMatthew Wilcox {
1116fce86ff5SDan Williams 	unsigned long addr = vmf->address & PMD_MASK;
1117fce86ff5SDan Williams 	struct vm_area_struct *vma = vmf->vma;
11187b806d22SLorenzo Stoakes 	pgprot_t pgprot = vma->vm_page_prot;
11193b6521f5SOliver O'Halloran 	pgtable_t pgtable = NULL;
1120fce86ff5SDan Williams 
11215cad465dSMatthew Wilcox 	/*
11225cad465dSMatthew Wilcox 	 * If we had pmd_special, we could avoid all these restrictions,
11235cad465dSMatthew Wilcox 	 * but we need to be consistent with PTEs and architectures that
11245cad465dSMatthew Wilcox 	 * can't support a 'special' bit.
11255cad465dSMatthew Wilcox 	 */
1126e1fb4a08SDave Jiang 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1127e1fb4a08SDave Jiang 			!pfn_t_devmap(pfn));
11285cad465dSMatthew Wilcox 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
11295cad465dSMatthew Wilcox 						(VM_PFNMAP|VM_MIXEDMAP));
11305cad465dSMatthew Wilcox 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
11315cad465dSMatthew Wilcox 
11325cad465dSMatthew Wilcox 	if (addr < vma->vm_start || addr >= vma->vm_end)
11335cad465dSMatthew Wilcox 		return VM_FAULT_SIGBUS;
1134308a047cSBorislav Petkov 
11353b6521f5SOliver O'Halloran 	if (arch_needs_pgtable_deposit()) {
11364cf58924SJoel Fernandes (Google) 		pgtable = pte_alloc_one(vma->vm_mm);
11373b6521f5SOliver O'Halloran 		if (!pgtable)
11383b6521f5SOliver O'Halloran 			return VM_FAULT_OOM;
11393b6521f5SOliver O'Halloran 	}
11403b6521f5SOliver O'Halloran 
1141308a047cSBorislav Petkov 	track_pfn_insert(vma, &pgprot, pfn);
1142308a047cSBorislav Petkov 
1143fce86ff5SDan Williams 	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
1144ae18d6dcSMatthew Wilcox 	return VM_FAULT_NOPAGE;
11455cad465dSMatthew Wilcox }
11467b806d22SLorenzo Stoakes EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
11475cad465dSMatthew Wilcox 
1148a00cc7d9SMatthew Wilcox #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1149f55e1014SLinus Torvalds static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1150a00cc7d9SMatthew Wilcox {
1151f55e1014SLinus Torvalds 	if (likely(vma->vm_flags & VM_WRITE))
1152a00cc7d9SMatthew Wilcox 		pud = pud_mkwrite(pud);
1153a00cc7d9SMatthew Wilcox 	return pud;
1154a00cc7d9SMatthew Wilcox }
1155a00cc7d9SMatthew Wilcox 
1156a00cc7d9SMatthew Wilcox static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
11577b806d22SLorenzo Stoakes 		pud_t *pud, pfn_t pfn, bool write)
1158a00cc7d9SMatthew Wilcox {
1159a00cc7d9SMatthew Wilcox 	struct mm_struct *mm = vma->vm_mm;
11607b806d22SLorenzo Stoakes 	pgprot_t prot = vma->vm_page_prot;
1161a00cc7d9SMatthew Wilcox 	pud_t entry;
1162a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
1163a00cc7d9SMatthew Wilcox 
1164a00cc7d9SMatthew Wilcox 	ptl = pud_lock(mm, pud);
1165c6f3c5eeSAneesh Kumar K.V 	if (!pud_none(*pud)) {
1166c6f3c5eeSAneesh Kumar K.V 		if (write) {
1167c6f3c5eeSAneesh Kumar K.V 			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
1168c6f3c5eeSAneesh Kumar K.V 				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
1169c6f3c5eeSAneesh Kumar K.V 				goto out_unlock;
1170c6f3c5eeSAneesh Kumar K.V 			}
1171c6f3c5eeSAneesh Kumar K.V 			entry = pud_mkyoung(*pud);
1172c6f3c5eeSAneesh Kumar K.V 			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1173c6f3c5eeSAneesh Kumar K.V 			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1174c6f3c5eeSAneesh Kumar K.V 				update_mmu_cache_pud(vma, addr, pud);
1175c6f3c5eeSAneesh Kumar K.V 		}
1176c6f3c5eeSAneesh Kumar K.V 		goto out_unlock;
1177c6f3c5eeSAneesh Kumar K.V 	}
1178c6f3c5eeSAneesh Kumar K.V 
1179a00cc7d9SMatthew Wilcox 	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1180a00cc7d9SMatthew Wilcox 	if (pfn_t_devmap(pfn))
1181a00cc7d9SMatthew Wilcox 		entry = pud_mkdevmap(entry);
1182a00cc7d9SMatthew Wilcox 	if (write) {
1183f55e1014SLinus Torvalds 		entry = pud_mkyoung(pud_mkdirty(entry));
1184f55e1014SLinus Torvalds 		entry = maybe_pud_mkwrite(entry, vma);
1185a00cc7d9SMatthew Wilcox 	}
1186a00cc7d9SMatthew Wilcox 	set_pud_at(mm, addr, pud, entry);
1187a00cc7d9SMatthew Wilcox 	update_mmu_cache_pud(vma, addr, pud);
1188c6f3c5eeSAneesh Kumar K.V 
1189c6f3c5eeSAneesh Kumar K.V out_unlock:
1190a00cc7d9SMatthew Wilcox 	spin_unlock(ptl);
1191a00cc7d9SMatthew Wilcox }
1192a00cc7d9SMatthew Wilcox 
11939a9731b1SThomas Hellstrom (VMware) /**
11947b806d22SLorenzo Stoakes  * vmf_insert_pfn_pud - insert a pud size pfn
11959a9731b1SThomas Hellstrom (VMware)  * @vmf: Structure describing the fault
11969a9731b1SThomas Hellstrom (VMware)  * @pfn: pfn to insert
11979a9731b1SThomas Hellstrom (VMware)  * @write: whether it's a write fault
11989a9731b1SThomas Hellstrom (VMware)  *
11997b806d22SLorenzo Stoakes  * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
12009a9731b1SThomas Hellstrom (VMware)  *
12019a9731b1SThomas Hellstrom (VMware)  * Return: vm_fault_t value.
12029a9731b1SThomas Hellstrom (VMware)  */
12037b806d22SLorenzo Stoakes vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
1204a00cc7d9SMatthew Wilcox {
1205fce86ff5SDan Williams 	unsigned long addr = vmf->address & PUD_MASK;
1206fce86ff5SDan Williams 	struct vm_area_struct *vma = vmf->vma;
12077b806d22SLorenzo Stoakes 	pgprot_t pgprot = vma->vm_page_prot;
1208fce86ff5SDan Williams 
1209a00cc7d9SMatthew Wilcox 	/*
1210a00cc7d9SMatthew Wilcox 	 * If we had pud_special, we could avoid all these restrictions,
1211a00cc7d9SMatthew Wilcox 	 * but we need to be consistent with PTEs and architectures that
1212a00cc7d9SMatthew Wilcox 	 * can't support a 'special' bit.
1213a00cc7d9SMatthew Wilcox 	 */
121462ec0d8cSDave Jiang 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
121562ec0d8cSDave Jiang 			!pfn_t_devmap(pfn));
1216a00cc7d9SMatthew Wilcox 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1217a00cc7d9SMatthew Wilcox 						(VM_PFNMAP|VM_MIXEDMAP));
1218a00cc7d9SMatthew Wilcox 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1219a00cc7d9SMatthew Wilcox 
1220a00cc7d9SMatthew Wilcox 	if (addr < vma->vm_start || addr >= vma->vm_end)
1221a00cc7d9SMatthew Wilcox 		return VM_FAULT_SIGBUS;
1222a00cc7d9SMatthew Wilcox 
1223a00cc7d9SMatthew Wilcox 	track_pfn_insert(vma, &pgprot, pfn);
1224a00cc7d9SMatthew Wilcox 
12257b806d22SLorenzo Stoakes 	insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
1226a00cc7d9SMatthew Wilcox 	return VM_FAULT_NOPAGE;
1227a00cc7d9SMatthew Wilcox }
12287b806d22SLorenzo Stoakes EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1229a00cc7d9SMatthew Wilcox #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1230a00cc7d9SMatthew Wilcox 
12313565fce3SDan Williams static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1232a69e4717SMiaohe Lin 		      pmd_t *pmd, bool write)
12333565fce3SDan Williams {
12343565fce3SDan Williams 	pmd_t _pmd;
12353565fce3SDan Williams 
1236a8f97366SKirill A. Shutemov 	_pmd = pmd_mkyoung(*pmd);
1237a69e4717SMiaohe Lin 	if (write)
1238a8f97366SKirill A. Shutemov 		_pmd = pmd_mkdirty(_pmd);
12393565fce3SDan Williams 	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1240a69e4717SMiaohe Lin 				  pmd, _pmd, write))
12413565fce3SDan Williams 		update_mmu_cache_pmd(vma, addr, pmd);
12423565fce3SDan Williams }
12433565fce3SDan Williams 
12443565fce3SDan Williams struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1245df06b37fSKeith Busch 		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
12463565fce3SDan Williams {
12473565fce3SDan Williams 	unsigned long pfn = pmd_pfn(*pmd);
12483565fce3SDan Williams 	struct mm_struct *mm = vma->vm_mm;
12493565fce3SDan Williams 	struct page *page;
12500f089235SLogan Gunthorpe 	int ret;
12513565fce3SDan Williams 
12523565fce3SDan Williams 	assert_spin_locked(pmd_lockptr(mm, pmd));
12533565fce3SDan Williams 
1254f6f37321SLinus Torvalds 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
12553565fce3SDan Williams 		return NULL;
12563565fce3SDan Williams 
12573565fce3SDan Williams 	if (pmd_present(*pmd) && pmd_devmap(*pmd))
12583565fce3SDan Williams 		/* pass */;
12593565fce3SDan Williams 	else
12603565fce3SDan Williams 		return NULL;
12613565fce3SDan Williams 
12623565fce3SDan Williams 	if (flags & FOLL_TOUCH)
1263a69e4717SMiaohe Lin 		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
12643565fce3SDan Williams 
12653565fce3SDan Williams 	/*
12663565fce3SDan Williams 	 * device mapped pages can only be returned if the
12673565fce3SDan Williams 	 * caller will manage the page reference count.
12683565fce3SDan Williams 	 */
12693faa52c0SJohn Hubbard 	if (!(flags & (FOLL_GET | FOLL_PIN)))
12703565fce3SDan Williams 		return ERR_PTR(-EEXIST);
12713565fce3SDan Williams 
12723565fce3SDan Williams 	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1273df06b37fSKeith Busch 	*pgmap = get_dev_pagemap(pfn, *pgmap);
1274df06b37fSKeith Busch 	if (!*pgmap)
12753565fce3SDan Williams 		return ERR_PTR(-EFAULT);
12763565fce3SDan Williams 	page = pfn_to_page(pfn);
12770f089235SLogan Gunthorpe 	ret = try_grab_page(page, flags);
12780f089235SLogan Gunthorpe 	if (ret)
12790f089235SLogan Gunthorpe 		page = ERR_PTR(ret);
12803565fce3SDan Williams 
12813565fce3SDan Williams 	return page;
12823565fce3SDan Williams }
12833565fce3SDan Williams 
128471e3aac0SAndrea Arcangeli int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
128571e3aac0SAndrea Arcangeli 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
12868f34f1eaSPeter Xu 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
128771e3aac0SAndrea Arcangeli {
1288c4088ebdSKirill A. Shutemov 	spinlock_t *dst_ptl, *src_ptl;
128971e3aac0SAndrea Arcangeli 	struct page *src_page;
129096c772c2SDavid Hildenbrand 	struct folio *src_folio;
129171e3aac0SAndrea Arcangeli 	pmd_t pmd;
129212c9d70bSMatthew Wilcox 	pgtable_t pgtable = NULL;
1293628d47ceSKirill A. Shutemov 	int ret = -ENOMEM;
129471e3aac0SAndrea Arcangeli 
1295628d47ceSKirill A. Shutemov 	/* Skip if can be re-fill on fault */
12968f34f1eaSPeter Xu 	if (!vma_is_anonymous(dst_vma))
1297628d47ceSKirill A. Shutemov 		return 0;
1298628d47ceSKirill A. Shutemov 
12994cf58924SJoel Fernandes (Google) 	pgtable = pte_alloc_one(dst_mm);
130071e3aac0SAndrea Arcangeli 	if (unlikely(!pgtable))
130171e3aac0SAndrea Arcangeli 		goto out;
130271e3aac0SAndrea Arcangeli 
1303c4088ebdSKirill A. Shutemov 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
1304c4088ebdSKirill A. Shutemov 	src_ptl = pmd_lockptr(src_mm, src_pmd);
1305c4088ebdSKirill A. Shutemov 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
130671e3aac0SAndrea Arcangeli 
130771e3aac0SAndrea Arcangeli 	ret = -EAGAIN;
130871e3aac0SAndrea Arcangeli 	pmd = *src_pmd;
130984c3fc4eSZi Yan 
131084c3fc4eSZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
131184c3fc4eSZi Yan 	if (unlikely(is_swap_pmd(pmd))) {
131284c3fc4eSZi Yan 		swp_entry_t entry = pmd_to_swp_entry(pmd);
131384c3fc4eSZi Yan 
131484c3fc4eSZi Yan 		VM_BUG_ON(!is_pmd_migration_entry(pmd));
13156c287605SDavid Hildenbrand 		if (!is_readable_migration_entry(entry)) {
13164dd845b5SAlistair Popple 			entry = make_readable_migration_entry(
13174dd845b5SAlistair Popple 							swp_offset(entry));
131884c3fc4eSZi Yan 			pmd = swp_entry_to_pmd(entry);
1319ab6e3d09SNaoya Horiguchi 			if (pmd_swp_soft_dirty(*src_pmd))
1320ab6e3d09SNaoya Horiguchi 				pmd = pmd_swp_mksoft_dirty(pmd);
13218f34f1eaSPeter Xu 			if (pmd_swp_uffd_wp(*src_pmd))
13228f34f1eaSPeter Xu 				pmd = pmd_swp_mkuffd_wp(pmd);
132384c3fc4eSZi Yan 			set_pmd_at(src_mm, addr, src_pmd, pmd);
132484c3fc4eSZi Yan 		}
1325dd8a67f9SZi Yan 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1326af5b0f6aSKirill A. Shutemov 		mm_inc_nr_ptes(dst_mm);
1327dd8a67f9SZi Yan 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
13288f34f1eaSPeter Xu 		if (!userfaultfd_wp(dst_vma))
13298f34f1eaSPeter Xu 			pmd = pmd_swp_clear_uffd_wp(pmd);
133084c3fc4eSZi Yan 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
133184c3fc4eSZi Yan 		ret = 0;
133284c3fc4eSZi Yan 		goto out_unlock;
133384c3fc4eSZi Yan 	}
133484c3fc4eSZi Yan #endif
133584c3fc4eSZi Yan 
1336628d47ceSKirill A. Shutemov 	if (unlikely(!pmd_trans_huge(pmd))) {
133771e3aac0SAndrea Arcangeli 		pte_free(dst_mm, pgtable);
133871e3aac0SAndrea Arcangeli 		goto out_unlock;
133971e3aac0SAndrea Arcangeli 	}
1340fc9fe822SKirill A. Shutemov 	/*
1341c4088ebdSKirill A. Shutemov 	 * When page table lock is held, the huge zero pmd should not be
1342fc9fe822SKirill A. Shutemov 	 * under splitting since we don't split the page itself, only pmd to
1343fc9fe822SKirill A. Shutemov 	 * a page table.
1344fc9fe822SKirill A. Shutemov 	 */
1345fc9fe822SKirill A. Shutemov 	if (is_huge_zero_pmd(pmd)) {
134697ae1749SKirill A. Shutemov 		/*
134797ae1749SKirill A. Shutemov 		 * get_huge_zero_page() will never allocate a new page here,
134897ae1749SKirill A. Shutemov 		 * since we already have a zero page to copy. It just takes a
134997ae1749SKirill A. Shutemov 		 * reference.
135097ae1749SKirill A. Shutemov 		 */
13515fc7a5f6SPeter Xu 		mm_get_huge_zero_page(dst_mm);
13525fc7a5f6SPeter Xu 		goto out_zero_page;
1353fc9fe822SKirill A. Shutemov 	}
1354de466bd6SMel Gorman 
135571e3aac0SAndrea Arcangeli 	src_page = pmd_page(pmd);
1356309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
135796c772c2SDavid Hildenbrand 	src_folio = page_folio(src_page);
1358d042035eSPeter Xu 
135996c772c2SDavid Hildenbrand 	folio_get(src_folio);
136096c772c2SDavid Hildenbrand 	if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
1361fb3d824dSDavid Hildenbrand 		/* Page maybe pinned: split and retry the fault on PTEs. */
136296c772c2SDavid Hildenbrand 		folio_put(src_folio);
1363d042035eSPeter Xu 		pte_free(dst_mm, pgtable);
1364d042035eSPeter Xu 		spin_unlock(src_ptl);
1365d042035eSPeter Xu 		spin_unlock(dst_ptl);
13668f34f1eaSPeter Xu 		__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1367d042035eSPeter Xu 		return -EAGAIN;
1368d042035eSPeter Xu 	}
136971e3aac0SAndrea Arcangeli 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
13705fc7a5f6SPeter Xu out_zero_page:
1371c4812909SKirill A. Shutemov 	mm_inc_nr_ptes(dst_mm);
13725c7fb56eSDan Williams 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
137371e3aac0SAndrea Arcangeli 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
13748f34f1eaSPeter Xu 	if (!userfaultfd_wp(dst_vma))
13758f34f1eaSPeter Xu 		pmd = pmd_clear_uffd_wp(pmd);
137671e3aac0SAndrea Arcangeli 	pmd = pmd_mkold(pmd_wrprotect(pmd));
137771e3aac0SAndrea Arcangeli 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
137871e3aac0SAndrea Arcangeli 
137971e3aac0SAndrea Arcangeli 	ret = 0;
138071e3aac0SAndrea Arcangeli out_unlock:
1381c4088ebdSKirill A. Shutemov 	spin_unlock(src_ptl);
1382c4088ebdSKirill A. Shutemov 	spin_unlock(dst_ptl);
138371e3aac0SAndrea Arcangeli out:
138471e3aac0SAndrea Arcangeli 	return ret;
138571e3aac0SAndrea Arcangeli }
138671e3aac0SAndrea Arcangeli 
1387a00cc7d9SMatthew Wilcox #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1388a00cc7d9SMatthew Wilcox static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
13895fe653e9SMiaohe Lin 		      pud_t *pud, bool write)
1390a00cc7d9SMatthew Wilcox {
1391a00cc7d9SMatthew Wilcox 	pud_t _pud;
1392a00cc7d9SMatthew Wilcox 
1393a8f97366SKirill A. Shutemov 	_pud = pud_mkyoung(*pud);
13945fe653e9SMiaohe Lin 	if (write)
1395a8f97366SKirill A. Shutemov 		_pud = pud_mkdirty(_pud);
1396a00cc7d9SMatthew Wilcox 	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
13975fe653e9SMiaohe Lin 				  pud, _pud, write))
1398a00cc7d9SMatthew Wilcox 		update_mmu_cache_pud(vma, addr, pud);
1399a00cc7d9SMatthew Wilcox }
1400a00cc7d9SMatthew Wilcox 
1401a00cc7d9SMatthew Wilcox struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1402df06b37fSKeith Busch 		pud_t *pud, int flags, struct dev_pagemap **pgmap)
1403a00cc7d9SMatthew Wilcox {
1404a00cc7d9SMatthew Wilcox 	unsigned long pfn = pud_pfn(*pud);
1405a00cc7d9SMatthew Wilcox 	struct mm_struct *mm = vma->vm_mm;
1406a00cc7d9SMatthew Wilcox 	struct page *page;
14070f089235SLogan Gunthorpe 	int ret;
1408a00cc7d9SMatthew Wilcox 
1409a00cc7d9SMatthew Wilcox 	assert_spin_locked(pud_lockptr(mm, pud));
1410a00cc7d9SMatthew Wilcox 
1411f6f37321SLinus Torvalds 	if (flags & FOLL_WRITE && !pud_write(*pud))
1412a00cc7d9SMatthew Wilcox 		return NULL;
1413a00cc7d9SMatthew Wilcox 
1414a00cc7d9SMatthew Wilcox 	if (pud_present(*pud) && pud_devmap(*pud))
1415a00cc7d9SMatthew Wilcox 		/* pass */;
1416a00cc7d9SMatthew Wilcox 	else
1417a00cc7d9SMatthew Wilcox 		return NULL;
1418a00cc7d9SMatthew Wilcox 
1419a00cc7d9SMatthew Wilcox 	if (flags & FOLL_TOUCH)
14205fe653e9SMiaohe Lin 		touch_pud(vma, addr, pud, flags & FOLL_WRITE);
1421a00cc7d9SMatthew Wilcox 
1422a00cc7d9SMatthew Wilcox 	/*
1423a00cc7d9SMatthew Wilcox 	 * device mapped pages can only be returned if the
1424a00cc7d9SMatthew Wilcox 	 * caller will manage the page reference count.
14253faa52c0SJohn Hubbard 	 *
14263faa52c0SJohn Hubbard 	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
1427a00cc7d9SMatthew Wilcox 	 */
14283faa52c0SJohn Hubbard 	if (!(flags & (FOLL_GET | FOLL_PIN)))
1429a00cc7d9SMatthew Wilcox 		return ERR_PTR(-EEXIST);
1430a00cc7d9SMatthew Wilcox 
1431a00cc7d9SMatthew Wilcox 	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1432df06b37fSKeith Busch 	*pgmap = get_dev_pagemap(pfn, *pgmap);
1433df06b37fSKeith Busch 	if (!*pgmap)
1434a00cc7d9SMatthew Wilcox 		return ERR_PTR(-EFAULT);
1435a00cc7d9SMatthew Wilcox 	page = pfn_to_page(pfn);
14360f089235SLogan Gunthorpe 
14370f089235SLogan Gunthorpe 	ret = try_grab_page(page, flags);
14380f089235SLogan Gunthorpe 	if (ret)
14390f089235SLogan Gunthorpe 		page = ERR_PTR(ret);
1440a00cc7d9SMatthew Wilcox 
1441a00cc7d9SMatthew Wilcox 	return page;
1442a00cc7d9SMatthew Wilcox }
1443a00cc7d9SMatthew Wilcox 
1444a00cc7d9SMatthew Wilcox int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1445a00cc7d9SMatthew Wilcox 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1446a00cc7d9SMatthew Wilcox 		  struct vm_area_struct *vma)
1447a00cc7d9SMatthew Wilcox {
1448a00cc7d9SMatthew Wilcox 	spinlock_t *dst_ptl, *src_ptl;
1449a00cc7d9SMatthew Wilcox 	pud_t pud;
1450a00cc7d9SMatthew Wilcox 	int ret;
1451a00cc7d9SMatthew Wilcox 
1452a00cc7d9SMatthew Wilcox 	dst_ptl = pud_lock(dst_mm, dst_pud);
1453a00cc7d9SMatthew Wilcox 	src_ptl = pud_lockptr(src_mm, src_pud);
1454a00cc7d9SMatthew Wilcox 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1455a00cc7d9SMatthew Wilcox 
1456a00cc7d9SMatthew Wilcox 	ret = -EAGAIN;
1457a00cc7d9SMatthew Wilcox 	pud = *src_pud;
1458a00cc7d9SMatthew Wilcox 	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1459a00cc7d9SMatthew Wilcox 		goto out_unlock;
1460a00cc7d9SMatthew Wilcox 
1461a00cc7d9SMatthew Wilcox 	/*
1462a00cc7d9SMatthew Wilcox 	 * When page table lock is held, the huge zero pud should not be
1463a00cc7d9SMatthew Wilcox 	 * under splitting since we don't split the page itself, only pud to
1464a00cc7d9SMatthew Wilcox 	 * a page table.
1465a00cc7d9SMatthew Wilcox 	 */
1466a00cc7d9SMatthew Wilcox 	if (is_huge_zero_pud(pud)) {
1467a00cc7d9SMatthew Wilcox 		/* No huge zero pud yet */
1468a00cc7d9SMatthew Wilcox 	}
1469a00cc7d9SMatthew Wilcox 
1470fb3d824dSDavid Hildenbrand 	/*
147196c772c2SDavid Hildenbrand 	 * TODO: once we support anonymous pages, use
147296c772c2SDavid Hildenbrand 	 * folio_try_dup_anon_rmap_*() and split if duplicating fails.
1473fb3d824dSDavid Hildenbrand 	 */
1474a00cc7d9SMatthew Wilcox 	pudp_set_wrprotect(src_mm, addr, src_pud);
1475a00cc7d9SMatthew Wilcox 	pud = pud_mkold(pud_wrprotect(pud));
1476a00cc7d9SMatthew Wilcox 	set_pud_at(dst_mm, addr, dst_pud, pud);
1477a00cc7d9SMatthew Wilcox 
1478a00cc7d9SMatthew Wilcox 	ret = 0;
1479a00cc7d9SMatthew Wilcox out_unlock:
1480a00cc7d9SMatthew Wilcox 	spin_unlock(src_ptl);
1481a00cc7d9SMatthew Wilcox 	spin_unlock(dst_ptl);
1482a00cc7d9SMatthew Wilcox 	return ret;
1483a00cc7d9SMatthew Wilcox }
1484a00cc7d9SMatthew Wilcox 
1485a00cc7d9SMatthew Wilcox void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1486a00cc7d9SMatthew Wilcox {
1487a00cc7d9SMatthew Wilcox 	bool write = vmf->flags & FAULT_FLAG_WRITE;
1488a00cc7d9SMatthew Wilcox 
1489a00cc7d9SMatthew Wilcox 	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1490a00cc7d9SMatthew Wilcox 	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1491a00cc7d9SMatthew Wilcox 		goto unlock;
1492a00cc7d9SMatthew Wilcox 
14935fe653e9SMiaohe Lin 	touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1494a00cc7d9SMatthew Wilcox unlock:
1495a00cc7d9SMatthew Wilcox 	spin_unlock(vmf->ptl);
1496a00cc7d9SMatthew Wilcox }
1497a00cc7d9SMatthew Wilcox #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1498a00cc7d9SMatthew Wilcox 
14995db4f15cSYang Shi void huge_pmd_set_accessed(struct vm_fault *vmf)
1500a1dd450bSWill Deacon {
150120f664aaSMinchan Kim 	bool write = vmf->flags & FAULT_FLAG_WRITE;
1502a1dd450bSWill Deacon 
150382b0f8c3SJan Kara 	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1504a69e4717SMiaohe Lin 	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1505a1dd450bSWill Deacon 		goto unlock;
1506a1dd450bSWill Deacon 
1507a69e4717SMiaohe Lin 	touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1508a1dd450bSWill Deacon 
1509a1dd450bSWill Deacon unlock:
151082b0f8c3SJan Kara 	spin_unlock(vmf->ptl);
1511a1dd450bSWill Deacon }
1512a1dd450bSWill Deacon 
15135db4f15cSYang Shi vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
151471e3aac0SAndrea Arcangeli {
1515c89357e2SDavid Hildenbrand 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
151682b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
15172fad3d14SMatthew Wilcox (Oracle) 	struct folio *folio;
15183917c802SKirill A. Shutemov 	struct page *page;
151982b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
15205db4f15cSYang Shi 	pmd_t orig_pmd = vmf->orig_pmd;
152171e3aac0SAndrea Arcangeli 
152282b0f8c3SJan Kara 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
152381d1b09cSSasha Levin 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
15243917c802SKirill A. Shutemov 
152593b4796dSKirill A. Shutemov 	if (is_huge_zero_pmd(orig_pmd))
15263917c802SKirill A. Shutemov 		goto fallback;
15273917c802SKirill A. Shutemov 
152882b0f8c3SJan Kara 	spin_lock(vmf->ptl);
15293917c802SKirill A. Shutemov 
15303917c802SKirill A. Shutemov 	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
15313917c802SKirill A. Shutemov 		spin_unlock(vmf->ptl);
15323917c802SKirill A. Shutemov 		return 0;
15333917c802SKirill A. Shutemov 	}
153471e3aac0SAndrea Arcangeli 
153571e3aac0SAndrea Arcangeli 	page = pmd_page(orig_pmd);
15362fad3d14SMatthew Wilcox (Oracle) 	folio = page_folio(page);
1537f6004e73SMiaohe Lin 	VM_BUG_ON_PAGE(!PageHead(page), page);
15383917c802SKirill A. Shutemov 
15396c287605SDavid Hildenbrand 	/* Early check when only holding the PT lock. */
15406c287605SDavid Hildenbrand 	if (PageAnonExclusive(page))
15416c287605SDavid Hildenbrand 		goto reuse;
15426c287605SDavid Hildenbrand 
15432fad3d14SMatthew Wilcox (Oracle) 	if (!folio_trylock(folio)) {
15442fad3d14SMatthew Wilcox (Oracle) 		folio_get(folio);
1545ba3c4ce6SHuang Ying 		spin_unlock(vmf->ptl);
15462fad3d14SMatthew Wilcox (Oracle) 		folio_lock(folio);
1547ba3c4ce6SHuang Ying 		spin_lock(vmf->ptl);
1548ba3c4ce6SHuang Ying 		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
15493917c802SKirill A. Shutemov 			spin_unlock(vmf->ptl);
15502fad3d14SMatthew Wilcox (Oracle) 			folio_unlock(folio);
15512fad3d14SMatthew Wilcox (Oracle) 			folio_put(folio);
15523917c802SKirill A. Shutemov 			return 0;
1553ba3c4ce6SHuang Ying 		}
15542fad3d14SMatthew Wilcox (Oracle) 		folio_put(folio);
1555ba3c4ce6SHuang Ying 	}
15563917c802SKirill A. Shutemov 
15576c287605SDavid Hildenbrand 	/* Recheck after temporarily dropping the PT lock. */
15586c287605SDavid Hildenbrand 	if (PageAnonExclusive(page)) {
15592fad3d14SMatthew Wilcox (Oracle) 		folio_unlock(folio);
15606c287605SDavid Hildenbrand 		goto reuse;
15616c287605SDavid Hildenbrand 	}
15626c287605SDavid Hildenbrand 
15633917c802SKirill A. Shutemov 	/*
15642fad3d14SMatthew Wilcox (Oracle) 	 * See do_wp_page(): we can only reuse the folio exclusively if
15652fad3d14SMatthew Wilcox (Oracle) 	 * there are no additional references. Note that we always drain
15661fec6890SMatthew Wilcox (Oracle) 	 * the LRU cache immediately after adding a THP.
15673917c802SKirill A. Shutemov 	 */
15682fad3d14SMatthew Wilcox (Oracle) 	if (folio_ref_count(folio) >
15692fad3d14SMatthew Wilcox (Oracle) 			1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
15703bff7e3fSDavid Hildenbrand 		goto unlock_fallback;
15712fad3d14SMatthew Wilcox (Oracle) 	if (folio_test_swapcache(folio))
15722fad3d14SMatthew Wilcox (Oracle) 		folio_free_swap(folio);
15732fad3d14SMatthew Wilcox (Oracle) 	if (folio_ref_count(folio) == 1) {
157471e3aac0SAndrea Arcangeli 		pmd_t entry;
15756c54dc6cSDavid Hildenbrand 
157606968625SDavid Hildenbrand 		folio_move_anon_rmap(folio, vma);
15775ca43289SDavid Hildenbrand 		SetPageAnonExclusive(page);
15782fad3d14SMatthew Wilcox (Oracle) 		folio_unlock(folio);
15796c287605SDavid Hildenbrand reuse:
1580c89357e2SDavid Hildenbrand 		if (unlikely(unshare)) {
1581c89357e2SDavid Hildenbrand 			spin_unlock(vmf->ptl);
1582c89357e2SDavid Hildenbrand 			return 0;
1583c89357e2SDavid Hildenbrand 		}
158471e3aac0SAndrea Arcangeli 		entry = pmd_mkyoung(orig_pmd);
1585f55e1014SLinus Torvalds 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
158682b0f8c3SJan Kara 		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
158782b0f8c3SJan Kara 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
15883917c802SKirill A. Shutemov 		spin_unlock(vmf->ptl);
1589cb8d8633SDavid Hildenbrand 		return 0;
159071e3aac0SAndrea Arcangeli 	}
15913917c802SKirill A. Shutemov 
15923bff7e3fSDavid Hildenbrand unlock_fallback:
15932fad3d14SMatthew Wilcox (Oracle) 	folio_unlock(folio);
159482b0f8c3SJan Kara 	spin_unlock(vmf->ptl);
15953917c802SKirill A. Shutemov fallback:
15963917c802SKirill A. Shutemov 	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
15973917c802SKirill A. Shutemov 	return VM_FAULT_FALLBACK;
159871e3aac0SAndrea Arcangeli }
159971e3aac0SAndrea Arcangeli 
1600c27f479eSDavid Hildenbrand static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1601c27f479eSDavid Hildenbrand 					   unsigned long addr, pmd_t pmd)
1602c27f479eSDavid Hildenbrand {
1603c27f479eSDavid Hildenbrand 	struct page *page;
1604c27f479eSDavid Hildenbrand 
1605c27f479eSDavid Hildenbrand 	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1606c27f479eSDavid Hildenbrand 		return false;
1607c27f479eSDavid Hildenbrand 
1608c27f479eSDavid Hildenbrand 	/* Don't touch entries that are not even readable (NUMA hinting). */
1609c27f479eSDavid Hildenbrand 	if (pmd_protnone(pmd))
1610c27f479eSDavid Hildenbrand 		return false;
1611c27f479eSDavid Hildenbrand 
1612c27f479eSDavid Hildenbrand 	/* Do we need write faults for softdirty tracking? */
1613c27f479eSDavid Hildenbrand 	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1614c27f479eSDavid Hildenbrand 		return false;
1615c27f479eSDavid Hildenbrand 
1616c27f479eSDavid Hildenbrand 	/* Do we need write faults for uffd-wp tracking? */
1617c27f479eSDavid Hildenbrand 	if (userfaultfd_huge_pmd_wp(vma, pmd))
1618c27f479eSDavid Hildenbrand 		return false;
1619c27f479eSDavid Hildenbrand 
1620c27f479eSDavid Hildenbrand 	if (!(vma->vm_flags & VM_SHARED)) {
1621c27f479eSDavid Hildenbrand 		/* See can_change_pte_writable(). */
1622c27f479eSDavid Hildenbrand 		page = vm_normal_page_pmd(vma, addr, pmd);
1623c27f479eSDavid Hildenbrand 		return page && PageAnon(page) && PageAnonExclusive(page);
1624c27f479eSDavid Hildenbrand 	}
1625c27f479eSDavid Hildenbrand 
1626c27f479eSDavid Hildenbrand 	/* See can_change_pte_writable(). */
1627c27f479eSDavid Hildenbrand 	return pmd_dirty(pmd);
1628c27f479eSDavid Hildenbrand }
1629c27f479eSDavid Hildenbrand 
16305535be30SDavid Hildenbrand /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
16315535be30SDavid Hildenbrand static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
16325535be30SDavid Hildenbrand 					struct vm_area_struct *vma,
16335535be30SDavid Hildenbrand 					unsigned int flags)
16348310d48bSKeno Fischer {
16355535be30SDavid Hildenbrand 	/* If the pmd is writable, we can write to the page. */
16365535be30SDavid Hildenbrand 	if (pmd_write(pmd))
16375535be30SDavid Hildenbrand 		return true;
16385535be30SDavid Hildenbrand 
16395535be30SDavid Hildenbrand 	/* Maybe FOLL_FORCE is set to override it? */
16405535be30SDavid Hildenbrand 	if (!(flags & FOLL_FORCE))
16415535be30SDavid Hildenbrand 		return false;
16425535be30SDavid Hildenbrand 
16435535be30SDavid Hildenbrand 	/* But FOLL_FORCE has no effect on shared mappings */
16445535be30SDavid Hildenbrand 	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
16455535be30SDavid Hildenbrand 		return false;
16465535be30SDavid Hildenbrand 
16475535be30SDavid Hildenbrand 	/* ... or read-only private ones */
16485535be30SDavid Hildenbrand 	if (!(vma->vm_flags & VM_MAYWRITE))
16495535be30SDavid Hildenbrand 		return false;
16505535be30SDavid Hildenbrand 
16515535be30SDavid Hildenbrand 	/* ... or already writable ones that just need to take a write fault */
16525535be30SDavid Hildenbrand 	if (vma->vm_flags & VM_WRITE)
16535535be30SDavid Hildenbrand 		return false;
16545535be30SDavid Hildenbrand 
16555535be30SDavid Hildenbrand 	/*
16565535be30SDavid Hildenbrand 	 * See can_change_pte_writable(): we broke COW and could map the page
16575535be30SDavid Hildenbrand 	 * writable if we have an exclusive anonymous page ...
16585535be30SDavid Hildenbrand 	 */
16595535be30SDavid Hildenbrand 	if (!page || !PageAnon(page) || !PageAnonExclusive(page))
16605535be30SDavid Hildenbrand 		return false;
16615535be30SDavid Hildenbrand 
16625535be30SDavid Hildenbrand 	/* ... and a write-fault isn't required for other reasons. */
16635535be30SDavid Hildenbrand 	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
16645535be30SDavid Hildenbrand 		return false;
16655535be30SDavid Hildenbrand 	return !userfaultfd_huge_pmd_wp(vma, pmd);
16668310d48bSKeno Fischer }
16678310d48bSKeno Fischer 
1668b676b293SDavid Rientjes struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
166971e3aac0SAndrea Arcangeli 				   unsigned long addr,
167071e3aac0SAndrea Arcangeli 				   pmd_t *pmd,
167171e3aac0SAndrea Arcangeli 				   unsigned int flags)
167271e3aac0SAndrea Arcangeli {
1673b676b293SDavid Rientjes 	struct mm_struct *mm = vma->vm_mm;
16745535be30SDavid Hildenbrand 	struct page *page;
16750f089235SLogan Gunthorpe 	int ret;
167671e3aac0SAndrea Arcangeli 
1677c4088ebdSKirill A. Shutemov 	assert_spin_locked(pmd_lockptr(mm, pmd));
167871e3aac0SAndrea Arcangeli 
16795535be30SDavid Hildenbrand 	page = pmd_page(*pmd);
16805535be30SDavid Hildenbrand 	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
16815535be30SDavid Hildenbrand 
16825535be30SDavid Hildenbrand 	if ((flags & FOLL_WRITE) &&
16835535be30SDavid Hildenbrand 	    !can_follow_write_pmd(*pmd, page, vma, flags))
16845535be30SDavid Hildenbrand 		return NULL;
168571e3aac0SAndrea Arcangeli 
168685facf25SKirill A. Shutemov 	/* Avoid dumping huge zero page */
168785facf25SKirill A. Shutemov 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
168885facf25SKirill A. Shutemov 		return ERR_PTR(-EFAULT);
168985facf25SKirill A. Shutemov 
1690d74943a2SDavid Hildenbrand 	if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
16915535be30SDavid Hildenbrand 		return NULL;
16923faa52c0SJohn Hubbard 
169384209e87SDavid Hildenbrand 	if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
1694a7f22660SDavid Hildenbrand 		return ERR_PTR(-EMLINK);
1695a7f22660SDavid Hildenbrand 
1696b6a2619cSDavid Hildenbrand 	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
1697b6a2619cSDavid Hildenbrand 			!PageAnonExclusive(page), page);
1698b6a2619cSDavid Hildenbrand 
16990f089235SLogan Gunthorpe 	ret = try_grab_page(page, flags);
17000f089235SLogan Gunthorpe 	if (ret)
17010f089235SLogan Gunthorpe 		return ERR_PTR(ret);
17023faa52c0SJohn Hubbard 
17033565fce3SDan Williams 	if (flags & FOLL_TOUCH)
1704a69e4717SMiaohe Lin 		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
17053faa52c0SJohn Hubbard 
170671e3aac0SAndrea Arcangeli 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1707ca120cf6SDan Williams 	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
170871e3aac0SAndrea Arcangeli 
170971e3aac0SAndrea Arcangeli 	return page;
171071e3aac0SAndrea Arcangeli }
171171e3aac0SAndrea Arcangeli 
1712d10e63f2SMel Gorman /* NUMA hinting page fault entry point for trans huge pmds */
17135db4f15cSYang Shi vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1714d10e63f2SMel Gorman {
171582b0f8c3SJan Kara 	struct vm_area_struct *vma = vmf->vma;
1716c5b5a3ddSYang Shi 	pmd_t oldpmd = vmf->orig_pmd;
1717c5b5a3ddSYang Shi 	pmd_t pmd;
1718667ffc31SKefeng Wang 	struct folio *folio;
171982b0f8c3SJan Kara 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1720667ffc31SKefeng Wang 	int nid = NUMA_NO_NODE;
172133024536SHuang Ying 	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
17226a56ccbcSDavid Hildenbrand 	bool migrated = false, writable = false;
17236688cc05SPeter Zijlstra 	int flags = 0;
1724d10e63f2SMel Gorman 
172582b0f8c3SJan Kara 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1726c5b5a3ddSYang Shi 	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
172782b0f8c3SJan Kara 		spin_unlock(vmf->ptl);
1728de466bd6SMel Gorman 		goto out;
1729de466bd6SMel Gorman 	}
1730de466bd6SMel Gorman 
1731c5b5a3ddSYang Shi 	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
17326a56ccbcSDavid Hildenbrand 
17336a56ccbcSDavid Hildenbrand 	/*
17346a56ccbcSDavid Hildenbrand 	 * Detect now whether the PMD could be writable; this information
17356a56ccbcSDavid Hildenbrand 	 * is only valid while holding the PT lock.
17366a56ccbcSDavid Hildenbrand 	 */
17376a56ccbcSDavid Hildenbrand 	writable = pmd_write(pmd);
17386a56ccbcSDavid Hildenbrand 	if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
17396a56ccbcSDavid Hildenbrand 	    can_change_pmd_writable(vma, vmf->address, pmd))
17406a56ccbcSDavid Hildenbrand 		writable = true;
17416a56ccbcSDavid Hildenbrand 
1742667ffc31SKefeng Wang 	folio = vm_normal_folio_pmd(vma, haddr, pmd);
1743667ffc31SKefeng Wang 	if (!folio)
1744c5b5a3ddSYang Shi 		goto out_map;
1745c5b5a3ddSYang Shi 
1746c5b5a3ddSYang Shi 	/* See similar comment in do_numa_page for explanation */
17476a56ccbcSDavid Hildenbrand 	if (!writable)
1748c5b5a3ddSYang Shi 		flags |= TNF_NO_GROUP;
1749c5b5a3ddSYang Shi 
1750667ffc31SKefeng Wang 	nid = folio_nid(folio);
175133024536SHuang Ying 	/*
175233024536SHuang Ying 	 * For memory tiering mode, cpupid of slow memory page is used
175333024536SHuang Ying 	 * to record page access time.  So use default value.
175433024536SHuang Ying 	 */
1755667ffc31SKefeng Wang 	if (node_is_toptier(nid))
1756c4a8d2faSKefeng Wang 		last_cpupid = folio_last_cpupid(folio);
1757cda6d936SKefeng Wang 	target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
1758c5b5a3ddSYang Shi 	if (target_nid == NUMA_NO_NODE) {
1759667ffc31SKefeng Wang 		folio_put(folio);
1760c5b5a3ddSYang Shi 		goto out_map;
1761c5b5a3ddSYang Shi 	}
1762c5b5a3ddSYang Shi 
176382b0f8c3SJan Kara 	spin_unlock(vmf->ptl);
17646a56ccbcSDavid Hildenbrand 	writable = false;
17658b1b436dSPeter Zijlstra 
1766667ffc31SKefeng Wang 	migrated = migrate_misplaced_folio(folio, vma, target_nid);
17676688cc05SPeter Zijlstra 	if (migrated) {
17686688cc05SPeter Zijlstra 		flags |= TNF_MIGRATED;
1769667ffc31SKefeng Wang 		nid = target_nid;
1770c5b5a3ddSYang Shi 	} else {
1771074c2381SMel Gorman 		flags |= TNF_MIGRATE_FAIL;
1772c5b5a3ddSYang Shi 		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1773c5b5a3ddSYang Shi 		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
177482b0f8c3SJan Kara 			spin_unlock(vmf->ptl);
1775c5b5a3ddSYang Shi 			goto out;
1776c5b5a3ddSYang Shi 		}
1777c5b5a3ddSYang Shi 		goto out_map;
1778c5b5a3ddSYang Shi 	}
1779b8916634SMel Gorman 
1780b8916634SMel Gorman out:
1781667ffc31SKefeng Wang 	if (nid != NUMA_NO_NODE)
1782667ffc31SKefeng Wang 		task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
17838191acbdSMel Gorman 
1784d10e63f2SMel Gorman 	return 0;
1785c5b5a3ddSYang Shi 
1786c5b5a3ddSYang Shi out_map:
1787c5b5a3ddSYang Shi 	/* Restore the PMD */
1788c5b5a3ddSYang Shi 	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1789c5b5a3ddSYang Shi 	pmd = pmd_mkyoung(pmd);
17906a56ccbcSDavid Hildenbrand 	if (writable)
1791161e393cSRick Edgecombe 		pmd = pmd_mkwrite(pmd, vma);
1792c5b5a3ddSYang Shi 	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1793c5b5a3ddSYang Shi 	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1794c5b5a3ddSYang Shi 	spin_unlock(vmf->ptl);
1795c5b5a3ddSYang Shi 	goto out;
1796d10e63f2SMel Gorman }
1797d10e63f2SMel Gorman 
1798319904adSHuang Ying /*
1799319904adSHuang Ying  * Return true if we do MADV_FREE successfully on entire pmd page.
1800319904adSHuang Ying  * Otherwise, return false.
1801319904adSHuang Ying  */
1802319904adSHuang Ying bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1803b8d3c4c3SMinchan Kim 		pmd_t *pmd, unsigned long addr, unsigned long next)
1804b8d3c4c3SMinchan Kim {
1805b8d3c4c3SMinchan Kim 	spinlock_t *ptl;
1806b8d3c4c3SMinchan Kim 	pmd_t orig_pmd;
1807fc986a38SKefeng Wang 	struct folio *folio;
1808b8d3c4c3SMinchan Kim 	struct mm_struct *mm = tlb->mm;
1809319904adSHuang Ying 	bool ret = false;
1810b8d3c4c3SMinchan Kim 
1811ed6a7935SPeter Zijlstra 	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
181207e32661SAneesh Kumar K.V 
1813b6ec57f4SKirill A. Shutemov 	ptl = pmd_trans_huge_lock(pmd, vma);
1814b6ec57f4SKirill A. Shutemov 	if (!ptl)
181525eedabeSLinus Torvalds 		goto out_unlocked;
1816b8d3c4c3SMinchan Kim 
1817b8d3c4c3SMinchan Kim 	orig_pmd = *pmd;
1818319904adSHuang Ying 	if (is_huge_zero_pmd(orig_pmd))
1819b8d3c4c3SMinchan Kim 		goto out;
1820b8d3c4c3SMinchan Kim 
182184c3fc4eSZi Yan 	if (unlikely(!pmd_present(orig_pmd))) {
182284c3fc4eSZi Yan 		VM_BUG_ON(thp_migration_supported() &&
182384c3fc4eSZi Yan 				  !is_pmd_migration_entry(orig_pmd));
182484c3fc4eSZi Yan 		goto out;
182584c3fc4eSZi Yan 	}
182684c3fc4eSZi Yan 
1827fc986a38SKefeng Wang 	folio = pfn_folio(pmd_pfn(orig_pmd));
1828b8d3c4c3SMinchan Kim 	/*
1829fc986a38SKefeng Wang 	 * If other processes are mapping this folio, we couldn't discard
1830fc986a38SKefeng Wang 	 * the folio unless they all do MADV_FREE so let's skip the folio.
1831b8d3c4c3SMinchan Kim 	 */
183220b18aadSYin Fengwei 	if (folio_estimated_sharers(folio) != 1)
1833b8d3c4c3SMinchan Kim 		goto out;
1834b8d3c4c3SMinchan Kim 
1835fc986a38SKefeng Wang 	if (!folio_trylock(folio))
1836b8d3c4c3SMinchan Kim 		goto out;
1837b8d3c4c3SMinchan Kim 
1838b8d3c4c3SMinchan Kim 	/*
1839b8d3c4c3SMinchan Kim 	 * If user want to discard part-pages of THP, split it so MADV_FREE
1840b8d3c4c3SMinchan Kim 	 * will deactivate only them.
1841b8d3c4c3SMinchan Kim 	 */
1842b8d3c4c3SMinchan Kim 	if (next - addr != HPAGE_PMD_SIZE) {
1843fc986a38SKefeng Wang 		folio_get(folio);
1844b8d3c4c3SMinchan Kim 		spin_unlock(ptl);
1845fc986a38SKefeng Wang 		split_folio(folio);
1846fc986a38SKefeng Wang 		folio_unlock(folio);
1847fc986a38SKefeng Wang 		folio_put(folio);
1848b8d3c4c3SMinchan Kim 		goto out_unlocked;
1849b8d3c4c3SMinchan Kim 	}
1850b8d3c4c3SMinchan Kim 
1851fc986a38SKefeng Wang 	if (folio_test_dirty(folio))
1852fc986a38SKefeng Wang 		folio_clear_dirty(folio);
1853fc986a38SKefeng Wang 	folio_unlock(folio);
1854b8d3c4c3SMinchan Kim 
1855b8d3c4c3SMinchan Kim 	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
185658ceeb6bSKirill A. Shutemov 		pmdp_invalidate(vma, addr, pmd);
1857b8d3c4c3SMinchan Kim 		orig_pmd = pmd_mkold(orig_pmd);
1858b8d3c4c3SMinchan Kim 		orig_pmd = pmd_mkclean(orig_pmd);
1859b8d3c4c3SMinchan Kim 
1860b8d3c4c3SMinchan Kim 		set_pmd_at(mm, addr, pmd, orig_pmd);
1861b8d3c4c3SMinchan Kim 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1862b8d3c4c3SMinchan Kim 	}
1863802a3a92SShaohua Li 
18646a6fe9ebSKefeng Wang 	folio_mark_lazyfree(folio);
1865319904adSHuang Ying 	ret = true;
1866b8d3c4c3SMinchan Kim out:
1867b8d3c4c3SMinchan Kim 	spin_unlock(ptl);
1868b8d3c4c3SMinchan Kim out_unlocked:
1869b8d3c4c3SMinchan Kim 	return ret;
1870b8d3c4c3SMinchan Kim }
1871b8d3c4c3SMinchan Kim 
1872953c66c2SAneesh Kumar K.V static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1873953c66c2SAneesh Kumar K.V {
1874953c66c2SAneesh Kumar K.V 	pgtable_t pgtable;
1875953c66c2SAneesh Kumar K.V 
1876953c66c2SAneesh Kumar K.V 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1877953c66c2SAneesh Kumar K.V 	pte_free(mm, pgtable);
1878c4812909SKirill A. Shutemov 	mm_dec_nr_ptes(mm);
1879953c66c2SAneesh Kumar K.V }
1880953c66c2SAneesh Kumar K.V 
188171e3aac0SAndrea Arcangeli int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1882f21760b1SShaohua Li 		 pmd_t *pmd, unsigned long addr)
188371e3aac0SAndrea Arcangeli {
1884f5c8ad47SDavid Miller 	pmd_t orig_pmd;
1885da146769SKirill A. Shutemov 	spinlock_t *ptl;
1886da146769SKirill A. Shutemov 
1887ed6a7935SPeter Zijlstra 	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
188807e32661SAneesh Kumar K.V 
1889b6ec57f4SKirill A. Shutemov 	ptl = __pmd_trans_huge_lock(pmd, vma);
1890b6ec57f4SKirill A. Shutemov 	if (!ptl)
1891da146769SKirill A. Shutemov 		return 0;
1892a6bf2bb0SAneesh Kumar K.V 	/*
1893a6bf2bb0SAneesh Kumar K.V 	 * For architectures like ppc64 we look at deposited pgtable
18948809aa2dSAneesh Kumar K.V 	 * when calling pmdp_huge_get_and_clear. So do the
1895a6bf2bb0SAneesh Kumar K.V 	 * pgtable_trans_huge_withdraw after finishing pmdp related
1896a6bf2bb0SAneesh Kumar K.V 	 * operations.
1897a6bf2bb0SAneesh Kumar K.V 	 */
189893a98695SAneesh Kumar K.V 	orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1899fcbe08d6SMartin Schwidefsky 						tlb->fullmm);
1900e5136e87SRick Edgecombe 	arch_check_zapped_pmd(vma, orig_pmd);
1901f21760b1SShaohua Li 	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
19022484ca9bSThomas Hellstrom (VMware) 	if (vma_is_special_huge(vma)) {
19033b6521f5SOliver O'Halloran 		if (arch_needs_pgtable_deposit())
19043b6521f5SOliver O'Halloran 			zap_deposited_table(tlb->mm, pmd);
19054897c765SMatthew Wilcox 		spin_unlock(ptl);
1906da146769SKirill A. Shutemov 	} else if (is_huge_zero_pmd(orig_pmd)) {
1907c14a6eb4SOliver O'Halloran 		zap_deposited_table(tlb->mm, pmd);
1908bf929152SKirill A. Shutemov 		spin_unlock(ptl);
1909479f0abbSKirill A. Shutemov 	} else {
19100103b27aSKefeng Wang 		struct folio *folio = NULL;
1911616b8371SZi Yan 		int flush_needed = 1;
1912616b8371SZi Yan 
1913616b8371SZi Yan 		if (pmd_present(orig_pmd)) {
19140103b27aSKefeng Wang 			struct page *page = pmd_page(orig_pmd);
19150103b27aSKefeng Wang 
19160103b27aSKefeng Wang 			folio = page_folio(page);
19170103b27aSKefeng Wang 			folio_remove_rmap_pmd(folio, page, vma);
1918309381feSSasha Levin 			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1919309381feSSasha Levin 			VM_BUG_ON_PAGE(!PageHead(page), page);
1920616b8371SZi Yan 		} else if (thp_migration_supported()) {
1921616b8371SZi Yan 			swp_entry_t entry;
1922616b8371SZi Yan 
1923616b8371SZi Yan 			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1924616b8371SZi Yan 			entry = pmd_to_swp_entry(orig_pmd);
19250103b27aSKefeng Wang 			folio = pfn_swap_entry_folio(entry);
1926616b8371SZi Yan 			flush_needed = 0;
1927616b8371SZi Yan 		} else
1928616b8371SZi Yan 			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1929616b8371SZi Yan 
19300103b27aSKefeng Wang 		if (folio_test_anon(folio)) {
1931c14a6eb4SOliver O'Halloran 			zap_deposited_table(tlb->mm, pmd);
1932b5072380SKirill A. Shutemov 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1933b5072380SKirill A. Shutemov 		} else {
1934953c66c2SAneesh Kumar K.V 			if (arch_needs_pgtable_deposit())
1935953c66c2SAneesh Kumar K.V 				zap_deposited_table(tlb->mm, pmd);
19366b27cc6cSKefeng Wang 			add_mm_counter(tlb->mm, mm_counter_file(folio),
19370103b27aSKefeng Wang 				       -HPAGE_PMD_NR);
1938b5072380SKirill A. Shutemov 		}
1939616b8371SZi Yan 
1940bf929152SKirill A. Shutemov 		spin_unlock(ptl);
1941616b8371SZi Yan 		if (flush_needed)
19420103b27aSKefeng Wang 			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
1943479f0abbSKirill A. Shutemov 	}
1944da146769SKirill A. Shutemov 	return 1;
194571e3aac0SAndrea Arcangeli }
194671e3aac0SAndrea Arcangeli 
19471dd38b6cSAneesh Kumar K.V #ifndef pmd_move_must_withdraw
19481dd38b6cSAneesh Kumar K.V static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
19491dd38b6cSAneesh Kumar K.V 					 spinlock_t *old_pmd_ptl,
19501dd38b6cSAneesh Kumar K.V 					 struct vm_area_struct *vma)
19511dd38b6cSAneesh Kumar K.V {
19521dd38b6cSAneesh Kumar K.V 	/*
19531dd38b6cSAneesh Kumar K.V 	 * With split pmd lock we also need to move preallocated
19541dd38b6cSAneesh Kumar K.V 	 * PTE page table if new_pmd is on different PMD page table.
19551dd38b6cSAneesh Kumar K.V 	 *
19561dd38b6cSAneesh Kumar K.V 	 * We also don't deposit and withdraw tables for file pages.
19571dd38b6cSAneesh Kumar K.V 	 */
19581dd38b6cSAneesh Kumar K.V 	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
19591dd38b6cSAneesh Kumar K.V }
19601dd38b6cSAneesh Kumar K.V #endif
19611dd38b6cSAneesh Kumar K.V 
1962ab6e3d09SNaoya Horiguchi static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1963ab6e3d09SNaoya Horiguchi {
1964ab6e3d09SNaoya Horiguchi #ifdef CONFIG_MEM_SOFT_DIRTY
1965ab6e3d09SNaoya Horiguchi 	if (unlikely(is_pmd_migration_entry(pmd)))
1966ab6e3d09SNaoya Horiguchi 		pmd = pmd_swp_mksoft_dirty(pmd);
1967ab6e3d09SNaoya Horiguchi 	else if (pmd_present(pmd))
1968ab6e3d09SNaoya Horiguchi 		pmd = pmd_mksoft_dirty(pmd);
1969ab6e3d09SNaoya Horiguchi #endif
1970ab6e3d09SNaoya Horiguchi 	return pmd;
1971ab6e3d09SNaoya Horiguchi }
1972ab6e3d09SNaoya Horiguchi 
1973bf8616d5SHugh Dickins bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1974b8aa9d9dSWei Yang 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
197537a1c49aSAndrea Arcangeli {
1976bf929152SKirill A. Shutemov 	spinlock_t *old_ptl, *new_ptl;
197737a1c49aSAndrea Arcangeli 	pmd_t pmd;
197837a1c49aSAndrea Arcangeli 	struct mm_struct *mm = vma->vm_mm;
19795d190420SAaron Lu 	bool force_flush = false;
198037a1c49aSAndrea Arcangeli 
198137a1c49aSAndrea Arcangeli 	/*
198237a1c49aSAndrea Arcangeli 	 * The destination pmd shouldn't be established, free_pgtables()
1983a5be621eSHugh Dickins 	 * should have released it; but move_page_tables() might have already
1984a5be621eSHugh Dickins 	 * inserted a page table, if racing against shmem/file collapse.
198537a1c49aSAndrea Arcangeli 	 */
1986a5be621eSHugh Dickins 	if (!pmd_none(*new_pmd)) {
198737a1c49aSAndrea Arcangeli 		VM_BUG_ON(pmd_trans_huge(*new_pmd));
19884b471e88SKirill A. Shutemov 		return false;
198937a1c49aSAndrea Arcangeli 	}
199037a1c49aSAndrea Arcangeli 
1991bf929152SKirill A. Shutemov 	/*
1992bf929152SKirill A. Shutemov 	 * We don't have to worry about the ordering of src and dst
1993c1e8d7c6SMichel Lespinasse 	 * ptlocks because exclusive mmap_lock prevents deadlock.
1994bf929152SKirill A. Shutemov 	 */
1995b6ec57f4SKirill A. Shutemov 	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1996b6ec57f4SKirill A. Shutemov 	if (old_ptl) {
1997bf929152SKirill A. Shutemov 		new_ptl = pmd_lockptr(mm, new_pmd);
1998bf929152SKirill A. Shutemov 		if (new_ptl != old_ptl)
1999bf929152SKirill A. Shutemov 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
20008809aa2dSAneesh Kumar K.V 		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
2001eb66ae03SLinus Torvalds 		if (pmd_present(pmd))
2002a2ce2666SAaron Lu 			force_flush = true;
200337a1c49aSAndrea Arcangeli 		VM_BUG_ON(!pmd_none(*new_pmd));
20043592806cSKirill A. Shutemov 
20051dd38b6cSAneesh Kumar K.V 		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
2006b3084f4dSAneesh Kumar K.V 			pgtable_t pgtable;
20073592806cSKirill A. Shutemov 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
20083592806cSKirill A. Shutemov 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
20093592806cSKirill A. Shutemov 		}
2010ab6e3d09SNaoya Horiguchi 		pmd = move_soft_dirty_pmd(pmd);
2011ab6e3d09SNaoya Horiguchi 		set_pmd_at(mm, new_addr, new_pmd, pmd);
20125d190420SAaron Lu 		if (force_flush)
20137c38f181SMiaohe Lin 			flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
2014eb66ae03SLinus Torvalds 		if (new_ptl != old_ptl)
2015eb66ae03SLinus Torvalds 			spin_unlock(new_ptl);
2016bf929152SKirill A. Shutemov 		spin_unlock(old_ptl);
20174b471e88SKirill A. Shutemov 		return true;
201837a1c49aSAndrea Arcangeli 	}
20194b471e88SKirill A. Shutemov 	return false;
202037a1c49aSAndrea Arcangeli }
202137a1c49aSAndrea Arcangeli 
2022f123d74aSMel Gorman /*
2023f123d74aSMel Gorman  * Returns
2024f123d74aSMel Gorman  *  - 0 if PMD could not be locked
2025f0953a1bSIngo Molnar  *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
2026e346e668SYang Shi  *      or if prot_numa but THP migration is not supported
2027f0953a1bSIngo Molnar  *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
2028f123d74aSMel Gorman  */
20294a18419fSNadav Amit int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
20304a18419fSNadav Amit 		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
20314a18419fSNadav Amit 		    unsigned long cp_flags)
2032cd7548abSJohannes Weiner {
2033cd7548abSJohannes Weiner 	struct mm_struct *mm = vma->vm_mm;
2034bf929152SKirill A. Shutemov 	spinlock_t *ptl;
2035c9fe6656SNadav Amit 	pmd_t oldpmd, entry;
203658705444SPeter Xu 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
2037292924b2SPeter Xu 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2038292924b2SPeter Xu 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
20396a56ccbcSDavid Hildenbrand 	int ret = 1;
2040cd7548abSJohannes Weiner 
20414a18419fSNadav Amit 	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
20424a18419fSNadav Amit 
2043e346e668SYang Shi 	if (prot_numa && !thp_migration_supported())
2044e346e668SYang Shi 		return 1;
2045e346e668SYang Shi 
2046b6ec57f4SKirill A. Shutemov 	ptl = __pmd_trans_huge_lock(pmd, vma);
20470a85e51dSKirill A. Shutemov 	if (!ptl)
20480a85e51dSKirill A. Shutemov 		return 0;
20490a85e51dSKirill A. Shutemov 
205084c3fc4eSZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
205184c3fc4eSZi Yan 	if (is_swap_pmd(*pmd)) {
205284c3fc4eSZi Yan 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
20535662400aSMatthew Wilcox (Oracle) 		struct folio *folio = pfn_swap_entry_folio(entry);
205424bf08c4SDavid Hildenbrand 		pmd_t newpmd;
205584c3fc4eSZi Yan 
205684c3fc4eSZi Yan 		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
20574dd845b5SAlistair Popple 		if (is_writable_migration_entry(entry)) {
205884c3fc4eSZi Yan 			/*
205984c3fc4eSZi Yan 			 * A protection check is difficult so
206084c3fc4eSZi Yan 			 * just be safe and disable write
206184c3fc4eSZi Yan 			 */
2062d986ba2bSKefeng Wang 			if (folio_test_anon(folio))
20636c287605SDavid Hildenbrand 				entry = make_readable_exclusive_migration_entry(swp_offset(entry));
20646c287605SDavid Hildenbrand 			else
20656c287605SDavid Hildenbrand 				entry = make_readable_migration_entry(swp_offset(entry));
206684c3fc4eSZi Yan 			newpmd = swp_entry_to_pmd(entry);
2067ab6e3d09SNaoya Horiguchi 			if (pmd_swp_soft_dirty(*pmd))
2068ab6e3d09SNaoya Horiguchi 				newpmd = pmd_swp_mksoft_dirty(newpmd);
206924bf08c4SDavid Hildenbrand 		} else {
207024bf08c4SDavid Hildenbrand 			newpmd = *pmd;
207184c3fc4eSZi Yan 		}
207224bf08c4SDavid Hildenbrand 
207324bf08c4SDavid Hildenbrand 		if (uffd_wp)
207424bf08c4SDavid Hildenbrand 			newpmd = pmd_swp_mkuffd_wp(newpmd);
207524bf08c4SDavid Hildenbrand 		else if (uffd_wp_resolve)
207624bf08c4SDavid Hildenbrand 			newpmd = pmd_swp_clear_uffd_wp(newpmd);
207724bf08c4SDavid Hildenbrand 		if (!pmd_same(*pmd, newpmd))
207824bf08c4SDavid Hildenbrand 			set_pmd_at(mm, addr, pmd, newpmd);
207984c3fc4eSZi Yan 		goto unlock;
208084c3fc4eSZi Yan 	}
208184c3fc4eSZi Yan #endif
208284c3fc4eSZi Yan 
2083a1a3a2fcSHuang Ying 	if (prot_numa) {
2084d986ba2bSKefeng Wang 		struct folio *folio;
208533024536SHuang Ying 		bool toptier;
2086e944fd67SMel Gorman 		/*
2087e944fd67SMel Gorman 		 * Avoid trapping faults against the zero page. The read-only
2088e944fd67SMel Gorman 		 * data is likely to be read-cached on the local CPU and
2089e944fd67SMel Gorman 		 * local/remote hits to the zero page are not interesting.
2090e944fd67SMel Gorman 		 */
2091a1a3a2fcSHuang Ying 		if (is_huge_zero_pmd(*pmd))
20920a85e51dSKirill A. Shutemov 			goto unlock;
2093e944fd67SMel Gorman 
2094a1a3a2fcSHuang Ying 		if (pmd_protnone(*pmd))
20950a85e51dSKirill A. Shutemov 			goto unlock;
20960a85e51dSKirill A. Shutemov 
2097d986ba2bSKefeng Wang 		folio = page_folio(pmd_page(*pmd));
2098d986ba2bSKefeng Wang 		toptier = node_is_toptier(folio_nid(folio));
2099a1a3a2fcSHuang Ying 		/*
2100a1a3a2fcSHuang Ying 		 * Skip scanning top tier node if normal numa
2101a1a3a2fcSHuang Ying 		 * balancing is disabled
2102a1a3a2fcSHuang Ying 		 */
2103a1a3a2fcSHuang Ying 		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
210433024536SHuang Ying 		    toptier)
2105a1a3a2fcSHuang Ying 			goto unlock;
210633024536SHuang Ying 
210733024536SHuang Ying 		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
210833024536SHuang Ying 		    !toptier)
2109d986ba2bSKefeng Wang 			folio_xchg_access_time(folio,
2110d986ba2bSKefeng Wang 					       jiffies_to_msecs(jiffies));
2111a1a3a2fcSHuang Ying 	}
2112ced10803SKirill A. Shutemov 	/*
21133e4e28c5SMichel Lespinasse 	 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
2114ced10803SKirill A. Shutemov 	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
21153e4e28c5SMichel Lespinasse 	 * which is also under mmap_read_lock(mm):
2116ced10803SKirill A. Shutemov 	 *
2117ced10803SKirill A. Shutemov 	 *	CPU0:				CPU1:
2118ced10803SKirill A. Shutemov 	 *				change_huge_pmd(prot_numa=1)
2119ced10803SKirill A. Shutemov 	 *				 pmdp_huge_get_and_clear_notify()
2120ced10803SKirill A. Shutemov 	 * madvise_dontneed()
2121ced10803SKirill A. Shutemov 	 *  zap_pmd_range()
2122ced10803SKirill A. Shutemov 	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
2123ced10803SKirill A. Shutemov 	 *   // skip the pmd
2124ced10803SKirill A. Shutemov 	 *				 set_pmd_at();
2125ced10803SKirill A. Shutemov 	 *				 // pmd is re-established
2126ced10803SKirill A. Shutemov 	 *
2127ced10803SKirill A. Shutemov 	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2128ced10803SKirill A. Shutemov 	 * which may break userspace.
2129ced10803SKirill A. Shutemov 	 *
21304f831457SNadav Amit 	 * pmdp_invalidate_ad() is required to make sure we don't miss
2131ced10803SKirill A. Shutemov 	 * dirty/young flags set by hardware.
2132ced10803SKirill A. Shutemov 	 */
21334f831457SNadav Amit 	oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2134ced10803SKirill A. Shutemov 
2135c9fe6656SNadav Amit 	entry = pmd_modify(oldpmd, newprot);
2136f1eb1bacSPeter Xu 	if (uffd_wp)
2137292924b2SPeter Xu 		entry = pmd_mkuffd_wp(entry);
2138f1eb1bacSPeter Xu 	else if (uffd_wp_resolve)
2139292924b2SPeter Xu 		/*
2140292924b2SPeter Xu 		 * Leave the write bit to be handled by PF interrupt
2141292924b2SPeter Xu 		 * handler, then things like COW could be properly
2142292924b2SPeter Xu 		 * handled.
2143292924b2SPeter Xu 		 */
2144292924b2SPeter Xu 		entry = pmd_clear_uffd_wp(entry);
2145c27f479eSDavid Hildenbrand 
2146c27f479eSDavid Hildenbrand 	/* See change_pte_range(). */
2147c27f479eSDavid Hildenbrand 	if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2148c27f479eSDavid Hildenbrand 	    can_change_pmd_writable(vma, addr, entry))
2149161e393cSRick Edgecombe 		entry = pmd_mkwrite(entry, vma);
2150c27f479eSDavid Hildenbrand 
2151f123d74aSMel Gorman 	ret = HPAGE_PMD_NR;
215256eecdb9SAneesh Kumar K.V 	set_pmd_at(mm, addr, pmd, entry);
21534a18419fSNadav Amit 
2154c9fe6656SNadav Amit 	if (huge_pmd_needs_flush(oldpmd, entry))
21554a18419fSNadav Amit 		tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
21560a85e51dSKirill A. Shutemov unlock:
2157bf929152SKirill A. Shutemov 	spin_unlock(ptl);
2158cd7548abSJohannes Weiner 	return ret;
2159cd7548abSJohannes Weiner }
2160cd7548abSJohannes Weiner 
2161adef4406SAndrea Arcangeli #ifdef CONFIG_USERFAULTFD
2162adef4406SAndrea Arcangeli /*
2163867a43a3SLokesh Gidra  * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2164adef4406SAndrea Arcangeli  * the caller, but it must return after releasing the page_table_lock.
2165adef4406SAndrea Arcangeli  * Just move the page from src_pmd to dst_pmd if possible.
2166adef4406SAndrea Arcangeli  * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2167adef4406SAndrea Arcangeli  * repeated by the caller, or other errors in case of failure.
2168adef4406SAndrea Arcangeli  */
2169adef4406SAndrea Arcangeli int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2170adef4406SAndrea Arcangeli 			struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2171adef4406SAndrea Arcangeli 			unsigned long dst_addr, unsigned long src_addr)
2172adef4406SAndrea Arcangeli {
2173adef4406SAndrea Arcangeli 	pmd_t _dst_pmd, src_pmdval;
2174adef4406SAndrea Arcangeli 	struct page *src_page;
2175adef4406SAndrea Arcangeli 	struct folio *src_folio;
2176adef4406SAndrea Arcangeli 	struct anon_vma *src_anon_vma;
2177adef4406SAndrea Arcangeli 	spinlock_t *src_ptl, *dst_ptl;
2178adef4406SAndrea Arcangeli 	pgtable_t src_pgtable;
2179adef4406SAndrea Arcangeli 	struct mmu_notifier_range range;
2180adef4406SAndrea Arcangeli 	int err = 0;
2181adef4406SAndrea Arcangeli 
2182adef4406SAndrea Arcangeli 	src_pmdval = *src_pmd;
2183adef4406SAndrea Arcangeli 	src_ptl = pmd_lockptr(mm, src_pmd);
2184adef4406SAndrea Arcangeli 
2185adef4406SAndrea Arcangeli 	lockdep_assert_held(src_ptl);
2186867a43a3SLokesh Gidra 	vma_assert_locked(src_vma);
2187867a43a3SLokesh Gidra 	vma_assert_locked(dst_vma);
2188adef4406SAndrea Arcangeli 
2189adef4406SAndrea Arcangeli 	/* Sanity checks before the operation */
2190adef4406SAndrea Arcangeli 	if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2191adef4406SAndrea Arcangeli 	    WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2192adef4406SAndrea Arcangeli 		spin_unlock(src_ptl);
2193adef4406SAndrea Arcangeli 		return -EINVAL;
2194adef4406SAndrea Arcangeli 	}
2195adef4406SAndrea Arcangeli 
2196adef4406SAndrea Arcangeli 	if (!pmd_trans_huge(src_pmdval)) {
2197adef4406SAndrea Arcangeli 		spin_unlock(src_ptl);
2198adef4406SAndrea Arcangeli 		if (is_pmd_migration_entry(src_pmdval)) {
2199adef4406SAndrea Arcangeli 			pmd_migration_entry_wait(mm, &src_pmdval);
2200adef4406SAndrea Arcangeli 			return -EAGAIN;
2201adef4406SAndrea Arcangeli 		}
2202adef4406SAndrea Arcangeli 		return -ENOENT;
2203adef4406SAndrea Arcangeli 	}
2204adef4406SAndrea Arcangeli 
2205adef4406SAndrea Arcangeli 	src_page = pmd_page(src_pmdval);
2206eb1521daSSuren Baghdasaryan 
2207eb1521daSSuren Baghdasaryan 	if (!is_huge_zero_pmd(src_pmdval)) {
2208adef4406SAndrea Arcangeli 		if (unlikely(!PageAnonExclusive(src_page))) {
2209adef4406SAndrea Arcangeli 			spin_unlock(src_ptl);
2210adef4406SAndrea Arcangeli 			return -EBUSY;
2211adef4406SAndrea Arcangeli 		}
2212adef4406SAndrea Arcangeli 
2213adef4406SAndrea Arcangeli 		src_folio = page_folio(src_page);
2214adef4406SAndrea Arcangeli 		folio_get(src_folio);
2215eb1521daSSuren Baghdasaryan 	} else
2216eb1521daSSuren Baghdasaryan 		src_folio = NULL;
2217eb1521daSSuren Baghdasaryan 
2218adef4406SAndrea Arcangeli 	spin_unlock(src_ptl);
2219adef4406SAndrea Arcangeli 
2220adef4406SAndrea Arcangeli 	flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2221adef4406SAndrea Arcangeli 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2222adef4406SAndrea Arcangeli 				src_addr + HPAGE_PMD_SIZE);
2223adef4406SAndrea Arcangeli 	mmu_notifier_invalidate_range_start(&range);
2224adef4406SAndrea Arcangeli 
2225eb1521daSSuren Baghdasaryan 	if (src_folio) {
2226adef4406SAndrea Arcangeli 		folio_lock(src_folio);
2227adef4406SAndrea Arcangeli 
2228adef4406SAndrea Arcangeli 		/*
2229adef4406SAndrea Arcangeli 		 * split_huge_page walks the anon_vma chain without the page
2230adef4406SAndrea Arcangeli 		 * lock. Serialize against it with the anon_vma lock, the page
2231adef4406SAndrea Arcangeli 		 * lock is not enough.
2232adef4406SAndrea Arcangeli 		 */
2233adef4406SAndrea Arcangeli 		src_anon_vma = folio_get_anon_vma(src_folio);
2234adef4406SAndrea Arcangeli 		if (!src_anon_vma) {
2235adef4406SAndrea Arcangeli 			err = -EAGAIN;
2236adef4406SAndrea Arcangeli 			goto unlock_folio;
2237adef4406SAndrea Arcangeli 		}
2238adef4406SAndrea Arcangeli 		anon_vma_lock_write(src_anon_vma);
2239eb1521daSSuren Baghdasaryan 	} else
2240eb1521daSSuren Baghdasaryan 		src_anon_vma = NULL;
2241adef4406SAndrea Arcangeli 
2242adef4406SAndrea Arcangeli 	dst_ptl = pmd_lockptr(mm, dst_pmd);
2243adef4406SAndrea Arcangeli 	double_pt_lock(src_ptl, dst_ptl);
2244adef4406SAndrea Arcangeli 	if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2245adef4406SAndrea Arcangeli 		     !pmd_same(*dst_pmd, dst_pmdval))) {
2246adef4406SAndrea Arcangeli 		err = -EAGAIN;
2247adef4406SAndrea Arcangeli 		goto unlock_ptls;
2248adef4406SAndrea Arcangeli 	}
2249eb1521daSSuren Baghdasaryan 	if (src_folio) {
2250adef4406SAndrea Arcangeli 		if (folio_maybe_dma_pinned(src_folio) ||
2251adef4406SAndrea Arcangeli 		    !PageAnonExclusive(&src_folio->page)) {
2252adef4406SAndrea Arcangeli 			err = -EBUSY;
2253adef4406SAndrea Arcangeli 			goto unlock_ptls;
2254adef4406SAndrea Arcangeli 		}
2255adef4406SAndrea Arcangeli 
2256adef4406SAndrea Arcangeli 		if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2257adef4406SAndrea Arcangeli 		    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2258adef4406SAndrea Arcangeli 			err = -EBUSY;
2259adef4406SAndrea Arcangeli 			goto unlock_ptls;
2260adef4406SAndrea Arcangeli 		}
2261adef4406SAndrea Arcangeli 
2262adef4406SAndrea Arcangeli 		src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2263adef4406SAndrea Arcangeli 		/* Folio got pinned from under us. Put it back and fail the move. */
2264adef4406SAndrea Arcangeli 		if (folio_maybe_dma_pinned(src_folio)) {
2265adef4406SAndrea Arcangeli 			set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2266adef4406SAndrea Arcangeli 			err = -EBUSY;
2267adef4406SAndrea Arcangeli 			goto unlock_ptls;
2268adef4406SAndrea Arcangeli 		}
2269adef4406SAndrea Arcangeli 
2270*c0205eafSLokesh Gidra 		folio_move_anon_rmap(src_folio, dst_vma);
2271*c0205eafSLokesh Gidra 		WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
2272*c0205eafSLokesh Gidra 
2273adef4406SAndrea Arcangeli 		_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2274adef4406SAndrea Arcangeli 		/* Follow mremap() behavior and treat the entry dirty after the move */
2275adef4406SAndrea Arcangeli 		_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2276eb1521daSSuren Baghdasaryan 	} else {
2277eb1521daSSuren Baghdasaryan 		src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2278eb1521daSSuren Baghdasaryan 		_dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
2279eb1521daSSuren Baghdasaryan 	}
2280adef4406SAndrea Arcangeli 	set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2281adef4406SAndrea Arcangeli 
2282adef4406SAndrea Arcangeli 	src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2283adef4406SAndrea Arcangeli 	pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2284adef4406SAndrea Arcangeli unlock_ptls:
2285adef4406SAndrea Arcangeli 	double_pt_unlock(src_ptl, dst_ptl);
2286eb1521daSSuren Baghdasaryan 	if (src_anon_vma) {
2287adef4406SAndrea Arcangeli 		anon_vma_unlock_write(src_anon_vma);
2288adef4406SAndrea Arcangeli 		put_anon_vma(src_anon_vma);
2289eb1521daSSuren Baghdasaryan 	}
2290adef4406SAndrea Arcangeli unlock_folio:
2291adef4406SAndrea Arcangeli 	/* unblock rmap walks */
2292eb1521daSSuren Baghdasaryan 	if (src_folio)
2293adef4406SAndrea Arcangeli 		folio_unlock(src_folio);
2294adef4406SAndrea Arcangeli 	mmu_notifier_invalidate_range_end(&range);
2295eb1521daSSuren Baghdasaryan 	if (src_folio)
2296adef4406SAndrea Arcangeli 		folio_put(src_folio);
2297adef4406SAndrea Arcangeli 	return err;
2298adef4406SAndrea Arcangeli }
2299adef4406SAndrea Arcangeli #endif /* CONFIG_USERFAULTFD */
2300adef4406SAndrea Arcangeli 
2301025c5b24SNaoya Horiguchi /*
23028f19b0c0SHuang Ying  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2303025c5b24SNaoya Horiguchi  *
23048f19b0c0SHuang Ying  * Note that if it returns page table lock pointer, this routine returns without
23058f19b0c0SHuang Ying  * unlocking page table lock. So callers must unlock it.
2306025c5b24SNaoya Horiguchi  */
2307b6ec57f4SKirill A. Shutemov spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2308025c5b24SNaoya Horiguchi {
2309b6ec57f4SKirill A. Shutemov 	spinlock_t *ptl;
2310b6ec57f4SKirill A. Shutemov 	ptl = pmd_lock(vma->vm_mm, pmd);
231184c3fc4eSZi Yan 	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
231284c3fc4eSZi Yan 			pmd_devmap(*pmd)))
2313b6ec57f4SKirill A. Shutemov 		return ptl;
2314b6ec57f4SKirill A. Shutemov 	spin_unlock(ptl);
2315b6ec57f4SKirill A. Shutemov 	return NULL;
2316025c5b24SNaoya Horiguchi }
2317025c5b24SNaoya Horiguchi 
2318a00cc7d9SMatthew Wilcox /*
2319d965e390SMiaohe Lin  * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2320a00cc7d9SMatthew Wilcox  *
2321d965e390SMiaohe Lin  * Note that if it returns page table lock pointer, this routine returns without
2322d965e390SMiaohe Lin  * unlocking page table lock. So callers must unlock it.
2323a00cc7d9SMatthew Wilcox  */
2324a00cc7d9SMatthew Wilcox spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2325a00cc7d9SMatthew Wilcox {
2326a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
2327a00cc7d9SMatthew Wilcox 
2328a00cc7d9SMatthew Wilcox 	ptl = pud_lock(vma->vm_mm, pud);
2329a00cc7d9SMatthew Wilcox 	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2330a00cc7d9SMatthew Wilcox 		return ptl;
2331a00cc7d9SMatthew Wilcox 	spin_unlock(ptl);
2332a00cc7d9SMatthew Wilcox 	return NULL;
2333a00cc7d9SMatthew Wilcox }
2334a00cc7d9SMatthew Wilcox 
2335a00cc7d9SMatthew Wilcox #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2336a00cc7d9SMatthew Wilcox int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2337a00cc7d9SMatthew Wilcox 		 pud_t *pud, unsigned long addr)
2338a00cc7d9SMatthew Wilcox {
2339a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
2340a00cc7d9SMatthew Wilcox 
2341a00cc7d9SMatthew Wilcox 	ptl = __pud_trans_huge_lock(pud, vma);
2342a00cc7d9SMatthew Wilcox 	if (!ptl)
2343a00cc7d9SMatthew Wilcox 		return 0;
234474929079SMiaohe Lin 
2345f32928abSAneesh Kumar K.V 	pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2346a00cc7d9SMatthew Wilcox 	tlb_remove_pud_tlb_entry(tlb, pud, addr);
23472484ca9bSThomas Hellstrom (VMware) 	if (vma_is_special_huge(vma)) {
2348a00cc7d9SMatthew Wilcox 		spin_unlock(ptl);
2349a00cc7d9SMatthew Wilcox 		/* No zero page support yet */
2350a00cc7d9SMatthew Wilcox 	} else {
2351a00cc7d9SMatthew Wilcox 		/* No support for anonymous PUD pages yet */
2352a00cc7d9SMatthew Wilcox 		BUG();
2353a00cc7d9SMatthew Wilcox 	}
2354a00cc7d9SMatthew Wilcox 	return 1;
2355a00cc7d9SMatthew Wilcox }
2356a00cc7d9SMatthew Wilcox 
2357a00cc7d9SMatthew Wilcox static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2358a00cc7d9SMatthew Wilcox 		unsigned long haddr)
2359a00cc7d9SMatthew Wilcox {
2360a00cc7d9SMatthew Wilcox 	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2361a00cc7d9SMatthew Wilcox 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2362a00cc7d9SMatthew Wilcox 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2363a00cc7d9SMatthew Wilcox 	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2364a00cc7d9SMatthew Wilcox 
2365ce9311cfSYisheng Xie 	count_vm_event(THP_SPLIT_PUD);
2366a00cc7d9SMatthew Wilcox 
2367ec8832d0SAlistair Popple 	pudp_huge_clear_flush(vma, haddr, pud);
2368a00cc7d9SMatthew Wilcox }
2369a00cc7d9SMatthew Wilcox 
2370a00cc7d9SMatthew Wilcox void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2371a00cc7d9SMatthew Wilcox 		unsigned long address)
2372a00cc7d9SMatthew Wilcox {
2373a00cc7d9SMatthew Wilcox 	spinlock_t *ptl;
2374ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
2375a00cc7d9SMatthew Wilcox 
23767d4a8be0SAlistair Popple 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
23776f4f13e8SJérôme Glisse 				address & HPAGE_PUD_MASK,
2378ac46d4f3SJérôme Glisse 				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2379ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
2380ac46d4f3SJérôme Glisse 	ptl = pud_lock(vma->vm_mm, pud);
2381a00cc7d9SMatthew Wilcox 	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2382a00cc7d9SMatthew Wilcox 		goto out;
2383ac46d4f3SJérôme Glisse 	__split_huge_pud_locked(vma, pud, range.start);
2384a00cc7d9SMatthew Wilcox 
2385a00cc7d9SMatthew Wilcox out:
2386a00cc7d9SMatthew Wilcox 	spin_unlock(ptl);
2387ec8832d0SAlistair Popple 	mmu_notifier_invalidate_range_end(&range);
2388a00cc7d9SMatthew Wilcox }
2389a00cc7d9SMatthew Wilcox #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2390a00cc7d9SMatthew Wilcox 
2391eef1b3baSKirill A. Shutemov static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2392eef1b3baSKirill A. Shutemov 		unsigned long haddr, pmd_t *pmd)
2393eef1b3baSKirill A. Shutemov {
2394eef1b3baSKirill A. Shutemov 	struct mm_struct *mm = vma->vm_mm;
2395eef1b3baSKirill A. Shutemov 	pgtable_t pgtable;
239642b2af2cSDavid Hildenbrand 	pmd_t _pmd, old_pmd;
2397c9c1ee20SHugh Dickins 	unsigned long addr;
2398c9c1ee20SHugh Dickins 	pte_t *pte;
2399eef1b3baSKirill A. Shutemov 	int i;
2400eef1b3baSKirill A. Shutemov 
24010f10851eSJérôme Glisse 	/*
24020f10851eSJérôme Glisse 	 * Leave pmd empty until pte is filled note that it is fine to delay
24030f10851eSJérôme Glisse 	 * notification until mmu_notifier_invalidate_range_end() as we are
24040f10851eSJérôme Glisse 	 * replacing a zero pmd write protected page with a zero pte write
24050f10851eSJérôme Glisse 	 * protected page.
24060f10851eSJérôme Glisse 	 *
2407ee65728eSMike Rapoport 	 * See Documentation/mm/mmu_notifier.rst
24080f10851eSJérôme Glisse 	 */
240942b2af2cSDavid Hildenbrand 	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2410eef1b3baSKirill A. Shutemov 
2411eef1b3baSKirill A. Shutemov 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2412eef1b3baSKirill A. Shutemov 	pmd_populate(mm, &_pmd, pgtable);
2413eef1b3baSKirill A. Shutemov 
2414c9c1ee20SHugh Dickins 	pte = pte_offset_map(&_pmd, haddr);
2415c9c1ee20SHugh Dickins 	VM_BUG_ON(!pte);
2416c9c1ee20SHugh Dickins 	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2417c9c1ee20SHugh Dickins 		pte_t entry;
2418c9c1ee20SHugh Dickins 
2419c9c1ee20SHugh Dickins 		entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2420eef1b3baSKirill A. Shutemov 		entry = pte_mkspecial(entry);
242142b2af2cSDavid Hildenbrand 		if (pmd_uffd_wp(old_pmd))
242242b2af2cSDavid Hildenbrand 			entry = pte_mkuffd_wp(entry);
2423c33c7948SRyan Roberts 		VM_BUG_ON(!pte_none(ptep_get(pte)));
2424c9c1ee20SHugh Dickins 		set_pte_at(mm, addr, pte, entry);
2425c9c1ee20SHugh Dickins 		pte++;
2426eef1b3baSKirill A. Shutemov 	}
2427c9c1ee20SHugh Dickins 	pte_unmap(pte - 1);
2428eef1b3baSKirill A. Shutemov 	smp_wmb(); /* make pte visible before pmd */
2429eef1b3baSKirill A. Shutemov 	pmd_populate(mm, pmd, pgtable);
2430eef1b3baSKirill A. Shutemov }
2431eef1b3baSKirill A. Shutemov 
2432eef1b3baSKirill A. Shutemov static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2433ba988280SKirill A. Shutemov 		unsigned long haddr, bool freeze)
2434eef1b3baSKirill A. Shutemov {
2435eef1b3baSKirill A. Shutemov 	struct mm_struct *mm = vma->vm_mm;
243691b2978aSDavid Hildenbrand 	struct folio *folio;
2437eef1b3baSKirill A. Shutemov 	struct page *page;
2438eef1b3baSKirill A. Shutemov 	pgtable_t pgtable;
2439423ac9afSAneesh Kumar K.V 	pmd_t old_pmd, _pmd;
2440292924b2SPeter Xu 	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
24410ccf7f16SPeter Xu 	bool anon_exclusive = false, dirty = false;
24422ac015e2SKirill A. Shutemov 	unsigned long addr;
2443c9c1ee20SHugh Dickins 	pte_t *pte;
2444eef1b3baSKirill A. Shutemov 	int i;
2445eef1b3baSKirill A. Shutemov 
2446eef1b3baSKirill A. Shutemov 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2447eef1b3baSKirill A. Shutemov 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2448eef1b3baSKirill A. Shutemov 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
244984c3fc4eSZi Yan 	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
245084c3fc4eSZi Yan 				&& !pmd_devmap(*pmd));
2451eef1b3baSKirill A. Shutemov 
2452eef1b3baSKirill A. Shutemov 	count_vm_event(THP_SPLIT_PMD);
2453eef1b3baSKirill A. Shutemov 
2454d21b9e57SKirill A. Shutemov 	if (!vma_is_anonymous(vma)) {
2455ec8832d0SAlistair Popple 		old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2456953c66c2SAneesh Kumar K.V 		/*
2457953c66c2SAneesh Kumar K.V 		 * We are going to unmap this huge page. So
2458953c66c2SAneesh Kumar K.V 		 * just go ahead and zap it
2459953c66c2SAneesh Kumar K.V 		 */
2460953c66c2SAneesh Kumar K.V 		if (arch_needs_pgtable_deposit())
2461953c66c2SAneesh Kumar K.V 			zap_deposited_table(mm, pmd);
24622484ca9bSThomas Hellstrom (VMware) 		if (vma_is_special_huge(vma))
2463d21b9e57SKirill A. Shutemov 			return;
246499fa8a48SHugh Dickins 		if (unlikely(is_pmd_migration_entry(old_pmd))) {
246599fa8a48SHugh Dickins 			swp_entry_t entry;
246699fa8a48SHugh Dickins 
246799fa8a48SHugh Dickins 			entry = pmd_to_swp_entry(old_pmd);
2468439992ffSKefeng Wang 			folio = pfn_swap_entry_folio(entry);
246999fa8a48SHugh Dickins 		} else {
247099fa8a48SHugh Dickins 			page = pmd_page(old_pmd);
2471a8e61d58SDavid Hildenbrand 			folio = page_folio(page);
2472a8e61d58SDavid Hildenbrand 			if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
2473db44c658SDavid Hildenbrand 				folio_mark_dirty(folio);
2474a8e61d58SDavid Hildenbrand 			if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2475a8e61d58SDavid Hildenbrand 				folio_set_referenced(folio);
2476a8e61d58SDavid Hildenbrand 			folio_remove_rmap_pmd(folio, page, vma);
2477a8e61d58SDavid Hildenbrand 			folio_put(folio);
247899fa8a48SHugh Dickins 		}
24796b27cc6cSKefeng Wang 		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
2480eef1b3baSKirill A. Shutemov 		return;
248199fa8a48SHugh Dickins 	}
248299fa8a48SHugh Dickins 
24833b77e8c8SHugh Dickins 	if (is_huge_zero_pmd(*pmd)) {
24844645b9feSJérôme Glisse 		/*
24854645b9feSJérôme Glisse 		 * FIXME: Do we want to invalidate secondary mmu by calling
24861af5a810SAlistair Popple 		 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
24871af5a810SAlistair Popple 		 * inside __split_huge_pmd() ?
24884645b9feSJérôme Glisse 		 *
24894645b9feSJérôme Glisse 		 * We are going from a zero huge page write protected to zero
24904645b9feSJérôme Glisse 		 * small page also write protected so it does not seems useful
24914645b9feSJérôme Glisse 		 * to invalidate secondary mmu at this time.
24924645b9feSJérôme Glisse 		 */
2493eef1b3baSKirill A. Shutemov 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
2494eef1b3baSKirill A. Shutemov 	}
2495eef1b3baSKirill A. Shutemov 
2496423ac9afSAneesh Kumar K.V 	/*
2497423ac9afSAneesh Kumar K.V 	 * Up to this point the pmd is present and huge and userland has the
2498423ac9afSAneesh Kumar K.V 	 * whole access to the hugepage during the split (which happens in
2499423ac9afSAneesh Kumar K.V 	 * place). If we overwrite the pmd with the not-huge version pointing
2500423ac9afSAneesh Kumar K.V 	 * to the pte here (which of course we could if all CPUs were bug
2501423ac9afSAneesh Kumar K.V 	 * free), userland could trigger a small page size TLB miss on the
2502423ac9afSAneesh Kumar K.V 	 * small sized TLB while the hugepage TLB entry is still established in
2503423ac9afSAneesh Kumar K.V 	 * the huge TLB. Some CPU doesn't like that.
250442742d9bSAlexander A. Klimov 	 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
250542742d9bSAlexander A. Klimov 	 * 383 on page 105. Intel should be safe but is also warns that it's
2506423ac9afSAneesh Kumar K.V 	 * only safe if the permission and cache attributes of the two entries
2507423ac9afSAneesh Kumar K.V 	 * loaded in the two TLB is identical (which should be the case here).
2508423ac9afSAneesh Kumar K.V 	 * But it is generally safer to never allow small and huge TLB entries
2509423ac9afSAneesh Kumar K.V 	 * for the same virtual address to be loaded simultaneously. So instead
2510423ac9afSAneesh Kumar K.V 	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2511423ac9afSAneesh Kumar K.V 	 * current pmd notpresent (atomically because here the pmd_trans_huge
2512423ac9afSAneesh Kumar K.V 	 * must remain set at all times on the pmd until the split is complete
2513423ac9afSAneesh Kumar K.V 	 * for this pmd), then we flush the SMP TLB and finally we write the
2514423ac9afSAneesh Kumar K.V 	 * non-huge version of the pmd entry with pmd_populate.
2515423ac9afSAneesh Kumar K.V 	 */
2516423ac9afSAneesh Kumar K.V 	old_pmd = pmdp_invalidate(vma, haddr, pmd);
2517423ac9afSAneesh Kumar K.V 
2518423ac9afSAneesh Kumar K.V 	pmd_migration = is_pmd_migration_entry(old_pmd);
25192e83ee1dSPeter Xu 	if (unlikely(pmd_migration)) {
252084c3fc4eSZi Yan 		swp_entry_t entry;
252184c3fc4eSZi Yan 
2522423ac9afSAneesh Kumar K.V 		entry = pmd_to_swp_entry(old_pmd);
2523af5cdaf8SAlistair Popple 		page = pfn_swap_entry_to_page(entry);
25244dd845b5SAlistair Popple 		write = is_writable_migration_entry(entry);
25256c287605SDavid Hildenbrand 		if (PageAnon(page))
25266c287605SDavid Hildenbrand 			anon_exclusive = is_readable_exclusive_migration_entry(entry);
25272e346877SPeter Xu 		young = is_migration_entry_young(entry);
25282e346877SPeter Xu 		dirty = is_migration_entry_dirty(entry);
25292e83ee1dSPeter Xu 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
2530f45ec5ffSPeter Xu 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
25312e83ee1dSPeter Xu 	} else {
2532423ac9afSAneesh Kumar K.V 		page = pmd_page(old_pmd);
253391b2978aSDavid Hildenbrand 		folio = page_folio(page);
25340ccf7f16SPeter Xu 		if (pmd_dirty(old_pmd)) {
25350ccf7f16SPeter Xu 			dirty = true;
253691b2978aSDavid Hildenbrand 			folio_set_dirty(folio);
25370ccf7f16SPeter Xu 		}
2538423ac9afSAneesh Kumar K.V 		write = pmd_write(old_pmd);
2539423ac9afSAneesh Kumar K.V 		young = pmd_young(old_pmd);
2540423ac9afSAneesh Kumar K.V 		soft_dirty = pmd_soft_dirty(old_pmd);
2541292924b2SPeter Xu 		uffd_wp = pmd_uffd_wp(old_pmd);
25426c287605SDavid Hildenbrand 
254391b2978aSDavid Hildenbrand 		VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
254491b2978aSDavid Hildenbrand 		VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
25456c287605SDavid Hildenbrand 
25466c287605SDavid Hildenbrand 		/*
25476c287605SDavid Hildenbrand 		 * Without "freeze", we'll simply split the PMD, propagating the
25486c287605SDavid Hildenbrand 		 * PageAnonExclusive() flag for each PTE by setting it for
25496c287605SDavid Hildenbrand 		 * each subpage -- no need to (temporarily) clear.
25506c287605SDavid Hildenbrand 		 *
25516c287605SDavid Hildenbrand 		 * With "freeze" we want to replace mapped pages by
25526c287605SDavid Hildenbrand 		 * migration entries right away. This is only possible if we
25536c287605SDavid Hildenbrand 		 * managed to clear PageAnonExclusive() -- see
25546c287605SDavid Hildenbrand 		 * set_pmd_migration_entry().
25556c287605SDavid Hildenbrand 		 *
25566c287605SDavid Hildenbrand 		 * In case we cannot clear PageAnonExclusive(), split the PMD
25576c287605SDavid Hildenbrand 		 * only and let try_to_migrate_one() fail later.
2558088b8aa5SDavid Hildenbrand 		 *
2559e3b4b137SDavid Hildenbrand 		 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
25606c287605SDavid Hildenbrand 		 */
256191b2978aSDavid Hildenbrand 		anon_exclusive = PageAnonExclusive(page);
2562e3b4b137SDavid Hildenbrand 		if (freeze && anon_exclusive &&
2563e3b4b137SDavid Hildenbrand 		    folio_try_share_anon_rmap_pmd(folio, page))
25646c287605SDavid Hildenbrand 			freeze = false;
256591b2978aSDavid Hildenbrand 		if (!freeze) {
256691b2978aSDavid Hildenbrand 			rmap_t rmap_flags = RMAP_NONE;
256791b2978aSDavid Hildenbrand 
256891b2978aSDavid Hildenbrand 			folio_ref_add(folio, HPAGE_PMD_NR - 1);
256991b2978aSDavid Hildenbrand 			if (anon_exclusive)
257091b2978aSDavid Hildenbrand 				rmap_flags |= RMAP_EXCLUSIVE;
257191b2978aSDavid Hildenbrand 			folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
257291b2978aSDavid Hildenbrand 						 vma, haddr, rmap_flags);
257391b2978aSDavid Hildenbrand 		}
25749d84604bSHugh Dickins 	}
2575eef1b3baSKirill A. Shutemov 
2576423ac9afSAneesh Kumar K.V 	/*
2577423ac9afSAneesh Kumar K.V 	 * Withdraw the table only after we mark the pmd entry invalid.
2578423ac9afSAneesh Kumar K.V 	 * This's critical for some architectures (Power).
2579423ac9afSAneesh Kumar K.V 	 */
2580eef1b3baSKirill A. Shutemov 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2581eef1b3baSKirill A. Shutemov 	pmd_populate(mm, &_pmd, pgtable);
2582eef1b3baSKirill A. Shutemov 
2583c9c1ee20SHugh Dickins 	pte = pte_offset_map(&_pmd, haddr);
2584c9c1ee20SHugh Dickins 	VM_BUG_ON(!pte);
25852bdba986SRyan Roberts 
2586eef1b3baSKirill A. Shutemov 	/*
25872bdba986SRyan Roberts 	 * Note that NUMA hinting access restrictions are not transferred to
25882bdba986SRyan Roberts 	 * avoid any possibility of altering permissions across VMAs.
2589eef1b3baSKirill A. Shutemov 	 */
259084c3fc4eSZi Yan 	if (freeze || pmd_migration) {
25912bdba986SRyan Roberts 		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
25922bdba986SRyan Roberts 			pte_t entry;
2593ba988280SKirill A. Shutemov 			swp_entry_t swp_entry;
25942bdba986SRyan Roberts 
25954dd845b5SAlistair Popple 			if (write)
25964dd845b5SAlistair Popple 				swp_entry = make_writable_migration_entry(
25974dd845b5SAlistair Popple 							page_to_pfn(page + i));
25986c287605SDavid Hildenbrand 			else if (anon_exclusive)
25996c287605SDavid Hildenbrand 				swp_entry = make_readable_exclusive_migration_entry(
26006c287605SDavid Hildenbrand 							page_to_pfn(page + i));
26014dd845b5SAlistair Popple 			else
26024dd845b5SAlistair Popple 				swp_entry = make_readable_migration_entry(
26034dd845b5SAlistair Popple 							page_to_pfn(page + i));
26042e346877SPeter Xu 			if (young)
26052e346877SPeter Xu 				swp_entry = make_migration_entry_young(swp_entry);
26062e346877SPeter Xu 			if (dirty)
26072e346877SPeter Xu 				swp_entry = make_migration_entry_dirty(swp_entry);
2608ba988280SKirill A. Shutemov 			entry = swp_entry_to_pte(swp_entry);
2609804dd150SAndrea Arcangeli 			if (soft_dirty)
2610804dd150SAndrea Arcangeli 				entry = pte_swp_mksoft_dirty(entry);
2611f45ec5ffSPeter Xu 			if (uffd_wp)
2612f45ec5ffSPeter Xu 				entry = pte_swp_mkuffd_wp(entry);
26132bdba986SRyan Roberts 
26142bdba986SRyan Roberts 			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
26152bdba986SRyan Roberts 			set_pte_at(mm, addr, pte + i, entry);
26162bdba986SRyan Roberts 		}
2617ba988280SKirill A. Shutemov 	} else {
26182bdba986SRyan Roberts 		pte_t entry;
26192bdba986SRyan Roberts 
26202bdba986SRyan Roberts 		entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
26211462c52eSDavid Hildenbrand 		if (write)
2622161e393cSRick Edgecombe 			entry = pte_mkwrite(entry, vma);
2623eef1b3baSKirill A. Shutemov 		if (!young)
2624eef1b3baSKirill A. Shutemov 			entry = pte_mkold(entry);
2625e833bc50SPeter Xu 		/* NOTE: this may set soft-dirty too on some archs */
2626e833bc50SPeter Xu 		if (dirty)
2627e833bc50SPeter Xu 			entry = pte_mkdirty(entry);
2628804dd150SAndrea Arcangeli 		if (soft_dirty)
2629804dd150SAndrea Arcangeli 			entry = pte_mksoft_dirty(entry);
2630292924b2SPeter Xu 		if (uffd_wp)
2631292924b2SPeter Xu 			entry = pte_mkuffd_wp(entry);
26322bdba986SRyan Roberts 
26332bdba986SRyan Roberts 		for (i = 0; i < HPAGE_PMD_NR; i++)
26342bdba986SRyan Roberts 			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
26352bdba986SRyan Roberts 
26362bdba986SRyan Roberts 		set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
2637ba988280SKirill A. Shutemov 	}
26382bdba986SRyan Roberts 	pte_unmap(pte);
2639eef1b3baSKirill A. Shutemov 
2640cb67f428SHugh Dickins 	if (!pmd_migration)
2641a8e61d58SDavid Hildenbrand 		folio_remove_rmap_pmd(folio, page, vma);
264296d82debSHugh Dickins 	if (freeze)
264396d82debSHugh Dickins 		put_page(page);
2644eef1b3baSKirill A. Shutemov 
2645eef1b3baSKirill A. Shutemov 	smp_wmb(); /* make pte visible before pmd */
2646eef1b3baSKirill A. Shutemov 	pmd_populate(mm, pmd, pgtable);
2647eef1b3baSKirill A. Shutemov }
2648eef1b3baSKirill A. Shutemov 
2649eef1b3baSKirill A. Shutemov void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2650af28a988SMatthew Wilcox (Oracle) 		unsigned long address, bool freeze, struct folio *folio)
2651eef1b3baSKirill A. Shutemov {
2652eef1b3baSKirill A. Shutemov 	spinlock_t *ptl;
2653ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
2654eef1b3baSKirill A. Shutemov 
26557d4a8be0SAlistair Popple 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
26566f4f13e8SJérôme Glisse 				address & HPAGE_PMD_MASK,
2657ac46d4f3SJérôme Glisse 				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2658ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
2659ac46d4f3SJérôme Glisse 	ptl = pmd_lock(vma->vm_mm, pmd);
266033f4751eSNaoya Horiguchi 
266133f4751eSNaoya Horiguchi 	/*
2662af28a988SMatthew Wilcox (Oracle) 	 * If caller asks to setup a migration entry, we need a folio to check
2663af28a988SMatthew Wilcox (Oracle) 	 * pmd against. Otherwise we can end up replacing wrong folio.
266433f4751eSNaoya Horiguchi 	 */
2665af28a988SMatthew Wilcox (Oracle) 	VM_BUG_ON(freeze && !folio);
266683a8441fSMatthew Wilcox (Oracle) 	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
266733f4751eSNaoya Horiguchi 
26687f760917SDavid Hildenbrand 	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
266983a8441fSMatthew Wilcox (Oracle) 	    is_pmd_migration_entry(*pmd)) {
2670cea33328SMiaohe Lin 		/*
2671cea33328SMiaohe Lin 		 * It's safe to call pmd_page when folio is set because it's
2672cea33328SMiaohe Lin 		 * guaranteed that pmd is present.
2673cea33328SMiaohe Lin 		 */
267483a8441fSMatthew Wilcox (Oracle) 		if (folio && folio != page_folio(pmd_page(*pmd)))
267583a8441fSMatthew Wilcox (Oracle) 			goto out;
2676ac46d4f3SJérôme Glisse 		__split_huge_pmd_locked(vma, pmd, range.start, freeze);
267783a8441fSMatthew Wilcox (Oracle) 	}
26787f760917SDavid Hildenbrand 
2679e90309c9SKirill A. Shutemov out:
2680eef1b3baSKirill A. Shutemov 	spin_unlock(ptl);
2681ec8832d0SAlistair Popple 	mmu_notifier_invalidate_range_end(&range);
2682eef1b3baSKirill A. Shutemov }
2683eef1b3baSKirill A. Shutemov 
2684fec89c10SKirill A. Shutemov void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2685af28a988SMatthew Wilcox (Oracle) 		bool freeze, struct folio *folio)
268694fcc585SAndrea Arcangeli {
268750722804SZach O'Keefe 	pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
268894fcc585SAndrea Arcangeli 
268950722804SZach O'Keefe 	if (!pmd)
2690f72e7dcdSHugh Dickins 		return;
2691f72e7dcdSHugh Dickins 
2692af28a988SMatthew Wilcox (Oracle) 	__split_huge_pmd(vma, pmd, address, freeze, folio);
269394fcc585SAndrea Arcangeli }
269494fcc585SAndrea Arcangeli 
269571f9e58eSMiaohe Lin static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
269671f9e58eSMiaohe Lin {
269771f9e58eSMiaohe Lin 	/*
269871f9e58eSMiaohe Lin 	 * If the new address isn't hpage aligned and it could previously
269971f9e58eSMiaohe Lin 	 * contain an hugepage: check if we need to split an huge pmd.
270071f9e58eSMiaohe Lin 	 */
270171f9e58eSMiaohe Lin 	if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
270271f9e58eSMiaohe Lin 	    range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
270371f9e58eSMiaohe Lin 			 ALIGN(address, HPAGE_PMD_SIZE)))
270471f9e58eSMiaohe Lin 		split_huge_pmd_address(vma, address, false, NULL);
270571f9e58eSMiaohe Lin }
270671f9e58eSMiaohe Lin 
2707e1b9996bSKirill A. Shutemov void vma_adjust_trans_huge(struct vm_area_struct *vma,
270894fcc585SAndrea Arcangeli 			     unsigned long start,
270994fcc585SAndrea Arcangeli 			     unsigned long end,
271094fcc585SAndrea Arcangeli 			     long adjust_next)
271194fcc585SAndrea Arcangeli {
271271f9e58eSMiaohe Lin 	/* Check if we need to split start first. */
271371f9e58eSMiaohe Lin 	split_huge_pmd_if_needed(vma, start);
271471f9e58eSMiaohe Lin 
271571f9e58eSMiaohe Lin 	/* Check if we need to split end next. */
271671f9e58eSMiaohe Lin 	split_huge_pmd_if_needed(vma, end);
271794fcc585SAndrea Arcangeli 
271894fcc585SAndrea Arcangeli 	/*
271968540502SMatthew Wilcox (Oracle) 	 * If we're also updating the next vma vm_start,
272071f9e58eSMiaohe Lin 	 * check if we need to split it.
272194fcc585SAndrea Arcangeli 	 */
272294fcc585SAndrea Arcangeli 	if (adjust_next > 0) {
272368540502SMatthew Wilcox (Oracle) 		struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
272494fcc585SAndrea Arcangeli 		unsigned long nstart = next->vm_start;
2725f9d86a60SWei Yang 		nstart += adjust_next;
272671f9e58eSMiaohe Lin 		split_huge_pmd_if_needed(next, nstart);
272794fcc585SAndrea Arcangeli 	}
272894fcc585SAndrea Arcangeli }
2729e9b61f19SKirill A. Shutemov 
2730684555aaSMatthew Wilcox (Oracle) static void unmap_folio(struct folio *folio)
2731e9b61f19SKirill A. Shutemov {
2732319a624eSZi Yan 	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
2733319a624eSZi Yan 		TTU_BATCH_FLUSH;
2734e9b61f19SKirill A. Shutemov 
2735684555aaSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2736e9b61f19SKirill A. Shutemov 
2737319a624eSZi Yan 	if (folio_test_pmd_mappable(folio))
2738319a624eSZi Yan 		ttu_flags |= TTU_SPLIT_HUGE_PMD;
2739319a624eSZi Yan 
2740a98a2f0cSAlistair Popple 	/*
2741a98a2f0cSAlistair Popple 	 * Anon pages need migration entries to preserve them, but file
2742a98a2f0cSAlistair Popple 	 * pages can simply be left unmapped, then faulted back on demand.
2743a98a2f0cSAlistair Popple 	 * If that is ever changed (perhaps for mlock), update remap_page().
2744a98a2f0cSAlistair Popple 	 */
27454b8554c5SMatthew Wilcox (Oracle) 	if (folio_test_anon(folio))
27464b8554c5SMatthew Wilcox (Oracle) 		try_to_migrate(folio, ttu_flags);
2747a98a2f0cSAlistair Popple 	else
2748869f7ee6SMatthew Wilcox (Oracle) 		try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
27493027c6f8SBaolin Wang 
27503027c6f8SBaolin Wang 	try_to_unmap_flush();
2751bd56086fSKirill A. Shutemov }
2752bd56086fSKirill A. Shutemov 
27534eecb8b9SMatthew Wilcox (Oracle) static void remap_page(struct folio *folio, unsigned long nr)
2754e9b61f19SKirill A. Shutemov {
27554eecb8b9SMatthew Wilcox (Oracle) 	int i = 0;
2756ab02c252SHugh Dickins 
2757684555aaSMatthew Wilcox (Oracle) 	/* If unmap_folio() uses try_to_migrate() on file, remove this check */
27584eecb8b9SMatthew Wilcox (Oracle) 	if (!folio_test_anon(folio))
2759ab02c252SHugh Dickins 		return;
27604eecb8b9SMatthew Wilcox (Oracle) 	for (;;) {
27614eecb8b9SMatthew Wilcox (Oracle) 		remove_migration_ptes(folio, folio, true);
27624eecb8b9SMatthew Wilcox (Oracle) 		i += folio_nr_pages(folio);
27634eecb8b9SMatthew Wilcox (Oracle) 		if (i >= nr)
27644eecb8b9SMatthew Wilcox (Oracle) 			break;
27654eecb8b9SMatthew Wilcox (Oracle) 		folio = folio_next(folio);
2766e9b61f19SKirill A. Shutemov 	}
2767ace71a19SKirill A. Shutemov }
2768e9b61f19SKirill A. Shutemov 
276994866635SAlex Shi static void lru_add_page_tail(struct page *head, struct page *tail,
277088dcb9a3SAlex Shi 		struct lruvec *lruvec, struct list_head *list)
277188dcb9a3SAlex Shi {
277294866635SAlex Shi 	VM_BUG_ON_PAGE(!PageHead(head), head);
277394866635SAlex Shi 	VM_BUG_ON_PAGE(PageLRU(tail), head);
27746168d0daSAlex Shi 	lockdep_assert_held(&lruvec->lru_lock);
277588dcb9a3SAlex Shi 
27766dbb5741SAlex Shi 	if (list) {
277788dcb9a3SAlex Shi 		/* page reclaim is reclaiming a huge page */
27786dbb5741SAlex Shi 		VM_WARN_ON(PageLRU(head));
277994866635SAlex Shi 		get_page(tail);
278094866635SAlex Shi 		list_add_tail(&tail->lru, list);
278188dcb9a3SAlex Shi 	} else {
27826dbb5741SAlex Shi 		/* head is still on lru (and we have it frozen) */
27836dbb5741SAlex Shi 		VM_WARN_ON(!PageLRU(head));
278407ca7606SHugh Dickins 		if (PageUnevictable(tail))
278507ca7606SHugh Dickins 			tail->mlock_count = 0;
278607ca7606SHugh Dickins 		else
27876dbb5741SAlex Shi 			list_add_tail(&tail->lru, &head->lru);
278807ca7606SHugh Dickins 		SetPageLRU(tail);
278988dcb9a3SAlex Shi 	}
279088dcb9a3SAlex Shi }
279188dcb9a3SAlex Shi 
279207e09c48SDavid Hildenbrand static void __split_huge_page_tail(struct folio *folio, int tail,
2793c010d47fSZi Yan 		struct lruvec *lruvec, struct list_head *list,
2794c010d47fSZi Yan 		unsigned int new_order)
2795e9b61f19SKirill A. Shutemov {
279607e09c48SDavid Hildenbrand 	struct page *head = &folio->page;
2797e9b61f19SKirill A. Shutemov 	struct page *page_tail = head + tail;
279807e09c48SDavid Hildenbrand 	/*
279907e09c48SDavid Hildenbrand 	 * Careful: new_folio is not a "real" folio before we cleared PageTail.
280007e09c48SDavid Hildenbrand 	 * Don't pass it around before clear_compound_head().
280107e09c48SDavid Hildenbrand 	 */
280207e09c48SDavid Hildenbrand 	struct folio *new_folio = (struct folio *)page_tail;
2803e9b61f19SKirill A. Shutemov 
28048df651c7SKirill A. Shutemov 	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2805e9b61f19SKirill A. Shutemov 
2806e9b61f19SKirill A. Shutemov 	/*
2807605ca5edSKonstantin Khlebnikov 	 * Clone page flags before unfreezing refcount.
2808605ca5edSKonstantin Khlebnikov 	 *
2809605ca5edSKonstantin Khlebnikov 	 * After successful get_page_unless_zero() might follow flags change,
28108958b249SHaitao Shi 	 * for example lock_page() which set PG_waiters.
28116c287605SDavid Hildenbrand 	 *
28126c287605SDavid Hildenbrand 	 * Note that for mapped sub-pages of an anonymous THP,
2813684555aaSMatthew Wilcox (Oracle) 	 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
28146c287605SDavid Hildenbrand 	 * the migration entry instead from where remap_page() will restore it.
28156c287605SDavid Hildenbrand 	 * We can still have PG_anon_exclusive set on effectively unmapped and
28166c287605SDavid Hildenbrand 	 * unreferenced sub-pages of an anonymous THP: we can simply drop
28176c287605SDavid Hildenbrand 	 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2818e9b61f19SKirill A. Shutemov 	 */
2819e9b61f19SKirill A. Shutemov 	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2820e9b61f19SKirill A. Shutemov 	page_tail->flags |= (head->flags &
2821e9b61f19SKirill A. Shutemov 			((1L << PG_referenced) |
2822e9b61f19SKirill A. Shutemov 			 (1L << PG_swapbacked) |
282338d8b4e6SHuang Ying 			 (1L << PG_swapcache) |
2824e9b61f19SKirill A. Shutemov 			 (1L << PG_mlocked) |
2825e9b61f19SKirill A. Shutemov 			 (1L << PG_uptodate) |
2826e9b61f19SKirill A. Shutemov 			 (1L << PG_active) |
28271899ad18SJohannes Weiner 			 (1L << PG_workingset) |
2828e9b61f19SKirill A. Shutemov 			 (1L << PG_locked) |
2829b8d3c4c3SMinchan Kim 			 (1L << PG_unevictable) |
2830b0284cd2SCatalin Marinas #ifdef CONFIG_ARCH_USES_PG_ARCH_X
283172e6afa0SCatalin Marinas 			 (1L << PG_arch_2) |
2832ef6458b1SPeter Collingbourne 			 (1L << PG_arch_3) |
283372e6afa0SCatalin Marinas #endif
2834ec1c86b2SYu Zhao 			 (1L << PG_dirty) |
2835ec1c86b2SYu Zhao 			 LRU_GEN_MASK | LRU_REFS_MASK));
2836e9b61f19SKirill A. Shutemov 
2837cb67f428SHugh Dickins 	/* ->mapping in first and second tail page is replaced by other uses */
2838173d9d9fSHugh Dickins 	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2839173d9d9fSHugh Dickins 			page_tail);
2840173d9d9fSHugh Dickins 	page_tail->mapping = head->mapping;
2841173d9d9fSHugh Dickins 	page_tail->index = head->index + tail;
284271e2d666SMel Gorman 
284371e2d666SMel Gorman 	/*
2844cfeed8ffSDavid Hildenbrand 	 * page->private should not be set in tail pages. Fix up and warn once
2845cfeed8ffSDavid Hildenbrand 	 * if private is unexpectedly set.
284671e2d666SMel Gorman 	 */
2847cfeed8ffSDavid Hildenbrand 	if (unlikely(page_tail->private)) {
2848cfeed8ffSDavid Hildenbrand 		VM_WARN_ON_ONCE_PAGE(true, page_tail);
2849b653db77SMatthew Wilcox (Oracle) 		page_tail->private = 0;
285071e2d666SMel Gorman 	}
285107e09c48SDavid Hildenbrand 	if (folio_test_swapcache(folio))
285207e09c48SDavid Hildenbrand 		new_folio->swap.val = folio->swap.val + tail;
2853173d9d9fSHugh Dickins 
2854605ca5edSKonstantin Khlebnikov 	/* Page flags must be visible before we make the page non-compound. */
2855e9b61f19SKirill A. Shutemov 	smp_wmb();
2856e9b61f19SKirill A. Shutemov 
2857605ca5edSKonstantin Khlebnikov 	/*
2858605ca5edSKonstantin Khlebnikov 	 * Clear PageTail before unfreezing page refcount.
2859605ca5edSKonstantin Khlebnikov 	 *
2860605ca5edSKonstantin Khlebnikov 	 * After successful get_page_unless_zero() might follow put_page()
2861605ca5edSKonstantin Khlebnikov 	 * which needs correct compound_head().
2862605ca5edSKonstantin Khlebnikov 	 */
2863e9b61f19SKirill A. Shutemov 	clear_compound_head(page_tail);
2864c010d47fSZi Yan 	if (new_order) {
2865c010d47fSZi Yan 		prep_compound_page(page_tail, new_order);
2866c010d47fSZi Yan 		folio_prep_large_rmappable(new_folio);
2867c010d47fSZi Yan 	}
2868e9b61f19SKirill A. Shutemov 
2869605ca5edSKonstantin Khlebnikov 	/* Finally unfreeze refcount. Additional reference from page cache. */
2870c010d47fSZi Yan 	page_ref_unfreeze(page_tail,
2871c010d47fSZi Yan 		1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
2872c010d47fSZi Yan 			     folio_nr_pages(new_folio) : 0));
2873605ca5edSKonstantin Khlebnikov 
2874b7542769SKefeng Wang 	if (folio_test_young(folio))
2875b7542769SKefeng Wang 		folio_set_young(new_folio);
2876b7542769SKefeng Wang 	if (folio_test_idle(folio))
2877b7542769SKefeng Wang 		folio_set_idle(new_folio);
2878e9b61f19SKirill A. Shutemov 
2879c8253011SKefeng Wang 	folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
288094723aafSMichal Hocko 
288194723aafSMichal Hocko 	/*
288294723aafSMichal Hocko 	 * always add to the tail because some iterators expect new
288394723aafSMichal Hocko 	 * pages to show after the currently processed elements - e.g.
288494723aafSMichal Hocko 	 * migrate_pages
288594723aafSMichal Hocko 	 */
2886e9b61f19SKirill A. Shutemov 	lru_add_page_tail(head, page_tail, lruvec, list);
2887e9b61f19SKirill A. Shutemov }
2888e9b61f19SKirill A. Shutemov 
2889baa355fdSKirill A. Shutemov static void __split_huge_page(struct page *page, struct list_head *list,
2890c010d47fSZi Yan 		pgoff_t end, unsigned int new_order)
2891e9b61f19SKirill A. Shutemov {
2892e809c3feSMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
2893e809c3feSMatthew Wilcox (Oracle) 	struct page *head = &folio->page;
2894e9b61f19SKirill A. Shutemov 	struct lruvec *lruvec;
28954101196bSMatthew Wilcox (Oracle) 	struct address_space *swap_cache = NULL;
28964101196bSMatthew Wilcox (Oracle) 	unsigned long offset = 0;
2897509f0069SHugh Dickins 	int i, nr_dropped = 0;
2898c010d47fSZi Yan 	unsigned int new_nr = 1 << new_order;
2899502003bbSZi Yan 	int order = folio_order(folio);
2900502003bbSZi Yan 	unsigned int nr = 1 << order;
2901e9b61f19SKirill A. Shutemov 
2902e9b61f19SKirill A. Shutemov 	/* complete memcg works before add pages to LRU */
2903c010d47fSZi Yan 	split_page_memcg(head, order, new_order);
2904e9b61f19SKirill A. Shutemov 
290507e09c48SDavid Hildenbrand 	if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
290607e09c48SDavid Hildenbrand 		offset = swp_offset(folio->swap);
290707e09c48SDavid Hildenbrand 		swap_cache = swap_address_space(folio->swap);
29084101196bSMatthew Wilcox (Oracle) 		xa_lock(&swap_cache->i_pages);
29094101196bSMatthew Wilcox (Oracle) 	}
29104101196bSMatthew Wilcox (Oracle) 
2911f0953a1bSIngo Molnar 	/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
2912e809c3feSMatthew Wilcox (Oracle) 	lruvec = folio_lruvec_lock(folio);
2913b6769834SAlex Shi 
2914eac96c3eSYang Shi 	ClearPageHasHWPoisoned(head);
2915eac96c3eSYang Shi 
2916c010d47fSZi Yan 	for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
2917c010d47fSZi Yan 		__split_huge_page_tail(folio, i, lruvec, list, new_order);
2918d144bf62SHugh Dickins 		/* Some pages can be beyond EOF: drop them from page cache */
2919baa355fdSKirill A. Shutemov 		if (head[i].index >= end) {
2920fb5c2029SMatthew Wilcox (Oracle) 			struct folio *tail = page_folio(head + i);
2921fb5c2029SMatthew Wilcox (Oracle) 
2922435a7554SMatthew Wilcox (Oracle) 			if (shmem_mapping(folio->mapping))
2923509f0069SHugh Dickins 				nr_dropped++;
2924fb5c2029SMatthew Wilcox (Oracle) 			else if (folio_test_clear_dirty(tail))
2925fb5c2029SMatthew Wilcox (Oracle) 				folio_account_cleaned(tail,
2926fb5c2029SMatthew Wilcox (Oracle) 					inode_to_wb(folio->mapping->host));
2927fb5c2029SMatthew Wilcox (Oracle) 			__filemap_remove_folio(tail, NULL);
2928fb5c2029SMatthew Wilcox (Oracle) 			folio_put(tail);
29294101196bSMatthew Wilcox (Oracle) 		} else if (!PageAnon(page)) {
2930435a7554SMatthew Wilcox (Oracle) 			__xa_store(&folio->mapping->i_pages, head[i].index,
29314101196bSMatthew Wilcox (Oracle) 					head + i, 0);
29324101196bSMatthew Wilcox (Oracle) 		} else if (swap_cache) {
29334101196bSMatthew Wilcox (Oracle) 			__xa_store(&swap_cache->i_pages, offset + i,
29344101196bSMatthew Wilcox (Oracle) 					head + i, 0);
2935baa355fdSKirill A. Shutemov 		}
2936baa355fdSKirill A. Shutemov 	}
2937e9b61f19SKirill A. Shutemov 
2938c010d47fSZi Yan 	if (!new_order)
2939e9b61f19SKirill A. Shutemov 		ClearPageCompound(head);
2940c010d47fSZi Yan 	else {
2941c010d47fSZi Yan 		struct folio *new_folio = (struct folio *)head;
2942c010d47fSZi Yan 
2943c010d47fSZi Yan 		folio_set_order(new_folio, new_order);
2944c010d47fSZi Yan 	}
29456168d0daSAlex Shi 	unlock_page_lruvec(lruvec);
2946b6769834SAlex Shi 	/* Caller disabled irqs, so they are still disabled here */
2947f7da677bSVlastimil Babka 
2948c010d47fSZi Yan 	split_page_owner(head, order, new_order);
2949f7da677bSVlastimil Babka 
2950baa355fdSKirill A. Shutemov 	/* See comment in __split_huge_page_tail() */
2951435a7554SMatthew Wilcox (Oracle) 	if (folio_test_anon(folio)) {
2952aa5dc07fSMatthew Wilcox 		/* Additional pin to swap cache */
2953435a7554SMatthew Wilcox (Oracle) 		if (folio_test_swapcache(folio)) {
2954435a7554SMatthew Wilcox (Oracle) 			folio_ref_add(folio, 1 + new_nr);
29554101196bSMatthew Wilcox (Oracle) 			xa_unlock(&swap_cache->i_pages);
29564101196bSMatthew Wilcox (Oracle) 		} else {
2957435a7554SMatthew Wilcox (Oracle) 			folio_ref_inc(folio);
29584101196bSMatthew Wilcox (Oracle) 		}
2959baa355fdSKirill A. Shutemov 	} else {
2960aa5dc07fSMatthew Wilcox 		/* Additional pin to page cache */
2961435a7554SMatthew Wilcox (Oracle) 		folio_ref_add(folio, 1 + new_nr);
2962435a7554SMatthew Wilcox (Oracle) 		xa_unlock(&folio->mapping->i_pages);
2963baa355fdSKirill A. Shutemov 	}
2964b6769834SAlex Shi 	local_irq_enable();
2965e9b61f19SKirill A. Shutemov 
2966509f0069SHugh Dickins 	if (nr_dropped)
2967435a7554SMatthew Wilcox (Oracle) 		shmem_uncharge(folio->mapping->host, nr_dropped);
29684eecb8b9SMatthew Wilcox (Oracle) 	remap_page(folio, nr);
2969e9b61f19SKirill A. Shutemov 
297007e09c48SDavid Hildenbrand 	if (folio_test_swapcache(folio))
297107e09c48SDavid Hildenbrand 		split_swap_cluster(folio->swap);
2972c4f9c701SHuang Ying 
2973c010d47fSZi Yan 	/*
2974c010d47fSZi Yan 	 * set page to its compound_head when split to non order-0 pages, so
2975c010d47fSZi Yan 	 * we can skip unlocking it below, since PG_locked is transferred to
2976c010d47fSZi Yan 	 * the compound_head of the page and the caller will unlock it.
2977c010d47fSZi Yan 	 */
2978c010d47fSZi Yan 	if (new_order)
2979c010d47fSZi Yan 		page = compound_head(page);
2980c010d47fSZi Yan 
2981c010d47fSZi Yan 	for (i = 0; i < nr; i += new_nr) {
2982e9b61f19SKirill A. Shutemov 		struct page *subpage = head + i;
2983435a7554SMatthew Wilcox (Oracle) 		struct folio *new_folio = page_folio(subpage);
2984e9b61f19SKirill A. Shutemov 		if (subpage == page)
2985e9b61f19SKirill A. Shutemov 			continue;
2986435a7554SMatthew Wilcox (Oracle) 		folio_unlock(new_folio);
2987e9b61f19SKirill A. Shutemov 
2988e9b61f19SKirill A. Shutemov 		/*
2989e9b61f19SKirill A. Shutemov 		 * Subpages may be freed if there wasn't any mapping
2990e9b61f19SKirill A. Shutemov 		 * like if add_to_swap() is running on a lru page that
2991e9b61f19SKirill A. Shutemov 		 * had its mapping zapped. And freeing these pages
2992e9b61f19SKirill A. Shutemov 		 * requires taking the lru_lock so we do the put_page
2993e9b61f19SKirill A. Shutemov 		 * of the tail pages after the split is complete.
2994e9b61f19SKirill A. Shutemov 		 */
29950b175468SMiaohe Lin 		free_page_and_swap_cache(subpage);
2996e9b61f19SKirill A. Shutemov 	}
2997e9b61f19SKirill A. Shutemov }
2998e9b61f19SKirill A. Shutemov 
2999b8f593cdSHuang Ying /* Racy check whether the huge page can be split */
3000d4b4084aSMatthew Wilcox (Oracle) bool can_split_folio(struct folio *folio, int *pextra_pins)
3001b8f593cdSHuang Ying {
3002b8f593cdSHuang Ying 	int extra_pins;
3003b8f593cdSHuang Ying 
3004aa5dc07fSMatthew Wilcox 	/* Additional pins from page cache */
3005d4b4084aSMatthew Wilcox (Oracle) 	if (folio_test_anon(folio))
3006d4b4084aSMatthew Wilcox (Oracle) 		extra_pins = folio_test_swapcache(folio) ?
3007d4b4084aSMatthew Wilcox (Oracle) 				folio_nr_pages(folio) : 0;
3008b8f593cdSHuang Ying 	else
3009d4b4084aSMatthew Wilcox (Oracle) 		extra_pins = folio_nr_pages(folio);
3010b8f593cdSHuang Ying 	if (pextra_pins)
3011b8f593cdSHuang Ying 		*pextra_pins = extra_pins;
3012d4b4084aSMatthew Wilcox (Oracle) 	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
3013b8f593cdSHuang Ying }
3014b8f593cdSHuang Ying 
30156d0a07edSAndrea Arcangeli /*
3016c010d47fSZi Yan  * This function splits huge page into pages in @new_order. @page can point to
3017c010d47fSZi Yan  * any subpage of huge page to split. Split doesn't change the position of
3018c010d47fSZi Yan  * @page.
3019c010d47fSZi Yan  *
3020c010d47fSZi Yan  * NOTE: order-1 anonymous folio is not supported because _deferred_list,
3021c010d47fSZi Yan  * which is used by partially mapped folios, is stored in subpage 2 and an
3022c010d47fSZi Yan  * order-1 folio only has subpage 0 and 1. File-backed order-1 folios are OK,
3023c010d47fSZi Yan  * since they do not use _deferred_list.
3024e9b61f19SKirill A. Shutemov  *
3025e9b61f19SKirill A. Shutemov  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
3026e9b61f19SKirill A. Shutemov  * The huge page must be locked.
3027e9b61f19SKirill A. Shutemov  *
3028e9b61f19SKirill A. Shutemov  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3029e9b61f19SKirill A. Shutemov  *
3030c010d47fSZi Yan  * Pages in new_order will inherit mapping, flags, and so on from the hugepage.
3031e9b61f19SKirill A. Shutemov  *
3032c010d47fSZi Yan  * GUP pin and PG_locked transferred to @page or the compound page @page belongs
3033c010d47fSZi Yan  * to. Rest subpages can be freed if they are not mapped.
3034e9b61f19SKirill A. Shutemov  *
3035e9b61f19SKirill A. Shutemov  * Returns 0 if the hugepage is split successfully.
3036e9b61f19SKirill A. Shutemov  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
3037e9b61f19SKirill A. Shutemov  * us.
3038e9b61f19SKirill A. Shutemov  */
3039c010d47fSZi Yan int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
3040c010d47fSZi Yan 				     unsigned int new_order)
3041e9b61f19SKirill A. Shutemov {
30424eecb8b9SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
3043f8baa6beSMatthew Wilcox (Oracle) 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3044c010d47fSZi Yan 	/* reset xarray order to new order after split */
3045c010d47fSZi Yan 	XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
3046baa355fdSKirill A. Shutemov 	struct anon_vma *anon_vma = NULL;
3047baa355fdSKirill A. Shutemov 	struct address_space *mapping = NULL;
3048504e070dSYang Shi 	int extra_pins, ret;
3049006d3ff2SHugh Dickins 	pgoff_t end;
3050478d134eSXu Yu 	bool is_hzp;
3051e9b61f19SKirill A. Shutemov 
30523e9a13daSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
30533e9a13daSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3054e9b61f19SKirill A. Shutemov 
30551412ecb3SZi Yan 	if (new_order >= folio_order(folio))
30561412ecb3SZi Yan 		return -EINVAL;
30571412ecb3SZi Yan 
3058c010d47fSZi Yan 	/* Cannot split anonymous THP to order-1 */
3059c010d47fSZi Yan 	if (new_order == 1 && folio_test_anon(folio)) {
3060c010d47fSZi Yan 		VM_WARN_ONCE(1, "Cannot split to order-1 folio");
3061c010d47fSZi Yan 		return -EINVAL;
3062c010d47fSZi Yan 	}
3063c010d47fSZi Yan 
3064c010d47fSZi Yan 	if (new_order) {
3065c010d47fSZi Yan 		/* Only swapping a whole PMD-mapped folio is supported */
3066c010d47fSZi Yan 		if (folio_test_swapcache(folio))
3067c010d47fSZi Yan 			return -EINVAL;
3068c010d47fSZi Yan 		/* Split shmem folio to non-zero order not supported */
3069c010d47fSZi Yan 		if (shmem_mapping(folio->mapping)) {
3070c010d47fSZi Yan 			VM_WARN_ONCE(1,
3071c010d47fSZi Yan 				"Cannot split shmem folio to non-0 order");
3072c010d47fSZi Yan 			return -EINVAL;
3073c010d47fSZi Yan 		}
3074c010d47fSZi Yan 		/* No split if the file system does not support large folio */
3075c010d47fSZi Yan 		if (!mapping_large_folio_support(folio->mapping)) {
3076c010d47fSZi Yan 			VM_WARN_ONCE(1,
3077c010d47fSZi Yan 				"Cannot split file folio to non-0 order");
3078c010d47fSZi Yan 			return -EINVAL;
3079c010d47fSZi Yan 		}
3080c010d47fSZi Yan 	}
3081c010d47fSZi Yan 
3082c010d47fSZi Yan 
30833e9a13daSMatthew Wilcox (Oracle) 	is_hzp = is_huge_zero_page(&folio->page);
30844737edbbSNaoya Horiguchi 	if (is_hzp) {
30854737edbbSNaoya Horiguchi 		pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3086478d134eSXu Yu 		return -EBUSY;
30874737edbbSNaoya Horiguchi 	}
3088478d134eSXu Yu 
30893e9a13daSMatthew Wilcox (Oracle) 	if (folio_test_writeback(folio))
309059807685SHuang Ying 		return -EBUSY;
309159807685SHuang Ying 
30923e9a13daSMatthew Wilcox (Oracle) 	if (folio_test_anon(folio)) {
3093e9b61f19SKirill A. Shutemov 		/*
3094c1e8d7c6SMichel Lespinasse 		 * The caller does not necessarily hold an mmap_lock that would
3095baa355fdSKirill A. Shutemov 		 * prevent the anon_vma disappearing so we first we take a
3096baa355fdSKirill A. Shutemov 		 * reference to it and then lock the anon_vma for write. This
30972f031c6fSMatthew Wilcox (Oracle) 		 * is similar to folio_lock_anon_vma_read except the write lock
3098baa355fdSKirill A. Shutemov 		 * is taken to serialise against parallel split or collapse
3099baa355fdSKirill A. Shutemov 		 * operations.
3100e9b61f19SKirill A. Shutemov 		 */
310129eea9b5SMatthew Wilcox (Oracle) 		anon_vma = folio_get_anon_vma(folio);
3102e9b61f19SKirill A. Shutemov 		if (!anon_vma) {
3103e9b61f19SKirill A. Shutemov 			ret = -EBUSY;
3104e9b61f19SKirill A. Shutemov 			goto out;
3105e9b61f19SKirill A. Shutemov 		}
3106006d3ff2SHugh Dickins 		end = -1;
3107baa355fdSKirill A. Shutemov 		mapping = NULL;
3108e9b61f19SKirill A. Shutemov 		anon_vma_lock_write(anon_vma);
3109baa355fdSKirill A. Shutemov 	} else {
31106a3edd29SYin Fengwei 		gfp_t gfp;
31116a3edd29SYin Fengwei 
31123e9a13daSMatthew Wilcox (Oracle) 		mapping = folio->mapping;
3113baa355fdSKirill A. Shutemov 
3114baa355fdSKirill A. Shutemov 		/* Truncated ? */
3115baa355fdSKirill A. Shutemov 		if (!mapping) {
3116baa355fdSKirill A. Shutemov 			ret = -EBUSY;
3117baa355fdSKirill A. Shutemov 			goto out;
3118baa355fdSKirill A. Shutemov 		}
3119baa355fdSKirill A. Shutemov 
31206a3edd29SYin Fengwei 		gfp = current_gfp_context(mapping_gfp_mask(mapping) &
31216a3edd29SYin Fengwei 							GFP_RECLAIM_MASK);
31226a3edd29SYin Fengwei 
31230201ebf2SDavid Howells 		if (!filemap_release_folio(folio, gfp)) {
31246a3edd29SYin Fengwei 			ret = -EBUSY;
31256a3edd29SYin Fengwei 			goto out;
31266a3edd29SYin Fengwei 		}
31276a3edd29SYin Fengwei 
31283e9a13daSMatthew Wilcox (Oracle) 		xas_split_alloc(&xas, folio, folio_order(folio), gfp);
31296b24ca4aSMatthew Wilcox (Oracle) 		if (xas_error(&xas)) {
31306b24ca4aSMatthew Wilcox (Oracle) 			ret = xas_error(&xas);
31316b24ca4aSMatthew Wilcox (Oracle) 			goto out;
31326b24ca4aSMatthew Wilcox (Oracle) 		}
31336b24ca4aSMatthew Wilcox (Oracle) 
3134baa355fdSKirill A. Shutemov 		anon_vma = NULL;
3135baa355fdSKirill A. Shutemov 		i_mmap_lock_read(mapping);
3136006d3ff2SHugh Dickins 
3137006d3ff2SHugh Dickins 		/*
3138006d3ff2SHugh Dickins 		 *__split_huge_page() may need to trim off pages beyond EOF:
3139006d3ff2SHugh Dickins 		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3140006d3ff2SHugh Dickins 		 * which cannot be nested inside the page tree lock. So note
3141006d3ff2SHugh Dickins 		 * end now: i_size itself may be changed at any moment, but
31423e9a13daSMatthew Wilcox (Oracle) 		 * folio lock is good enough to serialize the trimming.
3143006d3ff2SHugh Dickins 		 */
3144006d3ff2SHugh Dickins 		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3145d144bf62SHugh Dickins 		if (shmem_mapping(mapping))
3146d144bf62SHugh Dickins 			end = shmem_fallocend(mapping->host, end);
3147baa355fdSKirill A. Shutemov 	}
3148e9b61f19SKirill A. Shutemov 
3149e9b61f19SKirill A. Shutemov 	/*
3150684555aaSMatthew Wilcox (Oracle) 	 * Racy check if we can split the page, before unmap_folio() will
3151e9b61f19SKirill A. Shutemov 	 * split PMDs
3152e9b61f19SKirill A. Shutemov 	 */
3153d4b4084aSMatthew Wilcox (Oracle) 	if (!can_split_folio(folio, &extra_pins)) {
3154fd4a7ac3SBaolin Wang 		ret = -EAGAIN;
3155e9b61f19SKirill A. Shutemov 		goto out_unlock;
3156e9b61f19SKirill A. Shutemov 	}
3157e9b61f19SKirill A. Shutemov 
3158684555aaSMatthew Wilcox (Oracle) 	unmap_folio(folio);
3159e9b61f19SKirill A. Shutemov 
3160b6769834SAlex Shi 	/* block interrupt reentry in xa_lock and spinlock */
3161b6769834SAlex Shi 	local_irq_disable();
3162baa355fdSKirill A. Shutemov 	if (mapping) {
3163baa355fdSKirill A. Shutemov 		/*
31643e9a13daSMatthew Wilcox (Oracle) 		 * Check if the folio is present in page cache.
31653e9a13daSMatthew Wilcox (Oracle) 		 * We assume all tail are present too, if folio is there.
3166baa355fdSKirill A. Shutemov 		 */
31676b24ca4aSMatthew Wilcox (Oracle) 		xas_lock(&xas);
31686b24ca4aSMatthew Wilcox (Oracle) 		xas_reset(&xas);
31693e9a13daSMatthew Wilcox (Oracle) 		if (xas_load(&xas) != folio)
3170baa355fdSKirill A. Shutemov 			goto fail;
3171baa355fdSKirill A. Shutemov 	}
3172baa355fdSKirill A. Shutemov 
31730139aa7bSJoonsoo Kim 	/* Prevent deferred_split_scan() touching ->_refcount */
3174364c1eebSYang Shi 	spin_lock(&ds_queue->split_queue_lock);
31753e9a13daSMatthew Wilcox (Oracle) 	if (folio_ref_freeze(folio, 1 + extra_pins)) {
31768897277aSMatthew Wilcox (Oracle) 		if (folio_order(folio) > 1 &&
31778897277aSMatthew Wilcox (Oracle) 		    !list_empty(&folio->_deferred_list)) {
3178364c1eebSYang Shi 			ds_queue->split_queue_len--;
3179c010d47fSZi Yan 			/*
3180c010d47fSZi Yan 			 * Reinitialize page_deferred_list after removing the
3181c010d47fSZi Yan 			 * page from the split_queue, otherwise a subsequent
3182c010d47fSZi Yan 			 * split will see list corruption when checking the
3183c010d47fSZi Yan 			 * page_deferred_list.
3184c010d47fSZi Yan 			 */
3185c010d47fSZi Yan 			list_del_init(&folio->_deferred_list);
31869a982250SKirill A. Shutemov 		}
3187afb97172SWei Yang 		spin_unlock(&ds_queue->split_queue_lock);
318806d3eff6SKirill A. Shutemov 		if (mapping) {
31893e9a13daSMatthew Wilcox (Oracle) 			int nr = folio_nr_pages(folio);
3190bf9eceadSMuchun Song 
31913e9a13daSMatthew Wilcox (Oracle) 			xas_split(&xas, folio, folio_order(folio));
3192c010d47fSZi Yan 			if (folio_test_pmd_mappable(folio) &&
3193c010d47fSZi Yan 			    new_order < HPAGE_PMD_ORDER) {
31943e9a13daSMatthew Wilcox (Oracle) 				if (folio_test_swapbacked(folio)) {
3195a48d5bdcSStefan Roesch 					__lruvec_stat_mod_folio(folio,
3196a48d5bdcSStefan Roesch 							NR_SHMEM_THPS, -nr);
31971ca7554dSMarek Szyprowski 				} else {
3198a48d5bdcSStefan Roesch 					__lruvec_stat_mod_folio(folio,
3199a48d5bdcSStefan Roesch 							NR_FILE_THPS, -nr);
32001ca7554dSMarek Szyprowski 					filemap_nr_thps_dec(mapping);
32011ca7554dSMarek Szyprowski 				}
320206d3eff6SKirill A. Shutemov 			}
3203a48d5bdcSStefan Roesch 		}
320406d3eff6SKirill A. Shutemov 
3205c010d47fSZi Yan 		__split_huge_page(page, list, end, new_order);
3206e9b61f19SKirill A. Shutemov 		ret = 0;
3207baa355fdSKirill A. Shutemov 	} else {
3208364c1eebSYang Shi 		spin_unlock(&ds_queue->split_queue_lock);
3209504e070dSYang Shi fail:
3210504e070dSYang Shi 		if (mapping)
32116b24ca4aSMatthew Wilcox (Oracle) 			xas_unlock(&xas);
3212b6769834SAlex Shi 		local_irq_enable();
32134eecb8b9SMatthew Wilcox (Oracle) 		remap_page(folio, folio_nr_pages(folio));
3214fd4a7ac3SBaolin Wang 		ret = -EAGAIN;
3215e9b61f19SKirill A. Shutemov 	}
3216e9b61f19SKirill A. Shutemov 
3217e9b61f19SKirill A. Shutemov out_unlock:
3218baa355fdSKirill A. Shutemov 	if (anon_vma) {
3219e9b61f19SKirill A. Shutemov 		anon_vma_unlock_write(anon_vma);
3220e9b61f19SKirill A. Shutemov 		put_anon_vma(anon_vma);
3221baa355fdSKirill A. Shutemov 	}
3222baa355fdSKirill A. Shutemov 	if (mapping)
3223baa355fdSKirill A. Shutemov 		i_mmap_unlock_read(mapping);
3224e9b61f19SKirill A. Shutemov out:
322569a37a8bSMatthew Wilcox (Oracle) 	xas_destroy(&xas);
3226e9b61f19SKirill A. Shutemov 	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3227e9b61f19SKirill A. Shutemov 	return ret;
3228e9b61f19SKirill A. Shutemov }
32299a982250SKirill A. Shutemov 
32308dc4a8f1SMatthew Wilcox (Oracle) void folio_undo_large_rmappable(struct folio *folio)
32319a982250SKirill A. Shutemov {
32328dc4a8f1SMatthew Wilcox (Oracle) 	struct deferred_split *ds_queue;
32339a982250SKirill A. Shutemov 	unsigned long flags;
32349a982250SKirill A. Shutemov 
32358897277aSMatthew Wilcox (Oracle) 	if (folio_order(folio) <= 1)
32368897277aSMatthew Wilcox (Oracle) 		return;
32378897277aSMatthew Wilcox (Oracle) 
3238deedad80SYin Fengwei 	/*
3239deedad80SYin Fengwei 	 * At this point, there is no one trying to add the folio to
3240deedad80SYin Fengwei 	 * deferred_list. If folio is not in deferred_list, it's safe
3241deedad80SYin Fengwei 	 * to check without acquiring the split_queue_lock.
3242deedad80SYin Fengwei 	 */
32438dc4a8f1SMatthew Wilcox (Oracle) 	if (data_race(list_empty(&folio->_deferred_list)))
32448dc4a8f1SMatthew Wilcox (Oracle) 		return;
32458dc4a8f1SMatthew Wilcox (Oracle) 
32468dc4a8f1SMatthew Wilcox (Oracle) 	ds_queue = get_deferred_split_queue(folio);
3247364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
32488991de90SMatthew Wilcox (Oracle) 	if (!list_empty(&folio->_deferred_list)) {
3249364c1eebSYang Shi 		ds_queue->split_queue_len--;
32509bcef597SBaolin Wang 		list_del_init(&folio->_deferred_list);
32519a982250SKirill A. Shutemov 	}
3252364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3253deedad80SYin Fengwei }
32549a982250SKirill A. Shutemov 
3255f158ed61SMatthew Wilcox (Oracle) void deferred_split_folio(struct folio *folio)
32569a982250SKirill A. Shutemov {
3257f8baa6beSMatthew Wilcox (Oracle) 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
325887eaceb3SYang Shi #ifdef CONFIG_MEMCG
32598991de90SMatthew Wilcox (Oracle) 	struct mem_cgroup *memcg = folio_memcg(folio);
326087eaceb3SYang Shi #endif
32619a982250SKirill A. Shutemov 	unsigned long flags;
32629a982250SKirill A. Shutemov 
32638897277aSMatthew Wilcox (Oracle) 	/*
32648897277aSMatthew Wilcox (Oracle) 	 * Order 1 folios have no space for a deferred list, but we also
32658897277aSMatthew Wilcox (Oracle) 	 * won't waste much memory by not adding them to the deferred list.
32668897277aSMatthew Wilcox (Oracle) 	 */
32678897277aSMatthew Wilcox (Oracle) 	if (folio_order(folio) <= 1)
32688897277aSMatthew Wilcox (Oracle) 		return;
32699a982250SKirill A. Shutemov 
327087eaceb3SYang Shi 	/*
327187eaceb3SYang Shi 	 * The try_to_unmap() in page reclaim path might reach here too,
327287eaceb3SYang Shi 	 * this may cause a race condition to corrupt deferred split queue.
32738991de90SMatthew Wilcox (Oracle) 	 * And, if page reclaim is already handling the same folio, it is
327487eaceb3SYang Shi 	 * unnecessary to handle it again in shrinker.
327587eaceb3SYang Shi 	 *
32768991de90SMatthew Wilcox (Oracle) 	 * Check the swapcache flag to determine if the folio is being
32778991de90SMatthew Wilcox (Oracle) 	 * handled by page reclaim since THP swap would add the folio into
327887eaceb3SYang Shi 	 * swap cache before calling try_to_unmap().
327987eaceb3SYang Shi 	 */
32808991de90SMatthew Wilcox (Oracle) 	if (folio_test_swapcache(folio))
328187eaceb3SYang Shi 		return;
328287eaceb3SYang Shi 
32838991de90SMatthew Wilcox (Oracle) 	if (!list_empty(&folio->_deferred_list))
32849a982250SKirill A. Shutemov 		return;
32859a982250SKirill A. Shutemov 
3286364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
32878991de90SMatthew Wilcox (Oracle) 	if (list_empty(&folio->_deferred_list)) {
3288f9719a03SKirill A. Shutemov 		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
32898991de90SMatthew Wilcox (Oracle) 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
3290364c1eebSYang Shi 		ds_queue->split_queue_len++;
329187eaceb3SYang Shi #ifdef CONFIG_MEMCG
329287eaceb3SYang Shi 		if (memcg)
32938991de90SMatthew Wilcox (Oracle) 			set_shrinker_bit(memcg, folio_nid(folio),
329454d91729SQi Zheng 					 deferred_split_shrinker->id);
329587eaceb3SYang Shi #endif
32969a982250SKirill A. Shutemov 	}
3297364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
32989a982250SKirill A. Shutemov }
32999a982250SKirill A. Shutemov 
33009a982250SKirill A. Shutemov static unsigned long deferred_split_count(struct shrinker *shrink,
33019a982250SKirill A. Shutemov 		struct shrink_control *sc)
33029a982250SKirill A. Shutemov {
3303a3d0a918SKirill A. Shutemov 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
3304364c1eebSYang Shi 	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
330587eaceb3SYang Shi 
330687eaceb3SYang Shi #ifdef CONFIG_MEMCG
330787eaceb3SYang Shi 	if (sc->memcg)
330887eaceb3SYang Shi 		ds_queue = &sc->memcg->deferred_split_queue;
330987eaceb3SYang Shi #endif
3310364c1eebSYang Shi 	return READ_ONCE(ds_queue->split_queue_len);
33119a982250SKirill A. Shutemov }
33129a982250SKirill A. Shutemov 
33139a982250SKirill A. Shutemov static unsigned long deferred_split_scan(struct shrinker *shrink,
33149a982250SKirill A. Shutemov 		struct shrink_control *sc)
33159a982250SKirill A. Shutemov {
3316a3d0a918SKirill A. Shutemov 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
3317364c1eebSYang Shi 	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
33189a982250SKirill A. Shutemov 	unsigned long flags;
33194375a553SMatthew Wilcox (Oracle) 	LIST_HEAD(list);
33204375a553SMatthew Wilcox (Oracle) 	struct folio *folio, *next;
33219a982250SKirill A. Shutemov 	int split = 0;
33229a982250SKirill A. Shutemov 
332387eaceb3SYang Shi #ifdef CONFIG_MEMCG
332487eaceb3SYang Shi 	if (sc->memcg)
332587eaceb3SYang Shi 		ds_queue = &sc->memcg->deferred_split_queue;
332687eaceb3SYang Shi #endif
332787eaceb3SYang Shi 
3328364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
33299a982250SKirill A. Shutemov 	/* Take pin on all head pages to avoid freeing them under us */
33304375a553SMatthew Wilcox (Oracle) 	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
33314375a553SMatthew Wilcox (Oracle) 							_deferred_list) {
33324375a553SMatthew Wilcox (Oracle) 		if (folio_try_get(folio)) {
33334375a553SMatthew Wilcox (Oracle) 			list_move(&folio->_deferred_list, &list);
3334e3ae1953SKirill A. Shutemov 		} else {
33354375a553SMatthew Wilcox (Oracle) 			/* We lost race with folio_put() */
33364375a553SMatthew Wilcox (Oracle) 			list_del_init(&folio->_deferred_list);
3337364c1eebSYang Shi 			ds_queue->split_queue_len--;
33389a982250SKirill A. Shutemov 		}
3339e3ae1953SKirill A. Shutemov 		if (!--sc->nr_to_scan)
3340e3ae1953SKirill A. Shutemov 			break;
33419a982250SKirill A. Shutemov 	}
3342364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
33439a982250SKirill A. Shutemov 
33444375a553SMatthew Wilcox (Oracle) 	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
33454375a553SMatthew Wilcox (Oracle) 		if (!folio_trylock(folio))
3346fa41b900SKirill A. Shutemov 			goto next;
33479a982250SKirill A. Shutemov 		/* split_huge_page() removes page from list on success */
33484375a553SMatthew Wilcox (Oracle) 		if (!split_folio(folio))
33499a982250SKirill A. Shutemov 			split++;
33504375a553SMatthew Wilcox (Oracle) 		folio_unlock(folio);
3351fa41b900SKirill A. Shutemov next:
33524375a553SMatthew Wilcox (Oracle) 		folio_put(folio);
33539a982250SKirill A. Shutemov 	}
33549a982250SKirill A. Shutemov 
3355364c1eebSYang Shi 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3356364c1eebSYang Shi 	list_splice_tail(&list, &ds_queue->split_queue);
3357364c1eebSYang Shi 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
33589a982250SKirill A. Shutemov 
3359cb8d68ecSKirill A. Shutemov 	/*
3360cb8d68ecSKirill A. Shutemov 	 * Stop shrinker if we didn't split any page, but the queue is empty.
3361cb8d68ecSKirill A. Shutemov 	 * This can happen if pages were freed under us.
3362cb8d68ecSKirill A. Shutemov 	 */
3363364c1eebSYang Shi 	if (!split && list_empty(&ds_queue->split_queue))
3364cb8d68ecSKirill A. Shutemov 		return SHRINK_STOP;
3365cb8d68ecSKirill A. Shutemov 	return split;
33669a982250SKirill A. Shutemov }
33679a982250SKirill A. Shutemov 
336849071d43SKirill A. Shutemov #ifdef CONFIG_DEBUG_FS
3369fa6c0231SZi Yan static void split_huge_pages_all(void)
337049071d43SKirill A. Shutemov {
337149071d43SKirill A. Shutemov 	struct zone *zone;
337249071d43SKirill A. Shutemov 	struct page *page;
3373630e7c5eSKefeng Wang 	struct folio *folio;
337449071d43SKirill A. Shutemov 	unsigned long pfn, max_zone_pfn;
337549071d43SKirill A. Shutemov 	unsigned long total = 0, split = 0;
337649071d43SKirill A. Shutemov 
3377fa6c0231SZi Yan 	pr_debug("Split all THPs\n");
3378a17206daSMiaohe Lin 	for_each_zone(zone) {
3379a17206daSMiaohe Lin 		if (!managed_zone(zone))
3380a17206daSMiaohe Lin 			continue;
338149071d43SKirill A. Shutemov 		max_zone_pfn = zone_end_pfn(zone);
338249071d43SKirill A. Shutemov 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3383a17206daSMiaohe Lin 			int nr_pages;
338449071d43SKirill A. Shutemov 
33852b7aa91bSNaoya Horiguchi 			page = pfn_to_online_page(pfn);
3386630e7c5eSKefeng Wang 			if (!page || PageTail(page))
3387630e7c5eSKefeng Wang 				continue;
3388630e7c5eSKefeng Wang 			folio = page_folio(page);
3389630e7c5eSKefeng Wang 			if (!folio_try_get(folio))
339049071d43SKirill A. Shutemov 				continue;
339149071d43SKirill A. Shutemov 
3392630e7c5eSKefeng Wang 			if (unlikely(page_folio(page) != folio))
339349071d43SKirill A. Shutemov 				goto next;
339449071d43SKirill A. Shutemov 
3395630e7c5eSKefeng Wang 			if (zone != folio_zone(folio))
3396630e7c5eSKefeng Wang 				goto next;
3397630e7c5eSKefeng Wang 
3398630e7c5eSKefeng Wang 			if (!folio_test_large(folio)
3399630e7c5eSKefeng Wang 				|| folio_test_hugetlb(folio)
3400630e7c5eSKefeng Wang 				|| !folio_test_lru(folio))
340149071d43SKirill A. Shutemov 				goto next;
340249071d43SKirill A. Shutemov 
340349071d43SKirill A. Shutemov 			total++;
3404630e7c5eSKefeng Wang 			folio_lock(folio);
3405630e7c5eSKefeng Wang 			nr_pages = folio_nr_pages(folio);
3406630e7c5eSKefeng Wang 			if (!split_folio(folio))
340749071d43SKirill A. Shutemov 				split++;
3408a17206daSMiaohe Lin 			pfn += nr_pages - 1;
3409630e7c5eSKefeng Wang 			folio_unlock(folio);
341049071d43SKirill A. Shutemov next:
3411630e7c5eSKefeng Wang 			folio_put(folio);
3412fa6c0231SZi Yan 			cond_resched();
341349071d43SKirill A. Shutemov 		}
341449071d43SKirill A. Shutemov 	}
341549071d43SKirill A. Shutemov 
3416fa6c0231SZi Yan 	pr_debug("%lu of %lu THP split\n", split, total);
341749071d43SKirill A. Shutemov }
3418fa6c0231SZi Yan 
3419fa6c0231SZi Yan static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
3420fa6c0231SZi Yan {
3421fa6c0231SZi Yan 	return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
3422fa6c0231SZi Yan 		    is_vm_hugetlb_page(vma);
3423fa6c0231SZi Yan }
3424fa6c0231SZi Yan 
3425fa6c0231SZi Yan static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
3426fc4d1823SZi Yan 				unsigned long vaddr_end, unsigned int new_order)
3427fa6c0231SZi Yan {
3428fa6c0231SZi Yan 	int ret = 0;
3429fa6c0231SZi Yan 	struct task_struct *task;
3430fa6c0231SZi Yan 	struct mm_struct *mm;
3431fa6c0231SZi Yan 	unsigned long total = 0, split = 0;
3432fa6c0231SZi Yan 	unsigned long addr;
3433fa6c0231SZi Yan 
3434fa6c0231SZi Yan 	vaddr_start &= PAGE_MASK;
3435fa6c0231SZi Yan 	vaddr_end &= PAGE_MASK;
3436fa6c0231SZi Yan 
3437fa6c0231SZi Yan 	/* Find the task_struct from pid */
3438fa6c0231SZi Yan 	rcu_read_lock();
3439fa6c0231SZi Yan 	task = find_task_by_vpid(pid);
3440fa6c0231SZi Yan 	if (!task) {
3441fa6c0231SZi Yan 		rcu_read_unlock();
3442fa6c0231SZi Yan 		ret = -ESRCH;
3443fa6c0231SZi Yan 		goto out;
3444fa6c0231SZi Yan 	}
3445fa6c0231SZi Yan 	get_task_struct(task);
3446fa6c0231SZi Yan 	rcu_read_unlock();
3447fa6c0231SZi Yan 
3448fa6c0231SZi Yan 	/* Find the mm_struct */
3449fa6c0231SZi Yan 	mm = get_task_mm(task);
3450fa6c0231SZi Yan 	put_task_struct(task);
3451fa6c0231SZi Yan 
3452fa6c0231SZi Yan 	if (!mm) {
3453fa6c0231SZi Yan 		ret = -EINVAL;
3454fa6c0231SZi Yan 		goto out;
3455fa6c0231SZi Yan 	}
3456fa6c0231SZi Yan 
3457fa6c0231SZi Yan 	pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3458fa6c0231SZi Yan 		 pid, vaddr_start, vaddr_end);
3459fa6c0231SZi Yan 
3460fa6c0231SZi Yan 	mmap_read_lock(mm);
3461fa6c0231SZi Yan 	/*
3462fa6c0231SZi Yan 	 * always increase addr by PAGE_SIZE, since we could have a PTE page
3463fa6c0231SZi Yan 	 * table filled with PTE-mapped THPs, each of which is distinct.
3464fa6c0231SZi Yan 	 */
3465fa6c0231SZi Yan 	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
346674ba2b38SMiaohe Lin 		struct vm_area_struct *vma = vma_lookup(mm, addr);
3467fa6c0231SZi Yan 		struct page *page;
3468a644b0abSMatthew Wilcox (Oracle) 		struct folio *folio;
3469fa6c0231SZi Yan 
347074ba2b38SMiaohe Lin 		if (!vma)
3471fa6c0231SZi Yan 			break;
3472fa6c0231SZi Yan 
3473fa6c0231SZi Yan 		/* skip special VMA and hugetlb VMA */
3474fa6c0231SZi Yan 		if (vma_not_suitable_for_thp_split(vma)) {
3475fa6c0231SZi Yan 			addr = vma->vm_end;
3476fa6c0231SZi Yan 			continue;
3477fa6c0231SZi Yan 		}
3478fa6c0231SZi Yan 
3479fa6c0231SZi Yan 		/* FOLL_DUMP to ignore special (like zero) pages */
348087d2762eSMiaohe Lin 		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
3481fa6c0231SZi Yan 
3482f7091ed6SHaiyue Wang 		if (IS_ERR_OR_NULL(page))
3483fa6c0231SZi Yan 			continue;
3484fa6c0231SZi Yan 
3485a644b0abSMatthew Wilcox (Oracle) 		folio = page_folio(page);
3486a644b0abSMatthew Wilcox (Oracle) 		if (!is_transparent_hugepage(folio))
3487fa6c0231SZi Yan 			goto next;
3488fa6c0231SZi Yan 
34892394aef6SZi Yan 		if (new_order >= folio_order(folio))
34902394aef6SZi Yan 			goto next;
34912394aef6SZi Yan 
3492fa6c0231SZi Yan 		total++;
3493fc4d1823SZi Yan 		/*
3494fc4d1823SZi Yan 		 * For folios with private, split_huge_page_to_list_to_order()
3495fc4d1823SZi Yan 		 * will try to drop it before split and then check if the folio
3496fc4d1823SZi Yan 		 * can be split or not. So skip the check here.
3497fc4d1823SZi Yan 		 */
3498fc4d1823SZi Yan 		if (!folio_test_private(folio) &&
3499fc4d1823SZi Yan 		    !can_split_folio(folio, NULL))
3500fa6c0231SZi Yan 			goto next;
3501fa6c0231SZi Yan 
3502a644b0abSMatthew Wilcox (Oracle) 		if (!folio_trylock(folio))
3503fa6c0231SZi Yan 			goto next;
3504fa6c0231SZi Yan 
3505fc4d1823SZi Yan 		if (!split_folio_to_order(folio, new_order))
3506fa6c0231SZi Yan 			split++;
3507fa6c0231SZi Yan 
3508a644b0abSMatthew Wilcox (Oracle) 		folio_unlock(folio);
3509fa6c0231SZi Yan next:
3510a644b0abSMatthew Wilcox (Oracle) 		folio_put(folio);
3511fa6c0231SZi Yan 		cond_resched();
3512fa6c0231SZi Yan 	}
3513fa6c0231SZi Yan 	mmap_read_unlock(mm);
3514fa6c0231SZi Yan 	mmput(mm);
3515fa6c0231SZi Yan 
3516fa6c0231SZi Yan 	pr_debug("%lu of %lu THP split\n", split, total);
3517fa6c0231SZi Yan 
3518fa6c0231SZi Yan out:
3519fa6c0231SZi Yan 	return ret;
3520fa6c0231SZi Yan }
3521fa6c0231SZi Yan 
3522fbe37501SZi Yan static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
3523fc4d1823SZi Yan 				pgoff_t off_end, unsigned int new_order)
3524fbe37501SZi Yan {
3525fbe37501SZi Yan 	struct filename *file;
3526fbe37501SZi Yan 	struct file *candidate;
3527fbe37501SZi Yan 	struct address_space *mapping;
3528fbe37501SZi Yan 	int ret = -EINVAL;
3529fbe37501SZi Yan 	pgoff_t index;
3530fbe37501SZi Yan 	int nr_pages = 1;
3531fbe37501SZi Yan 	unsigned long total = 0, split = 0;
3532fbe37501SZi Yan 
3533fbe37501SZi Yan 	file = getname_kernel(file_path);
3534fbe37501SZi Yan 	if (IS_ERR(file))
3535fbe37501SZi Yan 		return ret;
3536fbe37501SZi Yan 
3537fbe37501SZi Yan 	candidate = file_open_name(file, O_RDONLY, 0);
3538fbe37501SZi Yan 	if (IS_ERR(candidate))
3539fbe37501SZi Yan 		goto out;
3540fbe37501SZi Yan 
3541fbe37501SZi Yan 	pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3542fbe37501SZi Yan 		 file_path, off_start, off_end);
3543fbe37501SZi Yan 
3544fbe37501SZi Yan 	mapping = candidate->f_mapping;
3545fbe37501SZi Yan 
3546fbe37501SZi Yan 	for (index = off_start; index < off_end; index += nr_pages) {
35471fb130b2SChristoph Hellwig 		struct folio *folio = filemap_get_folio(mapping, index);
3548fbe37501SZi Yan 
3549fbe37501SZi Yan 		nr_pages = 1;
355066dabbb6SChristoph Hellwig 		if (IS_ERR(folio))
3551fbe37501SZi Yan 			continue;
3552fbe37501SZi Yan 
35539ee2c086SMatthew Wilcox (Oracle) 		if (!folio_test_large(folio))
3554fbe37501SZi Yan 			goto next;
3555fbe37501SZi Yan 
3556fbe37501SZi Yan 		total++;
35579ee2c086SMatthew Wilcox (Oracle) 		nr_pages = folio_nr_pages(folio);
3558fbe37501SZi Yan 
35592394aef6SZi Yan 		if (new_order >= folio_order(folio))
35602394aef6SZi Yan 			goto next;
35612394aef6SZi Yan 
35629ee2c086SMatthew Wilcox (Oracle) 		if (!folio_trylock(folio))
3563fbe37501SZi Yan 			goto next;
3564fbe37501SZi Yan 
3565fc4d1823SZi Yan 		if (!split_folio_to_order(folio, new_order))
3566fbe37501SZi Yan 			split++;
3567fbe37501SZi Yan 
35689ee2c086SMatthew Wilcox (Oracle) 		folio_unlock(folio);
3569fbe37501SZi Yan next:
35709ee2c086SMatthew Wilcox (Oracle) 		folio_put(folio);
3571fbe37501SZi Yan 		cond_resched();
3572fbe37501SZi Yan 	}
3573fbe37501SZi Yan 
3574fbe37501SZi Yan 	filp_close(candidate, NULL);
3575fbe37501SZi Yan 	ret = 0;
3576fbe37501SZi Yan 
3577fbe37501SZi Yan 	pr_debug("%lu of %lu file-backed THP split\n", split, total);
3578fbe37501SZi Yan out:
3579fbe37501SZi Yan 	putname(file);
3580fbe37501SZi Yan 	return ret;
3581fbe37501SZi Yan }
3582fbe37501SZi Yan 
3583fa6c0231SZi Yan #define MAX_INPUT_BUF_SZ 255
3584fa6c0231SZi Yan 
3585fa6c0231SZi Yan static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
3586fa6c0231SZi Yan 				size_t count, loff_t *ppops)
3587fa6c0231SZi Yan {
3588fa6c0231SZi Yan 	static DEFINE_MUTEX(split_debug_mutex);
3589fa6c0231SZi Yan 	ssize_t ret;
3590fc4d1823SZi Yan 	/*
3591fc4d1823SZi Yan 	 * hold pid, start_vaddr, end_vaddr, new_order or
3592fc4d1823SZi Yan 	 * file_path, off_start, off_end, new_order
3593fc4d1823SZi Yan 	 */
3594fbe37501SZi Yan 	char input_buf[MAX_INPUT_BUF_SZ];
3595fa6c0231SZi Yan 	int pid;
3596fa6c0231SZi Yan 	unsigned long vaddr_start, vaddr_end;
3597fc4d1823SZi Yan 	unsigned int new_order = 0;
3598fa6c0231SZi Yan 
3599fa6c0231SZi Yan 	ret = mutex_lock_interruptible(&split_debug_mutex);
3600fa6c0231SZi Yan 	if (ret)
3601fa6c0231SZi Yan 		return ret;
3602fa6c0231SZi Yan 
3603fa6c0231SZi Yan 	ret = -EFAULT;
3604fa6c0231SZi Yan 
3605fa6c0231SZi Yan 	memset(input_buf, 0, MAX_INPUT_BUF_SZ);
3606fa6c0231SZi Yan 	if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3607fa6c0231SZi Yan 		goto out;
3608fa6c0231SZi Yan 
3609fa6c0231SZi Yan 	input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
3610fbe37501SZi Yan 
3611fbe37501SZi Yan 	if (input_buf[0] == '/') {
3612fbe37501SZi Yan 		char *tok;
3613fbe37501SZi Yan 		char *buf = input_buf;
3614fbe37501SZi Yan 		char file_path[MAX_INPUT_BUF_SZ];
3615fbe37501SZi Yan 		pgoff_t off_start = 0, off_end = 0;
3616fbe37501SZi Yan 		size_t input_len = strlen(input_buf);
3617fbe37501SZi Yan 
3618fbe37501SZi Yan 		tok = strsep(&buf, ",");
3619fbe37501SZi Yan 		if (tok) {
36201212e00cSMatthew Wilcox (Oracle) 			strcpy(file_path, tok);
3621fbe37501SZi Yan 		} else {
3622fbe37501SZi Yan 			ret = -EINVAL;
3623fbe37501SZi Yan 			goto out;
3624fbe37501SZi Yan 		}
3625fbe37501SZi Yan 
3626fc4d1823SZi Yan 		ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
3627fc4d1823SZi Yan 		if (ret != 2 && ret != 3) {
3628fbe37501SZi Yan 			ret = -EINVAL;
3629fbe37501SZi Yan 			goto out;
3630fbe37501SZi Yan 		}
3631fc4d1823SZi Yan 		ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
3632fbe37501SZi Yan 		if (!ret)
3633fbe37501SZi Yan 			ret = input_len;
3634fbe37501SZi Yan 
3635fbe37501SZi Yan 		goto out;
3636fbe37501SZi Yan 	}
3637fbe37501SZi Yan 
3638fc4d1823SZi Yan 	ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
3639fa6c0231SZi Yan 	if (ret == 1 && pid == 1) {
3640fa6c0231SZi Yan 		split_huge_pages_all();
3641fa6c0231SZi Yan 		ret = strlen(input_buf);
3642fa6c0231SZi Yan 		goto out;
3643fc4d1823SZi Yan 	} else if (ret != 3 && ret != 4) {
3644fa6c0231SZi Yan 		ret = -EINVAL;
3645fa6c0231SZi Yan 		goto out;
3646fa6c0231SZi Yan 	}
3647fa6c0231SZi Yan 
3648fc4d1823SZi Yan 	ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
3649fa6c0231SZi Yan 	if (!ret)
3650fa6c0231SZi Yan 		ret = strlen(input_buf);
3651fa6c0231SZi Yan out:
3652fa6c0231SZi Yan 	mutex_unlock(&split_debug_mutex);
3653fa6c0231SZi Yan 	return ret;
3654fa6c0231SZi Yan 
3655fa6c0231SZi Yan }
3656fa6c0231SZi Yan 
3657fa6c0231SZi Yan static const struct file_operations split_huge_pages_fops = {
3658fa6c0231SZi Yan 	.owner	 = THIS_MODULE,
3659fa6c0231SZi Yan 	.write	 = split_huge_pages_write,
3660fa6c0231SZi Yan 	.llseek  = no_llseek,
3661fa6c0231SZi Yan };
366249071d43SKirill A. Shutemov 
366349071d43SKirill A. Shutemov static int __init split_huge_pages_debugfs(void)
366449071d43SKirill A. Shutemov {
3665d9f7979cSGreg Kroah-Hartman 	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
366649071d43SKirill A. Shutemov 			    &split_huge_pages_fops);
366749071d43SKirill A. Shutemov 	return 0;
366849071d43SKirill A. Shutemov }
366949071d43SKirill A. Shutemov late_initcall(split_huge_pages_debugfs);
367049071d43SKirill A. Shutemov #endif
3671616b8371SZi Yan 
3672616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
36737f5abe60SDavid Hildenbrand int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3674616b8371SZi Yan 		struct page *page)
3675616b8371SZi Yan {
3676a8e61d58SDavid Hildenbrand 	struct folio *folio = page_folio(page);
3677616b8371SZi Yan 	struct vm_area_struct *vma = pvmw->vma;
3678616b8371SZi Yan 	struct mm_struct *mm = vma->vm_mm;
3679616b8371SZi Yan 	unsigned long address = pvmw->address;
36806c287605SDavid Hildenbrand 	bool anon_exclusive;
3681616b8371SZi Yan 	pmd_t pmdval;
3682616b8371SZi Yan 	swp_entry_t entry;
3683ab6e3d09SNaoya Horiguchi 	pmd_t pmdswp;
3684616b8371SZi Yan 
3685616b8371SZi Yan 	if (!(pvmw->pmd && !pvmw->pte))
36867f5abe60SDavid Hildenbrand 		return 0;
3687616b8371SZi Yan 
3688616b8371SZi Yan 	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
36898a8683adSHuang Ying 	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
36906c287605SDavid Hildenbrand 
3691e3b4b137SDavid Hildenbrand 	/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
3692a8e61d58SDavid Hildenbrand 	anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
3693e3b4b137SDavid Hildenbrand 	if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
36946c287605SDavid Hildenbrand 		set_pmd_at(mm, address, pvmw->pmd, pmdval);
36957f5abe60SDavid Hildenbrand 		return -EBUSY;
36966c287605SDavid Hildenbrand 	}
36976c287605SDavid Hildenbrand 
3698616b8371SZi Yan 	if (pmd_dirty(pmdval))
3699db44c658SDavid Hildenbrand 		folio_mark_dirty(folio);
37004dd845b5SAlistair Popple 	if (pmd_write(pmdval))
37014dd845b5SAlistair Popple 		entry = make_writable_migration_entry(page_to_pfn(page));
37026c287605SDavid Hildenbrand 	else if (anon_exclusive)
37036c287605SDavid Hildenbrand 		entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
37044dd845b5SAlistair Popple 	else
37054dd845b5SAlistair Popple 		entry = make_readable_migration_entry(page_to_pfn(page));
37062e346877SPeter Xu 	if (pmd_young(pmdval))
37072e346877SPeter Xu 		entry = make_migration_entry_young(entry);
37082e346877SPeter Xu 	if (pmd_dirty(pmdval))
37092e346877SPeter Xu 		entry = make_migration_entry_dirty(entry);
3710ab6e3d09SNaoya Horiguchi 	pmdswp = swp_entry_to_pmd(entry);
3711ab6e3d09SNaoya Horiguchi 	if (pmd_soft_dirty(pmdval))
3712ab6e3d09SNaoya Horiguchi 		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
371324bf08c4SDavid Hildenbrand 	if (pmd_uffd_wp(pmdval))
371424bf08c4SDavid Hildenbrand 		pmdswp = pmd_swp_mkuffd_wp(pmdswp);
3715ab6e3d09SNaoya Horiguchi 	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3716a8e61d58SDavid Hildenbrand 	folio_remove_rmap_pmd(folio, page, vma);
3717a8e61d58SDavid Hildenbrand 	folio_put(folio);
3718283fd6feSAnshuman Khandual 	trace_set_migration_pmd(address, pmd_val(pmdswp));
37197f5abe60SDavid Hildenbrand 
37207f5abe60SDavid Hildenbrand 	return 0;
3721616b8371SZi Yan }
3722616b8371SZi Yan 
3723616b8371SZi Yan void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3724616b8371SZi Yan {
372514d85a6eSDavid Hildenbrand 	struct folio *folio = page_folio(new);
3726616b8371SZi Yan 	struct vm_area_struct *vma = pvmw->vma;
3727616b8371SZi Yan 	struct mm_struct *mm = vma->vm_mm;
3728616b8371SZi Yan 	unsigned long address = pvmw->address;
37294fba8f2aSMiaohe Lin 	unsigned long haddr = address & HPAGE_PMD_MASK;
3730616b8371SZi Yan 	pmd_t pmde;
3731616b8371SZi Yan 	swp_entry_t entry;
3732616b8371SZi Yan 
3733616b8371SZi Yan 	if (!(pvmw->pmd && !pvmw->pte))
3734616b8371SZi Yan 		return;
3735616b8371SZi Yan 
3736616b8371SZi Yan 	entry = pmd_to_swp_entry(*pvmw->pmd);
373714d85a6eSDavid Hildenbrand 	folio_get(folio);
37382e346877SPeter Xu 	pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
3739ab6e3d09SNaoya Horiguchi 	if (pmd_swp_soft_dirty(*pvmw->pmd))
3740ab6e3d09SNaoya Horiguchi 		pmde = pmd_mksoft_dirty(pmde);
37413c811f78SDavid Hildenbrand 	if (is_writable_migration_entry(entry))
3742161e393cSRick Edgecombe 		pmde = pmd_mkwrite(pmde, vma);
37438f34f1eaSPeter Xu 	if (pmd_swp_uffd_wp(*pvmw->pmd))
3744f1eb1bacSPeter Xu 		pmde = pmd_mkuffd_wp(pmde);
37452e346877SPeter Xu 	if (!is_migration_entry_young(entry))
37462e346877SPeter Xu 		pmde = pmd_mkold(pmde);
37472e346877SPeter Xu 	/* NOTE: this may contain setting soft-dirty on some archs */
374814d85a6eSDavid Hildenbrand 	if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
37492e346877SPeter Xu 		pmde = pmd_mkdirty(pmde);
3750616b8371SZi Yan 
375114d85a6eSDavid Hildenbrand 	if (folio_test_anon(folio)) {
3752395db7b1SDavid Hildenbrand 		rmap_t rmap_flags = RMAP_NONE;
37536c287605SDavid Hildenbrand 
37546c287605SDavid Hildenbrand 		if (!is_readable_migration_entry(entry))
37556c287605SDavid Hildenbrand 			rmap_flags |= RMAP_EXCLUSIVE;
37566c287605SDavid Hildenbrand 
3757395db7b1SDavid Hildenbrand 		folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
37586c287605SDavid Hildenbrand 	} else {
375914d85a6eSDavid Hildenbrand 		folio_add_file_rmap_pmd(folio, new, vma);
37606c287605SDavid Hildenbrand 	}
376114d85a6eSDavid Hildenbrand 	VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
37624fba8f2aSMiaohe Lin 	set_pmd_at(mm, haddr, pvmw->pmd, pmde);
37635cbcf225SMuchun Song 
37645cbcf225SMuchun Song 	/* No need to invalidate - it was non-present before */
3765616b8371SZi Yan 	update_mmu_cache_pmd(vma, address, pvmw->pmd);
3766283fd6feSAnshuman Khandual 	trace_remove_migration_pmd(address, pmd_val(pmde));
3767616b8371SZi Yan }
3768616b8371SZi Yan #endif
3769