xref: /linux/mm/migrate.c (revision 96f97c438f61ddba94117dcd1a1eb0aaafa22309)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2b20a3503SChristoph Lameter /*
314e0f9bcSHugh Dickins  * Memory Migration functionality - linux/mm/migrate.c
4b20a3503SChristoph Lameter  *
5b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6b20a3503SChristoph Lameter  *
7b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
8b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
9b20a3503SChristoph Lameter  *
10b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
12b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
13cde53535SChristoph Lameter  * Christoph Lameter
14b20a3503SChristoph Lameter  */
15b20a3503SChristoph Lameter 
16b20a3503SChristoph Lameter #include <linux/migrate.h>
17b95f1b31SPaul Gortmaker #include <linux/export.h>
18b20a3503SChristoph Lameter #include <linux/swap.h>
190697212aSChristoph Lameter #include <linux/swapops.h>
20b20a3503SChristoph Lameter #include <linux/pagemap.h>
21e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
22b20a3503SChristoph Lameter #include <linux/mm_inline.h>
23b488893aSPavel Emelyanov #include <linux/nsproxy.h>
24b20a3503SChristoph Lameter #include <linux/pagevec.h>
25e9995ef9SHugh Dickins #include <linux/ksm.h>
26b20a3503SChristoph Lameter #include <linux/rmap.h>
27b20a3503SChristoph Lameter #include <linux/topology.h>
28b20a3503SChristoph Lameter #include <linux/cpu.h>
29b20a3503SChristoph Lameter #include <linux/cpuset.h>
3004e62a29SChristoph Lameter #include <linux/writeback.h>
31742755a1SChristoph Lameter #include <linux/mempolicy.h>
32742755a1SChristoph Lameter #include <linux/vmalloc.h>
3386c3a764SDavid Quigley #include <linux/security.h>
3442cb14b1SHugh Dickins #include <linux/backing-dev.h>
35bda807d4SMinchan Kim #include <linux/compaction.h>
364f5ca265SAdrian Bunk #include <linux/syscalls.h>
377addf443SDominik Brodowski #include <linux/compat.h>
38290408d4SNaoya Horiguchi #include <linux/hugetlb.h>
398e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
405a0e3ad6STejun Heo #include <linux/gfp.h>
41df6ad698SJérôme Glisse #include <linux/pfn_t.h>
42a5430ddaSJérôme Glisse #include <linux/memremap.h>
438315ada7SJérôme Glisse #include <linux/userfaultfd_k.h>
44bf6bddf1SRafael Aquini #include <linux/balloon_compaction.h>
4533c3fc71SVladimir Davydov #include <linux/page_idle.h>
46d435edcaSVlastimil Babka #include <linux/page_owner.h>
476e84f315SIngo Molnar #include <linux/sched/mm.h>
48197e7e52SLinus Torvalds #include <linux/ptrace.h>
4934290e2cSRalph Campbell #include <linux/oom.h>
50884a6e5dSDave Hansen #include <linux/memory.h>
51ac16ec83SBaolin Wang #include <linux/random.h>
52c574bbe9SHuang Ying #include <linux/sched/sysctl.h>
53467b171aSAneesh Kumar K.V #include <linux/memory-tiers.h>
54b20a3503SChristoph Lameter 
550d1836c3SMichal Nazarewicz #include <asm/tlbflush.h>
560d1836c3SMichal Nazarewicz 
577b2a2d4aSMel Gorman #include <trace/events/migrate.h>
587b2a2d4aSMel Gorman 
59b20a3503SChristoph Lameter #include "internal.h"
60b20a3503SChristoph Lameter 
619e5bcd61SYisheng Xie int isolate_movable_page(struct page *page, isolate_mode_t mode)
62bda807d4SMinchan Kim {
6368f2736aSMatthew Wilcox (Oracle) 	const struct movable_operations *mops;
64bda807d4SMinchan Kim 
65bda807d4SMinchan Kim 	/*
66bda807d4SMinchan Kim 	 * Avoid burning cycles with pages that are yet under __free_pages(),
67bda807d4SMinchan Kim 	 * or just got freed under us.
68bda807d4SMinchan Kim 	 *
69bda807d4SMinchan Kim 	 * In case we 'win' a race for a movable page being freed under us and
70bda807d4SMinchan Kim 	 * raise its refcount preventing __free_pages() from doing its job
71bda807d4SMinchan Kim 	 * the put_page() at the end of this block will take care of
72bda807d4SMinchan Kim 	 * release this page, thus avoiding a nasty leakage.
73bda807d4SMinchan Kim 	 */
74bda807d4SMinchan Kim 	if (unlikely(!get_page_unless_zero(page)))
75bda807d4SMinchan Kim 		goto out;
76bda807d4SMinchan Kim 
778b881763SVlastimil Babka 	if (unlikely(PageSlab(page)))
788b881763SVlastimil Babka 		goto out_putpage;
798b881763SVlastimil Babka 	/* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
808b881763SVlastimil Babka 	smp_rmb();
81bda807d4SMinchan Kim 	/*
828b881763SVlastimil Babka 	 * Check movable flag before taking the page lock because
838b881763SVlastimil Babka 	 * we use non-atomic bitops on newly allocated page flags so
848b881763SVlastimil Babka 	 * unconditionally grabbing the lock ruins page's owner side.
85bda807d4SMinchan Kim 	 */
86bda807d4SMinchan Kim 	if (unlikely(!__PageMovable(page)))
87bda807d4SMinchan Kim 		goto out_putpage;
888b881763SVlastimil Babka 	/* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
898b881763SVlastimil Babka 	smp_rmb();
908b881763SVlastimil Babka 	if (unlikely(PageSlab(page)))
918b881763SVlastimil Babka 		goto out_putpage;
928b881763SVlastimil Babka 
93bda807d4SMinchan Kim 	/*
94bda807d4SMinchan Kim 	 * As movable pages are not isolated from LRU lists, concurrent
95bda807d4SMinchan Kim 	 * compaction threads can race against page migration functions
96bda807d4SMinchan Kim 	 * as well as race against the releasing a page.
97bda807d4SMinchan Kim 	 *
98bda807d4SMinchan Kim 	 * In order to avoid having an already isolated movable page
99bda807d4SMinchan Kim 	 * being (wrongly) re-isolated while it is under migration,
100bda807d4SMinchan Kim 	 * or to avoid attempting to isolate pages being released,
101bda807d4SMinchan Kim 	 * lets be sure we have the page lock
102bda807d4SMinchan Kim 	 * before proceeding with the movable page isolation steps.
103bda807d4SMinchan Kim 	 */
104bda807d4SMinchan Kim 	if (unlikely(!trylock_page(page)))
105bda807d4SMinchan Kim 		goto out_putpage;
106bda807d4SMinchan Kim 
107bda807d4SMinchan Kim 	if (!PageMovable(page) || PageIsolated(page))
108bda807d4SMinchan Kim 		goto out_no_isolated;
109bda807d4SMinchan Kim 
11068f2736aSMatthew Wilcox (Oracle) 	mops = page_movable_ops(page);
11168f2736aSMatthew Wilcox (Oracle) 	VM_BUG_ON_PAGE(!mops, page);
112bda807d4SMinchan Kim 
11368f2736aSMatthew Wilcox (Oracle) 	if (!mops->isolate_page(page, mode))
114bda807d4SMinchan Kim 		goto out_no_isolated;
115bda807d4SMinchan Kim 
116bda807d4SMinchan Kim 	/* Driver shouldn't use PG_isolated bit of page->flags */
117bda807d4SMinchan Kim 	WARN_ON_ONCE(PageIsolated(page));
118356ea386Sandrew.yang 	SetPageIsolated(page);
119bda807d4SMinchan Kim 	unlock_page(page);
120bda807d4SMinchan Kim 
1219e5bcd61SYisheng Xie 	return 0;
122bda807d4SMinchan Kim 
123bda807d4SMinchan Kim out_no_isolated:
124bda807d4SMinchan Kim 	unlock_page(page);
125bda807d4SMinchan Kim out_putpage:
126bda807d4SMinchan Kim 	put_page(page);
127bda807d4SMinchan Kim out:
1289e5bcd61SYisheng Xie 	return -EBUSY;
129bda807d4SMinchan Kim }
130bda807d4SMinchan Kim 
131606a6f71SMiaohe Lin static void putback_movable_page(struct page *page)
132bda807d4SMinchan Kim {
13368f2736aSMatthew Wilcox (Oracle) 	const struct movable_operations *mops = page_movable_ops(page);
134bda807d4SMinchan Kim 
13568f2736aSMatthew Wilcox (Oracle) 	mops->putback_page(page);
136356ea386Sandrew.yang 	ClearPageIsolated(page);
137bda807d4SMinchan Kim }
138bda807d4SMinchan Kim 
139b20a3503SChristoph Lameter /*
1405733c7d1SRafael Aquini  * Put previously isolated pages back onto the appropriate lists
1415733c7d1SRafael Aquini  * from where they were once taken off for compaction/migration.
1425733c7d1SRafael Aquini  *
14359c82b70SJoonsoo Kim  * This function shall be used whenever the isolated pageset has been
14459c82b70SJoonsoo Kim  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
1457ce82f4cSMiaohe Lin  * and isolate_hugetlb().
1465733c7d1SRafael Aquini  */
1475733c7d1SRafael Aquini void putback_movable_pages(struct list_head *l)
1485733c7d1SRafael Aquini {
1495733c7d1SRafael Aquini 	struct page *page;
1505733c7d1SRafael Aquini 	struct page *page2;
1515733c7d1SRafael Aquini 
1525733c7d1SRafael Aquini 	list_for_each_entry_safe(page, page2, l, lru) {
15331caf665SNaoya Horiguchi 		if (unlikely(PageHuge(page))) {
15431caf665SNaoya Horiguchi 			putback_active_hugepage(page);
15531caf665SNaoya Horiguchi 			continue;
15631caf665SNaoya Horiguchi 		}
1575733c7d1SRafael Aquini 		list_del(&page->lru);
158bda807d4SMinchan Kim 		/*
159bda807d4SMinchan Kim 		 * We isolated non-lru movable page so here we can use
160bda807d4SMinchan Kim 		 * __PageMovable because LRU page's mapping cannot have
161bda807d4SMinchan Kim 		 * PAGE_MAPPING_MOVABLE.
162bda807d4SMinchan Kim 		 */
163b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(page))) {
164bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
165bda807d4SMinchan Kim 			lock_page(page);
166bda807d4SMinchan Kim 			if (PageMovable(page))
167bda807d4SMinchan Kim 				putback_movable_page(page);
168bf6bddf1SRafael Aquini 			else
169356ea386Sandrew.yang 				ClearPageIsolated(page);
170bda807d4SMinchan Kim 			unlock_page(page);
171bda807d4SMinchan Kim 			put_page(page);
172bda807d4SMinchan Kim 		} else {
173e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1746c357848SMatthew Wilcox (Oracle) 					page_is_file_lru(page), -thp_nr_pages(page));
175fc280fe8SRabin Vincent 			putback_lru_page(page);
176b20a3503SChristoph Lameter 		}
177b20a3503SChristoph Lameter 	}
178bda807d4SMinchan Kim }
179b20a3503SChristoph Lameter 
1800697212aSChristoph Lameter /*
1810697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
1820697212aSChristoph Lameter  */
1832f031c6fSMatthew Wilcox (Oracle) static bool remove_migration_pte(struct folio *folio,
1842f031c6fSMatthew Wilcox (Oracle) 		struct vm_area_struct *vma, unsigned long addr, void *old)
1850697212aSChristoph Lameter {
1864eecb8b9SMatthew Wilcox (Oracle) 	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
1870697212aSChristoph Lameter 
1883fe87967SKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
1896c287605SDavid Hildenbrand 		rmap_t rmap_flags = RMAP_NONE;
1900697212aSChristoph Lameter 		pte_t pte;
1910697212aSChristoph Lameter 		swp_entry_t entry;
1924eecb8b9SMatthew Wilcox (Oracle) 		struct page *new;
1934eecb8b9SMatthew Wilcox (Oracle) 		unsigned long idx = 0;
1940697212aSChristoph Lameter 
1954eecb8b9SMatthew Wilcox (Oracle) 		/* pgoff is invalid for ksm pages, but they are never large */
1964eecb8b9SMatthew Wilcox (Oracle) 		if (folio_test_large(folio) && !folio_test_hugetlb(folio))
1974eecb8b9SMatthew Wilcox (Oracle) 			idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
1984eecb8b9SMatthew Wilcox (Oracle) 		new = folio_page(folio, idx);
1990697212aSChristoph Lameter 
200616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
201616b8371SZi Yan 		/* PMD-mapped THP migration entry */
202616b8371SZi Yan 		if (!pvmw.pte) {
2034eecb8b9SMatthew Wilcox (Oracle) 			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
2044eecb8b9SMatthew Wilcox (Oracle) 					!folio_test_pmd_mappable(folio), folio);
205616b8371SZi Yan 			remove_migration_pmd(&pvmw, new);
206616b8371SZi Yan 			continue;
207616b8371SZi Yan 		}
208616b8371SZi Yan #endif
209616b8371SZi Yan 
2104eecb8b9SMatthew Wilcox (Oracle) 		folio_get(folio);
2112e346877SPeter Xu 		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
2123fe87967SKirill A. Shutemov 		if (pte_swp_soft_dirty(*pvmw.pte))
213c3d16e16SCyrill Gorcunov 			pte = pte_mksoft_dirty(pte);
214d3cb8bf6SMel Gorman 
2153fe87967SKirill A. Shutemov 		/*
2163fe87967SKirill A. Shutemov 		 * Recheck VMA as permissions can change since migration started
2173fe87967SKirill A. Shutemov 		 */
2183fe87967SKirill A. Shutemov 		entry = pte_to_swp_entry(*pvmw.pte);
2192e346877SPeter Xu 		if (!is_migration_entry_young(entry))
2202e346877SPeter Xu 			pte = pte_mkold(pte);
2212e346877SPeter Xu 		if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
2222e346877SPeter Xu 			pte = pte_mkdirty(pte);
2234dd845b5SAlistair Popple 		if (is_writable_migration_entry(entry))
224d3cb8bf6SMel Gorman 			pte = maybe_mkwrite(pte, vma);
225f45ec5ffSPeter Xu 		else if (pte_swp_uffd_wp(*pvmw.pte))
226f45ec5ffSPeter Xu 			pte = pte_mkuffd_wp(pte);
227d3cb8bf6SMel Gorman 
2286c287605SDavid Hildenbrand 		if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
2296c287605SDavid Hildenbrand 			rmap_flags |= RMAP_EXCLUSIVE;
2306c287605SDavid Hildenbrand 
2316128763fSRalph Campbell 		if (unlikely(is_device_private_page(new))) {
2324dd845b5SAlistair Popple 			if (pte_write(pte))
2334dd845b5SAlistair Popple 				entry = make_writable_device_private_entry(
2344dd845b5SAlistair Popple 							page_to_pfn(new));
2354dd845b5SAlistair Popple 			else
2364dd845b5SAlistair Popple 				entry = make_readable_device_private_entry(
2374dd845b5SAlistair Popple 							page_to_pfn(new));
238a5430ddaSJérôme Glisse 			pte = swp_entry_to_pte(entry);
2393d321bf8SRalph Campbell 			if (pte_swp_soft_dirty(*pvmw.pte))
2403d321bf8SRalph Campbell 				pte = pte_swp_mksoft_dirty(pte);
241f45ec5ffSPeter Xu 			if (pte_swp_uffd_wp(*pvmw.pte))
242ebdf8321SAlistair Popple 				pte = pte_swp_mkuffd_wp(pte);
243df6ad698SJérôme Glisse 		}
244a5430ddaSJérôme Glisse 
2453ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE
2464eecb8b9SMatthew Wilcox (Oracle) 		if (folio_test_hugetlb(folio)) {
24779c1c594SChristophe Leroy 			unsigned int shift = huge_page_shift(hstate_vma(vma));
24879c1c594SChristophe Leroy 
249290408d4SNaoya Horiguchi 			pte = pte_mkhuge(pte);
25079c1c594SChristophe Leroy 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
2514eecb8b9SMatthew Wilcox (Oracle) 			if (folio_test_anon(folio))
25228c5209dSDavid Hildenbrand 				hugepage_add_anon_rmap(new, vma, pvmw.address,
2536c287605SDavid Hildenbrand 						       rmap_flags);
254290408d4SNaoya Horiguchi 			else
255fb3d824dSDavid Hildenbrand 				page_dup_file_rmap(new, true);
2561eba86c0SPasha Tatashin 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
257383321abSAneesh Kumar K.V 		} else
258383321abSAneesh Kumar K.V #endif
259383321abSAneesh Kumar K.V 		{
2604eecb8b9SMatthew Wilcox (Oracle) 			if (folio_test_anon(folio))
261f1e2db12SDavid Hildenbrand 				page_add_anon_rmap(new, vma, pvmw.address,
2626c287605SDavid Hildenbrand 						   rmap_flags);
26304e62a29SChristoph Lameter 			else
264cea86fe2SHugh Dickins 				page_add_file_rmap(new, vma, false);
2651eba86c0SPasha Tatashin 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
266383321abSAneesh Kumar K.V 		}
267b7435507SHugh Dickins 		if (vma->vm_flags & VM_LOCKED)
268*96f97c43SLorenzo Stoakes 			mlock_drain_local();
269e125fe40SKirill A. Shutemov 
2704cc79b33SAnshuman Khandual 		trace_remove_migration_pte(pvmw.address, pte_val(pte),
2714cc79b33SAnshuman Khandual 					   compound_order(new));
2724cc79b33SAnshuman Khandual 
27304e62a29SChristoph Lameter 		/* No need to invalidate - it was non-present before */
2743fe87967SKirill A. Shutemov 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
2753fe87967SKirill A. Shutemov 	}
2763fe87967SKirill A. Shutemov 
277e4b82222SMinchan Kim 	return true;
2780697212aSChristoph Lameter }
2790697212aSChristoph Lameter 
2800697212aSChristoph Lameter /*
28104e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
28204e62a29SChristoph Lameter  * references to the indicated page.
28304e62a29SChristoph Lameter  */
2844eecb8b9SMatthew Wilcox (Oracle) void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
28504e62a29SChristoph Lameter {
286051ac83aSJoonsoo Kim 	struct rmap_walk_control rwc = {
287051ac83aSJoonsoo Kim 		.rmap_one = remove_migration_pte,
2884eecb8b9SMatthew Wilcox (Oracle) 		.arg = src,
289051ac83aSJoonsoo Kim 	};
290051ac83aSJoonsoo Kim 
291e388466dSKirill A. Shutemov 	if (locked)
2922f031c6fSMatthew Wilcox (Oracle) 		rmap_walk_locked(dst, &rwc);
293e388466dSKirill A. Shutemov 	else
2942f031c6fSMatthew Wilcox (Oracle) 		rmap_walk(dst, &rwc);
29504e62a29SChristoph Lameter }
29604e62a29SChristoph Lameter 
29704e62a29SChristoph Lameter /*
2980697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
2990697212aSChristoph Lameter  * get to the page and wait until migration is finished.
3000697212aSChristoph Lameter  * When we return from this function the fault will be retried.
3010697212aSChristoph Lameter  */
302e66f17ffSNaoya Horiguchi void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
30330dad309SNaoya Horiguchi 				spinlock_t *ptl)
3040697212aSChristoph Lameter {
30530dad309SNaoya Horiguchi 	pte_t pte;
3060697212aSChristoph Lameter 	swp_entry_t entry;
3070697212aSChristoph Lameter 
30830dad309SNaoya Horiguchi 	spin_lock(ptl);
3090697212aSChristoph Lameter 	pte = *ptep;
3100697212aSChristoph Lameter 	if (!is_swap_pte(pte))
3110697212aSChristoph Lameter 		goto out;
3120697212aSChristoph Lameter 
3130697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
3140697212aSChristoph Lameter 	if (!is_migration_entry(entry))
3150697212aSChristoph Lameter 		goto out;
3160697212aSChristoph Lameter 
317ffa65753SAlistair Popple 	migration_entry_wait_on_locked(entry, ptep, ptl);
3180697212aSChristoph Lameter 	return;
3190697212aSChristoph Lameter out:
3200697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3210697212aSChristoph Lameter }
3220697212aSChristoph Lameter 
32330dad309SNaoya Horiguchi void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
32430dad309SNaoya Horiguchi 				unsigned long address)
32530dad309SNaoya Horiguchi {
32630dad309SNaoya Horiguchi 	spinlock_t *ptl = pte_lockptr(mm, pmd);
32730dad309SNaoya Horiguchi 	pte_t *ptep = pte_offset_map(pmd, address);
32830dad309SNaoya Horiguchi 	__migration_entry_wait(mm, ptep, ptl);
32930dad309SNaoya Horiguchi }
33030dad309SNaoya Horiguchi 
331ad1ac596SMiaohe Lin #ifdef CONFIG_HUGETLB_PAGE
332fcd48540SPeter Xu /*
333fcd48540SPeter Xu  * The vma read lock must be held upon entry. Holding that lock prevents either
334fcd48540SPeter Xu  * the pte or the ptl from being freed.
335fcd48540SPeter Xu  *
336fcd48540SPeter Xu  * This function will release the vma lock before returning.
337fcd48540SPeter Xu  */
338fcd48540SPeter Xu void __migration_entry_wait_huge(struct vm_area_struct *vma,
339fcd48540SPeter Xu 				 pte_t *ptep, spinlock_t *ptl)
34030dad309SNaoya Horiguchi {
341ad1ac596SMiaohe Lin 	pte_t pte;
342ad1ac596SMiaohe Lin 
343fcd48540SPeter Xu 	hugetlb_vma_assert_locked(vma);
344ad1ac596SMiaohe Lin 	spin_lock(ptl);
345ad1ac596SMiaohe Lin 	pte = huge_ptep_get(ptep);
346ad1ac596SMiaohe Lin 
347fcd48540SPeter Xu 	if (unlikely(!is_hugetlb_entry_migration(pte))) {
348ad1ac596SMiaohe Lin 		spin_unlock(ptl);
349fcd48540SPeter Xu 		hugetlb_vma_unlock_read(vma);
350fcd48540SPeter Xu 	} else {
351fcd48540SPeter Xu 		/*
352fcd48540SPeter Xu 		 * If migration entry existed, safe to release vma lock
353fcd48540SPeter Xu 		 * here because the pgtable page won't be freed without the
354fcd48540SPeter Xu 		 * pgtable lock released.  See comment right above pgtable
355fcd48540SPeter Xu 		 * lock release in migration_entry_wait_on_locked().
356fcd48540SPeter Xu 		 */
357fcd48540SPeter Xu 		hugetlb_vma_unlock_read(vma);
358ad1ac596SMiaohe Lin 		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
35930dad309SNaoya Horiguchi 	}
360fcd48540SPeter Xu }
36130dad309SNaoya Horiguchi 
362ad1ac596SMiaohe Lin void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
363ad1ac596SMiaohe Lin {
364ad1ac596SMiaohe Lin 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
365ad1ac596SMiaohe Lin 
366fcd48540SPeter Xu 	__migration_entry_wait_huge(vma, pte, ptl);
367ad1ac596SMiaohe Lin }
368ad1ac596SMiaohe Lin #endif
369ad1ac596SMiaohe Lin 
370616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
371616b8371SZi Yan void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
372616b8371SZi Yan {
373616b8371SZi Yan 	spinlock_t *ptl;
374616b8371SZi Yan 
375616b8371SZi Yan 	ptl = pmd_lock(mm, pmd);
376616b8371SZi Yan 	if (!is_pmd_migration_entry(*pmd))
377616b8371SZi Yan 		goto unlock;
378ffa65753SAlistair Popple 	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
379616b8371SZi Yan 	return;
380616b8371SZi Yan unlock:
381616b8371SZi Yan 	spin_unlock(ptl);
382616b8371SZi Yan }
383616b8371SZi Yan #endif
384616b8371SZi Yan 
385108ca835SMatthew Wilcox (Oracle) static int folio_expected_refs(struct address_space *mapping,
386108ca835SMatthew Wilcox (Oracle) 		struct folio *folio)
3870b3901b3SJan Kara {
388108ca835SMatthew Wilcox (Oracle) 	int refs = 1;
389108ca835SMatthew Wilcox (Oracle) 	if (!mapping)
390108ca835SMatthew Wilcox (Oracle) 		return refs;
3910b3901b3SJan Kara 
392108ca835SMatthew Wilcox (Oracle) 	refs += folio_nr_pages(folio);
393108ca835SMatthew Wilcox (Oracle) 	if (folio_test_private(folio))
394108ca835SMatthew Wilcox (Oracle) 		refs++;
395108ca835SMatthew Wilcox (Oracle) 
396108ca835SMatthew Wilcox (Oracle) 	return refs;
3970b3901b3SJan Kara }
3980b3901b3SJan Kara 
399b20a3503SChristoph Lameter /*
400c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
4015b5c7120SChristoph Lameter  *
4025b5c7120SChristoph Lameter  * The number of remaining references must be:
4035b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
4045b5c7120SChristoph Lameter  * 2 for pages with a mapping
405266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
406b20a3503SChristoph Lameter  */
4073417013eSMatthew Wilcox (Oracle) int folio_migrate_mapping(struct address_space *mapping,
4083417013eSMatthew Wilcox (Oracle) 		struct folio *newfolio, struct folio *folio, int extra_count)
409b20a3503SChristoph Lameter {
4103417013eSMatthew Wilcox (Oracle) 	XA_STATE(xas, &mapping->i_pages, folio_index(folio));
41142cb14b1SHugh Dickins 	struct zone *oldzone, *newzone;
41242cb14b1SHugh Dickins 	int dirty;
413108ca835SMatthew Wilcox (Oracle) 	int expected_count = folio_expected_refs(mapping, folio) + extra_count;
4143417013eSMatthew Wilcox (Oracle) 	long nr = folio_nr_pages(folio);
4158763cb45SJérôme Glisse 
4166c5240aeSChristoph Lameter 	if (!mapping) {
4170e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
4183417013eSMatthew Wilcox (Oracle) 		if (folio_ref_count(folio) != expected_count)
4196c5240aeSChristoph Lameter 			return -EAGAIN;
420cf4b769aSHugh Dickins 
421cf4b769aSHugh Dickins 		/* No turning back from here */
4223417013eSMatthew Wilcox (Oracle) 		newfolio->index = folio->index;
4233417013eSMatthew Wilcox (Oracle) 		newfolio->mapping = folio->mapping;
4243417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapbacked(folio))
4253417013eSMatthew Wilcox (Oracle) 			__folio_set_swapbacked(newfolio);
426cf4b769aSHugh Dickins 
42778bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
4286c5240aeSChristoph Lameter 	}
4296c5240aeSChristoph Lameter 
4303417013eSMatthew Wilcox (Oracle) 	oldzone = folio_zone(folio);
4313417013eSMatthew Wilcox (Oracle) 	newzone = folio_zone(newfolio);
43242cb14b1SHugh Dickins 
43389eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
4343417013eSMatthew Wilcox (Oracle) 	if (!folio_ref_freeze(folio, expected_count)) {
43589eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
436e286781dSNick Piggin 		return -EAGAIN;
437e286781dSNick Piggin 	}
438e286781dSNick Piggin 
439b20a3503SChristoph Lameter 	/*
4403417013eSMatthew Wilcox (Oracle) 	 * Now we know that no one else is looking at the folio:
441cf4b769aSHugh Dickins 	 * no turning back from here.
442b20a3503SChristoph Lameter 	 */
4433417013eSMatthew Wilcox (Oracle) 	newfolio->index = folio->index;
4443417013eSMatthew Wilcox (Oracle) 	newfolio->mapping = folio->mapping;
4453417013eSMatthew Wilcox (Oracle) 	folio_ref_add(newfolio, nr); /* add cache reference */
4463417013eSMatthew Wilcox (Oracle) 	if (folio_test_swapbacked(folio)) {
4473417013eSMatthew Wilcox (Oracle) 		__folio_set_swapbacked(newfolio);
4483417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapcache(folio)) {
4493417013eSMatthew Wilcox (Oracle) 			folio_set_swapcache(newfolio);
4503417013eSMatthew Wilcox (Oracle) 			newfolio->private = folio_get_private(folio);
451b20a3503SChristoph Lameter 		}
4526326fec1SNicholas Piggin 	} else {
4533417013eSMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
4546326fec1SNicholas Piggin 	}
455b20a3503SChristoph Lameter 
45642cb14b1SHugh Dickins 	/* Move dirty while page refs frozen and newpage not yet exposed */
4573417013eSMatthew Wilcox (Oracle) 	dirty = folio_test_dirty(folio);
45842cb14b1SHugh Dickins 	if (dirty) {
4593417013eSMatthew Wilcox (Oracle) 		folio_clear_dirty(folio);
4603417013eSMatthew Wilcox (Oracle) 		folio_set_dirty(newfolio);
46142cb14b1SHugh Dickins 	}
46242cb14b1SHugh Dickins 
4633417013eSMatthew Wilcox (Oracle) 	xas_store(&xas, newfolio);
4647cf9c2c7SNick Piggin 
4657cf9c2c7SNick Piggin 	/*
466937a94c9SJacobo Giralt 	 * Drop cache reference from old page by unfreezing
467937a94c9SJacobo Giralt 	 * to one less reference.
4687cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
4697cf9c2c7SNick Piggin 	 */
4703417013eSMatthew Wilcox (Oracle) 	folio_ref_unfreeze(folio, expected_count - nr);
4717cf9c2c7SNick Piggin 
47289eb946aSMatthew Wilcox 	xas_unlock(&xas);
47342cb14b1SHugh Dickins 	/* Leave irq disabled to prevent preemption while updating stats */
47442cb14b1SHugh Dickins 
4750e8c7d0fSChristoph Lameter 	/*
4760e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
4770e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
4780e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
4790e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
4800e8c7d0fSChristoph Lameter 	 *
4810e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
4824b9d0fabSMel Gorman 	 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
4830e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
4840e8c7d0fSChristoph Lameter 	 */
48542cb14b1SHugh Dickins 	if (newzone != oldzone) {
4860d1c2072SJohannes Weiner 		struct lruvec *old_lruvec, *new_lruvec;
4870d1c2072SJohannes Weiner 		struct mem_cgroup *memcg;
4880d1c2072SJohannes Weiner 
4893417013eSMatthew Wilcox (Oracle) 		memcg = folio_memcg(folio);
4900d1c2072SJohannes Weiner 		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
4910d1c2072SJohannes Weiner 		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
4920d1c2072SJohannes Weiner 
4935c447d27SShakeel Butt 		__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
4945c447d27SShakeel Butt 		__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
4953417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
4965c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
4975c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
4984b02108aSKOSAKI Motohiro 		}
499b6038942SShakeel Butt #ifdef CONFIG_SWAP
5003417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapcache(folio)) {
501b6038942SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
502b6038942SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
503b6038942SShakeel Butt 		}
504b6038942SShakeel Butt #endif
505f56753acSChristoph Hellwig 		if (dirty && mapping_can_writeback(mapping)) {
5065c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
5075c447d27SShakeel Butt 			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
5085c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
5095c447d27SShakeel Butt 			__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
51042cb14b1SHugh Dickins 		}
51142cb14b1SHugh Dickins 	}
51242cb14b1SHugh Dickins 	local_irq_enable();
513b20a3503SChristoph Lameter 
51478bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
515b20a3503SChristoph Lameter }
5163417013eSMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_mapping);
517b20a3503SChristoph Lameter 
518b20a3503SChristoph Lameter /*
519290408d4SNaoya Horiguchi  * The expected number of remaining references is the same as that
5203417013eSMatthew Wilcox (Oracle)  * of folio_migrate_mapping().
521290408d4SNaoya Horiguchi  */
522290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping,
523b890ec2aSMatthew Wilcox (Oracle) 				   struct folio *dst, struct folio *src)
524290408d4SNaoya Horiguchi {
525b890ec2aSMatthew Wilcox (Oracle) 	XA_STATE(xas, &mapping->i_pages, folio_index(src));
526290408d4SNaoya Horiguchi 	int expected_count;
527290408d4SNaoya Horiguchi 
52889eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
529b890ec2aSMatthew Wilcox (Oracle) 	expected_count = 2 + folio_has_private(src);
530b890ec2aSMatthew Wilcox (Oracle) 	if (!folio_ref_freeze(src, expected_count)) {
53189eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
532290408d4SNaoya Horiguchi 		return -EAGAIN;
533290408d4SNaoya Horiguchi 	}
534290408d4SNaoya Horiguchi 
535b890ec2aSMatthew Wilcox (Oracle) 	dst->index = src->index;
536b890ec2aSMatthew Wilcox (Oracle) 	dst->mapping = src->mapping;
5376a93ca8fSJohannes Weiner 
538b890ec2aSMatthew Wilcox (Oracle) 	folio_get(dst);
539290408d4SNaoya Horiguchi 
540b890ec2aSMatthew Wilcox (Oracle) 	xas_store(&xas, dst);
541290408d4SNaoya Horiguchi 
542b890ec2aSMatthew Wilcox (Oracle) 	folio_ref_unfreeze(src, expected_count - 1);
543290408d4SNaoya Horiguchi 
54489eb946aSMatthew Wilcox 	xas_unlock_irq(&xas);
5456a93ca8fSJohannes Weiner 
54678bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
547290408d4SNaoya Horiguchi }
548290408d4SNaoya Horiguchi 
549290408d4SNaoya Horiguchi /*
55019138349SMatthew Wilcox (Oracle)  * Copy the flags and some other ancillary information
551b20a3503SChristoph Lameter  */
55219138349SMatthew Wilcox (Oracle) void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
553b20a3503SChristoph Lameter {
5547851a45cSRik van Riel 	int cpupid;
5557851a45cSRik van Riel 
55619138349SMatthew Wilcox (Oracle) 	if (folio_test_error(folio))
55719138349SMatthew Wilcox (Oracle) 		folio_set_error(newfolio);
55819138349SMatthew Wilcox (Oracle) 	if (folio_test_referenced(folio))
55919138349SMatthew Wilcox (Oracle) 		folio_set_referenced(newfolio);
56019138349SMatthew Wilcox (Oracle) 	if (folio_test_uptodate(folio))
56119138349SMatthew Wilcox (Oracle) 		folio_mark_uptodate(newfolio);
56219138349SMatthew Wilcox (Oracle) 	if (folio_test_clear_active(folio)) {
56319138349SMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
56419138349SMatthew Wilcox (Oracle) 		folio_set_active(newfolio);
56519138349SMatthew Wilcox (Oracle) 	} else if (folio_test_clear_unevictable(folio))
56619138349SMatthew Wilcox (Oracle) 		folio_set_unevictable(newfolio);
56719138349SMatthew Wilcox (Oracle) 	if (folio_test_workingset(folio))
56819138349SMatthew Wilcox (Oracle) 		folio_set_workingset(newfolio);
56919138349SMatthew Wilcox (Oracle) 	if (folio_test_checked(folio))
57019138349SMatthew Wilcox (Oracle) 		folio_set_checked(newfolio);
5716c287605SDavid Hildenbrand 	/*
5726c287605SDavid Hildenbrand 	 * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
5736c287605SDavid Hildenbrand 	 * migration entries. We can still have PG_anon_exclusive set on an
5746c287605SDavid Hildenbrand 	 * effectively unmapped and unreferenced first sub-pages of an
5756c287605SDavid Hildenbrand 	 * anonymous THP: we can simply copy it here via PG_mappedtodisk.
5766c287605SDavid Hildenbrand 	 */
57719138349SMatthew Wilcox (Oracle) 	if (folio_test_mappedtodisk(folio))
57819138349SMatthew Wilcox (Oracle) 		folio_set_mappedtodisk(newfolio);
579b20a3503SChristoph Lameter 
5803417013eSMatthew Wilcox (Oracle) 	/* Move dirty on pages not done by folio_migrate_mapping() */
58119138349SMatthew Wilcox (Oracle) 	if (folio_test_dirty(folio))
58219138349SMatthew Wilcox (Oracle) 		folio_set_dirty(newfolio);
583b20a3503SChristoph Lameter 
58419138349SMatthew Wilcox (Oracle) 	if (folio_test_young(folio))
58519138349SMatthew Wilcox (Oracle) 		folio_set_young(newfolio);
58619138349SMatthew Wilcox (Oracle) 	if (folio_test_idle(folio))
58719138349SMatthew Wilcox (Oracle) 		folio_set_idle(newfolio);
58833c3fc71SVladimir Davydov 
5897851a45cSRik van Riel 	/*
5907851a45cSRik van Riel 	 * Copy NUMA information to the new page, to prevent over-eager
5917851a45cSRik van Riel 	 * future migrations of this same page.
5927851a45cSRik van Riel 	 */
59319138349SMatthew Wilcox (Oracle) 	cpupid = page_cpupid_xchg_last(&folio->page, -1);
59433024536SHuang Ying 	/*
59533024536SHuang Ying 	 * For memory tiering mode, when migrate between slow and fast
59633024536SHuang Ying 	 * memory node, reset cpupid, because that is used to record
59733024536SHuang Ying 	 * page access time in slow memory node.
59833024536SHuang Ying 	 */
59933024536SHuang Ying 	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
60033024536SHuang Ying 		bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
60133024536SHuang Ying 		bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
60233024536SHuang Ying 
60333024536SHuang Ying 		if (f_toptier != t_toptier)
60433024536SHuang Ying 			cpupid = -1;
60533024536SHuang Ying 	}
60619138349SMatthew Wilcox (Oracle) 	page_cpupid_xchg_last(&newfolio->page, cpupid);
6077851a45cSRik van Riel 
60819138349SMatthew Wilcox (Oracle) 	folio_migrate_ksm(newfolio, folio);
609c8d6553bSHugh Dickins 	/*
610c8d6553bSHugh Dickins 	 * Please do not reorder this without considering how mm/ksm.c's
611c8d6553bSHugh Dickins 	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
612c8d6553bSHugh Dickins 	 */
61319138349SMatthew Wilcox (Oracle) 	if (folio_test_swapcache(folio))
61419138349SMatthew Wilcox (Oracle) 		folio_clear_swapcache(folio);
61519138349SMatthew Wilcox (Oracle) 	folio_clear_private(folio);
616ad2fa371SMuchun Song 
617ad2fa371SMuchun Song 	/* page->private contains hugetlb specific flags */
61819138349SMatthew Wilcox (Oracle) 	if (!folio_test_hugetlb(folio))
61919138349SMatthew Wilcox (Oracle) 		folio->private = NULL;
620b20a3503SChristoph Lameter 
621b20a3503SChristoph Lameter 	/*
622b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
623b20a3503SChristoph Lameter 	 * wake them up.
624b20a3503SChristoph Lameter 	 */
62519138349SMatthew Wilcox (Oracle) 	if (folio_test_writeback(newfolio))
62619138349SMatthew Wilcox (Oracle) 		folio_end_writeback(newfolio);
627d435edcaSVlastimil Babka 
6286aeff241SYang Shi 	/*
6296aeff241SYang Shi 	 * PG_readahead shares the same bit with PG_reclaim.  The above
6306aeff241SYang Shi 	 * end_page_writeback() may clear PG_readahead mistakenly, so set the
6316aeff241SYang Shi 	 * bit after that.
6326aeff241SYang Shi 	 */
63319138349SMatthew Wilcox (Oracle) 	if (folio_test_readahead(folio))
63419138349SMatthew Wilcox (Oracle) 		folio_set_readahead(newfolio);
6356aeff241SYang Shi 
63619138349SMatthew Wilcox (Oracle) 	folio_copy_owner(newfolio, folio);
63774485cf2SJohannes Weiner 
63819138349SMatthew Wilcox (Oracle) 	if (!folio_test_hugetlb(folio))
639d21bba2bSMatthew Wilcox (Oracle) 		mem_cgroup_migrate(folio, newfolio);
640b20a3503SChristoph Lameter }
64119138349SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_flags);
6422916ecc0SJérôme Glisse 
643715cbfd6SMatthew Wilcox (Oracle) void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
6442916ecc0SJérôme Glisse {
645715cbfd6SMatthew Wilcox (Oracle) 	folio_copy(newfolio, folio);
646715cbfd6SMatthew Wilcox (Oracle) 	folio_migrate_flags(newfolio, folio);
6472916ecc0SJérôme Glisse }
648715cbfd6SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_copy);
649b20a3503SChristoph Lameter 
6501d8b85ccSChristoph Lameter /************************************************************
6511d8b85ccSChristoph Lameter  *                    Migration functions
6521d8b85ccSChristoph Lameter  ***********************************************************/
6531d8b85ccSChristoph Lameter 
65416ce101dSAlistair Popple int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
65516ce101dSAlistair Popple 		struct folio *src, enum migrate_mode mode, int extra_count)
65616ce101dSAlistair Popple {
65716ce101dSAlistair Popple 	int rc;
65816ce101dSAlistair Popple 
65916ce101dSAlistair Popple 	BUG_ON(folio_test_writeback(src));	/* Writeback must be complete */
66016ce101dSAlistair Popple 
66116ce101dSAlistair Popple 	rc = folio_migrate_mapping(mapping, dst, src, extra_count);
66216ce101dSAlistair Popple 
66316ce101dSAlistair Popple 	if (rc != MIGRATEPAGE_SUCCESS)
66416ce101dSAlistair Popple 		return rc;
66516ce101dSAlistair Popple 
66616ce101dSAlistair Popple 	if (mode != MIGRATE_SYNC_NO_COPY)
66716ce101dSAlistair Popple 		folio_migrate_copy(dst, src);
66816ce101dSAlistair Popple 	else
66916ce101dSAlistair Popple 		folio_migrate_flags(dst, src);
67016ce101dSAlistair Popple 	return MIGRATEPAGE_SUCCESS;
67116ce101dSAlistair Popple }
67216ce101dSAlistair Popple 
67354184650SMatthew Wilcox (Oracle) /**
67454184650SMatthew Wilcox (Oracle)  * migrate_folio() - Simple folio migration.
67554184650SMatthew Wilcox (Oracle)  * @mapping: The address_space containing the folio.
67654184650SMatthew Wilcox (Oracle)  * @dst: The folio to migrate the data to.
67754184650SMatthew Wilcox (Oracle)  * @src: The folio containing the current data.
67854184650SMatthew Wilcox (Oracle)  * @mode: How to migrate the page.
679b20a3503SChristoph Lameter  *
68054184650SMatthew Wilcox (Oracle)  * Common logic to directly migrate a single LRU folio suitable for
68154184650SMatthew Wilcox (Oracle)  * folios that do not use PagePrivate/PagePrivate2.
68254184650SMatthew Wilcox (Oracle)  *
68354184650SMatthew Wilcox (Oracle)  * Folios are locked upon entry and exit.
684b20a3503SChristoph Lameter  */
68554184650SMatthew Wilcox (Oracle) int migrate_folio(struct address_space *mapping, struct folio *dst,
68654184650SMatthew Wilcox (Oracle) 		struct folio *src, enum migrate_mode mode)
687b20a3503SChristoph Lameter {
68816ce101dSAlistair Popple 	return migrate_folio_extra(mapping, dst, src, mode, 0);
689b20a3503SChristoph Lameter }
69054184650SMatthew Wilcox (Oracle) EXPORT_SYMBOL(migrate_folio);
691b20a3503SChristoph Lameter 
6929361401eSDavid Howells #ifdef CONFIG_BLOCK
69384ade7c1SJan Kara /* Returns true if all buffers are successfully locked */
69484ade7c1SJan Kara static bool buffer_migrate_lock_buffers(struct buffer_head *head,
69584ade7c1SJan Kara 							enum migrate_mode mode)
69684ade7c1SJan Kara {
69784ade7c1SJan Kara 	struct buffer_head *bh = head;
69884ade7c1SJan Kara 
69984ade7c1SJan Kara 	/* Simple case, sync compaction */
70084ade7c1SJan Kara 	if (mode != MIGRATE_ASYNC) {
70184ade7c1SJan Kara 		do {
70284ade7c1SJan Kara 			lock_buffer(bh);
70384ade7c1SJan Kara 			bh = bh->b_this_page;
70484ade7c1SJan Kara 
70584ade7c1SJan Kara 		} while (bh != head);
70684ade7c1SJan Kara 
70784ade7c1SJan Kara 		return true;
70884ade7c1SJan Kara 	}
70984ade7c1SJan Kara 
71084ade7c1SJan Kara 	/* async case, we cannot block on lock_buffer so use trylock_buffer */
71184ade7c1SJan Kara 	do {
71284ade7c1SJan Kara 		if (!trylock_buffer(bh)) {
71384ade7c1SJan Kara 			/*
71484ade7c1SJan Kara 			 * We failed to lock the buffer and cannot stall in
71584ade7c1SJan Kara 			 * async migration. Release the taken locks
71684ade7c1SJan Kara 			 */
71784ade7c1SJan Kara 			struct buffer_head *failed_bh = bh;
71884ade7c1SJan Kara 			bh = head;
71984ade7c1SJan Kara 			while (bh != failed_bh) {
72084ade7c1SJan Kara 				unlock_buffer(bh);
72184ade7c1SJan Kara 				bh = bh->b_this_page;
72284ade7c1SJan Kara 			}
72384ade7c1SJan Kara 			return false;
72484ade7c1SJan Kara 		}
72584ade7c1SJan Kara 
72684ade7c1SJan Kara 		bh = bh->b_this_page;
72784ade7c1SJan Kara 	} while (bh != head);
72884ade7c1SJan Kara 	return true;
72984ade7c1SJan Kara }
73084ade7c1SJan Kara 
73167235182SMatthew Wilcox (Oracle) static int __buffer_migrate_folio(struct address_space *mapping,
73267235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode,
73389cb0888SJan Kara 		bool check_refs)
7341d8b85ccSChristoph Lameter {
7351d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
7361d8b85ccSChristoph Lameter 	int rc;
737cc4f11e6SJan Kara 	int expected_count;
7381d8b85ccSChristoph Lameter 
73967235182SMatthew Wilcox (Oracle) 	head = folio_buffers(src);
74067235182SMatthew Wilcox (Oracle) 	if (!head)
74154184650SMatthew Wilcox (Oracle) 		return migrate_folio(mapping, dst, src, mode);
7421d8b85ccSChristoph Lameter 
743cc4f11e6SJan Kara 	/* Check whether page does not have extra refs before we do more work */
744108ca835SMatthew Wilcox (Oracle) 	expected_count = folio_expected_refs(mapping, src);
74567235182SMatthew Wilcox (Oracle) 	if (folio_ref_count(src) != expected_count)
746cc4f11e6SJan Kara 		return -EAGAIN;
747cc4f11e6SJan Kara 
748cc4f11e6SJan Kara 	if (!buffer_migrate_lock_buffers(head, mode))
749cc4f11e6SJan Kara 		return -EAGAIN;
7501d8b85ccSChristoph Lameter 
75189cb0888SJan Kara 	if (check_refs) {
75289cb0888SJan Kara 		bool busy;
75389cb0888SJan Kara 		bool invalidated = false;
75489cb0888SJan Kara 
75589cb0888SJan Kara recheck_buffers:
75689cb0888SJan Kara 		busy = false;
75789cb0888SJan Kara 		spin_lock(&mapping->private_lock);
75889cb0888SJan Kara 		bh = head;
75989cb0888SJan Kara 		do {
76089cb0888SJan Kara 			if (atomic_read(&bh->b_count)) {
76189cb0888SJan Kara 				busy = true;
76289cb0888SJan Kara 				break;
76389cb0888SJan Kara 			}
76489cb0888SJan Kara 			bh = bh->b_this_page;
76589cb0888SJan Kara 		} while (bh != head);
76689cb0888SJan Kara 		if (busy) {
76789cb0888SJan Kara 			if (invalidated) {
76889cb0888SJan Kara 				rc = -EAGAIN;
76989cb0888SJan Kara 				goto unlock_buffers;
77089cb0888SJan Kara 			}
771ebdf4de5SJan Kara 			spin_unlock(&mapping->private_lock);
77289cb0888SJan Kara 			invalidate_bh_lrus();
77389cb0888SJan Kara 			invalidated = true;
77489cb0888SJan Kara 			goto recheck_buffers;
77589cb0888SJan Kara 		}
77689cb0888SJan Kara 	}
77789cb0888SJan Kara 
77867235182SMatthew Wilcox (Oracle) 	rc = folio_migrate_mapping(mapping, dst, src, 0);
77978bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
780cc4f11e6SJan Kara 		goto unlock_buffers;
7811d8b85ccSChristoph Lameter 
78267235182SMatthew Wilcox (Oracle) 	folio_attach_private(dst, folio_detach_private(src));
7831d8b85ccSChristoph Lameter 
7841d8b85ccSChristoph Lameter 	bh = head;
7851d8b85ccSChristoph Lameter 	do {
78667235182SMatthew Wilcox (Oracle) 		set_bh_page(bh, &dst->page, bh_offset(bh));
7871d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7881d8b85ccSChristoph Lameter 	} while (bh != head);
7891d8b85ccSChristoph Lameter 
7902916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
79167235182SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
7922916ecc0SJérôme Glisse 	else
79367235182SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
7941d8b85ccSChristoph Lameter 
795cc4f11e6SJan Kara 	rc = MIGRATEPAGE_SUCCESS;
796cc4f11e6SJan Kara unlock_buffers:
797ebdf4de5SJan Kara 	if (check_refs)
798ebdf4de5SJan Kara 		spin_unlock(&mapping->private_lock);
7991d8b85ccSChristoph Lameter 	bh = head;
8001d8b85ccSChristoph Lameter 	do {
8011d8b85ccSChristoph Lameter 		unlock_buffer(bh);
8021d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
8031d8b85ccSChristoph Lameter 	} while (bh != head);
8041d8b85ccSChristoph Lameter 
805cc4f11e6SJan Kara 	return rc;
8061d8b85ccSChristoph Lameter }
80789cb0888SJan Kara 
80867235182SMatthew Wilcox (Oracle) /**
80967235182SMatthew Wilcox (Oracle)  * buffer_migrate_folio() - Migration function for folios with buffers.
81067235182SMatthew Wilcox (Oracle)  * @mapping: The address space containing @src.
81167235182SMatthew Wilcox (Oracle)  * @dst: The folio to migrate to.
81267235182SMatthew Wilcox (Oracle)  * @src: The folio to migrate from.
81367235182SMatthew Wilcox (Oracle)  * @mode: How to migrate the folio.
81467235182SMatthew Wilcox (Oracle)  *
81567235182SMatthew Wilcox (Oracle)  * This function can only be used if the underlying filesystem guarantees
81667235182SMatthew Wilcox (Oracle)  * that no other references to @src exist. For example attached buffer
81767235182SMatthew Wilcox (Oracle)  * heads are accessed only under the folio lock.  If your filesystem cannot
81867235182SMatthew Wilcox (Oracle)  * provide this guarantee, buffer_migrate_folio_norefs() may be more
81967235182SMatthew Wilcox (Oracle)  * appropriate.
82067235182SMatthew Wilcox (Oracle)  *
82167235182SMatthew Wilcox (Oracle)  * Return: 0 on success or a negative errno on failure.
82289cb0888SJan Kara  */
82367235182SMatthew Wilcox (Oracle) int buffer_migrate_folio(struct address_space *mapping,
82467235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
82589cb0888SJan Kara {
82667235182SMatthew Wilcox (Oracle) 	return __buffer_migrate_folio(mapping, dst, src, mode, false);
82789cb0888SJan Kara }
82867235182SMatthew Wilcox (Oracle) EXPORT_SYMBOL(buffer_migrate_folio);
82989cb0888SJan Kara 
83067235182SMatthew Wilcox (Oracle) /**
83167235182SMatthew Wilcox (Oracle)  * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
83267235182SMatthew Wilcox (Oracle)  * @mapping: The address space containing @src.
83367235182SMatthew Wilcox (Oracle)  * @dst: The folio to migrate to.
83467235182SMatthew Wilcox (Oracle)  * @src: The folio to migrate from.
83567235182SMatthew Wilcox (Oracle)  * @mode: How to migrate the folio.
83667235182SMatthew Wilcox (Oracle)  *
83767235182SMatthew Wilcox (Oracle)  * Like buffer_migrate_folio() except that this variant is more careful
83867235182SMatthew Wilcox (Oracle)  * and checks that there are also no buffer head references. This function
83967235182SMatthew Wilcox (Oracle)  * is the right one for mappings where buffer heads are directly looked
84067235182SMatthew Wilcox (Oracle)  * up and referenced (such as block device mappings).
84167235182SMatthew Wilcox (Oracle)  *
84267235182SMatthew Wilcox (Oracle)  * Return: 0 on success or a negative errno on failure.
84389cb0888SJan Kara  */
84467235182SMatthew Wilcox (Oracle) int buffer_migrate_folio_norefs(struct address_space *mapping,
84567235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
84689cb0888SJan Kara {
84767235182SMatthew Wilcox (Oracle) 	return __buffer_migrate_folio(mapping, dst, src, mode, true);
84889cb0888SJan Kara }
849e26355e2SJan Kara EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
8509361401eSDavid Howells #endif
8511d8b85ccSChristoph Lameter 
8522ec810d5SMatthew Wilcox (Oracle) int filemap_migrate_folio(struct address_space *mapping,
8532ec810d5SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
8542ec810d5SMatthew Wilcox (Oracle) {
8552ec810d5SMatthew Wilcox (Oracle) 	int ret;
8562ec810d5SMatthew Wilcox (Oracle) 
8572ec810d5SMatthew Wilcox (Oracle) 	ret = folio_migrate_mapping(mapping, dst, src, 0);
8582ec810d5SMatthew Wilcox (Oracle) 	if (ret != MIGRATEPAGE_SUCCESS)
8592ec810d5SMatthew Wilcox (Oracle) 		return ret;
8602ec810d5SMatthew Wilcox (Oracle) 
8612ec810d5SMatthew Wilcox (Oracle) 	if (folio_get_private(src))
8622ec810d5SMatthew Wilcox (Oracle) 		folio_attach_private(dst, folio_detach_private(src));
8632ec810d5SMatthew Wilcox (Oracle) 
8642ec810d5SMatthew Wilcox (Oracle) 	if (mode != MIGRATE_SYNC_NO_COPY)
8652ec810d5SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
8662ec810d5SMatthew Wilcox (Oracle) 	else
8672ec810d5SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
8682ec810d5SMatthew Wilcox (Oracle) 	return MIGRATEPAGE_SUCCESS;
8692ec810d5SMatthew Wilcox (Oracle) }
8702ec810d5SMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(filemap_migrate_folio);
8712ec810d5SMatthew Wilcox (Oracle) 
87204e62a29SChristoph Lameter /*
8732be7fa10SMatthew Wilcox (Oracle)  * Writeback a folio to clean the dirty state
87404e62a29SChristoph Lameter  */
8752be7fa10SMatthew Wilcox (Oracle) static int writeout(struct address_space *mapping, struct folio *folio)
87604e62a29SChristoph Lameter {
87704e62a29SChristoph Lameter 	struct writeback_control wbc = {
87804e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
87904e62a29SChristoph Lameter 		.nr_to_write = 1,
88004e62a29SChristoph Lameter 		.range_start = 0,
88104e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
88204e62a29SChristoph Lameter 		.for_reclaim = 1
88304e62a29SChristoph Lameter 	};
88404e62a29SChristoph Lameter 	int rc;
88504e62a29SChristoph Lameter 
88604e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
88704e62a29SChristoph Lameter 		/* No write method for the address space */
88804e62a29SChristoph Lameter 		return -EINVAL;
88904e62a29SChristoph Lameter 
8902be7fa10SMatthew Wilcox (Oracle) 	if (!folio_clear_dirty_for_io(folio))
89104e62a29SChristoph Lameter 		/* Someone else already triggered a write */
89204e62a29SChristoph Lameter 		return -EAGAIN;
89304e62a29SChristoph Lameter 
89404e62a29SChristoph Lameter 	/*
8952be7fa10SMatthew Wilcox (Oracle) 	 * A dirty folio may imply that the underlying filesystem has
8962be7fa10SMatthew Wilcox (Oracle) 	 * the folio on some queue. So the folio must be clean for
8972be7fa10SMatthew Wilcox (Oracle) 	 * migration. Writeout may mean we lose the lock and the
8982be7fa10SMatthew Wilcox (Oracle) 	 * folio state is no longer what we checked for earlier.
89904e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
90004e62a29SChristoph Lameter 	 * be successful.
90104e62a29SChristoph Lameter 	 */
9024eecb8b9SMatthew Wilcox (Oracle) 	remove_migration_ptes(folio, folio, false);
90304e62a29SChristoph Lameter 
9042be7fa10SMatthew Wilcox (Oracle) 	rc = mapping->a_ops->writepage(&folio->page, &wbc);
90504e62a29SChristoph Lameter 
90604e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
90704e62a29SChristoph Lameter 		/* unlocked. Relock */
9082be7fa10SMatthew Wilcox (Oracle) 		folio_lock(folio);
90904e62a29SChristoph Lameter 
910bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
91104e62a29SChristoph Lameter }
91204e62a29SChristoph Lameter 
91304e62a29SChristoph Lameter /*
91404e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
91504e62a29SChristoph Lameter  */
9168faa8ef5SMatthew Wilcox (Oracle) static int fallback_migrate_folio(struct address_space *mapping,
9178faa8ef5SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
9188351a6e4SChristoph Lameter {
9198faa8ef5SMatthew Wilcox (Oracle) 	if (folio_test_dirty(src)) {
9208faa8ef5SMatthew Wilcox (Oracle) 		/* Only writeback folios in full synchronous migration */
9212916ecc0SJérôme Glisse 		switch (mode) {
9222916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
9232916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
9242916ecc0SJérôme Glisse 			break;
9252916ecc0SJérôme Glisse 		default:
926b969c4abSMel Gorman 			return -EBUSY;
9272916ecc0SJérôme Glisse 		}
9282be7fa10SMatthew Wilcox (Oracle) 		return writeout(mapping, src);
929b969c4abSMel Gorman 	}
9308351a6e4SChristoph Lameter 
9318351a6e4SChristoph Lameter 	/*
9328351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
9338351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
9348351a6e4SChristoph Lameter 	 */
9358faa8ef5SMatthew Wilcox (Oracle) 	if (folio_test_private(src) &&
9368faa8ef5SMatthew Wilcox (Oracle) 	    !filemap_release_folio(src, GFP_KERNEL))
937806031bbSMel Gorman 		return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
9388351a6e4SChristoph Lameter 
93954184650SMatthew Wilcox (Oracle) 	return migrate_folio(mapping, dst, src, mode);
9408351a6e4SChristoph Lameter }
9418351a6e4SChristoph Lameter 
9421d8b85ccSChristoph Lameter /*
943e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
944e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
945b20a3503SChristoph Lameter  *
946e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
947e24f0b8fSChristoph Lameter  * is successful.
948894bc310SLee Schermerhorn  *
949894bc310SLee Schermerhorn  * Return value:
950894bc310SLee Schermerhorn  *   < 0 - error code
95178bd5209SRafael Aquini  *  MIGRATEPAGE_SUCCESS - success
952b20a3503SChristoph Lameter  */
953e7e3ffebSMatthew Wilcox (Oracle) static int move_to_new_folio(struct folio *dst, struct folio *src,
9545c3f9a67SHugh Dickins 				enum migrate_mode mode)
955b20a3503SChristoph Lameter {
956bda807d4SMinchan Kim 	int rc = -EAGAIN;
957e7e3ffebSMatthew Wilcox (Oracle) 	bool is_lru = !__PageMovable(&src->page);
958b20a3503SChristoph Lameter 
959e7e3ffebSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
960e7e3ffebSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
961b20a3503SChristoph Lameter 
962bda807d4SMinchan Kim 	if (likely(is_lru)) {
96368f2736aSMatthew Wilcox (Oracle) 		struct address_space *mapping = folio_mapping(src);
96468f2736aSMatthew Wilcox (Oracle) 
965b20a3503SChristoph Lameter 		if (!mapping)
96654184650SMatthew Wilcox (Oracle) 			rc = migrate_folio(mapping, dst, src, mode);
9675490da4fSMatthew Wilcox (Oracle) 		else if (mapping->a_ops->migrate_folio)
968b20a3503SChristoph Lameter 			/*
9695490da4fSMatthew Wilcox (Oracle) 			 * Most folios have a mapping and most filesystems
9705490da4fSMatthew Wilcox (Oracle) 			 * provide a migrate_folio callback. Anonymous folios
971bda807d4SMinchan Kim 			 * are part of swap space which also has its own
9725490da4fSMatthew Wilcox (Oracle) 			 * migrate_folio callback. This is the most common path
973bda807d4SMinchan Kim 			 * for page migration.
974b20a3503SChristoph Lameter 			 */
9755490da4fSMatthew Wilcox (Oracle) 			rc = mapping->a_ops->migrate_folio(mapping, dst, src,
9765490da4fSMatthew Wilcox (Oracle) 								mode);
9778351a6e4SChristoph Lameter 		else
9788faa8ef5SMatthew Wilcox (Oracle) 			rc = fallback_migrate_folio(mapping, dst, src, mode);
979bda807d4SMinchan Kim 	} else {
98068f2736aSMatthew Wilcox (Oracle) 		const struct movable_operations *mops;
98168f2736aSMatthew Wilcox (Oracle) 
982bda807d4SMinchan Kim 		/*
983bda807d4SMinchan Kim 		 * In case of non-lru page, it could be released after
984bda807d4SMinchan Kim 		 * isolation step. In that case, we shouldn't try migration.
985bda807d4SMinchan Kim 		 */
986e7e3ffebSMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
987e7e3ffebSMatthew Wilcox (Oracle) 		if (!folio_test_movable(src)) {
988bda807d4SMinchan Kim 			rc = MIGRATEPAGE_SUCCESS;
989e7e3ffebSMatthew Wilcox (Oracle) 			folio_clear_isolated(src);
990bda807d4SMinchan Kim 			goto out;
991bda807d4SMinchan Kim 		}
992bda807d4SMinchan Kim 
99368f2736aSMatthew Wilcox (Oracle) 		mops = page_movable_ops(&src->page);
99468f2736aSMatthew Wilcox (Oracle) 		rc = mops->migrate_page(&dst->page, &src->page, mode);
995bda807d4SMinchan Kim 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
996e7e3ffebSMatthew Wilcox (Oracle) 				!folio_test_isolated(src));
997bda807d4SMinchan Kim 	}
998b20a3503SChristoph Lameter 
9995c3f9a67SHugh Dickins 	/*
1000e7e3ffebSMatthew Wilcox (Oracle) 	 * When successful, old pagecache src->mapping must be cleared before
1001e7e3ffebSMatthew Wilcox (Oracle) 	 * src is freed; but stats require that PageAnon be left as PageAnon.
10025c3f9a67SHugh Dickins 	 */
10035c3f9a67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1004e7e3ffebSMatthew Wilcox (Oracle) 		if (__PageMovable(&src->page)) {
1005e7e3ffebSMatthew Wilcox (Oracle) 			VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
1006bda807d4SMinchan Kim 
1007bda807d4SMinchan Kim 			/*
1008bda807d4SMinchan Kim 			 * We clear PG_movable under page_lock so any compactor
1009bda807d4SMinchan Kim 			 * cannot try to migrate this page.
1010bda807d4SMinchan Kim 			 */
1011e7e3ffebSMatthew Wilcox (Oracle) 			folio_clear_isolated(src);
1012bda807d4SMinchan Kim 		}
1013bda807d4SMinchan Kim 
1014bda807d4SMinchan Kim 		/*
1015e7e3ffebSMatthew Wilcox (Oracle) 		 * Anonymous and movable src->mapping will be cleared by
1016bda807d4SMinchan Kim 		 * free_pages_prepare so don't reset it here for keeping
1017bda807d4SMinchan Kim 		 * the type to work PageAnon, for example.
1018bda807d4SMinchan Kim 		 */
1019e7e3ffebSMatthew Wilcox (Oracle) 		if (!folio_mapping_flags(src))
1020e7e3ffebSMatthew Wilcox (Oracle) 			src->mapping = NULL;
1021d2b2c6ddSLars Persson 
1022e7e3ffebSMatthew Wilcox (Oracle) 		if (likely(!folio_is_zone_device(dst)))
1023e7e3ffebSMatthew Wilcox (Oracle) 			flush_dcache_folio(dst);
10243fe2011fSMel Gorman 	}
1025bda807d4SMinchan Kim out:
1026e24f0b8fSChristoph Lameter 	return rc;
1027e24f0b8fSChristoph Lameter }
1028e24f0b8fSChristoph Lameter 
1029682a71a1SMatthew Wilcox (Oracle) static int __unmap_and_move(struct folio *src, struct folio *dst,
10309c620e2bSHugh Dickins 				int force, enum migrate_mode mode)
1031e24f0b8fSChristoph Lameter {
10320dabec93SMinchan Kim 	int rc = -EAGAIN;
1033213ecb31SBaolin Wang 	bool page_was_mapped = false;
10343f6c8272SMel Gorman 	struct anon_vma *anon_vma = NULL;
1035682a71a1SMatthew Wilcox (Oracle) 	bool is_lru = !__PageMovable(&src->page);
103695a402c3SChristoph Lameter 
1037682a71a1SMatthew Wilcox (Oracle) 	if (!folio_trylock(src)) {
1038a6bc32b8SMel Gorman 		if (!force || mode == MIGRATE_ASYNC)
10390dabec93SMinchan Kim 			goto out;
10403e7d3449SMel Gorman 
10413e7d3449SMel Gorman 		/*
10423e7d3449SMel Gorman 		 * It's not safe for direct compaction to call lock_page.
10433e7d3449SMel Gorman 		 * For example, during page readahead pages are added locked
10443e7d3449SMel Gorman 		 * to the LRU. Later, when the IO completes the pages are
10453e7d3449SMel Gorman 		 * marked uptodate and unlocked. However, the queueing
10463e7d3449SMel Gorman 		 * could be merging multiple pages for one bio (e.g.
1047d4388340SMatthew Wilcox (Oracle) 		 * mpage_readahead). If an allocation happens for the
10483e7d3449SMel Gorman 		 * second or third page, the process can end up locking
10493e7d3449SMel Gorman 		 * the same page twice and deadlocking. Rather than
10503e7d3449SMel Gorman 		 * trying to be clever about what pages can be locked,
10513e7d3449SMel Gorman 		 * avoid the use of lock_page for direct compaction
10523e7d3449SMel Gorman 		 * altogether.
10533e7d3449SMel Gorman 		 */
10543e7d3449SMel Gorman 		if (current->flags & PF_MEMALLOC)
10550dabec93SMinchan Kim 			goto out;
10563e7d3449SMel Gorman 
1057682a71a1SMatthew Wilcox (Oracle) 		folio_lock(src);
1058e24f0b8fSChristoph Lameter 	}
1059e24f0b8fSChristoph Lameter 
1060682a71a1SMatthew Wilcox (Oracle) 	if (folio_test_writeback(src)) {
106111bc82d6SAndrea Arcangeli 		/*
1062fed5b64aSJianguo Wu 		 * Only in the case of a full synchronous migration is it
1063a6bc32b8SMel Gorman 		 * necessary to wait for PageWriteback. In the async case,
1064a6bc32b8SMel Gorman 		 * the retry loop is too short and in the sync-light case,
1065a6bc32b8SMel Gorman 		 * the overhead of stalling is too much
106611bc82d6SAndrea Arcangeli 		 */
10672916ecc0SJérôme Glisse 		switch (mode) {
10682916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
10692916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
10702916ecc0SJérôme Glisse 			break;
10712916ecc0SJérôme Glisse 		default:
107211bc82d6SAndrea Arcangeli 			rc = -EBUSY;
10730a31bc97SJohannes Weiner 			goto out_unlock;
107411bc82d6SAndrea Arcangeli 		}
107511bc82d6SAndrea Arcangeli 		if (!force)
10760a31bc97SJohannes Weiner 			goto out_unlock;
1077682a71a1SMatthew Wilcox (Oracle) 		folio_wait_writeback(src);
1078e24f0b8fSChristoph Lameter 	}
107903f15c86SHugh Dickins 
1080e24f0b8fSChristoph Lameter 	/*
1081682a71a1SMatthew Wilcox (Oracle) 	 * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
1082682a71a1SMatthew Wilcox (Oracle) 	 * we cannot notice that anon_vma is freed while we migrate a page.
10831ce82b69SHugh Dickins 	 * This get_anon_vma() delays freeing anon_vma pointer until the end
1084dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
1085989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
1086989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
10873fe2011fSMel Gorman 	 *
108829eea9b5SMatthew Wilcox (Oracle) 	 * Only folio_get_anon_vma() understands the subtleties of
108903f15c86SHugh Dickins 	 * getting a hold on an anon_vma from outside one of its mms.
109003f15c86SHugh Dickins 	 * But if we cannot get anon_vma, then we won't need it anyway,
109103f15c86SHugh Dickins 	 * because that implies that the anon page is no longer mapped
109203f15c86SHugh Dickins 	 * (and cannot be remapped so long as we hold the page lock).
10933fe2011fSMel Gorman 	 */
1094682a71a1SMatthew Wilcox (Oracle) 	if (folio_test_anon(src) && !folio_test_ksm(src))
109529eea9b5SMatthew Wilcox (Oracle) 		anon_vma = folio_get_anon_vma(src);
109662e1c553SShaohua Li 
10977db7671fSHugh Dickins 	/*
10987db7671fSHugh Dickins 	 * Block others from accessing the new page when we get around to
10997db7671fSHugh Dickins 	 * establishing additional references. We are usually the only one
1100682a71a1SMatthew Wilcox (Oracle) 	 * holding a reference to dst at this point. We used to have a BUG
1101682a71a1SMatthew Wilcox (Oracle) 	 * here if folio_trylock(dst) fails, but would like to allow for
1102682a71a1SMatthew Wilcox (Oracle) 	 * cases where there might be a race with the previous use of dst.
11037db7671fSHugh Dickins 	 * This is much like races on refcount of oldpage: just don't BUG().
11047db7671fSHugh Dickins 	 */
1105682a71a1SMatthew Wilcox (Oracle) 	if (unlikely(!folio_trylock(dst)))
11067db7671fSHugh Dickins 		goto out_unlock;
11077db7671fSHugh Dickins 
1108bda807d4SMinchan Kim 	if (unlikely(!is_lru)) {
1109682a71a1SMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, src, mode);
1110bda807d4SMinchan Kim 		goto out_unlock_both;
1111bda807d4SMinchan Kim 	}
1112bda807d4SMinchan Kim 
1113dc386d4dSKAMEZAWA Hiroyuki 	/*
111462e1c553SShaohua Li 	 * Corner case handling:
111562e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
111662e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
1117682a71a1SMatthew Wilcox (Oracle) 	 * Calling try_to_unmap() against a src->mapping==NULL page will
111862e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
1119d12b8951SYang Shi 	 * 2. An orphaned page (see truncate_cleanup_page) might have
112062e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
112162e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
112262e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
112362e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
1124dc386d4dSKAMEZAWA Hiroyuki 	 */
1125682a71a1SMatthew Wilcox (Oracle) 	if (!src->mapping) {
1126682a71a1SMatthew Wilcox (Oracle) 		if (folio_test_private(src)) {
1127682a71a1SMatthew Wilcox (Oracle) 			try_to_free_buffers(src);
11287db7671fSHugh Dickins 			goto out_unlock_both;
112962e1c553SShaohua Li 		}
1130682a71a1SMatthew Wilcox (Oracle) 	} else if (folio_mapped(src)) {
11317db7671fSHugh Dickins 		/* Establish migration ptes */
1132682a71a1SMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_anon(src) &&
1133682a71a1SMatthew Wilcox (Oracle) 			       !folio_test_ksm(src) && !anon_vma, src);
1134682a71a1SMatthew Wilcox (Oracle) 		try_to_migrate(src, 0);
1135213ecb31SBaolin Wang 		page_was_mapped = true;
11362ebba6b7SHugh Dickins 	}
1137dc386d4dSKAMEZAWA Hiroyuki 
1138682a71a1SMatthew Wilcox (Oracle) 	if (!folio_mapped(src))
1139682a71a1SMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, src, mode);
1140e24f0b8fSChristoph Lameter 
1141c3096e67SHugh Dickins 	/*
1142682a71a1SMatthew Wilcox (Oracle) 	 * When successful, push dst to LRU immediately: so that if it
1143c3096e67SHugh Dickins 	 * turns out to be an mlocked page, remove_migration_ptes() will
1144682a71a1SMatthew Wilcox (Oracle) 	 * automatically build up the correct dst->mlock_count for it.
1145c3096e67SHugh Dickins 	 *
1146c3096e67SHugh Dickins 	 * We would like to do something similar for the old page, when
1147c3096e67SHugh Dickins 	 * unsuccessful, and other cases when a page has been temporarily
1148c3096e67SHugh Dickins 	 * isolated from the unevictable LRU: but this case is the easiest.
1149c3096e67SHugh Dickins 	 */
1150c3096e67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1151682a71a1SMatthew Wilcox (Oracle) 		folio_add_lru(dst);
11525c3f9a67SHugh Dickins 		if (page_was_mapped)
1153c3096e67SHugh Dickins 			lru_add_drain();
1154c3096e67SHugh Dickins 	}
1155c3096e67SHugh Dickins 
11565c3f9a67SHugh Dickins 	if (page_was_mapped)
1157682a71a1SMatthew Wilcox (Oracle) 		remove_migration_ptes(src,
1158682a71a1SMatthew Wilcox (Oracle) 			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
11593f6c8272SMel Gorman 
11607db7671fSHugh Dickins out_unlock_both:
1161682a71a1SMatthew Wilcox (Oracle) 	folio_unlock(dst);
11627db7671fSHugh Dickins out_unlock:
11633f6c8272SMel Gorman 	/* Drop an anon_vma reference if we took one */
116476545066SRik van Riel 	if (anon_vma)
11659e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
1166682a71a1SMatthew Wilcox (Oracle) 	folio_unlock(src);
11670dabec93SMinchan Kim out:
1168c6c919ebSMinchan Kim 	/*
1169682a71a1SMatthew Wilcox (Oracle) 	 * If migration is successful, decrease refcount of dst,
1170c6c919ebSMinchan Kim 	 * which will not free the page because new page owner increased
1171c3096e67SHugh Dickins 	 * refcounter.
1172c6c919ebSMinchan Kim 	 */
1173c3096e67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS)
1174682a71a1SMatthew Wilcox (Oracle) 		folio_put(dst);
1175c6c919ebSMinchan Kim 
11760dabec93SMinchan Kim 	return rc;
11770dabec93SMinchan Kim }
117895a402c3SChristoph Lameter 
11790dabec93SMinchan Kim /*
118049f51859SHuang Ying  * Obtain the lock on folio, remove all ptes and migrate the folio
118149f51859SHuang Ying  * to the newly allocated folio in dst.
11820dabec93SMinchan Kim  */
11836ec4476aSLinus Torvalds static int unmap_and_move(new_page_t get_new_page,
1184ef2a5153SGeert Uytterhoeven 				   free_page_t put_new_page,
118549f51859SHuang Ying 				   unsigned long private, struct folio *src,
1186add05cecSNaoya Horiguchi 				   int force, enum migrate_mode mode,
1187dd4ae78aSYang Shi 				   enum migrate_reason reason,
1188dd4ae78aSYang Shi 				   struct list_head *ret)
11890dabec93SMinchan Kim {
119049f51859SHuang Ying 	struct folio *dst;
11912def7424SHugh Dickins 	int rc = MIGRATEPAGE_SUCCESS;
119274d4a579SYang Shi 	struct page *newpage = NULL;
11930dabec93SMinchan Kim 
119449f51859SHuang Ying 	if (!thp_migration_supported() && folio_test_transhuge(src))
1195d532e2e5SYang Shi 		return -ENOSYS;
119694723aafSMichal Hocko 
119749f51859SHuang Ying 	if (folio_ref_count(src) == 1) {
119849f51859SHuang Ying 		/* Folio was freed from under us. So we are done. */
119949f51859SHuang Ying 		folio_clear_active(src);
120049f51859SHuang Ying 		folio_clear_unevictable(src);
1201160088b3SMiaohe Lin 		/* free_pages_prepare() will clear PG_isolated. */
12020dabec93SMinchan Kim 		goto out;
12030dabec93SMinchan Kim 	}
12040dabec93SMinchan Kim 
120549f51859SHuang Ying 	newpage = get_new_page(&src->page, private);
120674d4a579SYang Shi 	if (!newpage)
120774d4a579SYang Shi 		return -ENOMEM;
1208682a71a1SMatthew Wilcox (Oracle) 	dst = page_folio(newpage);
120974d4a579SYang Shi 
12104c74b65fSYang Li 	dst->private = NULL;
1211682a71a1SMatthew Wilcox (Oracle) 	rc = __unmap_and_move(src, dst, force, mode);
1212c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS)
121349f51859SHuang Ying 		set_page_owner_migrate_reason(&dst->page, reason);
1214bf6bddf1SRafael Aquini 
12150dabec93SMinchan Kim out:
1216e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
1217aaa994b3SChristoph Lameter 		/*
121849f51859SHuang Ying 		 * A folio that has been migrated has all references
121949f51859SHuang Ying 		 * removed and will be freed. A folio that has not been
1220c23a0c99SRalph Campbell 		 * migrated will have kept its references and be restored.
1221aaa994b3SChristoph Lameter 		 */
122249f51859SHuang Ying 		list_del(&src->lru);
1223e24f0b8fSChristoph Lameter 	}
122468711a74SDavid Rientjes 
122595a402c3SChristoph Lameter 	/*
1226c6c919ebSMinchan Kim 	 * If migration is successful, releases reference grabbed during
122749f51859SHuang Ying 	 * isolation. Otherwise, restore the folio to right list unless
1228c6c919ebSMinchan Kim 	 * we want to retry.
122995a402c3SChristoph Lameter 	 */
1230c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1231dd4ae78aSYang Shi 		/*
123249f51859SHuang Ying 		 * Compaction can migrate also non-LRU folios which are
1233dd4ae78aSYang Shi 		 * not accounted to NR_ISOLATED_*. They can be recognized
123449f51859SHuang Ying 		 * as __folio_test_movable
1235dd4ae78aSYang Shi 		 */
123649f51859SHuang Ying 		if (likely(!__folio_test_movable(src)))
123749f51859SHuang Ying 			mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
123849f51859SHuang Ying 					folio_is_file_lru(src), -folio_nr_pages(src));
1239dd4ae78aSYang Shi 
124079f5f8faSOscar Salvador 		if (reason != MR_MEMORY_FAILURE)
1241c6c919ebSMinchan Kim 			/*
124249f51859SHuang Ying 			 * We release the folio in page_handle_poison.
1243c6c919ebSMinchan Kim 			 */
124449f51859SHuang Ying 			folio_put(src);
1245c6c919ebSMinchan Kim 	} else {
1246dd4ae78aSYang Shi 		if (rc != -EAGAIN)
124749f51859SHuang Ying 			list_add_tail(&src->lru, ret);
1248bda807d4SMinchan Kim 
1249cf4b769aSHugh Dickins 		if (put_new_page)
125049f51859SHuang Ying 			put_new_page(&dst->page, private);
1251c6c919ebSMinchan Kim 		else
125249f51859SHuang Ying 			folio_put(dst);
1253c6c919ebSMinchan Kim 	}
125468711a74SDavid Rientjes 
1255e24f0b8fSChristoph Lameter 	return rc;
1256e24f0b8fSChristoph Lameter }
1257b20a3503SChristoph Lameter 
1258e24f0b8fSChristoph Lameter /*
1259290408d4SNaoya Horiguchi  * Counterpart of unmap_and_move_page() for hugepage migration.
1260290408d4SNaoya Horiguchi  *
1261290408d4SNaoya Horiguchi  * This function doesn't wait the completion of hugepage I/O
1262290408d4SNaoya Horiguchi  * because there is no race between I/O and migration for hugepage.
1263290408d4SNaoya Horiguchi  * Note that currently hugepage I/O occurs only in direct I/O
1264290408d4SNaoya Horiguchi  * where no lock is held and PG_writeback is irrelevant,
1265290408d4SNaoya Horiguchi  * and writeback status of all subpages are counted in the reference
1266290408d4SNaoya Horiguchi  * count of the head page (i.e. if all subpages of a 2MB hugepage are
1267290408d4SNaoya Horiguchi  * under direct I/O, the reference of the head page is 512 and a bit more.)
1268290408d4SNaoya Horiguchi  * This means that when we try to migrate hugepage whose subpages are
1269290408d4SNaoya Horiguchi  * doing direct I/O, some references remain after try_to_unmap() and
1270290408d4SNaoya Horiguchi  * hugepage migration fails without data corruption.
1271290408d4SNaoya Horiguchi  *
1272290408d4SNaoya Horiguchi  * There is also no race when direct I/O is issued on the page under migration,
1273290408d4SNaoya Horiguchi  * because then pte is replaced with migration swap entry and direct I/O code
1274290408d4SNaoya Horiguchi  * will wait in the page fault for migration to complete.
1275290408d4SNaoya Horiguchi  */
1276290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page,
127768711a74SDavid Rientjes 				free_page_t put_new_page, unsigned long private,
127868711a74SDavid Rientjes 				struct page *hpage, int force,
1279dd4ae78aSYang Shi 				enum migrate_mode mode, int reason,
1280dd4ae78aSYang Shi 				struct list_head *ret)
1281290408d4SNaoya Horiguchi {
12824eecb8b9SMatthew Wilcox (Oracle) 	struct folio *dst, *src = page_folio(hpage);
12832def7424SHugh Dickins 	int rc = -EAGAIN;
12842ebba6b7SHugh Dickins 	int page_was_mapped = 0;
128532665f2bSJoonsoo Kim 	struct page *new_hpage;
1286290408d4SNaoya Horiguchi 	struct anon_vma *anon_vma = NULL;
1287c0d0381aSMike Kravetz 	struct address_space *mapping = NULL;
1288290408d4SNaoya Horiguchi 
128983467efbSNaoya Horiguchi 	/*
12907ed2c31dSAnshuman Khandual 	 * Migratability of hugepages depends on architectures and their size.
129183467efbSNaoya Horiguchi 	 * This check is necessary because some callers of hugepage migration
129283467efbSNaoya Horiguchi 	 * like soft offline and memory hotremove don't walk through page
129383467efbSNaoya Horiguchi 	 * tables or check whether the hugepage is pmd-based or not before
129483467efbSNaoya Horiguchi 	 * kicking migration.
129583467efbSNaoya Horiguchi 	 */
1296577be05cSHuang Ying 	if (!hugepage_migration_supported(page_hstate(hpage)))
129783467efbSNaoya Horiguchi 		return -ENOSYS;
129883467efbSNaoya Horiguchi 
1299c33db292SMatthew Wilcox (Oracle) 	if (folio_ref_count(src) == 1) {
130071a64f61SMuchun Song 		/* page was freed from under us. So we are done. */
130171a64f61SMuchun Song 		putback_active_hugepage(hpage);
130271a64f61SMuchun Song 		return MIGRATEPAGE_SUCCESS;
130371a64f61SMuchun Song 	}
130471a64f61SMuchun Song 
1305666feb21SMichal Hocko 	new_hpage = get_new_page(hpage, private);
1306290408d4SNaoya Horiguchi 	if (!new_hpage)
1307290408d4SNaoya Horiguchi 		return -ENOMEM;
13084eecb8b9SMatthew Wilcox (Oracle) 	dst = page_folio(new_hpage);
1309290408d4SNaoya Horiguchi 
1310c33db292SMatthew Wilcox (Oracle) 	if (!folio_trylock(src)) {
13112916ecc0SJérôme Glisse 		if (!force)
1312290408d4SNaoya Horiguchi 			goto out;
13132916ecc0SJérôme Glisse 		switch (mode) {
13142916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
13152916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
13162916ecc0SJérôme Glisse 			break;
13172916ecc0SJérôme Glisse 		default:
13182916ecc0SJérôme Glisse 			goto out;
13192916ecc0SJérôme Glisse 		}
1320c33db292SMatthew Wilcox (Oracle) 		folio_lock(src);
1321290408d4SNaoya Horiguchi 	}
1322290408d4SNaoya Horiguchi 
1323cb6acd01SMike Kravetz 	/*
1324cb6acd01SMike Kravetz 	 * Check for pages which are in the process of being freed.  Without
1325c33db292SMatthew Wilcox (Oracle) 	 * folio_mapping() set, hugetlbfs specific move page routine will not
1326cb6acd01SMike Kravetz 	 * be called and we could leak usage counts for subpools.
1327cb6acd01SMike Kravetz 	 */
1328345c62d1SSidhartha Kumar 	if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
1329cb6acd01SMike Kravetz 		rc = -EBUSY;
1330cb6acd01SMike Kravetz 		goto out_unlock;
1331cb6acd01SMike Kravetz 	}
1332cb6acd01SMike Kravetz 
1333c33db292SMatthew Wilcox (Oracle) 	if (folio_test_anon(src))
133429eea9b5SMatthew Wilcox (Oracle) 		anon_vma = folio_get_anon_vma(src);
1335290408d4SNaoya Horiguchi 
1336c33db292SMatthew Wilcox (Oracle) 	if (unlikely(!folio_trylock(dst)))
13377db7671fSHugh Dickins 		goto put_anon;
13387db7671fSHugh Dickins 
1339c33db292SMatthew Wilcox (Oracle) 	if (folio_mapped(src)) {
1340a98a2f0cSAlistair Popple 		enum ttu_flags ttu = 0;
1341336bf30eSMike Kravetz 
1342c33db292SMatthew Wilcox (Oracle) 		if (!folio_test_anon(src)) {
1343c0d0381aSMike Kravetz 			/*
1344336bf30eSMike Kravetz 			 * In shared mappings, try_to_unmap could potentially
1345336bf30eSMike Kravetz 			 * call huge_pmd_unshare.  Because of this, take
1346336bf30eSMike Kravetz 			 * semaphore in write mode here and set TTU_RMAP_LOCKED
1347336bf30eSMike Kravetz 			 * to let lower levels know we have taken the lock.
1348c0d0381aSMike Kravetz 			 */
1349c0d0381aSMike Kravetz 			mapping = hugetlb_page_mapping_lock_write(hpage);
1350c0d0381aSMike Kravetz 			if (unlikely(!mapping))
1351c0d0381aSMike Kravetz 				goto unlock_put_anon;
1352c0d0381aSMike Kravetz 
13535202978bSMiaohe Lin 			ttu = TTU_RMAP_LOCKED;
1354336bf30eSMike Kravetz 		}
1355336bf30eSMike Kravetz 
13564b8554c5SMatthew Wilcox (Oracle) 		try_to_migrate(src, ttu);
13572ebba6b7SHugh Dickins 		page_was_mapped = 1;
1358336bf30eSMike Kravetz 
13595202978bSMiaohe Lin 		if (ttu & TTU_RMAP_LOCKED)
1360336bf30eSMike Kravetz 			i_mmap_unlock_write(mapping);
13612ebba6b7SHugh Dickins 	}
1362290408d4SNaoya Horiguchi 
1363c33db292SMatthew Wilcox (Oracle) 	if (!folio_mapped(src))
1364e7e3ffebSMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, src, mode);
1365290408d4SNaoya Horiguchi 
1366336bf30eSMike Kravetz 	if (page_was_mapped)
13674eecb8b9SMatthew Wilcox (Oracle) 		remove_migration_ptes(src,
13684eecb8b9SMatthew Wilcox (Oracle) 			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
1369290408d4SNaoya Horiguchi 
1370c0d0381aSMike Kravetz unlock_put_anon:
1371c33db292SMatthew Wilcox (Oracle) 	folio_unlock(dst);
13727db7671fSHugh Dickins 
13737db7671fSHugh Dickins put_anon:
1374fd4a4663SHugh Dickins 	if (anon_vma)
13759e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
13768e6ac7faSAneesh Kumar K.V 
13772def7424SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1378345c62d1SSidhartha Kumar 		move_hugetlb_state(src, dst, reason);
13792def7424SHugh Dickins 		put_new_page = NULL;
13802def7424SHugh Dickins 	}
13818e6ac7faSAneesh Kumar K.V 
1382cb6acd01SMike Kravetz out_unlock:
1383c33db292SMatthew Wilcox (Oracle) 	folio_unlock(src);
138409761333SHillf Danton out:
1385dd4ae78aSYang Shi 	if (rc == MIGRATEPAGE_SUCCESS)
1386b8ec1ceeSNaoya Horiguchi 		putback_active_hugepage(hpage);
1387a04840c6SMiaohe Lin 	else if (rc != -EAGAIN)
1388c33db292SMatthew Wilcox (Oracle) 		list_move_tail(&src->lru, ret);
138968711a74SDavid Rientjes 
139068711a74SDavid Rientjes 	/*
139168711a74SDavid Rientjes 	 * If migration was not successful and there's a freeing callback, use
139268711a74SDavid Rientjes 	 * it.  Otherwise, put_page() will drop the reference grabbed during
139368711a74SDavid Rientjes 	 * isolation.
139468711a74SDavid Rientjes 	 */
13952def7424SHugh Dickins 	if (put_new_page)
139668711a74SDavid Rientjes 		put_new_page(new_hpage, private);
139768711a74SDavid Rientjes 	else
13983aaa76e1SNaoya Horiguchi 		putback_active_hugepage(new_hpage);
139968711a74SDavid Rientjes 
1400290408d4SNaoya Horiguchi 	return rc;
1401290408d4SNaoya Horiguchi }
1402290408d4SNaoya Horiguchi 
1403eaec4e63SHuang Ying static inline int try_split_folio(struct folio *folio, struct list_head *split_folios)
1404d532e2e5SYang Shi {
14059c62ff00SHuang Ying 	int rc;
1406d532e2e5SYang Shi 
1407eaec4e63SHuang Ying 	folio_lock(folio);
1408eaec4e63SHuang Ying 	rc = split_folio_to_list(folio, split_folios);
1409eaec4e63SHuang Ying 	folio_unlock(folio);
1410e6fa8a79SHuang Ying 	if (!rc)
1411eaec4e63SHuang Ying 		list_move_tail(&folio->lru, split_folios);
1412d532e2e5SYang Shi 
1413d532e2e5SYang Shi 	return rc;
1414d532e2e5SYang Shi }
1415d532e2e5SYang Shi 
1416290408d4SNaoya Horiguchi /*
1417eaec4e63SHuang Ying  * migrate_pages - migrate the folios specified in a list, to the free folios
1418c73e5c9cSSrivatsa S. Bhat  *		   supplied as the target for the page migration
1419e24f0b8fSChristoph Lameter  *
1420eaec4e63SHuang Ying  * @from:		The list of folios to be migrated.
1421eaec4e63SHuang Ying  * @get_new_page:	The function used to allocate free folios to be used
1422eaec4e63SHuang Ying  *			as the target of the folio migration.
1423eaec4e63SHuang Ying  * @put_new_page:	The function used to free target folios if migration
142468711a74SDavid Rientjes  *			fails, or NULL if no special handling is necessary.
1425c73e5c9cSSrivatsa S. Bhat  * @private:		Private data to be passed on to get_new_page()
1426c73e5c9cSSrivatsa S. Bhat  * @mode:		The migration mode that specifies the constraints for
1427eaec4e63SHuang Ying  *			folio migration, if any.
1428eaec4e63SHuang Ying  * @reason:		The reason for folio migration.
1429eaec4e63SHuang Ying  * @ret_succeeded:	Set to the number of folios migrated successfully if
14305ac95884SYang Shi  *			the caller passes a non-NULL pointer.
1431e24f0b8fSChristoph Lameter  *
1432eaec4e63SHuang Ying  * The function returns after 10 attempts or if no folios are movable any more
1433eaec4e63SHuang Ying  * because the list has become empty or no retryable folios exist any more.
1434eaec4e63SHuang Ying  * It is caller's responsibility to call putback_movable_pages() to return folios
1435dd4ae78aSYang Shi  * to the LRU or free list only if ret != 0.
1436e24f0b8fSChristoph Lameter  *
1437eaec4e63SHuang Ying  * Returns the number of {normal folio, large folio, hugetlb} that were not
1438eaec4e63SHuang Ying  * migrated, or an error code. The number of large folio splits will be
1439eaec4e63SHuang Ying  * considered as the number of non-migrated large folio, no matter how many
1440eaec4e63SHuang Ying  * split folios of the large folio are migrated successfully.
1441e24f0b8fSChristoph Lameter  */
14429c620e2bSHugh Dickins int migrate_pages(struct list_head *from, new_page_t get_new_page,
144368711a74SDavid Rientjes 		free_page_t put_new_page, unsigned long private,
14445ac95884SYang Shi 		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1445e24f0b8fSChristoph Lameter {
1446e24f0b8fSChristoph Lameter 	int retry = 1;
1447eaec4e63SHuang Ying 	int large_retry = 1;
14481a5bae25SAnshuman Khandual 	int thp_retry = 1;
1449e24f0b8fSChristoph Lameter 	int nr_failed = 0;
1450b5bade97SBaolin Wang 	int nr_failed_pages = 0;
1451077309bcSHuang Ying 	int nr_retry_pages = 0;
14525647bc29SMel Gorman 	int nr_succeeded = 0;
14531a5bae25SAnshuman Khandual 	int nr_thp_succeeded = 0;
1454eaec4e63SHuang Ying 	int nr_large_failed = 0;
14551a5bae25SAnshuman Khandual 	int nr_thp_failed = 0;
14561a5bae25SAnshuman Khandual 	int nr_thp_split = 0;
1457e24f0b8fSChristoph Lameter 	int pass = 0;
1458eaec4e63SHuang Ying 	bool is_large = false;
14591a5bae25SAnshuman Khandual 	bool is_thp = false;
1460eaec4e63SHuang Ying 	struct folio *folio, *folio2;
1461eaec4e63SHuang Ying 	int rc, nr_pages;
1462eaec4e63SHuang Ying 	LIST_HEAD(ret_folios);
1463eaec4e63SHuang Ying 	LIST_HEAD(split_folios);
1464b0b515bfSYang Shi 	bool nosplit = (reason == MR_NUMA_MISPLACED);
1465eaec4e63SHuang Ying 	bool no_split_folio_counting = false;
14662d1db3b1SChristoph Lameter 
14677bc1aec5SLiam Mark 	trace_mm_migrate_pages_start(mode, reason);
14687bc1aec5SLiam Mark 
1469eaec4e63SHuang Ying split_folio_migration:
1470eaec4e63SHuang Ying 	for (pass = 0; pass < 10 && (retry || large_retry); pass++) {
1471e24f0b8fSChristoph Lameter 		retry = 0;
1472eaec4e63SHuang Ying 		large_retry = 0;
14731a5bae25SAnshuman Khandual 		thp_retry = 0;
1474077309bcSHuang Ying 		nr_retry_pages = 0;
1475e24f0b8fSChristoph Lameter 
1476eaec4e63SHuang Ying 		list_for_each_entry_safe(folio, folio2, from, lru) {
14771a5bae25SAnshuman Khandual 			/*
1478eaec4e63SHuang Ying 			 * Large folio statistics is based on the source large
1479eaec4e63SHuang Ying 			 * folio. Capture required information that might get
1480eaec4e63SHuang Ying 			 * lost during migration.
14811a5bae25SAnshuman Khandual 			 */
1482eaec4e63SHuang Ying 			is_large = folio_test_large(folio) && !folio_test_hugetlb(folio);
1483eaec4e63SHuang Ying 			is_thp = is_large && folio_test_pmd_mappable(folio);
1484eaec4e63SHuang Ying 			nr_pages = folio_nr_pages(folio);
1485e24f0b8fSChristoph Lameter 			cond_resched();
1486e24f0b8fSChristoph Lameter 
1487eaec4e63SHuang Ying 			if (folio_test_hugetlb(folio))
148831caf665SNaoya Horiguchi 				rc = unmap_and_move_huge_page(get_new_page,
1489eaec4e63SHuang Ying 						put_new_page, private,
1490eaec4e63SHuang Ying 						&folio->page, pass > 2, mode,
1491eaec4e63SHuang Ying 						reason,
1492eaec4e63SHuang Ying 						&ret_folios);
149331caf665SNaoya Horiguchi 			else
149468711a74SDavid Rientjes 				rc = unmap_and_move(get_new_page, put_new_page,
1495eaec4e63SHuang Ying 						private, folio, pass > 2, mode,
1496eaec4e63SHuang Ying 						reason, &ret_folios);
1497dd4ae78aSYang Shi 			/*
1498dd4ae78aSYang Shi 			 * The rules are:
1499eaec4e63SHuang Ying 			 *	Success: non hugetlb folio will be freed, hugetlb
1500eaec4e63SHuang Ying 			 *		 folio will be put back
1501dd4ae78aSYang Shi 			 *	-EAGAIN: stay on the from list
1502dd4ae78aSYang Shi 			 *	-ENOMEM: stay on the from list
1503577be05cSHuang Ying 			 *	-ENOSYS: stay on the from list
1504eaec4e63SHuang Ying 			 *	Other errno: put on ret_folios list then splice to
1505dd4ae78aSYang Shi 			 *		     from list
1506dd4ae78aSYang Shi 			 */
1507e24f0b8fSChristoph Lameter 			switch(rc) {
150894723aafSMichal Hocko 			/*
1509eaec4e63SHuang Ying 			 * Large folio migration might be unsupported or
1510eaec4e63SHuang Ying 			 * the allocation could've failed so we should retry
1511eaec4e63SHuang Ying 			 * on the same folio with the large folio split
1512eaec4e63SHuang Ying 			 * to normal folios.
151394723aafSMichal Hocko 			 *
1514eaec4e63SHuang Ying 			 * Split folios are put in split_folios, and
1515e6fa8a79SHuang Ying 			 * we will migrate them after the rest of the
1516e6fa8a79SHuang Ying 			 * list is processed.
151794723aafSMichal Hocko 			 */
1518d532e2e5SYang Shi 			case -ENOSYS:
1519eaec4e63SHuang Ying 				/* Large folio migration is unsupported */
1520eaec4e63SHuang Ying 				if (is_large) {
1521eaec4e63SHuang Ying 					nr_large_failed++;
1522eaec4e63SHuang Ying 					nr_thp_failed += is_thp;
1523eaec4e63SHuang Ying 					if (!try_split_folio(folio, &split_folios)) {
1524eaec4e63SHuang Ying 						nr_thp_split += is_thp;
1525e6fa8a79SHuang Ying 						break;
1526d532e2e5SYang Shi 					}
1527f430893bSMiaohe Lin 				/* Hugetlb migration is unsupported */
1528eaec4e63SHuang Ying 				} else if (!no_split_folio_counting) {
1529f430893bSMiaohe Lin 					nr_failed++;
1530d532e2e5SYang Shi 				}
1531d532e2e5SYang Shi 
1532eaec4e63SHuang Ying 				nr_failed_pages += nr_pages;
1533eaec4e63SHuang Ying 				list_move_tail(&folio->lru, &ret_folios);
1534d532e2e5SYang Shi 				break;
1535d532e2e5SYang Shi 			case -ENOMEM:
1536d532e2e5SYang Shi 				/*
1537d532e2e5SYang Shi 				 * When memory is low, don't bother to try to migrate
1538eaec4e63SHuang Ying 				 * other folios, just exit.
1539d532e2e5SYang Shi 				 */
1540eaec4e63SHuang Ying 				if (is_large) {
1541eaec4e63SHuang Ying 					nr_large_failed++;
1542eaec4e63SHuang Ying 					nr_thp_failed += is_thp;
1543eaec4e63SHuang Ying 					/* Large folio NUMA faulting doesn't split to retry. */
1544fd4a7ac3SBaolin Wang 					if (!nosplit) {
1545eaec4e63SHuang Ying 						int ret = try_split_folio(folio, &split_folios);
1546fd4a7ac3SBaolin Wang 
1547fd4a7ac3SBaolin Wang 						if (!ret) {
1548eaec4e63SHuang Ying 							nr_thp_split += is_thp;
1549e6fa8a79SHuang Ying 							break;
1550fd4a7ac3SBaolin Wang 						} else if (reason == MR_LONGTERM_PIN &&
1551fd4a7ac3SBaolin Wang 							   ret == -EAGAIN) {
1552fd4a7ac3SBaolin Wang 							/*
1553eaec4e63SHuang Ying 							 * Try again to split large folio to
1554eaec4e63SHuang Ying 							 * mitigate the failure of longterm pinning.
1555fd4a7ac3SBaolin Wang 							 */
1556eaec4e63SHuang Ying 							large_retry++;
1557eaec4e63SHuang Ying 							thp_retry += is_thp;
1558eaec4e63SHuang Ying 							nr_retry_pages += nr_pages;
155994723aafSMichal Hocko 							break;
156094723aafSMichal Hocko 						}
156194723aafSMichal Hocko 					}
1562eaec4e63SHuang Ying 				} else if (!no_split_folio_counting) {
1563f430893bSMiaohe Lin 					nr_failed++;
15641a5bae25SAnshuman Khandual 				}
1565b5bade97SBaolin Wang 
1566eaec4e63SHuang Ying 				nr_failed_pages += nr_pages + nr_retry_pages;
156769a041ffSMiaohe Lin 				/*
1568eaec4e63SHuang Ying 				 * There might be some split folios of fail-to-migrate large
1569eaec4e63SHuang Ying 				 * folios left in split_folios list. Move them back to migration
157069a041ffSMiaohe Lin 				 * list so that they could be put back to the right list by
1571eaec4e63SHuang Ying 				 * the caller otherwise the folio refcnt will be leaked.
157269a041ffSMiaohe Lin 				 */
1573eaec4e63SHuang Ying 				list_splice_init(&split_folios, from);
1574fbed53b4SHuang Ying 				/* nr_failed isn't updated for not used */
1575eaec4e63SHuang Ying 				nr_large_failed += large_retry;
157669a041ffSMiaohe Lin 				nr_thp_failed += thp_retry;
157795a402c3SChristoph Lameter 				goto out;
1578e24f0b8fSChristoph Lameter 			case -EAGAIN:
1579eaec4e63SHuang Ying 				if (is_large) {
1580eaec4e63SHuang Ying 					large_retry++;
1581eaec4e63SHuang Ying 					thp_retry += is_thp;
1582eaec4e63SHuang Ying 				} else if (!no_split_folio_counting) {
1583b20a3503SChristoph Lameter 					retry++;
1584eaec4e63SHuang Ying 				}
1585eaec4e63SHuang Ying 				nr_retry_pages += nr_pages;
1586e24f0b8fSChristoph Lameter 				break;
158778bd5209SRafael Aquini 			case MIGRATEPAGE_SUCCESS:
1588eaec4e63SHuang Ying 				nr_succeeded += nr_pages;
1589eaec4e63SHuang Ying 				nr_thp_succeeded += is_thp;
15901a5bae25SAnshuman Khandual 				break;
1591e24f0b8fSChristoph Lameter 			default:
1592354a3363SNaoya Horiguchi 				/*
1593d532e2e5SYang Shi 				 * Permanent failure (-EBUSY, etc.):
1594eaec4e63SHuang Ying 				 * unlike -EAGAIN case, the failed folio is
1595eaec4e63SHuang Ying 				 * removed from migration folio list and not
1596354a3363SNaoya Horiguchi 				 * retried in the next outer loop.
1597354a3363SNaoya Horiguchi 				 */
1598eaec4e63SHuang Ying 				if (is_large) {
1599eaec4e63SHuang Ying 					nr_large_failed++;
1600eaec4e63SHuang Ying 					nr_thp_failed += is_thp;
1601eaec4e63SHuang Ying 				} else if (!no_split_folio_counting) {
1602b20a3503SChristoph Lameter 					nr_failed++;
1603eaec4e63SHuang Ying 				}
1604f430893bSMiaohe Lin 
1605eaec4e63SHuang Ying 				nr_failed_pages += nr_pages;
1606e24f0b8fSChristoph Lameter 				break;
1607b20a3503SChristoph Lameter 			}
1608b20a3503SChristoph Lameter 		}
1609e24f0b8fSChristoph Lameter 	}
1610b5bade97SBaolin Wang 	nr_failed += retry;
1611eaec4e63SHuang Ying 	nr_large_failed += large_retry;
16121a5bae25SAnshuman Khandual 	nr_thp_failed += thp_retry;
1613077309bcSHuang Ying 	nr_failed_pages += nr_retry_pages;
1614b5bade97SBaolin Wang 	/*
1615eaec4e63SHuang Ying 	 * Try to migrate split folios of fail-to-migrate large folios, no
1616eaec4e63SHuang Ying 	 * nr_failed counting in this round, since all split folios of a
1617eaec4e63SHuang Ying 	 * large folio is counted as 1 failure in the first round.
1618b5bade97SBaolin Wang 	 */
1619eaec4e63SHuang Ying 	if (!list_empty(&split_folios)) {
1620b5bade97SBaolin Wang 		/*
1621eaec4e63SHuang Ying 		 * Move non-migrated folios (after 10 retries) to ret_folios
1622b5bade97SBaolin Wang 		 * to avoid migrating them again.
1623b5bade97SBaolin Wang 		 */
1624eaec4e63SHuang Ying 		list_splice_init(from, &ret_folios);
1625eaec4e63SHuang Ying 		list_splice_init(&split_folios, from);
1626eaec4e63SHuang Ying 		no_split_folio_counting = true;
1627b5bade97SBaolin Wang 		retry = 1;
1628eaec4e63SHuang Ying 		goto split_folio_migration;
1629b5bade97SBaolin Wang 	}
1630b5bade97SBaolin Wang 
1631eaec4e63SHuang Ying 	rc = nr_failed + nr_large_failed;
163295a402c3SChristoph Lameter out:
1633dd4ae78aSYang Shi 	/*
1634eaec4e63SHuang Ying 	 * Put the permanent failure folio back to migration list, they
1635dd4ae78aSYang Shi 	 * will be put back to the right list by the caller.
1636dd4ae78aSYang Shi 	 */
1637eaec4e63SHuang Ying 	list_splice(&ret_folios, from);
1638dd4ae78aSYang Shi 
163903e5f82eSBaolin Wang 	/*
1640eaec4e63SHuang Ying 	 * Return 0 in case all split folios of fail-to-migrate large folios
1641eaec4e63SHuang Ying 	 * are migrated successfully.
164203e5f82eSBaolin Wang 	 */
164303e5f82eSBaolin Wang 	if (list_empty(from))
164403e5f82eSBaolin Wang 		rc = 0;
164503e5f82eSBaolin Wang 
16465647bc29SMel Gorman 	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1647b5bade97SBaolin Wang 	count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
16481a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
16491a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
16501a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1651b5bade97SBaolin Wang 	trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
16521a5bae25SAnshuman Khandual 			       nr_thp_failed, nr_thp_split, mode, reason);
16537b2a2d4aSMel Gorman 
16545ac95884SYang Shi 	if (ret_succeeded)
16555ac95884SYang Shi 		*ret_succeeded = nr_succeeded;
16565ac95884SYang Shi 
165795a402c3SChristoph Lameter 	return rc;
1658b20a3503SChristoph Lameter }
1659b20a3503SChristoph Lameter 
166019fc7bedSJoonsoo Kim struct page *alloc_migration_target(struct page *page, unsigned long private)
1661b4b38223SJoonsoo Kim {
1662ffe06786SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
166319fc7bedSJoonsoo Kim 	struct migration_target_control *mtc;
166419fc7bedSJoonsoo Kim 	gfp_t gfp_mask;
1665b4b38223SJoonsoo Kim 	unsigned int order = 0;
1666ffe06786SMatthew Wilcox (Oracle) 	struct folio *new_folio = NULL;
166719fc7bedSJoonsoo Kim 	int nid;
166819fc7bedSJoonsoo Kim 	int zidx;
166919fc7bedSJoonsoo Kim 
167019fc7bedSJoonsoo Kim 	mtc = (struct migration_target_control *)private;
167119fc7bedSJoonsoo Kim 	gfp_mask = mtc->gfp_mask;
167219fc7bedSJoonsoo Kim 	nid = mtc->nid;
167319fc7bedSJoonsoo Kim 	if (nid == NUMA_NO_NODE)
1674ffe06786SMatthew Wilcox (Oracle) 		nid = folio_nid(folio);
1675b4b38223SJoonsoo Kim 
1676ffe06786SMatthew Wilcox (Oracle) 	if (folio_test_hugetlb(folio)) {
1677e51da3a9SSidhartha Kumar 		struct hstate *h = folio_hstate(folio);
1678d92bbc27SJoonsoo Kim 
167919fc7bedSJoonsoo Kim 		gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
168019fc7bedSJoonsoo Kim 		return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1681d92bbc27SJoonsoo Kim 	}
1682b4b38223SJoonsoo Kim 
1683ffe06786SMatthew Wilcox (Oracle) 	if (folio_test_large(folio)) {
16849933a0c8SJoonsoo Kim 		/*
16859933a0c8SJoonsoo Kim 		 * clear __GFP_RECLAIM to make the migration callback
16869933a0c8SJoonsoo Kim 		 * consistent with regular THP allocations.
16879933a0c8SJoonsoo Kim 		 */
16889933a0c8SJoonsoo Kim 		gfp_mask &= ~__GFP_RECLAIM;
1689b4b38223SJoonsoo Kim 		gfp_mask |= GFP_TRANSHUGE;
1690ffe06786SMatthew Wilcox (Oracle) 		order = folio_order(folio);
1691b4b38223SJoonsoo Kim 	}
1692ffe06786SMatthew Wilcox (Oracle) 	zidx = zone_idx(folio_zone(folio));
169319fc7bedSJoonsoo Kim 	if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1694b4b38223SJoonsoo Kim 		gfp_mask |= __GFP_HIGHMEM;
1695b4b38223SJoonsoo Kim 
1696ffe06786SMatthew Wilcox (Oracle) 	new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
1697b4b38223SJoonsoo Kim 
1698ffe06786SMatthew Wilcox (Oracle) 	return &new_folio->page;
1699b4b38223SJoonsoo Kim }
1700b4b38223SJoonsoo Kim 
1701742755a1SChristoph Lameter #ifdef CONFIG_NUMA
1702742755a1SChristoph Lameter 
1703a49bd4d7SMichal Hocko static int store_status(int __user *status, int start, int value, int nr)
1704742755a1SChristoph Lameter {
1705a49bd4d7SMichal Hocko 	while (nr-- > 0) {
1706a49bd4d7SMichal Hocko 		if (put_user(value, status + start))
1707a49bd4d7SMichal Hocko 			return -EFAULT;
1708a49bd4d7SMichal Hocko 		start++;
1709a49bd4d7SMichal Hocko 	}
1710742755a1SChristoph Lameter 
1711a49bd4d7SMichal Hocko 	return 0;
1712a49bd4d7SMichal Hocko }
1713742755a1SChristoph Lameter 
1714a49bd4d7SMichal Hocko static int do_move_pages_to_node(struct mm_struct *mm,
1715a49bd4d7SMichal Hocko 		struct list_head *pagelist, int node)
1716a49bd4d7SMichal Hocko {
1717a49bd4d7SMichal Hocko 	int err;
1718a0976311SJoonsoo Kim 	struct migration_target_control mtc = {
1719a0976311SJoonsoo Kim 		.nid = node,
1720a0976311SJoonsoo Kim 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1721a0976311SJoonsoo Kim 	};
1722742755a1SChristoph Lameter 
1723a0976311SJoonsoo Kim 	err = migrate_pages(pagelist, alloc_migration_target, NULL,
17245ac95884SYang Shi 		(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1725a49bd4d7SMichal Hocko 	if (err)
1726a49bd4d7SMichal Hocko 		putback_movable_pages(pagelist);
1727a49bd4d7SMichal Hocko 	return err;
1728742755a1SChristoph Lameter }
1729742755a1SChristoph Lameter 
1730742755a1SChristoph Lameter /*
1731a49bd4d7SMichal Hocko  * Resolves the given address to a struct page, isolates it from the LRU and
1732a49bd4d7SMichal Hocko  * puts it to the given pagelist.
1733e0153fc2SYang Shi  * Returns:
1734e0153fc2SYang Shi  *     errno - if the page cannot be found/isolated
1735e0153fc2SYang Shi  *     0 - when it doesn't have to be migrated because it is already on the
1736e0153fc2SYang Shi  *         target node
1737e0153fc2SYang Shi  *     1 - when it has been queued
1738742755a1SChristoph Lameter  */
1739a49bd4d7SMichal Hocko static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1740a49bd4d7SMichal Hocko 		int node, struct list_head *pagelist, bool migrate_all)
1741742755a1SChristoph Lameter {
1742742755a1SChristoph Lameter 	struct vm_area_struct *vma;
1743742755a1SChristoph Lameter 	struct page *page;
1744a49bd4d7SMichal Hocko 	int err;
1745742755a1SChristoph Lameter 
1746d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
1747742755a1SChristoph Lameter 	err = -EFAULT;
1748cb1c37b1SMiaohe Lin 	vma = vma_lookup(mm, addr);
1749cb1c37b1SMiaohe Lin 	if (!vma || !vma_migratable(vma))
1750a49bd4d7SMichal Hocko 		goto out;
1751742755a1SChristoph Lameter 
1752d899844eSKirill A. Shutemov 	/* FOLL_DUMP to ignore special (like zero) pages */
175387d2762eSMiaohe Lin 	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
175489f5b7daSLinus Torvalds 
175589f5b7daSLinus Torvalds 	err = PTR_ERR(page);
175689f5b7daSLinus Torvalds 	if (IS_ERR(page))
1757a49bd4d7SMichal Hocko 		goto out;
175889f5b7daSLinus Torvalds 
1759742755a1SChristoph Lameter 	err = -ENOENT;
1760f7091ed6SHaiyue Wang 	if (!page)
1761a49bd4d7SMichal Hocko 		goto out;
1762742755a1SChristoph Lameter 
1763f7091ed6SHaiyue Wang 	if (is_zone_device_page(page))
1764f7091ed6SHaiyue Wang 		goto out_putpage;
1765f7091ed6SHaiyue Wang 
1766a49bd4d7SMichal Hocko 	err = 0;
1767a49bd4d7SMichal Hocko 	if (page_to_nid(page) == node)
1768a49bd4d7SMichal Hocko 		goto out_putpage;
1769742755a1SChristoph Lameter 
1770742755a1SChristoph Lameter 	err = -EACCES;
1771a49bd4d7SMichal Hocko 	if (page_mapcount(page) > 1 && !migrate_all)
1772a49bd4d7SMichal Hocko 		goto out_putpage;
1773742755a1SChristoph Lameter 
1774e632a938SNaoya Horiguchi 	if (PageHuge(page)) {
1775e8db67ebSNaoya Horiguchi 		if (PageHead(page)) {
17767ce82f4cSMiaohe Lin 			err = isolate_hugetlb(page, pagelist);
17777ce82f4cSMiaohe Lin 			if (!err)
1778e0153fc2SYang Shi 				err = 1;
1779e8db67ebSNaoya Horiguchi 		}
1780a49bd4d7SMichal Hocko 	} else {
1781a49bd4d7SMichal Hocko 		struct page *head;
1782e632a938SNaoya Horiguchi 
1783e8db67ebSNaoya Horiguchi 		head = compound_head(page);
1784e8db67ebSNaoya Horiguchi 		err = isolate_lru_page(head);
1785a49bd4d7SMichal Hocko 		if (err)
1786a49bd4d7SMichal Hocko 			goto out_putpage;
1787a49bd4d7SMichal Hocko 
1788e0153fc2SYang Shi 		err = 1;
1789a49bd4d7SMichal Hocko 		list_add_tail(&head->lru, pagelist);
1790e8db67ebSNaoya Horiguchi 		mod_node_page_state(page_pgdat(head),
17919de4f22aSHuang Ying 			NR_ISOLATED_ANON + page_is_file_lru(head),
17926c357848SMatthew Wilcox (Oracle) 			thp_nr_pages(head));
17936d9c285aSKOSAKI Motohiro 	}
1794a49bd4d7SMichal Hocko out_putpage:
1795742755a1SChristoph Lameter 	/*
1796742755a1SChristoph Lameter 	 * Either remove the duplicate refcount from
1797742755a1SChristoph Lameter 	 * isolate_lru_page() or drop the page ref if it was
1798742755a1SChristoph Lameter 	 * not isolated.
1799742755a1SChristoph Lameter 	 */
1800742755a1SChristoph Lameter 	put_page(page);
1801a49bd4d7SMichal Hocko out:
1802d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
1803742755a1SChristoph Lameter 	return err;
1804742755a1SChristoph Lameter }
1805742755a1SChristoph Lameter 
18067ca8783aSWei Yang static int move_pages_and_store_status(struct mm_struct *mm, int node,
18077ca8783aSWei Yang 		struct list_head *pagelist, int __user *status,
18087ca8783aSWei Yang 		int start, int i, unsigned long nr_pages)
18097ca8783aSWei Yang {
18107ca8783aSWei Yang 	int err;
18117ca8783aSWei Yang 
18125d7ae891SWei Yang 	if (list_empty(pagelist))
18135d7ae891SWei Yang 		return 0;
18145d7ae891SWei Yang 
18157ca8783aSWei Yang 	err = do_move_pages_to_node(mm, pagelist, node);
18167ca8783aSWei Yang 	if (err) {
18177ca8783aSWei Yang 		/*
18187ca8783aSWei Yang 		 * Positive err means the number of failed
18197ca8783aSWei Yang 		 * pages to migrate.  Since we are going to
18207ca8783aSWei Yang 		 * abort and return the number of non-migrated
1821ab9dd4f8SLong Li 		 * pages, so need to include the rest of the
18227ca8783aSWei Yang 		 * nr_pages that have not been attempted as
18237ca8783aSWei Yang 		 * well.
18247ca8783aSWei Yang 		 */
18257ca8783aSWei Yang 		if (err > 0)
1826a7504ed1SHuang Ying 			err += nr_pages - i;
18277ca8783aSWei Yang 		return err;
18287ca8783aSWei Yang 	}
18297ca8783aSWei Yang 	return store_status(status, start, node, i - start);
18307ca8783aSWei Yang }
18317ca8783aSWei Yang 
1832742755a1SChristoph Lameter /*
18335e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
18345e9a0f02SBrice Goglin  * the corresponding array of status.
18355e9a0f02SBrice Goglin  */
18363268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
18375e9a0f02SBrice Goglin 			 unsigned long nr_pages,
18385e9a0f02SBrice Goglin 			 const void __user * __user *pages,
18395e9a0f02SBrice Goglin 			 const int __user *nodes,
18405e9a0f02SBrice Goglin 			 int __user *status, int flags)
18415e9a0f02SBrice Goglin {
1842a49bd4d7SMichal Hocko 	int current_node = NUMA_NO_NODE;
1843a49bd4d7SMichal Hocko 	LIST_HEAD(pagelist);
1844a49bd4d7SMichal Hocko 	int start, i;
1845a49bd4d7SMichal Hocko 	int err = 0, err1;
184635282a2dSBrice Goglin 
1847361a2a22SMinchan Kim 	lru_cache_disable();
184835282a2dSBrice Goglin 
1849a49bd4d7SMichal Hocko 	for (i = start = 0; i < nr_pages; i++) {
18505e9a0f02SBrice Goglin 		const void __user *p;
1851a49bd4d7SMichal Hocko 		unsigned long addr;
18525e9a0f02SBrice Goglin 		int node;
18535e9a0f02SBrice Goglin 
18543140a227SBrice Goglin 		err = -EFAULT;
1855a49bd4d7SMichal Hocko 		if (get_user(p, pages + i))
1856a49bd4d7SMichal Hocko 			goto out_flush;
1857a49bd4d7SMichal Hocko 		if (get_user(node, nodes + i))
1858a49bd4d7SMichal Hocko 			goto out_flush;
1859057d3389SAndrey Konovalov 		addr = (unsigned long)untagged_addr(p);
18605e9a0f02SBrice Goglin 
18615e9a0f02SBrice Goglin 		err = -ENODEV;
18626f5a55f1SLinus Torvalds 		if (node < 0 || node >= MAX_NUMNODES)
1863a49bd4d7SMichal Hocko 			goto out_flush;
1864389162c2SLai Jiangshan 		if (!node_state(node, N_MEMORY))
1865a49bd4d7SMichal Hocko 			goto out_flush;
18665e9a0f02SBrice Goglin 
18675e9a0f02SBrice Goglin 		err = -EACCES;
18685e9a0f02SBrice Goglin 		if (!node_isset(node, task_nodes))
1869a49bd4d7SMichal Hocko 			goto out_flush;
18705e9a0f02SBrice Goglin 
1871a49bd4d7SMichal Hocko 		if (current_node == NUMA_NO_NODE) {
1872a49bd4d7SMichal Hocko 			current_node = node;
1873a49bd4d7SMichal Hocko 			start = i;
1874a49bd4d7SMichal Hocko 		} else if (node != current_node) {
18757ca8783aSWei Yang 			err = move_pages_and_store_status(mm, current_node,
18767ca8783aSWei Yang 					&pagelist, status, start, i, nr_pages);
1877a49bd4d7SMichal Hocko 			if (err)
1878a49bd4d7SMichal Hocko 				goto out;
1879a49bd4d7SMichal Hocko 			start = i;
1880a49bd4d7SMichal Hocko 			current_node = node;
18815e9a0f02SBrice Goglin 		}
18825e9a0f02SBrice Goglin 
1883a49bd4d7SMichal Hocko 		/*
1884a49bd4d7SMichal Hocko 		 * Errors in the page lookup or isolation are not fatal and we simply
1885a49bd4d7SMichal Hocko 		 * report them via status
1886a49bd4d7SMichal Hocko 		 */
1887a49bd4d7SMichal Hocko 		err = add_page_for_migration(mm, addr, current_node,
1888a49bd4d7SMichal Hocko 				&pagelist, flags & MPOL_MF_MOVE_ALL);
1889e0153fc2SYang Shi 
1890d08221a0SWei Yang 		if (err > 0) {
1891e0153fc2SYang Shi 			/* The page is successfully queued for migration */
1892e0153fc2SYang Shi 			continue;
1893e0153fc2SYang Shi 		}
18943140a227SBrice Goglin 
1895d08221a0SWei Yang 		/*
189665462462SJohn Hubbard 		 * The move_pages() man page does not have an -EEXIST choice, so
189765462462SJohn Hubbard 		 * use -EFAULT instead.
189865462462SJohn Hubbard 		 */
189965462462SJohn Hubbard 		if (err == -EEXIST)
190065462462SJohn Hubbard 			err = -EFAULT;
190165462462SJohn Hubbard 
190265462462SJohn Hubbard 		/*
1903d08221a0SWei Yang 		 * If the page is already on the target node (!err), store the
1904d08221a0SWei Yang 		 * node, otherwise, store the err.
1905d08221a0SWei Yang 		 */
1906d08221a0SWei Yang 		err = store_status(status, i, err ? : current_node, 1);
1907a49bd4d7SMichal Hocko 		if (err)
1908a49bd4d7SMichal Hocko 			goto out_flush;
19093140a227SBrice Goglin 
19107ca8783aSWei Yang 		err = move_pages_and_store_status(mm, current_node, &pagelist,
19117ca8783aSWei Yang 				status, start, i, nr_pages);
1912a7504ed1SHuang Ying 		if (err) {
1913a7504ed1SHuang Ying 			/* We have accounted for page i */
1914a7504ed1SHuang Ying 			if (err > 0)
1915a7504ed1SHuang Ying 				err--;
1916a49bd4d7SMichal Hocko 			goto out;
1917a7504ed1SHuang Ying 		}
1918a49bd4d7SMichal Hocko 		current_node = NUMA_NO_NODE;
19193140a227SBrice Goglin 	}
1920a49bd4d7SMichal Hocko out_flush:
1921a49bd4d7SMichal Hocko 	/* Make sure we do not overwrite the existing error */
19227ca8783aSWei Yang 	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
19237ca8783aSWei Yang 				status, start, i, nr_pages);
1924dfe9aa23SWei Yang 	if (err >= 0)
1925a49bd4d7SMichal Hocko 		err = err1;
19265e9a0f02SBrice Goglin out:
1927361a2a22SMinchan Kim 	lru_cache_enable();
19285e9a0f02SBrice Goglin 	return err;
19295e9a0f02SBrice Goglin }
19305e9a0f02SBrice Goglin 
19315e9a0f02SBrice Goglin /*
19322f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
1933742755a1SChristoph Lameter  */
193480bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
193580bba129SBrice Goglin 				const void __user **pages, int *status)
1936742755a1SChristoph Lameter {
19372f007e74SBrice Goglin 	unsigned long i;
1938742755a1SChristoph Lameter 
1939d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
19402f007e74SBrice Goglin 
19412f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
194280bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
19432f007e74SBrice Goglin 		struct vm_area_struct *vma;
19442f007e74SBrice Goglin 		struct page *page;
1945c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
19462f007e74SBrice Goglin 
1947059b8b48SLiam Howlett 		vma = vma_lookup(mm, addr);
1948059b8b48SLiam Howlett 		if (!vma)
1949742755a1SChristoph Lameter 			goto set_status;
1950742755a1SChristoph Lameter 
1951d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
195216fd6b31SBaolin Wang 		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
195389f5b7daSLinus Torvalds 
195489f5b7daSLinus Torvalds 		err = PTR_ERR(page);
195589f5b7daSLinus Torvalds 		if (IS_ERR(page))
195689f5b7daSLinus Torvalds 			goto set_status;
195789f5b7daSLinus Torvalds 
1958f7091ed6SHaiyue Wang 		err = -ENOENT;
1959f7091ed6SHaiyue Wang 		if (!page)
1960f7091ed6SHaiyue Wang 			goto set_status;
1961f7091ed6SHaiyue Wang 
1962f7091ed6SHaiyue Wang 		if (!is_zone_device_page(page))
19634cd61484SMiaohe Lin 			err = page_to_nid(page);
1964f7091ed6SHaiyue Wang 
19654cd61484SMiaohe Lin 		put_page(page);
1966742755a1SChristoph Lameter set_status:
196780bba129SBrice Goglin 		*status = err;
196880bba129SBrice Goglin 
196980bba129SBrice Goglin 		pages++;
197080bba129SBrice Goglin 		status++;
197180bba129SBrice Goglin 	}
197280bba129SBrice Goglin 
1973d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
197480bba129SBrice Goglin }
197580bba129SBrice Goglin 
19765b1b561bSArnd Bergmann static int get_compat_pages_array(const void __user *chunk_pages[],
19775b1b561bSArnd Bergmann 				  const void __user * __user *pages,
19785b1b561bSArnd Bergmann 				  unsigned long chunk_nr)
19795b1b561bSArnd Bergmann {
19805b1b561bSArnd Bergmann 	compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
19815b1b561bSArnd Bergmann 	compat_uptr_t p;
19825b1b561bSArnd Bergmann 	int i;
19835b1b561bSArnd Bergmann 
19845b1b561bSArnd Bergmann 	for (i = 0; i < chunk_nr; i++) {
19855b1b561bSArnd Bergmann 		if (get_user(p, pages32 + i))
19865b1b561bSArnd Bergmann 			return -EFAULT;
19875b1b561bSArnd Bergmann 		chunk_pages[i] = compat_ptr(p);
19885b1b561bSArnd Bergmann 	}
19895b1b561bSArnd Bergmann 
19905b1b561bSArnd Bergmann 	return 0;
19915b1b561bSArnd Bergmann }
19925b1b561bSArnd Bergmann 
199380bba129SBrice Goglin /*
199480bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
199580bba129SBrice Goglin  * a user array of status.
199680bba129SBrice Goglin  */
199780bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
199880bba129SBrice Goglin 			 const void __user * __user *pages,
199980bba129SBrice Goglin 			 int __user *status)
200080bba129SBrice Goglin {
20013eefb826SMiaohe Lin #define DO_PAGES_STAT_CHUNK_NR 16UL
200280bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
200380bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
200480bba129SBrice Goglin 
200587b8d1adSH. Peter Anvin 	while (nr_pages) {
20063eefb826SMiaohe Lin 		unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
200787b8d1adSH. Peter Anvin 
20085b1b561bSArnd Bergmann 		if (in_compat_syscall()) {
20095b1b561bSArnd Bergmann 			if (get_compat_pages_array(chunk_pages, pages,
20105b1b561bSArnd Bergmann 						   chunk_nr))
201187b8d1adSH. Peter Anvin 				break;
20125b1b561bSArnd Bergmann 		} else {
20135b1b561bSArnd Bergmann 			if (copy_from_user(chunk_pages, pages,
20145b1b561bSArnd Bergmann 				      chunk_nr * sizeof(*chunk_pages)))
20155b1b561bSArnd Bergmann 				break;
20165b1b561bSArnd Bergmann 		}
201780bba129SBrice Goglin 
201880bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
201980bba129SBrice Goglin 
202087b8d1adSH. Peter Anvin 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
202187b8d1adSH. Peter Anvin 			break;
2022742755a1SChristoph Lameter 
202387b8d1adSH. Peter Anvin 		pages += chunk_nr;
202487b8d1adSH. Peter Anvin 		status += chunk_nr;
202587b8d1adSH. Peter Anvin 		nr_pages -= chunk_nr;
202687b8d1adSH. Peter Anvin 	}
202787b8d1adSH. Peter Anvin 	return nr_pages ? -EFAULT : 0;
2028742755a1SChristoph Lameter }
2029742755a1SChristoph Lameter 
20304dc200ceSMiaohe Lin static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
20314dc200ceSMiaohe Lin {
20324dc200ceSMiaohe Lin 	struct task_struct *task;
20334dc200ceSMiaohe Lin 	struct mm_struct *mm;
20344dc200ceSMiaohe Lin 
20354dc200ceSMiaohe Lin 	/*
20364dc200ceSMiaohe Lin 	 * There is no need to check if current process has the right to modify
20374dc200ceSMiaohe Lin 	 * the specified process when they are same.
20384dc200ceSMiaohe Lin 	 */
20394dc200ceSMiaohe Lin 	if (!pid) {
20404dc200ceSMiaohe Lin 		mmget(current->mm);
20414dc200ceSMiaohe Lin 		*mem_nodes = cpuset_mems_allowed(current);
20424dc200ceSMiaohe Lin 		return current->mm;
20434dc200ceSMiaohe Lin 	}
20444dc200ceSMiaohe Lin 
20454dc200ceSMiaohe Lin 	/* Find the mm_struct */
20464dc200ceSMiaohe Lin 	rcu_read_lock();
20474dc200ceSMiaohe Lin 	task = find_task_by_vpid(pid);
20484dc200ceSMiaohe Lin 	if (!task) {
20494dc200ceSMiaohe Lin 		rcu_read_unlock();
20504dc200ceSMiaohe Lin 		return ERR_PTR(-ESRCH);
20514dc200ceSMiaohe Lin 	}
20524dc200ceSMiaohe Lin 	get_task_struct(task);
20534dc200ceSMiaohe Lin 
20544dc200ceSMiaohe Lin 	/*
20554dc200ceSMiaohe Lin 	 * Check if this process has the right to modify the specified
20564dc200ceSMiaohe Lin 	 * process. Use the regular "ptrace_may_access()" checks.
20574dc200ceSMiaohe Lin 	 */
20584dc200ceSMiaohe Lin 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
20594dc200ceSMiaohe Lin 		rcu_read_unlock();
20604dc200ceSMiaohe Lin 		mm = ERR_PTR(-EPERM);
20614dc200ceSMiaohe Lin 		goto out;
20624dc200ceSMiaohe Lin 	}
20634dc200ceSMiaohe Lin 	rcu_read_unlock();
20644dc200ceSMiaohe Lin 
20654dc200ceSMiaohe Lin 	mm = ERR_PTR(security_task_movememory(task));
20664dc200ceSMiaohe Lin 	if (IS_ERR(mm))
20674dc200ceSMiaohe Lin 		goto out;
20684dc200ceSMiaohe Lin 	*mem_nodes = cpuset_mems_allowed(task);
20694dc200ceSMiaohe Lin 	mm = get_task_mm(task);
20704dc200ceSMiaohe Lin out:
20714dc200ceSMiaohe Lin 	put_task_struct(task);
20724dc200ceSMiaohe Lin 	if (!mm)
20734dc200ceSMiaohe Lin 		mm = ERR_PTR(-EINVAL);
20744dc200ceSMiaohe Lin 	return mm;
20754dc200ceSMiaohe Lin }
20764dc200ceSMiaohe Lin 
2077742755a1SChristoph Lameter /*
2078742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
2079742755a1SChristoph Lameter  * process.
2080742755a1SChristoph Lameter  */
20817addf443SDominik Brodowski static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
20827addf443SDominik Brodowski 			     const void __user * __user *pages,
20837addf443SDominik Brodowski 			     const int __user *nodes,
20847addf443SDominik Brodowski 			     int __user *status, int flags)
2085742755a1SChristoph Lameter {
2086742755a1SChristoph Lameter 	struct mm_struct *mm;
20875e9a0f02SBrice Goglin 	int err;
20883268c63eSChristoph Lameter 	nodemask_t task_nodes;
2089742755a1SChristoph Lameter 
2090742755a1SChristoph Lameter 	/* Check flags */
2091742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
2092742755a1SChristoph Lameter 		return -EINVAL;
2093742755a1SChristoph Lameter 
2094742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2095742755a1SChristoph Lameter 		return -EPERM;
2096742755a1SChristoph Lameter 
20974dc200ceSMiaohe Lin 	mm = find_mm_struct(pid, &task_nodes);
20984dc200ceSMiaohe Lin 	if (IS_ERR(mm))
20994dc200ceSMiaohe Lin 		return PTR_ERR(mm);
21006e8b09eaSSasha Levin 
21013268c63eSChristoph Lameter 	if (nodes)
21023268c63eSChristoph Lameter 		err = do_pages_move(mm, task_nodes, nr_pages, pages,
21033268c63eSChristoph Lameter 				    nodes, status, flags);
21043268c63eSChristoph Lameter 	else
21055e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
21063268c63eSChristoph Lameter 
21073268c63eSChristoph Lameter 	mmput(mm);
21083268c63eSChristoph Lameter 	return err;
2109742755a1SChristoph Lameter }
2110742755a1SChristoph Lameter 
21117addf443SDominik Brodowski SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
21127addf443SDominik Brodowski 		const void __user * __user *, pages,
21137addf443SDominik Brodowski 		const int __user *, nodes,
21147addf443SDominik Brodowski 		int __user *, status, int, flags)
21157addf443SDominik Brodowski {
21167addf443SDominik Brodowski 	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
21177addf443SDominik Brodowski }
21187addf443SDominik Brodowski 
21197039e1dbSPeter Zijlstra #ifdef CONFIG_NUMA_BALANCING
21207039e1dbSPeter Zijlstra /*
21217039e1dbSPeter Zijlstra  * Returns true if this is a safe migration target node for misplaced NUMA
2122bc53008eSWei Yang  * pages. Currently it only checks the watermarks which is crude.
21237039e1dbSPeter Zijlstra  */
21247039e1dbSPeter Zijlstra static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
21253abef4e6SMel Gorman 				   unsigned long nr_migrate_pages)
21267039e1dbSPeter Zijlstra {
21277039e1dbSPeter Zijlstra 	int z;
2128599d0c95SMel Gorman 
21297039e1dbSPeter Zijlstra 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
21307039e1dbSPeter Zijlstra 		struct zone *zone = pgdat->node_zones + z;
21317039e1dbSPeter Zijlstra 
2132bc53008eSWei Yang 		if (!managed_zone(zone))
21337039e1dbSPeter Zijlstra 			continue;
21347039e1dbSPeter Zijlstra 
21357039e1dbSPeter Zijlstra 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
21367039e1dbSPeter Zijlstra 		if (!zone_watermark_ok(zone, 0,
21377039e1dbSPeter Zijlstra 				       high_wmark_pages(zone) +
21387039e1dbSPeter Zijlstra 				       nr_migrate_pages,
2139bfe9d006SHuang Ying 				       ZONE_MOVABLE, 0))
21407039e1dbSPeter Zijlstra 			continue;
21417039e1dbSPeter Zijlstra 		return true;
21427039e1dbSPeter Zijlstra 	}
21437039e1dbSPeter Zijlstra 	return false;
21447039e1dbSPeter Zijlstra }
21457039e1dbSPeter Zijlstra 
21467039e1dbSPeter Zijlstra static struct page *alloc_misplaced_dst_page(struct page *page,
2147666feb21SMichal Hocko 					   unsigned long data)
21487039e1dbSPeter Zijlstra {
21497039e1dbSPeter Zijlstra 	int nid = (int) data;
2150c185e494SMatthew Wilcox (Oracle) 	int order = compound_order(page);
2151c185e494SMatthew Wilcox (Oracle) 	gfp_t gfp = __GFP_THISNODE;
2152c185e494SMatthew Wilcox (Oracle) 	struct folio *new;
21537039e1dbSPeter Zijlstra 
2154c185e494SMatthew Wilcox (Oracle) 	if (order > 0)
2155c185e494SMatthew Wilcox (Oracle) 		gfp |= GFP_TRANSHUGE_LIGHT;
2156c185e494SMatthew Wilcox (Oracle) 	else {
2157c185e494SMatthew Wilcox (Oracle) 		gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2158c185e494SMatthew Wilcox (Oracle) 			__GFP_NOWARN;
2159c185e494SMatthew Wilcox (Oracle) 		gfp &= ~__GFP_RECLAIM;
21607039e1dbSPeter Zijlstra 	}
2161c185e494SMatthew Wilcox (Oracle) 	new = __folio_alloc_node(gfp, order, nid);
21627039e1dbSPeter Zijlstra 
2163c185e494SMatthew Wilcox (Oracle) 	return &new->page;
2164c5b5a3ddSYang Shi }
2165c5b5a3ddSYang Shi 
21661c30e017SMel Gorman static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
2167b32967ffSMel Gorman {
21682b9b624fSBaolin Wang 	int nr_pages = thp_nr_pages(page);
2169c574bbe9SHuang Ying 	int order = compound_order(page);
2170b32967ffSMel Gorman 
2171c574bbe9SHuang Ying 	VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
21723abef4e6SMel Gorman 
2173662aeea7SYang Shi 	/* Do not migrate THP mapped by multiple processes */
2174662aeea7SYang Shi 	if (PageTransHuge(page) && total_mapcount(page) > 1)
2175662aeea7SYang Shi 		return 0;
2176662aeea7SYang Shi 
2177b32967ffSMel Gorman 	/* Avoid migrating to a node that is nearly full */
2178c574bbe9SHuang Ying 	if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2179c574bbe9SHuang Ying 		int z;
2180c574bbe9SHuang Ying 
2181c574bbe9SHuang Ying 		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2182340ef390SHugh Dickins 			return 0;
2183c574bbe9SHuang Ying 		for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2184bc53008eSWei Yang 			if (managed_zone(pgdat->node_zones + z))
2185c574bbe9SHuang Ying 				break;
2186c574bbe9SHuang Ying 		}
2187c574bbe9SHuang Ying 		wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
2188c574bbe9SHuang Ying 		return 0;
2189c574bbe9SHuang Ying 	}
2190b32967ffSMel Gorman 
2191340ef390SHugh Dickins 	if (isolate_lru_page(page))
2192340ef390SHugh Dickins 		return 0;
2193340ef390SHugh Dickins 
2194b75454e1SMiaohe Lin 	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
21952b9b624fSBaolin Wang 			    nr_pages);
2196b32967ffSMel Gorman 
2197b32967ffSMel Gorman 	/*
2198340ef390SHugh Dickins 	 * Isolating the page has taken another reference, so the
2199340ef390SHugh Dickins 	 * caller's reference can be safely dropped without the page
2200340ef390SHugh Dickins 	 * disappearing underneath us during migration.
2201b32967ffSMel Gorman 	 */
2202b32967ffSMel Gorman 	put_page(page);
2203340ef390SHugh Dickins 	return 1;
2204b32967ffSMel Gorman }
2205b32967ffSMel Gorman 
2206a8f60772SMel Gorman /*
22077039e1dbSPeter Zijlstra  * Attempt to migrate a misplaced page to the specified destination
22087039e1dbSPeter Zijlstra  * node. Caller is expected to have an elevated reference count on
22097039e1dbSPeter Zijlstra  * the page that will be dropped by this function before returning.
22107039e1dbSPeter Zijlstra  */
22111bc115d8SMel Gorman int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
22121bc115d8SMel Gorman 			   int node)
22137039e1dbSPeter Zijlstra {
2214a8f60772SMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
2215340ef390SHugh Dickins 	int isolated;
2216b32967ffSMel Gorman 	int nr_remaining;
2217e39bb6beSHuang Ying 	unsigned int nr_succeeded;
22187039e1dbSPeter Zijlstra 	LIST_HEAD(migratepages);
2219b5916c02SAneesh Kumar K.V 	int nr_pages = thp_nr_pages(page);
2220c5b5a3ddSYang Shi 
2221c5b5a3ddSYang Shi 	/*
22221bc115d8SMel Gorman 	 * Don't migrate file pages that are mapped in multiple processes
22231bc115d8SMel Gorman 	 * with execute permissions as they are probably shared libraries.
22247039e1dbSPeter Zijlstra 	 */
22257ee820eeSMiaohe Lin 	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
22267ee820eeSMiaohe Lin 	    (vma->vm_flags & VM_EXEC))
22277039e1dbSPeter Zijlstra 		goto out;
22287039e1dbSPeter Zijlstra 
2229a8f60772SMel Gorman 	/*
223009a913a7SMel Gorman 	 * Also do not migrate dirty pages as not all filesystems can move
223109a913a7SMel Gorman 	 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
223209a913a7SMel Gorman 	 */
22339de4f22aSHuang Ying 	if (page_is_file_lru(page) && PageDirty(page))
223409a913a7SMel Gorman 		goto out;
223509a913a7SMel Gorman 
2236b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
2237b32967ffSMel Gorman 	if (!isolated)
22387039e1dbSPeter Zijlstra 		goto out;
22397039e1dbSPeter Zijlstra 
22407039e1dbSPeter Zijlstra 	list_add(&page->lru, &migratepages);
2241c185e494SMatthew Wilcox (Oracle) 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
2242c185e494SMatthew Wilcox (Oracle) 				     NULL, node, MIGRATE_ASYNC,
2243c185e494SMatthew Wilcox (Oracle) 				     MR_NUMA_MISPLACED, &nr_succeeded);
22447039e1dbSPeter Zijlstra 	if (nr_remaining) {
224559c82b70SJoonsoo Kim 		if (!list_empty(&migratepages)) {
224659c82b70SJoonsoo Kim 			list_del(&page->lru);
2247c5fc5c3aSYang Shi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2248c5fc5c3aSYang Shi 					page_is_file_lru(page), -nr_pages);
224959c82b70SJoonsoo Kim 			putback_lru_page(page);
225059c82b70SJoonsoo Kim 		}
22517039e1dbSPeter Zijlstra 		isolated = 0;
2252e39bb6beSHuang Ying 	}
2253e39bb6beSHuang Ying 	if (nr_succeeded) {
2254e39bb6beSHuang Ying 		count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2255e39bb6beSHuang Ying 		if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
2256e39bb6beSHuang Ying 			mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
2257e39bb6beSHuang Ying 					    nr_succeeded);
2258e39bb6beSHuang Ying 	}
22597039e1dbSPeter Zijlstra 	BUG_ON(!list_empty(&migratepages));
22607039e1dbSPeter Zijlstra 	return isolated;
2261340ef390SHugh Dickins 
2262340ef390SHugh Dickins out:
2263340ef390SHugh Dickins 	put_page(page);
2264340ef390SHugh Dickins 	return 0;
22657039e1dbSPeter Zijlstra }
2266220018d3SMel Gorman #endif /* CONFIG_NUMA_BALANCING */
22677d6e2d96SOscar Salvador #endif /* CONFIG_NUMA */
2268