xref: /linux/mm/migrate.c (revision c33db29231ad242b0c381c60b1603f5e1dec7e46)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2b20a3503SChristoph Lameter /*
314e0f9bcSHugh Dickins  * Memory Migration functionality - linux/mm/migrate.c
4b20a3503SChristoph Lameter  *
5b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6b20a3503SChristoph Lameter  *
7b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
8b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
9b20a3503SChristoph Lameter  *
10b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
12b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
13cde53535SChristoph Lameter  * Christoph Lameter
14b20a3503SChristoph Lameter  */
15b20a3503SChristoph Lameter 
16b20a3503SChristoph Lameter #include <linux/migrate.h>
17b95f1b31SPaul Gortmaker #include <linux/export.h>
18b20a3503SChristoph Lameter #include <linux/swap.h>
190697212aSChristoph Lameter #include <linux/swapops.h>
20b20a3503SChristoph Lameter #include <linux/pagemap.h>
21e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
22b20a3503SChristoph Lameter #include <linux/mm_inline.h>
23b488893aSPavel Emelyanov #include <linux/nsproxy.h>
24b20a3503SChristoph Lameter #include <linux/pagevec.h>
25e9995ef9SHugh Dickins #include <linux/ksm.h>
26b20a3503SChristoph Lameter #include <linux/rmap.h>
27b20a3503SChristoph Lameter #include <linux/topology.h>
28b20a3503SChristoph Lameter #include <linux/cpu.h>
29b20a3503SChristoph Lameter #include <linux/cpuset.h>
3004e62a29SChristoph Lameter #include <linux/writeback.h>
31742755a1SChristoph Lameter #include <linux/mempolicy.h>
32742755a1SChristoph Lameter #include <linux/vmalloc.h>
3386c3a764SDavid Quigley #include <linux/security.h>
3442cb14b1SHugh Dickins #include <linux/backing-dev.h>
35bda807d4SMinchan Kim #include <linux/compaction.h>
364f5ca265SAdrian Bunk #include <linux/syscalls.h>
377addf443SDominik Brodowski #include <linux/compat.h>
38290408d4SNaoya Horiguchi #include <linux/hugetlb.h>
398e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
405a0e3ad6STejun Heo #include <linux/gfp.h>
41df6ad698SJérôme Glisse #include <linux/pfn_t.h>
42a5430ddaSJérôme Glisse #include <linux/memremap.h>
438315ada7SJérôme Glisse #include <linux/userfaultfd_k.h>
44bf6bddf1SRafael Aquini #include <linux/balloon_compaction.h>
4533c3fc71SVladimir Davydov #include <linux/page_idle.h>
46d435edcaSVlastimil Babka #include <linux/page_owner.h>
476e84f315SIngo Molnar #include <linux/sched/mm.h>
48197e7e52SLinus Torvalds #include <linux/ptrace.h>
4934290e2cSRalph Campbell #include <linux/oom.h>
50884a6e5dSDave Hansen #include <linux/memory.h>
51ac16ec83SBaolin Wang #include <linux/random.h>
52c574bbe9SHuang Ying #include <linux/sched/sysctl.h>
53467b171aSAneesh Kumar K.V #include <linux/memory-tiers.h>
54b20a3503SChristoph Lameter 
550d1836c3SMichal Nazarewicz #include <asm/tlbflush.h>
560d1836c3SMichal Nazarewicz 
577b2a2d4aSMel Gorman #include <trace/events/migrate.h>
587b2a2d4aSMel Gorman 
59b20a3503SChristoph Lameter #include "internal.h"
60b20a3503SChristoph Lameter 
619e5bcd61SYisheng Xie int isolate_movable_page(struct page *page, isolate_mode_t mode)
62bda807d4SMinchan Kim {
6368f2736aSMatthew Wilcox (Oracle) 	const struct movable_operations *mops;
64bda807d4SMinchan Kim 
65bda807d4SMinchan Kim 	/*
66bda807d4SMinchan Kim 	 * Avoid burning cycles with pages that are yet under __free_pages(),
67bda807d4SMinchan Kim 	 * or just got freed under us.
68bda807d4SMinchan Kim 	 *
69bda807d4SMinchan Kim 	 * In case we 'win' a race for a movable page being freed under us and
70bda807d4SMinchan Kim 	 * raise its refcount preventing __free_pages() from doing its job
71bda807d4SMinchan Kim 	 * the put_page() at the end of this block will take care of
72bda807d4SMinchan Kim 	 * release this page, thus avoiding a nasty leakage.
73bda807d4SMinchan Kim 	 */
74bda807d4SMinchan Kim 	if (unlikely(!get_page_unless_zero(page)))
75bda807d4SMinchan Kim 		goto out;
76bda807d4SMinchan Kim 
77bda807d4SMinchan Kim 	/*
78bda807d4SMinchan Kim 	 * Check PageMovable before holding a PG_lock because page's owner
79bda807d4SMinchan Kim 	 * assumes anybody doesn't touch PG_lock of newly allocated page
808bb4e7a2SWei Yang 	 * so unconditionally grabbing the lock ruins page's owner side.
81bda807d4SMinchan Kim 	 */
82bda807d4SMinchan Kim 	if (unlikely(!__PageMovable(page)))
83bda807d4SMinchan Kim 		goto out_putpage;
84bda807d4SMinchan Kim 	/*
85bda807d4SMinchan Kim 	 * As movable pages are not isolated from LRU lists, concurrent
86bda807d4SMinchan Kim 	 * compaction threads can race against page migration functions
87bda807d4SMinchan Kim 	 * as well as race against the releasing a page.
88bda807d4SMinchan Kim 	 *
89bda807d4SMinchan Kim 	 * In order to avoid having an already isolated movable page
90bda807d4SMinchan Kim 	 * being (wrongly) re-isolated while it is under migration,
91bda807d4SMinchan Kim 	 * or to avoid attempting to isolate pages being released,
92bda807d4SMinchan Kim 	 * lets be sure we have the page lock
93bda807d4SMinchan Kim 	 * before proceeding with the movable page isolation steps.
94bda807d4SMinchan Kim 	 */
95bda807d4SMinchan Kim 	if (unlikely(!trylock_page(page)))
96bda807d4SMinchan Kim 		goto out_putpage;
97bda807d4SMinchan Kim 
98bda807d4SMinchan Kim 	if (!PageMovable(page) || PageIsolated(page))
99bda807d4SMinchan Kim 		goto out_no_isolated;
100bda807d4SMinchan Kim 
10168f2736aSMatthew Wilcox (Oracle) 	mops = page_movable_ops(page);
10268f2736aSMatthew Wilcox (Oracle) 	VM_BUG_ON_PAGE(!mops, page);
103bda807d4SMinchan Kim 
10468f2736aSMatthew Wilcox (Oracle) 	if (!mops->isolate_page(page, mode))
105bda807d4SMinchan Kim 		goto out_no_isolated;
106bda807d4SMinchan Kim 
107bda807d4SMinchan Kim 	/* Driver shouldn't use PG_isolated bit of page->flags */
108bda807d4SMinchan Kim 	WARN_ON_ONCE(PageIsolated(page));
109356ea386Sandrew.yang 	SetPageIsolated(page);
110bda807d4SMinchan Kim 	unlock_page(page);
111bda807d4SMinchan Kim 
1129e5bcd61SYisheng Xie 	return 0;
113bda807d4SMinchan Kim 
114bda807d4SMinchan Kim out_no_isolated:
115bda807d4SMinchan Kim 	unlock_page(page);
116bda807d4SMinchan Kim out_putpage:
117bda807d4SMinchan Kim 	put_page(page);
118bda807d4SMinchan Kim out:
1199e5bcd61SYisheng Xie 	return -EBUSY;
120bda807d4SMinchan Kim }
121bda807d4SMinchan Kim 
122606a6f71SMiaohe Lin static void putback_movable_page(struct page *page)
123bda807d4SMinchan Kim {
12468f2736aSMatthew Wilcox (Oracle) 	const struct movable_operations *mops = page_movable_ops(page);
125bda807d4SMinchan Kim 
12668f2736aSMatthew Wilcox (Oracle) 	mops->putback_page(page);
127356ea386Sandrew.yang 	ClearPageIsolated(page);
128bda807d4SMinchan Kim }
129bda807d4SMinchan Kim 
130b20a3503SChristoph Lameter /*
1315733c7d1SRafael Aquini  * Put previously isolated pages back onto the appropriate lists
1325733c7d1SRafael Aquini  * from where they were once taken off for compaction/migration.
1335733c7d1SRafael Aquini  *
13459c82b70SJoonsoo Kim  * This function shall be used whenever the isolated pageset has been
13559c82b70SJoonsoo Kim  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
1367ce82f4cSMiaohe Lin  * and isolate_hugetlb().
1375733c7d1SRafael Aquini  */
1385733c7d1SRafael Aquini void putback_movable_pages(struct list_head *l)
1395733c7d1SRafael Aquini {
1405733c7d1SRafael Aquini 	struct page *page;
1415733c7d1SRafael Aquini 	struct page *page2;
1425733c7d1SRafael Aquini 
1435733c7d1SRafael Aquini 	list_for_each_entry_safe(page, page2, l, lru) {
14431caf665SNaoya Horiguchi 		if (unlikely(PageHuge(page))) {
14531caf665SNaoya Horiguchi 			putback_active_hugepage(page);
14631caf665SNaoya Horiguchi 			continue;
14731caf665SNaoya Horiguchi 		}
1485733c7d1SRafael Aquini 		list_del(&page->lru);
149bda807d4SMinchan Kim 		/*
150bda807d4SMinchan Kim 		 * We isolated non-lru movable page so here we can use
151bda807d4SMinchan Kim 		 * __PageMovable because LRU page's mapping cannot have
152bda807d4SMinchan Kim 		 * PAGE_MAPPING_MOVABLE.
153bda807d4SMinchan Kim 		 */
154b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(page))) {
155bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
156bda807d4SMinchan Kim 			lock_page(page);
157bda807d4SMinchan Kim 			if (PageMovable(page))
158bda807d4SMinchan Kim 				putback_movable_page(page);
159bf6bddf1SRafael Aquini 			else
160356ea386Sandrew.yang 				ClearPageIsolated(page);
161bda807d4SMinchan Kim 			unlock_page(page);
162bda807d4SMinchan Kim 			put_page(page);
163bda807d4SMinchan Kim 		} else {
164e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1656c357848SMatthew Wilcox (Oracle) 					page_is_file_lru(page), -thp_nr_pages(page));
166fc280fe8SRabin Vincent 			putback_lru_page(page);
167b20a3503SChristoph Lameter 		}
168b20a3503SChristoph Lameter 	}
169bda807d4SMinchan Kim }
170b20a3503SChristoph Lameter 
1710697212aSChristoph Lameter /*
1720697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
1730697212aSChristoph Lameter  */
1742f031c6fSMatthew Wilcox (Oracle) static bool remove_migration_pte(struct folio *folio,
1752f031c6fSMatthew Wilcox (Oracle) 		struct vm_area_struct *vma, unsigned long addr, void *old)
1760697212aSChristoph Lameter {
1774eecb8b9SMatthew Wilcox (Oracle) 	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
1780697212aSChristoph Lameter 
1793fe87967SKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
1806c287605SDavid Hildenbrand 		rmap_t rmap_flags = RMAP_NONE;
1810697212aSChristoph Lameter 		pte_t pte;
1820697212aSChristoph Lameter 		swp_entry_t entry;
1834eecb8b9SMatthew Wilcox (Oracle) 		struct page *new;
1844eecb8b9SMatthew Wilcox (Oracle) 		unsigned long idx = 0;
1850697212aSChristoph Lameter 
1864eecb8b9SMatthew Wilcox (Oracle) 		/* pgoff is invalid for ksm pages, but they are never large */
1874eecb8b9SMatthew Wilcox (Oracle) 		if (folio_test_large(folio) && !folio_test_hugetlb(folio))
1884eecb8b9SMatthew Wilcox (Oracle) 			idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
1894eecb8b9SMatthew Wilcox (Oracle) 		new = folio_page(folio, idx);
1900697212aSChristoph Lameter 
191616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
192616b8371SZi Yan 		/* PMD-mapped THP migration entry */
193616b8371SZi Yan 		if (!pvmw.pte) {
1944eecb8b9SMatthew Wilcox (Oracle) 			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
1954eecb8b9SMatthew Wilcox (Oracle) 					!folio_test_pmd_mappable(folio), folio);
196616b8371SZi Yan 			remove_migration_pmd(&pvmw, new);
197616b8371SZi Yan 			continue;
198616b8371SZi Yan 		}
199616b8371SZi Yan #endif
200616b8371SZi Yan 
2014eecb8b9SMatthew Wilcox (Oracle) 		folio_get(folio);
2022e346877SPeter Xu 		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
2033fe87967SKirill A. Shutemov 		if (pte_swp_soft_dirty(*pvmw.pte))
204c3d16e16SCyrill Gorcunov 			pte = pte_mksoft_dirty(pte);
205d3cb8bf6SMel Gorman 
2063fe87967SKirill A. Shutemov 		/*
2073fe87967SKirill A. Shutemov 		 * Recheck VMA as permissions can change since migration started
2083fe87967SKirill A. Shutemov 		 */
2093fe87967SKirill A. Shutemov 		entry = pte_to_swp_entry(*pvmw.pte);
2102e346877SPeter Xu 		if (!is_migration_entry_young(entry))
2112e346877SPeter Xu 			pte = pte_mkold(pte);
2122e346877SPeter Xu 		if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
2132e346877SPeter Xu 			pte = pte_mkdirty(pte);
2144dd845b5SAlistair Popple 		if (is_writable_migration_entry(entry))
215d3cb8bf6SMel Gorman 			pte = maybe_mkwrite(pte, vma);
216f45ec5ffSPeter Xu 		else if (pte_swp_uffd_wp(*pvmw.pte))
217f45ec5ffSPeter Xu 			pte = pte_mkuffd_wp(pte);
218d3cb8bf6SMel Gorman 
2196c287605SDavid Hildenbrand 		if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
2206c287605SDavid Hildenbrand 			rmap_flags |= RMAP_EXCLUSIVE;
2216c287605SDavid Hildenbrand 
2226128763fSRalph Campbell 		if (unlikely(is_device_private_page(new))) {
2234dd845b5SAlistair Popple 			if (pte_write(pte))
2244dd845b5SAlistair Popple 				entry = make_writable_device_private_entry(
2254dd845b5SAlistair Popple 							page_to_pfn(new));
2264dd845b5SAlistair Popple 			else
2274dd845b5SAlistair Popple 				entry = make_readable_device_private_entry(
2284dd845b5SAlistair Popple 							page_to_pfn(new));
229a5430ddaSJérôme Glisse 			pte = swp_entry_to_pte(entry);
2303d321bf8SRalph Campbell 			if (pte_swp_soft_dirty(*pvmw.pte))
2313d321bf8SRalph Campbell 				pte = pte_swp_mksoft_dirty(pte);
232f45ec5ffSPeter Xu 			if (pte_swp_uffd_wp(*pvmw.pte))
233ebdf8321SAlistair Popple 				pte = pte_swp_mkuffd_wp(pte);
234df6ad698SJérôme Glisse 		}
235a5430ddaSJérôme Glisse 
2363ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE
2374eecb8b9SMatthew Wilcox (Oracle) 		if (folio_test_hugetlb(folio)) {
23879c1c594SChristophe Leroy 			unsigned int shift = huge_page_shift(hstate_vma(vma));
23979c1c594SChristophe Leroy 
240290408d4SNaoya Horiguchi 			pte = pte_mkhuge(pte);
24179c1c594SChristophe Leroy 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
2424eecb8b9SMatthew Wilcox (Oracle) 			if (folio_test_anon(folio))
24328c5209dSDavid Hildenbrand 				hugepage_add_anon_rmap(new, vma, pvmw.address,
2446c287605SDavid Hildenbrand 						       rmap_flags);
245290408d4SNaoya Horiguchi 			else
246fb3d824dSDavid Hildenbrand 				page_dup_file_rmap(new, true);
2471eba86c0SPasha Tatashin 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
248383321abSAneesh Kumar K.V 		} else
249383321abSAneesh Kumar K.V #endif
250383321abSAneesh Kumar K.V 		{
2514eecb8b9SMatthew Wilcox (Oracle) 			if (folio_test_anon(folio))
252f1e2db12SDavid Hildenbrand 				page_add_anon_rmap(new, vma, pvmw.address,
2536c287605SDavid Hildenbrand 						   rmap_flags);
25404e62a29SChristoph Lameter 			else
255cea86fe2SHugh Dickins 				page_add_file_rmap(new, vma, false);
2561eba86c0SPasha Tatashin 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
257383321abSAneesh Kumar K.V 		}
258b7435507SHugh Dickins 		if (vma->vm_flags & VM_LOCKED)
259adb11e78SSebastian Andrzej Siewior 			mlock_page_drain_local();
260e125fe40SKirill A. Shutemov 
2614cc79b33SAnshuman Khandual 		trace_remove_migration_pte(pvmw.address, pte_val(pte),
2624cc79b33SAnshuman Khandual 					   compound_order(new));
2634cc79b33SAnshuman Khandual 
26404e62a29SChristoph Lameter 		/* No need to invalidate - it was non-present before */
2653fe87967SKirill A. Shutemov 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
2663fe87967SKirill A. Shutemov 	}
2673fe87967SKirill A. Shutemov 
268e4b82222SMinchan Kim 	return true;
2690697212aSChristoph Lameter }
2700697212aSChristoph Lameter 
2710697212aSChristoph Lameter /*
27204e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
27304e62a29SChristoph Lameter  * references to the indicated page.
27404e62a29SChristoph Lameter  */
2754eecb8b9SMatthew Wilcox (Oracle) void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
27604e62a29SChristoph Lameter {
277051ac83aSJoonsoo Kim 	struct rmap_walk_control rwc = {
278051ac83aSJoonsoo Kim 		.rmap_one = remove_migration_pte,
2794eecb8b9SMatthew Wilcox (Oracle) 		.arg = src,
280051ac83aSJoonsoo Kim 	};
281051ac83aSJoonsoo Kim 
282e388466dSKirill A. Shutemov 	if (locked)
2832f031c6fSMatthew Wilcox (Oracle) 		rmap_walk_locked(dst, &rwc);
284e388466dSKirill A. Shutemov 	else
2852f031c6fSMatthew Wilcox (Oracle) 		rmap_walk(dst, &rwc);
28604e62a29SChristoph Lameter }
28704e62a29SChristoph Lameter 
28804e62a29SChristoph Lameter /*
2890697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
2900697212aSChristoph Lameter  * get to the page and wait until migration is finished.
2910697212aSChristoph Lameter  * When we return from this function the fault will be retried.
2920697212aSChristoph Lameter  */
293e66f17ffSNaoya Horiguchi void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
29430dad309SNaoya Horiguchi 				spinlock_t *ptl)
2950697212aSChristoph Lameter {
29630dad309SNaoya Horiguchi 	pte_t pte;
2970697212aSChristoph Lameter 	swp_entry_t entry;
2980697212aSChristoph Lameter 
29930dad309SNaoya Horiguchi 	spin_lock(ptl);
3000697212aSChristoph Lameter 	pte = *ptep;
3010697212aSChristoph Lameter 	if (!is_swap_pte(pte))
3020697212aSChristoph Lameter 		goto out;
3030697212aSChristoph Lameter 
3040697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
3050697212aSChristoph Lameter 	if (!is_migration_entry(entry))
3060697212aSChristoph Lameter 		goto out;
3070697212aSChristoph Lameter 
308ffa65753SAlistair Popple 	migration_entry_wait_on_locked(entry, ptep, ptl);
3090697212aSChristoph Lameter 	return;
3100697212aSChristoph Lameter out:
3110697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3120697212aSChristoph Lameter }
3130697212aSChristoph Lameter 
31430dad309SNaoya Horiguchi void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
31530dad309SNaoya Horiguchi 				unsigned long address)
31630dad309SNaoya Horiguchi {
31730dad309SNaoya Horiguchi 	spinlock_t *ptl = pte_lockptr(mm, pmd);
31830dad309SNaoya Horiguchi 	pte_t *ptep = pte_offset_map(pmd, address);
31930dad309SNaoya Horiguchi 	__migration_entry_wait(mm, ptep, ptl);
32030dad309SNaoya Horiguchi }
32130dad309SNaoya Horiguchi 
322ad1ac596SMiaohe Lin #ifdef CONFIG_HUGETLB_PAGE
323ad1ac596SMiaohe Lin void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
32430dad309SNaoya Horiguchi {
325ad1ac596SMiaohe Lin 	pte_t pte;
326ad1ac596SMiaohe Lin 
327ad1ac596SMiaohe Lin 	spin_lock(ptl);
328ad1ac596SMiaohe Lin 	pte = huge_ptep_get(ptep);
329ad1ac596SMiaohe Lin 
330ad1ac596SMiaohe Lin 	if (unlikely(!is_hugetlb_entry_migration(pte)))
331ad1ac596SMiaohe Lin 		spin_unlock(ptl);
332ad1ac596SMiaohe Lin 	else
333ad1ac596SMiaohe Lin 		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
33430dad309SNaoya Horiguchi }
33530dad309SNaoya Horiguchi 
336ad1ac596SMiaohe Lin void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
337ad1ac596SMiaohe Lin {
338ad1ac596SMiaohe Lin 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
339ad1ac596SMiaohe Lin 
340ad1ac596SMiaohe Lin 	__migration_entry_wait_huge(pte, ptl);
341ad1ac596SMiaohe Lin }
342ad1ac596SMiaohe Lin #endif
343ad1ac596SMiaohe Lin 
344616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
345616b8371SZi Yan void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
346616b8371SZi Yan {
347616b8371SZi Yan 	spinlock_t *ptl;
348616b8371SZi Yan 
349616b8371SZi Yan 	ptl = pmd_lock(mm, pmd);
350616b8371SZi Yan 	if (!is_pmd_migration_entry(*pmd))
351616b8371SZi Yan 		goto unlock;
352ffa65753SAlistair Popple 	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
353616b8371SZi Yan 	return;
354616b8371SZi Yan unlock:
355616b8371SZi Yan 	spin_unlock(ptl);
356616b8371SZi Yan }
357616b8371SZi Yan #endif
358616b8371SZi Yan 
359108ca835SMatthew Wilcox (Oracle) static int folio_expected_refs(struct address_space *mapping,
360108ca835SMatthew Wilcox (Oracle) 		struct folio *folio)
3610b3901b3SJan Kara {
362108ca835SMatthew Wilcox (Oracle) 	int refs = 1;
363108ca835SMatthew Wilcox (Oracle) 	if (!mapping)
364108ca835SMatthew Wilcox (Oracle) 		return refs;
3650b3901b3SJan Kara 
366108ca835SMatthew Wilcox (Oracle) 	refs += folio_nr_pages(folio);
367108ca835SMatthew Wilcox (Oracle) 	if (folio_test_private(folio))
368108ca835SMatthew Wilcox (Oracle) 		refs++;
369108ca835SMatthew Wilcox (Oracle) 
370108ca835SMatthew Wilcox (Oracle) 	return refs;
3710b3901b3SJan Kara }
3720b3901b3SJan Kara 
373b20a3503SChristoph Lameter /*
374c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
3755b5c7120SChristoph Lameter  *
3765b5c7120SChristoph Lameter  * The number of remaining references must be:
3775b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
3785b5c7120SChristoph Lameter  * 2 for pages with a mapping
379266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
380b20a3503SChristoph Lameter  */
3813417013eSMatthew Wilcox (Oracle) int folio_migrate_mapping(struct address_space *mapping,
3823417013eSMatthew Wilcox (Oracle) 		struct folio *newfolio, struct folio *folio, int extra_count)
383b20a3503SChristoph Lameter {
3843417013eSMatthew Wilcox (Oracle) 	XA_STATE(xas, &mapping->i_pages, folio_index(folio));
38542cb14b1SHugh Dickins 	struct zone *oldzone, *newzone;
38642cb14b1SHugh Dickins 	int dirty;
387108ca835SMatthew Wilcox (Oracle) 	int expected_count = folio_expected_refs(mapping, folio) + extra_count;
3883417013eSMatthew Wilcox (Oracle) 	long nr = folio_nr_pages(folio);
3898763cb45SJérôme Glisse 
3906c5240aeSChristoph Lameter 	if (!mapping) {
3910e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
3923417013eSMatthew Wilcox (Oracle) 		if (folio_ref_count(folio) != expected_count)
3936c5240aeSChristoph Lameter 			return -EAGAIN;
394cf4b769aSHugh Dickins 
395cf4b769aSHugh Dickins 		/* No turning back from here */
3963417013eSMatthew Wilcox (Oracle) 		newfolio->index = folio->index;
3973417013eSMatthew Wilcox (Oracle) 		newfolio->mapping = folio->mapping;
3983417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapbacked(folio))
3993417013eSMatthew Wilcox (Oracle) 			__folio_set_swapbacked(newfolio);
400cf4b769aSHugh Dickins 
40178bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
4026c5240aeSChristoph Lameter 	}
4036c5240aeSChristoph Lameter 
4043417013eSMatthew Wilcox (Oracle) 	oldzone = folio_zone(folio);
4053417013eSMatthew Wilcox (Oracle) 	newzone = folio_zone(newfolio);
40642cb14b1SHugh Dickins 
40789eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
4083417013eSMatthew Wilcox (Oracle) 	if (!folio_ref_freeze(folio, expected_count)) {
40989eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
410e286781dSNick Piggin 		return -EAGAIN;
411e286781dSNick Piggin 	}
412e286781dSNick Piggin 
413b20a3503SChristoph Lameter 	/*
4143417013eSMatthew Wilcox (Oracle) 	 * Now we know that no one else is looking at the folio:
415cf4b769aSHugh Dickins 	 * no turning back from here.
416b20a3503SChristoph Lameter 	 */
4173417013eSMatthew Wilcox (Oracle) 	newfolio->index = folio->index;
4183417013eSMatthew Wilcox (Oracle) 	newfolio->mapping = folio->mapping;
4193417013eSMatthew Wilcox (Oracle) 	folio_ref_add(newfolio, nr); /* add cache reference */
4203417013eSMatthew Wilcox (Oracle) 	if (folio_test_swapbacked(folio)) {
4213417013eSMatthew Wilcox (Oracle) 		__folio_set_swapbacked(newfolio);
4223417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapcache(folio)) {
4233417013eSMatthew Wilcox (Oracle) 			folio_set_swapcache(newfolio);
4243417013eSMatthew Wilcox (Oracle) 			newfolio->private = folio_get_private(folio);
425b20a3503SChristoph Lameter 		}
4266326fec1SNicholas Piggin 	} else {
4273417013eSMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
4286326fec1SNicholas Piggin 	}
429b20a3503SChristoph Lameter 
43042cb14b1SHugh Dickins 	/* Move dirty while page refs frozen and newpage not yet exposed */
4313417013eSMatthew Wilcox (Oracle) 	dirty = folio_test_dirty(folio);
43242cb14b1SHugh Dickins 	if (dirty) {
4333417013eSMatthew Wilcox (Oracle) 		folio_clear_dirty(folio);
4343417013eSMatthew Wilcox (Oracle) 		folio_set_dirty(newfolio);
43542cb14b1SHugh Dickins 	}
43642cb14b1SHugh Dickins 
4373417013eSMatthew Wilcox (Oracle) 	xas_store(&xas, newfolio);
4387cf9c2c7SNick Piggin 
4397cf9c2c7SNick Piggin 	/*
440937a94c9SJacobo Giralt 	 * Drop cache reference from old page by unfreezing
441937a94c9SJacobo Giralt 	 * to one less reference.
4427cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
4437cf9c2c7SNick Piggin 	 */
4443417013eSMatthew Wilcox (Oracle) 	folio_ref_unfreeze(folio, expected_count - nr);
4457cf9c2c7SNick Piggin 
44689eb946aSMatthew Wilcox 	xas_unlock(&xas);
44742cb14b1SHugh Dickins 	/* Leave irq disabled to prevent preemption while updating stats */
44842cb14b1SHugh Dickins 
4490e8c7d0fSChristoph Lameter 	/*
4500e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
4510e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
4520e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
4530e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
4540e8c7d0fSChristoph Lameter 	 *
4550e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
4564b9d0fabSMel Gorman 	 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
4570e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
4580e8c7d0fSChristoph Lameter 	 */
45942cb14b1SHugh Dickins 	if (newzone != oldzone) {
4600d1c2072SJohannes Weiner 		struct lruvec *old_lruvec, *new_lruvec;
4610d1c2072SJohannes Weiner 		struct mem_cgroup *memcg;
4620d1c2072SJohannes Weiner 
4633417013eSMatthew Wilcox (Oracle) 		memcg = folio_memcg(folio);
4640d1c2072SJohannes Weiner 		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
4650d1c2072SJohannes Weiner 		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
4660d1c2072SJohannes Weiner 
4675c447d27SShakeel Butt 		__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
4685c447d27SShakeel Butt 		__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
4693417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
4705c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
4715c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
4724b02108aSKOSAKI Motohiro 		}
473b6038942SShakeel Butt #ifdef CONFIG_SWAP
4743417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapcache(folio)) {
475b6038942SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
476b6038942SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
477b6038942SShakeel Butt 		}
478b6038942SShakeel Butt #endif
479f56753acSChristoph Hellwig 		if (dirty && mapping_can_writeback(mapping)) {
4805c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
4815c447d27SShakeel Butt 			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
4825c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
4835c447d27SShakeel Butt 			__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
48442cb14b1SHugh Dickins 		}
48542cb14b1SHugh Dickins 	}
48642cb14b1SHugh Dickins 	local_irq_enable();
487b20a3503SChristoph Lameter 
48878bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
489b20a3503SChristoph Lameter }
4903417013eSMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_mapping);
491b20a3503SChristoph Lameter 
492b20a3503SChristoph Lameter /*
493290408d4SNaoya Horiguchi  * The expected number of remaining references is the same as that
4943417013eSMatthew Wilcox (Oracle)  * of folio_migrate_mapping().
495290408d4SNaoya Horiguchi  */
496290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping,
497b890ec2aSMatthew Wilcox (Oracle) 				   struct folio *dst, struct folio *src)
498290408d4SNaoya Horiguchi {
499b890ec2aSMatthew Wilcox (Oracle) 	XA_STATE(xas, &mapping->i_pages, folio_index(src));
500290408d4SNaoya Horiguchi 	int expected_count;
501290408d4SNaoya Horiguchi 
50289eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
503b890ec2aSMatthew Wilcox (Oracle) 	expected_count = 2 + folio_has_private(src);
504b890ec2aSMatthew Wilcox (Oracle) 	if (!folio_ref_freeze(src, expected_count)) {
50589eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
506290408d4SNaoya Horiguchi 		return -EAGAIN;
507290408d4SNaoya Horiguchi 	}
508290408d4SNaoya Horiguchi 
509b890ec2aSMatthew Wilcox (Oracle) 	dst->index = src->index;
510b890ec2aSMatthew Wilcox (Oracle) 	dst->mapping = src->mapping;
5116a93ca8fSJohannes Weiner 
512b890ec2aSMatthew Wilcox (Oracle) 	folio_get(dst);
513290408d4SNaoya Horiguchi 
514b890ec2aSMatthew Wilcox (Oracle) 	xas_store(&xas, dst);
515290408d4SNaoya Horiguchi 
516b890ec2aSMatthew Wilcox (Oracle) 	folio_ref_unfreeze(src, expected_count - 1);
517290408d4SNaoya Horiguchi 
51889eb946aSMatthew Wilcox 	xas_unlock_irq(&xas);
5196a93ca8fSJohannes Weiner 
52078bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
521290408d4SNaoya Horiguchi }
522290408d4SNaoya Horiguchi 
523290408d4SNaoya Horiguchi /*
52419138349SMatthew Wilcox (Oracle)  * Copy the flags and some other ancillary information
525b20a3503SChristoph Lameter  */
52619138349SMatthew Wilcox (Oracle) void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
527b20a3503SChristoph Lameter {
5287851a45cSRik van Riel 	int cpupid;
5297851a45cSRik van Riel 
53019138349SMatthew Wilcox (Oracle) 	if (folio_test_error(folio))
53119138349SMatthew Wilcox (Oracle) 		folio_set_error(newfolio);
53219138349SMatthew Wilcox (Oracle) 	if (folio_test_referenced(folio))
53319138349SMatthew Wilcox (Oracle) 		folio_set_referenced(newfolio);
53419138349SMatthew Wilcox (Oracle) 	if (folio_test_uptodate(folio))
53519138349SMatthew Wilcox (Oracle) 		folio_mark_uptodate(newfolio);
53619138349SMatthew Wilcox (Oracle) 	if (folio_test_clear_active(folio)) {
53719138349SMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
53819138349SMatthew Wilcox (Oracle) 		folio_set_active(newfolio);
53919138349SMatthew Wilcox (Oracle) 	} else if (folio_test_clear_unevictable(folio))
54019138349SMatthew Wilcox (Oracle) 		folio_set_unevictable(newfolio);
54119138349SMatthew Wilcox (Oracle) 	if (folio_test_workingset(folio))
54219138349SMatthew Wilcox (Oracle) 		folio_set_workingset(newfolio);
54319138349SMatthew Wilcox (Oracle) 	if (folio_test_checked(folio))
54419138349SMatthew Wilcox (Oracle) 		folio_set_checked(newfolio);
5456c287605SDavid Hildenbrand 	/*
5466c287605SDavid Hildenbrand 	 * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
5476c287605SDavid Hildenbrand 	 * migration entries. We can still have PG_anon_exclusive set on an
5486c287605SDavid Hildenbrand 	 * effectively unmapped and unreferenced first sub-pages of an
5496c287605SDavid Hildenbrand 	 * anonymous THP: we can simply copy it here via PG_mappedtodisk.
5506c287605SDavid Hildenbrand 	 */
55119138349SMatthew Wilcox (Oracle) 	if (folio_test_mappedtodisk(folio))
55219138349SMatthew Wilcox (Oracle) 		folio_set_mappedtodisk(newfolio);
553b20a3503SChristoph Lameter 
5543417013eSMatthew Wilcox (Oracle) 	/* Move dirty on pages not done by folio_migrate_mapping() */
55519138349SMatthew Wilcox (Oracle) 	if (folio_test_dirty(folio))
55619138349SMatthew Wilcox (Oracle) 		folio_set_dirty(newfolio);
557b20a3503SChristoph Lameter 
55819138349SMatthew Wilcox (Oracle) 	if (folio_test_young(folio))
55919138349SMatthew Wilcox (Oracle) 		folio_set_young(newfolio);
56019138349SMatthew Wilcox (Oracle) 	if (folio_test_idle(folio))
56119138349SMatthew Wilcox (Oracle) 		folio_set_idle(newfolio);
56233c3fc71SVladimir Davydov 
5637851a45cSRik van Riel 	/*
5647851a45cSRik van Riel 	 * Copy NUMA information to the new page, to prevent over-eager
5657851a45cSRik van Riel 	 * future migrations of this same page.
5667851a45cSRik van Riel 	 */
56719138349SMatthew Wilcox (Oracle) 	cpupid = page_cpupid_xchg_last(&folio->page, -1);
56833024536SHuang Ying 	/*
56933024536SHuang Ying 	 * For memory tiering mode, when migrate between slow and fast
57033024536SHuang Ying 	 * memory node, reset cpupid, because that is used to record
57133024536SHuang Ying 	 * page access time in slow memory node.
57233024536SHuang Ying 	 */
57333024536SHuang Ying 	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
57433024536SHuang Ying 		bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
57533024536SHuang Ying 		bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
57633024536SHuang Ying 
57733024536SHuang Ying 		if (f_toptier != t_toptier)
57833024536SHuang Ying 			cpupid = -1;
57933024536SHuang Ying 	}
58019138349SMatthew Wilcox (Oracle) 	page_cpupid_xchg_last(&newfolio->page, cpupid);
5817851a45cSRik van Riel 
58219138349SMatthew Wilcox (Oracle) 	folio_migrate_ksm(newfolio, folio);
583c8d6553bSHugh Dickins 	/*
584c8d6553bSHugh Dickins 	 * Please do not reorder this without considering how mm/ksm.c's
585c8d6553bSHugh Dickins 	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
586c8d6553bSHugh Dickins 	 */
58719138349SMatthew Wilcox (Oracle) 	if (folio_test_swapcache(folio))
58819138349SMatthew Wilcox (Oracle) 		folio_clear_swapcache(folio);
58919138349SMatthew Wilcox (Oracle) 	folio_clear_private(folio);
590ad2fa371SMuchun Song 
591ad2fa371SMuchun Song 	/* page->private contains hugetlb specific flags */
59219138349SMatthew Wilcox (Oracle) 	if (!folio_test_hugetlb(folio))
59319138349SMatthew Wilcox (Oracle) 		folio->private = NULL;
594b20a3503SChristoph Lameter 
595b20a3503SChristoph Lameter 	/*
596b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
597b20a3503SChristoph Lameter 	 * wake them up.
598b20a3503SChristoph Lameter 	 */
59919138349SMatthew Wilcox (Oracle) 	if (folio_test_writeback(newfolio))
60019138349SMatthew Wilcox (Oracle) 		folio_end_writeback(newfolio);
601d435edcaSVlastimil Babka 
6026aeff241SYang Shi 	/*
6036aeff241SYang Shi 	 * PG_readahead shares the same bit with PG_reclaim.  The above
6046aeff241SYang Shi 	 * end_page_writeback() may clear PG_readahead mistakenly, so set the
6056aeff241SYang Shi 	 * bit after that.
6066aeff241SYang Shi 	 */
60719138349SMatthew Wilcox (Oracle) 	if (folio_test_readahead(folio))
60819138349SMatthew Wilcox (Oracle) 		folio_set_readahead(newfolio);
6096aeff241SYang Shi 
61019138349SMatthew Wilcox (Oracle) 	folio_copy_owner(newfolio, folio);
61174485cf2SJohannes Weiner 
61219138349SMatthew Wilcox (Oracle) 	if (!folio_test_hugetlb(folio))
613d21bba2bSMatthew Wilcox (Oracle) 		mem_cgroup_migrate(folio, newfolio);
614b20a3503SChristoph Lameter }
61519138349SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_flags);
6162916ecc0SJérôme Glisse 
617715cbfd6SMatthew Wilcox (Oracle) void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
6182916ecc0SJérôme Glisse {
619715cbfd6SMatthew Wilcox (Oracle) 	folio_copy(newfolio, folio);
620715cbfd6SMatthew Wilcox (Oracle) 	folio_migrate_flags(newfolio, folio);
6212916ecc0SJérôme Glisse }
622715cbfd6SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_copy);
623b20a3503SChristoph Lameter 
6241d8b85ccSChristoph Lameter /************************************************************
6251d8b85ccSChristoph Lameter  *                    Migration functions
6261d8b85ccSChristoph Lameter  ***********************************************************/
6271d8b85ccSChristoph Lameter 
62854184650SMatthew Wilcox (Oracle) /**
62954184650SMatthew Wilcox (Oracle)  * migrate_folio() - Simple folio migration.
63054184650SMatthew Wilcox (Oracle)  * @mapping: The address_space containing the folio.
63154184650SMatthew Wilcox (Oracle)  * @dst: The folio to migrate the data to.
63254184650SMatthew Wilcox (Oracle)  * @src: The folio containing the current data.
63354184650SMatthew Wilcox (Oracle)  * @mode: How to migrate the page.
634b20a3503SChristoph Lameter  *
63554184650SMatthew Wilcox (Oracle)  * Common logic to directly migrate a single LRU folio suitable for
63654184650SMatthew Wilcox (Oracle)  * folios that do not use PagePrivate/PagePrivate2.
63754184650SMatthew Wilcox (Oracle)  *
63854184650SMatthew Wilcox (Oracle)  * Folios are locked upon entry and exit.
639b20a3503SChristoph Lameter  */
64054184650SMatthew Wilcox (Oracle) int migrate_folio(struct address_space *mapping, struct folio *dst,
64154184650SMatthew Wilcox (Oracle) 		struct folio *src, enum migrate_mode mode)
642b20a3503SChristoph Lameter {
643b20a3503SChristoph Lameter 	int rc;
644b20a3503SChristoph Lameter 
64554184650SMatthew Wilcox (Oracle) 	BUG_ON(folio_test_writeback(src));	/* Writeback must be complete */
646b20a3503SChristoph Lameter 
64754184650SMatthew Wilcox (Oracle) 	rc = folio_migrate_mapping(mapping, dst, src, 0);
648b20a3503SChristoph Lameter 
64978bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
650b20a3503SChristoph Lameter 		return rc;
651b20a3503SChristoph Lameter 
6522916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
65354184650SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
6542916ecc0SJérôme Glisse 	else
65554184650SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
65678bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
657b20a3503SChristoph Lameter }
65854184650SMatthew Wilcox (Oracle) EXPORT_SYMBOL(migrate_folio);
659b20a3503SChristoph Lameter 
6609361401eSDavid Howells #ifdef CONFIG_BLOCK
66184ade7c1SJan Kara /* Returns true if all buffers are successfully locked */
66284ade7c1SJan Kara static bool buffer_migrate_lock_buffers(struct buffer_head *head,
66384ade7c1SJan Kara 							enum migrate_mode mode)
66484ade7c1SJan Kara {
66584ade7c1SJan Kara 	struct buffer_head *bh = head;
66684ade7c1SJan Kara 
66784ade7c1SJan Kara 	/* Simple case, sync compaction */
66884ade7c1SJan Kara 	if (mode != MIGRATE_ASYNC) {
66984ade7c1SJan Kara 		do {
67084ade7c1SJan Kara 			lock_buffer(bh);
67184ade7c1SJan Kara 			bh = bh->b_this_page;
67284ade7c1SJan Kara 
67384ade7c1SJan Kara 		} while (bh != head);
67484ade7c1SJan Kara 
67584ade7c1SJan Kara 		return true;
67684ade7c1SJan Kara 	}
67784ade7c1SJan Kara 
67884ade7c1SJan Kara 	/* async case, we cannot block on lock_buffer so use trylock_buffer */
67984ade7c1SJan Kara 	do {
68084ade7c1SJan Kara 		if (!trylock_buffer(bh)) {
68184ade7c1SJan Kara 			/*
68284ade7c1SJan Kara 			 * We failed to lock the buffer and cannot stall in
68384ade7c1SJan Kara 			 * async migration. Release the taken locks
68484ade7c1SJan Kara 			 */
68584ade7c1SJan Kara 			struct buffer_head *failed_bh = bh;
68684ade7c1SJan Kara 			bh = head;
68784ade7c1SJan Kara 			while (bh != failed_bh) {
68884ade7c1SJan Kara 				unlock_buffer(bh);
68984ade7c1SJan Kara 				bh = bh->b_this_page;
69084ade7c1SJan Kara 			}
69184ade7c1SJan Kara 			return false;
69284ade7c1SJan Kara 		}
69384ade7c1SJan Kara 
69484ade7c1SJan Kara 		bh = bh->b_this_page;
69584ade7c1SJan Kara 	} while (bh != head);
69684ade7c1SJan Kara 	return true;
69784ade7c1SJan Kara }
69884ade7c1SJan Kara 
69967235182SMatthew Wilcox (Oracle) static int __buffer_migrate_folio(struct address_space *mapping,
70067235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode,
70189cb0888SJan Kara 		bool check_refs)
7021d8b85ccSChristoph Lameter {
7031d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
7041d8b85ccSChristoph Lameter 	int rc;
705cc4f11e6SJan Kara 	int expected_count;
7061d8b85ccSChristoph Lameter 
70767235182SMatthew Wilcox (Oracle) 	head = folio_buffers(src);
70867235182SMatthew Wilcox (Oracle) 	if (!head)
70954184650SMatthew Wilcox (Oracle) 		return migrate_folio(mapping, dst, src, mode);
7101d8b85ccSChristoph Lameter 
711cc4f11e6SJan Kara 	/* Check whether page does not have extra refs before we do more work */
712108ca835SMatthew Wilcox (Oracle) 	expected_count = folio_expected_refs(mapping, src);
71367235182SMatthew Wilcox (Oracle) 	if (folio_ref_count(src) != expected_count)
714cc4f11e6SJan Kara 		return -EAGAIN;
715cc4f11e6SJan Kara 
716cc4f11e6SJan Kara 	if (!buffer_migrate_lock_buffers(head, mode))
717cc4f11e6SJan Kara 		return -EAGAIN;
7181d8b85ccSChristoph Lameter 
71989cb0888SJan Kara 	if (check_refs) {
72089cb0888SJan Kara 		bool busy;
72189cb0888SJan Kara 		bool invalidated = false;
72289cb0888SJan Kara 
72389cb0888SJan Kara recheck_buffers:
72489cb0888SJan Kara 		busy = false;
72589cb0888SJan Kara 		spin_lock(&mapping->private_lock);
72689cb0888SJan Kara 		bh = head;
72789cb0888SJan Kara 		do {
72889cb0888SJan Kara 			if (atomic_read(&bh->b_count)) {
72989cb0888SJan Kara 				busy = true;
73089cb0888SJan Kara 				break;
73189cb0888SJan Kara 			}
73289cb0888SJan Kara 			bh = bh->b_this_page;
73389cb0888SJan Kara 		} while (bh != head);
73489cb0888SJan Kara 		if (busy) {
73589cb0888SJan Kara 			if (invalidated) {
73689cb0888SJan Kara 				rc = -EAGAIN;
73789cb0888SJan Kara 				goto unlock_buffers;
73889cb0888SJan Kara 			}
739ebdf4de5SJan Kara 			spin_unlock(&mapping->private_lock);
74089cb0888SJan Kara 			invalidate_bh_lrus();
74189cb0888SJan Kara 			invalidated = true;
74289cb0888SJan Kara 			goto recheck_buffers;
74389cb0888SJan Kara 		}
74489cb0888SJan Kara 	}
74589cb0888SJan Kara 
74667235182SMatthew Wilcox (Oracle) 	rc = folio_migrate_mapping(mapping, dst, src, 0);
74778bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
748cc4f11e6SJan Kara 		goto unlock_buffers;
7491d8b85ccSChristoph Lameter 
75067235182SMatthew Wilcox (Oracle) 	folio_attach_private(dst, folio_detach_private(src));
7511d8b85ccSChristoph Lameter 
7521d8b85ccSChristoph Lameter 	bh = head;
7531d8b85ccSChristoph Lameter 	do {
75467235182SMatthew Wilcox (Oracle) 		set_bh_page(bh, &dst->page, bh_offset(bh));
7551d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7561d8b85ccSChristoph Lameter 	} while (bh != head);
7571d8b85ccSChristoph Lameter 
7582916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
75967235182SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
7602916ecc0SJérôme Glisse 	else
76167235182SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
7621d8b85ccSChristoph Lameter 
763cc4f11e6SJan Kara 	rc = MIGRATEPAGE_SUCCESS;
764cc4f11e6SJan Kara unlock_buffers:
765ebdf4de5SJan Kara 	if (check_refs)
766ebdf4de5SJan Kara 		spin_unlock(&mapping->private_lock);
7671d8b85ccSChristoph Lameter 	bh = head;
7681d8b85ccSChristoph Lameter 	do {
7691d8b85ccSChristoph Lameter 		unlock_buffer(bh);
7701d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7711d8b85ccSChristoph Lameter 	} while (bh != head);
7721d8b85ccSChristoph Lameter 
773cc4f11e6SJan Kara 	return rc;
7741d8b85ccSChristoph Lameter }
77589cb0888SJan Kara 
77667235182SMatthew Wilcox (Oracle) /**
77767235182SMatthew Wilcox (Oracle)  * buffer_migrate_folio() - Migration function for folios with buffers.
77867235182SMatthew Wilcox (Oracle)  * @mapping: The address space containing @src.
77967235182SMatthew Wilcox (Oracle)  * @dst: The folio to migrate to.
78067235182SMatthew Wilcox (Oracle)  * @src: The folio to migrate from.
78167235182SMatthew Wilcox (Oracle)  * @mode: How to migrate the folio.
78267235182SMatthew Wilcox (Oracle)  *
78367235182SMatthew Wilcox (Oracle)  * This function can only be used if the underlying filesystem guarantees
78467235182SMatthew Wilcox (Oracle)  * that no other references to @src exist. For example attached buffer
78567235182SMatthew Wilcox (Oracle)  * heads are accessed only under the folio lock.  If your filesystem cannot
78667235182SMatthew Wilcox (Oracle)  * provide this guarantee, buffer_migrate_folio_norefs() may be more
78767235182SMatthew Wilcox (Oracle)  * appropriate.
78867235182SMatthew Wilcox (Oracle)  *
78967235182SMatthew Wilcox (Oracle)  * Return: 0 on success or a negative errno on failure.
79089cb0888SJan Kara  */
79167235182SMatthew Wilcox (Oracle) int buffer_migrate_folio(struct address_space *mapping,
79267235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
79389cb0888SJan Kara {
79467235182SMatthew Wilcox (Oracle) 	return __buffer_migrate_folio(mapping, dst, src, mode, false);
79589cb0888SJan Kara }
79667235182SMatthew Wilcox (Oracle) EXPORT_SYMBOL(buffer_migrate_folio);
79789cb0888SJan Kara 
79867235182SMatthew Wilcox (Oracle) /**
79967235182SMatthew Wilcox (Oracle)  * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
80067235182SMatthew Wilcox (Oracle)  * @mapping: The address space containing @src.
80167235182SMatthew Wilcox (Oracle)  * @dst: The folio to migrate to.
80267235182SMatthew Wilcox (Oracle)  * @src: The folio to migrate from.
80367235182SMatthew Wilcox (Oracle)  * @mode: How to migrate the folio.
80467235182SMatthew Wilcox (Oracle)  *
80567235182SMatthew Wilcox (Oracle)  * Like buffer_migrate_folio() except that this variant is more careful
80667235182SMatthew Wilcox (Oracle)  * and checks that there are also no buffer head references. This function
80767235182SMatthew Wilcox (Oracle)  * is the right one for mappings where buffer heads are directly looked
80867235182SMatthew Wilcox (Oracle)  * up and referenced (such as block device mappings).
80967235182SMatthew Wilcox (Oracle)  *
81067235182SMatthew Wilcox (Oracle)  * Return: 0 on success or a negative errno on failure.
81189cb0888SJan Kara  */
81267235182SMatthew Wilcox (Oracle) int buffer_migrate_folio_norefs(struct address_space *mapping,
81367235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
81489cb0888SJan Kara {
81567235182SMatthew Wilcox (Oracle) 	return __buffer_migrate_folio(mapping, dst, src, mode, true);
81689cb0888SJan Kara }
8179361401eSDavid Howells #endif
8181d8b85ccSChristoph Lameter 
8192ec810d5SMatthew Wilcox (Oracle) int filemap_migrate_folio(struct address_space *mapping,
8202ec810d5SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
8212ec810d5SMatthew Wilcox (Oracle) {
8222ec810d5SMatthew Wilcox (Oracle) 	int ret;
8232ec810d5SMatthew Wilcox (Oracle) 
8242ec810d5SMatthew Wilcox (Oracle) 	ret = folio_migrate_mapping(mapping, dst, src, 0);
8252ec810d5SMatthew Wilcox (Oracle) 	if (ret != MIGRATEPAGE_SUCCESS)
8262ec810d5SMatthew Wilcox (Oracle) 		return ret;
8272ec810d5SMatthew Wilcox (Oracle) 
8282ec810d5SMatthew Wilcox (Oracle) 	if (folio_get_private(src))
8292ec810d5SMatthew Wilcox (Oracle) 		folio_attach_private(dst, folio_detach_private(src));
8302ec810d5SMatthew Wilcox (Oracle) 
8312ec810d5SMatthew Wilcox (Oracle) 	if (mode != MIGRATE_SYNC_NO_COPY)
8322ec810d5SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
8332ec810d5SMatthew Wilcox (Oracle) 	else
8342ec810d5SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
8352ec810d5SMatthew Wilcox (Oracle) 	return MIGRATEPAGE_SUCCESS;
8362ec810d5SMatthew Wilcox (Oracle) }
8372ec810d5SMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(filemap_migrate_folio);
8382ec810d5SMatthew Wilcox (Oracle) 
83904e62a29SChristoph Lameter /*
8402be7fa10SMatthew Wilcox (Oracle)  * Writeback a folio to clean the dirty state
84104e62a29SChristoph Lameter  */
8422be7fa10SMatthew Wilcox (Oracle) static int writeout(struct address_space *mapping, struct folio *folio)
84304e62a29SChristoph Lameter {
84404e62a29SChristoph Lameter 	struct writeback_control wbc = {
84504e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
84604e62a29SChristoph Lameter 		.nr_to_write = 1,
84704e62a29SChristoph Lameter 		.range_start = 0,
84804e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
84904e62a29SChristoph Lameter 		.for_reclaim = 1
85004e62a29SChristoph Lameter 	};
85104e62a29SChristoph Lameter 	int rc;
85204e62a29SChristoph Lameter 
85304e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
85404e62a29SChristoph Lameter 		/* No write method for the address space */
85504e62a29SChristoph Lameter 		return -EINVAL;
85604e62a29SChristoph Lameter 
8572be7fa10SMatthew Wilcox (Oracle) 	if (!folio_clear_dirty_for_io(folio))
85804e62a29SChristoph Lameter 		/* Someone else already triggered a write */
85904e62a29SChristoph Lameter 		return -EAGAIN;
86004e62a29SChristoph Lameter 
86104e62a29SChristoph Lameter 	/*
8622be7fa10SMatthew Wilcox (Oracle) 	 * A dirty folio may imply that the underlying filesystem has
8632be7fa10SMatthew Wilcox (Oracle) 	 * the folio on some queue. So the folio must be clean for
8642be7fa10SMatthew Wilcox (Oracle) 	 * migration. Writeout may mean we lose the lock and the
8652be7fa10SMatthew Wilcox (Oracle) 	 * folio state is no longer what we checked for earlier.
86604e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
86704e62a29SChristoph Lameter 	 * be successful.
86804e62a29SChristoph Lameter 	 */
8694eecb8b9SMatthew Wilcox (Oracle) 	remove_migration_ptes(folio, folio, false);
87004e62a29SChristoph Lameter 
8712be7fa10SMatthew Wilcox (Oracle) 	rc = mapping->a_ops->writepage(&folio->page, &wbc);
87204e62a29SChristoph Lameter 
87304e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
87404e62a29SChristoph Lameter 		/* unlocked. Relock */
8752be7fa10SMatthew Wilcox (Oracle) 		folio_lock(folio);
87604e62a29SChristoph Lameter 
877bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
87804e62a29SChristoph Lameter }
87904e62a29SChristoph Lameter 
88004e62a29SChristoph Lameter /*
88104e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
88204e62a29SChristoph Lameter  */
8838faa8ef5SMatthew Wilcox (Oracle) static int fallback_migrate_folio(struct address_space *mapping,
8848faa8ef5SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
8858351a6e4SChristoph Lameter {
8868faa8ef5SMatthew Wilcox (Oracle) 	if (folio_test_dirty(src)) {
8878faa8ef5SMatthew Wilcox (Oracle) 		/* Only writeback folios in full synchronous migration */
8882916ecc0SJérôme Glisse 		switch (mode) {
8892916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
8902916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
8912916ecc0SJérôme Glisse 			break;
8922916ecc0SJérôme Glisse 		default:
893b969c4abSMel Gorman 			return -EBUSY;
8942916ecc0SJérôme Glisse 		}
8952be7fa10SMatthew Wilcox (Oracle) 		return writeout(mapping, src);
896b969c4abSMel Gorman 	}
8978351a6e4SChristoph Lameter 
8988351a6e4SChristoph Lameter 	/*
8998351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
9008351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
9018351a6e4SChristoph Lameter 	 */
9028faa8ef5SMatthew Wilcox (Oracle) 	if (folio_test_private(src) &&
9038faa8ef5SMatthew Wilcox (Oracle) 	    !filemap_release_folio(src, GFP_KERNEL))
904806031bbSMel Gorman 		return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
9058351a6e4SChristoph Lameter 
90654184650SMatthew Wilcox (Oracle) 	return migrate_folio(mapping, dst, src, mode);
9078351a6e4SChristoph Lameter }
9088351a6e4SChristoph Lameter 
9091d8b85ccSChristoph Lameter /*
910e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
911e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
912b20a3503SChristoph Lameter  *
913e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
914e24f0b8fSChristoph Lameter  * is successful.
915894bc310SLee Schermerhorn  *
916894bc310SLee Schermerhorn  * Return value:
917894bc310SLee Schermerhorn  *   < 0 - error code
91878bd5209SRafael Aquini  *  MIGRATEPAGE_SUCCESS - success
919b20a3503SChristoph Lameter  */
920e7e3ffebSMatthew Wilcox (Oracle) static int move_to_new_folio(struct folio *dst, struct folio *src,
9215c3f9a67SHugh Dickins 				enum migrate_mode mode)
922b20a3503SChristoph Lameter {
923bda807d4SMinchan Kim 	int rc = -EAGAIN;
924e7e3ffebSMatthew Wilcox (Oracle) 	bool is_lru = !__PageMovable(&src->page);
925b20a3503SChristoph Lameter 
926e7e3ffebSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
927e7e3ffebSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
928b20a3503SChristoph Lameter 
929bda807d4SMinchan Kim 	if (likely(is_lru)) {
93068f2736aSMatthew Wilcox (Oracle) 		struct address_space *mapping = folio_mapping(src);
93168f2736aSMatthew Wilcox (Oracle) 
932b20a3503SChristoph Lameter 		if (!mapping)
93354184650SMatthew Wilcox (Oracle) 			rc = migrate_folio(mapping, dst, src, mode);
9345490da4fSMatthew Wilcox (Oracle) 		else if (mapping->a_ops->migrate_folio)
935b20a3503SChristoph Lameter 			/*
9365490da4fSMatthew Wilcox (Oracle) 			 * Most folios have a mapping and most filesystems
9375490da4fSMatthew Wilcox (Oracle) 			 * provide a migrate_folio callback. Anonymous folios
938bda807d4SMinchan Kim 			 * are part of swap space which also has its own
9395490da4fSMatthew Wilcox (Oracle) 			 * migrate_folio callback. This is the most common path
940bda807d4SMinchan Kim 			 * for page migration.
941b20a3503SChristoph Lameter 			 */
9425490da4fSMatthew Wilcox (Oracle) 			rc = mapping->a_ops->migrate_folio(mapping, dst, src,
9435490da4fSMatthew Wilcox (Oracle) 								mode);
9448351a6e4SChristoph Lameter 		else
9458faa8ef5SMatthew Wilcox (Oracle) 			rc = fallback_migrate_folio(mapping, dst, src, mode);
946bda807d4SMinchan Kim 	} else {
94768f2736aSMatthew Wilcox (Oracle) 		const struct movable_operations *mops;
94868f2736aSMatthew Wilcox (Oracle) 
949bda807d4SMinchan Kim 		/*
950bda807d4SMinchan Kim 		 * In case of non-lru page, it could be released after
951bda807d4SMinchan Kim 		 * isolation step. In that case, we shouldn't try migration.
952bda807d4SMinchan Kim 		 */
953e7e3ffebSMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
954e7e3ffebSMatthew Wilcox (Oracle) 		if (!folio_test_movable(src)) {
955bda807d4SMinchan Kim 			rc = MIGRATEPAGE_SUCCESS;
956e7e3ffebSMatthew Wilcox (Oracle) 			folio_clear_isolated(src);
957bda807d4SMinchan Kim 			goto out;
958bda807d4SMinchan Kim 		}
959bda807d4SMinchan Kim 
96068f2736aSMatthew Wilcox (Oracle) 		mops = page_movable_ops(&src->page);
96168f2736aSMatthew Wilcox (Oracle) 		rc = mops->migrate_page(&dst->page, &src->page, mode);
962bda807d4SMinchan Kim 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
963e7e3ffebSMatthew Wilcox (Oracle) 				!folio_test_isolated(src));
964bda807d4SMinchan Kim 	}
965b20a3503SChristoph Lameter 
9665c3f9a67SHugh Dickins 	/*
967e7e3ffebSMatthew Wilcox (Oracle) 	 * When successful, old pagecache src->mapping must be cleared before
968e7e3ffebSMatthew Wilcox (Oracle) 	 * src is freed; but stats require that PageAnon be left as PageAnon.
9695c3f9a67SHugh Dickins 	 */
9705c3f9a67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
971e7e3ffebSMatthew Wilcox (Oracle) 		if (__PageMovable(&src->page)) {
972e7e3ffebSMatthew Wilcox (Oracle) 			VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
973bda807d4SMinchan Kim 
974bda807d4SMinchan Kim 			/*
975bda807d4SMinchan Kim 			 * We clear PG_movable under page_lock so any compactor
976bda807d4SMinchan Kim 			 * cannot try to migrate this page.
977bda807d4SMinchan Kim 			 */
978e7e3ffebSMatthew Wilcox (Oracle) 			folio_clear_isolated(src);
979bda807d4SMinchan Kim 		}
980bda807d4SMinchan Kim 
981bda807d4SMinchan Kim 		/*
982e7e3ffebSMatthew Wilcox (Oracle) 		 * Anonymous and movable src->mapping will be cleared by
983bda807d4SMinchan Kim 		 * free_pages_prepare so don't reset it here for keeping
984bda807d4SMinchan Kim 		 * the type to work PageAnon, for example.
985bda807d4SMinchan Kim 		 */
986e7e3ffebSMatthew Wilcox (Oracle) 		if (!folio_mapping_flags(src))
987e7e3ffebSMatthew Wilcox (Oracle) 			src->mapping = NULL;
988d2b2c6ddSLars Persson 
989e7e3ffebSMatthew Wilcox (Oracle) 		if (likely(!folio_is_zone_device(dst)))
990e7e3ffebSMatthew Wilcox (Oracle) 			flush_dcache_folio(dst);
9913fe2011fSMel Gorman 	}
992bda807d4SMinchan Kim out:
993e24f0b8fSChristoph Lameter 	return rc;
994e24f0b8fSChristoph Lameter }
995e24f0b8fSChristoph Lameter 
996682a71a1SMatthew Wilcox (Oracle) static int __unmap_and_move(struct folio *src, struct folio *dst,
9979c620e2bSHugh Dickins 				int force, enum migrate_mode mode)
998e24f0b8fSChristoph Lameter {
9990dabec93SMinchan Kim 	int rc = -EAGAIN;
1000213ecb31SBaolin Wang 	bool page_was_mapped = false;
10013f6c8272SMel Gorman 	struct anon_vma *anon_vma = NULL;
1002682a71a1SMatthew Wilcox (Oracle) 	bool is_lru = !__PageMovable(&src->page);
100395a402c3SChristoph Lameter 
1004682a71a1SMatthew Wilcox (Oracle) 	if (!folio_trylock(src)) {
1005a6bc32b8SMel Gorman 		if (!force || mode == MIGRATE_ASYNC)
10060dabec93SMinchan Kim 			goto out;
10073e7d3449SMel Gorman 
10083e7d3449SMel Gorman 		/*
10093e7d3449SMel Gorman 		 * It's not safe for direct compaction to call lock_page.
10103e7d3449SMel Gorman 		 * For example, during page readahead pages are added locked
10113e7d3449SMel Gorman 		 * to the LRU. Later, when the IO completes the pages are
10123e7d3449SMel Gorman 		 * marked uptodate and unlocked. However, the queueing
10133e7d3449SMel Gorman 		 * could be merging multiple pages for one bio (e.g.
1014d4388340SMatthew Wilcox (Oracle) 		 * mpage_readahead). If an allocation happens for the
10153e7d3449SMel Gorman 		 * second or third page, the process can end up locking
10163e7d3449SMel Gorman 		 * the same page twice and deadlocking. Rather than
10173e7d3449SMel Gorman 		 * trying to be clever about what pages can be locked,
10183e7d3449SMel Gorman 		 * avoid the use of lock_page for direct compaction
10193e7d3449SMel Gorman 		 * altogether.
10203e7d3449SMel Gorman 		 */
10213e7d3449SMel Gorman 		if (current->flags & PF_MEMALLOC)
10220dabec93SMinchan Kim 			goto out;
10233e7d3449SMel Gorman 
1024682a71a1SMatthew Wilcox (Oracle) 		folio_lock(src);
1025e24f0b8fSChristoph Lameter 	}
1026e24f0b8fSChristoph Lameter 
1027682a71a1SMatthew Wilcox (Oracle) 	if (folio_test_writeback(src)) {
102811bc82d6SAndrea Arcangeli 		/*
1029fed5b64aSJianguo Wu 		 * Only in the case of a full synchronous migration is it
1030a6bc32b8SMel Gorman 		 * necessary to wait for PageWriteback. In the async case,
1031a6bc32b8SMel Gorman 		 * the retry loop is too short and in the sync-light case,
1032a6bc32b8SMel Gorman 		 * the overhead of stalling is too much
103311bc82d6SAndrea Arcangeli 		 */
10342916ecc0SJérôme Glisse 		switch (mode) {
10352916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
10362916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
10372916ecc0SJérôme Glisse 			break;
10382916ecc0SJérôme Glisse 		default:
103911bc82d6SAndrea Arcangeli 			rc = -EBUSY;
10400a31bc97SJohannes Weiner 			goto out_unlock;
104111bc82d6SAndrea Arcangeli 		}
104211bc82d6SAndrea Arcangeli 		if (!force)
10430a31bc97SJohannes Weiner 			goto out_unlock;
1044682a71a1SMatthew Wilcox (Oracle) 		folio_wait_writeback(src);
1045e24f0b8fSChristoph Lameter 	}
104603f15c86SHugh Dickins 
1047e24f0b8fSChristoph Lameter 	/*
1048682a71a1SMatthew Wilcox (Oracle) 	 * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
1049682a71a1SMatthew Wilcox (Oracle) 	 * we cannot notice that anon_vma is freed while we migrate a page.
10501ce82b69SHugh Dickins 	 * This get_anon_vma() delays freeing anon_vma pointer until the end
1051dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
1052989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
1053989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
10543fe2011fSMel Gorman 	 *
105503f15c86SHugh Dickins 	 * Only page_get_anon_vma() understands the subtleties of
105603f15c86SHugh Dickins 	 * getting a hold on an anon_vma from outside one of its mms.
105703f15c86SHugh Dickins 	 * But if we cannot get anon_vma, then we won't need it anyway,
105803f15c86SHugh Dickins 	 * because that implies that the anon page is no longer mapped
105903f15c86SHugh Dickins 	 * (and cannot be remapped so long as we hold the page lock).
10603fe2011fSMel Gorman 	 */
1061682a71a1SMatthew Wilcox (Oracle) 	if (folio_test_anon(src) && !folio_test_ksm(src))
1062682a71a1SMatthew Wilcox (Oracle) 		anon_vma = page_get_anon_vma(&src->page);
106362e1c553SShaohua Li 
10647db7671fSHugh Dickins 	/*
10657db7671fSHugh Dickins 	 * Block others from accessing the new page when we get around to
10667db7671fSHugh Dickins 	 * establishing additional references. We are usually the only one
1067682a71a1SMatthew Wilcox (Oracle) 	 * holding a reference to dst at this point. We used to have a BUG
1068682a71a1SMatthew Wilcox (Oracle) 	 * here if folio_trylock(dst) fails, but would like to allow for
1069682a71a1SMatthew Wilcox (Oracle) 	 * cases where there might be a race with the previous use of dst.
10707db7671fSHugh Dickins 	 * This is much like races on refcount of oldpage: just don't BUG().
10717db7671fSHugh Dickins 	 */
1072682a71a1SMatthew Wilcox (Oracle) 	if (unlikely(!folio_trylock(dst)))
10737db7671fSHugh Dickins 		goto out_unlock;
10747db7671fSHugh Dickins 
1075bda807d4SMinchan Kim 	if (unlikely(!is_lru)) {
1076682a71a1SMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, src, mode);
1077bda807d4SMinchan Kim 		goto out_unlock_both;
1078bda807d4SMinchan Kim 	}
1079bda807d4SMinchan Kim 
1080dc386d4dSKAMEZAWA Hiroyuki 	/*
108162e1c553SShaohua Li 	 * Corner case handling:
108262e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
108362e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
1084682a71a1SMatthew Wilcox (Oracle) 	 * Calling try_to_unmap() against a src->mapping==NULL page will
108562e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
1086d12b8951SYang Shi 	 * 2. An orphaned page (see truncate_cleanup_page) might have
108762e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
108862e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
108962e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
109062e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
1091dc386d4dSKAMEZAWA Hiroyuki 	 */
1092682a71a1SMatthew Wilcox (Oracle) 	if (!src->mapping) {
1093682a71a1SMatthew Wilcox (Oracle) 		if (folio_test_private(src)) {
1094682a71a1SMatthew Wilcox (Oracle) 			try_to_free_buffers(src);
10957db7671fSHugh Dickins 			goto out_unlock_both;
109662e1c553SShaohua Li 		}
1097682a71a1SMatthew Wilcox (Oracle) 	} else if (folio_mapped(src)) {
10987db7671fSHugh Dickins 		/* Establish migration ptes */
1099682a71a1SMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_anon(src) &&
1100682a71a1SMatthew Wilcox (Oracle) 			       !folio_test_ksm(src) && !anon_vma, src);
1101682a71a1SMatthew Wilcox (Oracle) 		try_to_migrate(src, 0);
1102213ecb31SBaolin Wang 		page_was_mapped = true;
11032ebba6b7SHugh Dickins 	}
1104dc386d4dSKAMEZAWA Hiroyuki 
1105682a71a1SMatthew Wilcox (Oracle) 	if (!folio_mapped(src))
1106682a71a1SMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, src, mode);
1107e24f0b8fSChristoph Lameter 
1108c3096e67SHugh Dickins 	/*
1109682a71a1SMatthew Wilcox (Oracle) 	 * When successful, push dst to LRU immediately: so that if it
1110c3096e67SHugh Dickins 	 * turns out to be an mlocked page, remove_migration_ptes() will
1111682a71a1SMatthew Wilcox (Oracle) 	 * automatically build up the correct dst->mlock_count for it.
1112c3096e67SHugh Dickins 	 *
1113c3096e67SHugh Dickins 	 * We would like to do something similar for the old page, when
1114c3096e67SHugh Dickins 	 * unsuccessful, and other cases when a page has been temporarily
1115c3096e67SHugh Dickins 	 * isolated from the unevictable LRU: but this case is the easiest.
1116c3096e67SHugh Dickins 	 */
1117c3096e67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1118682a71a1SMatthew Wilcox (Oracle) 		folio_add_lru(dst);
11195c3f9a67SHugh Dickins 		if (page_was_mapped)
1120c3096e67SHugh Dickins 			lru_add_drain();
1121c3096e67SHugh Dickins 	}
1122c3096e67SHugh Dickins 
11235c3f9a67SHugh Dickins 	if (page_was_mapped)
1124682a71a1SMatthew Wilcox (Oracle) 		remove_migration_ptes(src,
1125682a71a1SMatthew Wilcox (Oracle) 			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
11263f6c8272SMel Gorman 
11277db7671fSHugh Dickins out_unlock_both:
1128682a71a1SMatthew Wilcox (Oracle) 	folio_unlock(dst);
11297db7671fSHugh Dickins out_unlock:
11303f6c8272SMel Gorman 	/* Drop an anon_vma reference if we took one */
113176545066SRik van Riel 	if (anon_vma)
11329e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
1133682a71a1SMatthew Wilcox (Oracle) 	folio_unlock(src);
11340dabec93SMinchan Kim out:
1135c6c919ebSMinchan Kim 	/*
1136682a71a1SMatthew Wilcox (Oracle) 	 * If migration is successful, decrease refcount of dst,
1137c6c919ebSMinchan Kim 	 * which will not free the page because new page owner increased
1138c3096e67SHugh Dickins 	 * refcounter.
1139c6c919ebSMinchan Kim 	 */
1140c3096e67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS)
1141682a71a1SMatthew Wilcox (Oracle) 		folio_put(dst);
1142c6c919ebSMinchan Kim 
11430dabec93SMinchan Kim 	return rc;
11440dabec93SMinchan Kim }
114595a402c3SChristoph Lameter 
11460dabec93SMinchan Kim /*
11470dabec93SMinchan Kim  * Obtain the lock on page, remove all ptes and migrate the page
11480dabec93SMinchan Kim  * to the newly allocated page in newpage.
11490dabec93SMinchan Kim  */
11506ec4476aSLinus Torvalds static int unmap_and_move(new_page_t get_new_page,
1151ef2a5153SGeert Uytterhoeven 				   free_page_t put_new_page,
1152ef2a5153SGeert Uytterhoeven 				   unsigned long private, struct page *page,
1153add05cecSNaoya Horiguchi 				   int force, enum migrate_mode mode,
1154dd4ae78aSYang Shi 				   enum migrate_reason reason,
1155dd4ae78aSYang Shi 				   struct list_head *ret)
11560dabec93SMinchan Kim {
1157682a71a1SMatthew Wilcox (Oracle) 	struct folio *dst, *src = page_folio(page);
11582def7424SHugh Dickins 	int rc = MIGRATEPAGE_SUCCESS;
115974d4a579SYang Shi 	struct page *newpage = NULL;
11600dabec93SMinchan Kim 
116194723aafSMichal Hocko 	if (!thp_migration_supported() && PageTransHuge(page))
1162d532e2e5SYang Shi 		return -ENOSYS;
116394723aafSMichal Hocko 
11640dabec93SMinchan Kim 	if (page_count(page) == 1) {
1165160088b3SMiaohe Lin 		/* Page was freed from under us. So we are done. */
1166c6c919ebSMinchan Kim 		ClearPageActive(page);
1167c6c919ebSMinchan Kim 		ClearPageUnevictable(page);
1168160088b3SMiaohe Lin 		/* free_pages_prepare() will clear PG_isolated. */
11690dabec93SMinchan Kim 		goto out;
11700dabec93SMinchan Kim 	}
11710dabec93SMinchan Kim 
117274d4a579SYang Shi 	newpage = get_new_page(page, private);
117374d4a579SYang Shi 	if (!newpage)
117474d4a579SYang Shi 		return -ENOMEM;
1175682a71a1SMatthew Wilcox (Oracle) 	dst = page_folio(newpage);
117674d4a579SYang Shi 
1177b653db77SMatthew Wilcox (Oracle) 	newpage->private = 0;
1178682a71a1SMatthew Wilcox (Oracle) 	rc = __unmap_and_move(src, dst, force, mode);
1179c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS)
11807cd12b4aSVlastimil Babka 		set_page_owner_migrate_reason(newpage, reason);
1181bf6bddf1SRafael Aquini 
11820dabec93SMinchan Kim out:
1183e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
1184aaa994b3SChristoph Lameter 		/*
1185aaa994b3SChristoph Lameter 		 * A page that has been migrated has all references
1186aaa994b3SChristoph Lameter 		 * removed and will be freed. A page that has not been
1187c23a0c99SRalph Campbell 		 * migrated will have kept its references and be restored.
1188aaa994b3SChristoph Lameter 		 */
1189aaa994b3SChristoph Lameter 		list_del(&page->lru);
1190e24f0b8fSChristoph Lameter 	}
119168711a74SDavid Rientjes 
119295a402c3SChristoph Lameter 	/*
1193c6c919ebSMinchan Kim 	 * If migration is successful, releases reference grabbed during
1194c6c919ebSMinchan Kim 	 * isolation. Otherwise, restore the page to right list unless
1195c6c919ebSMinchan Kim 	 * we want to retry.
119695a402c3SChristoph Lameter 	 */
1197c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1198dd4ae78aSYang Shi 		/*
1199dd4ae78aSYang Shi 		 * Compaction can migrate also non-LRU pages which are
1200dd4ae78aSYang Shi 		 * not accounted to NR_ISOLATED_*. They can be recognized
1201dd4ae78aSYang Shi 		 * as __PageMovable
1202dd4ae78aSYang Shi 		 */
1203dd4ae78aSYang Shi 		if (likely(!__PageMovable(page)))
1204dd4ae78aSYang Shi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1205dd4ae78aSYang Shi 					page_is_file_lru(page), -thp_nr_pages(page));
1206dd4ae78aSYang Shi 
120779f5f8faSOscar Salvador 		if (reason != MR_MEMORY_FAILURE)
1208c6c919ebSMinchan Kim 			/*
120979f5f8faSOscar Salvador 			 * We release the page in page_handle_poison.
1210c6c919ebSMinchan Kim 			 */
121179f5f8faSOscar Salvador 			put_page(page);
1212c6c919ebSMinchan Kim 	} else {
1213dd4ae78aSYang Shi 		if (rc != -EAGAIN)
1214dd4ae78aSYang Shi 			list_add_tail(&page->lru, ret);
1215bda807d4SMinchan Kim 
1216cf4b769aSHugh Dickins 		if (put_new_page)
121768711a74SDavid Rientjes 			put_new_page(newpage, private);
1218c6c919ebSMinchan Kim 		else
1219d6d86c0aSKonstantin Khlebnikov 			put_page(newpage);
1220c6c919ebSMinchan Kim 	}
122168711a74SDavid Rientjes 
1222e24f0b8fSChristoph Lameter 	return rc;
1223e24f0b8fSChristoph Lameter }
1224b20a3503SChristoph Lameter 
1225e24f0b8fSChristoph Lameter /*
1226290408d4SNaoya Horiguchi  * Counterpart of unmap_and_move_page() for hugepage migration.
1227290408d4SNaoya Horiguchi  *
1228290408d4SNaoya Horiguchi  * This function doesn't wait the completion of hugepage I/O
1229290408d4SNaoya Horiguchi  * because there is no race between I/O and migration for hugepage.
1230290408d4SNaoya Horiguchi  * Note that currently hugepage I/O occurs only in direct I/O
1231290408d4SNaoya Horiguchi  * where no lock is held and PG_writeback is irrelevant,
1232290408d4SNaoya Horiguchi  * and writeback status of all subpages are counted in the reference
1233290408d4SNaoya Horiguchi  * count of the head page (i.e. if all subpages of a 2MB hugepage are
1234290408d4SNaoya Horiguchi  * under direct I/O, the reference of the head page is 512 and a bit more.)
1235290408d4SNaoya Horiguchi  * This means that when we try to migrate hugepage whose subpages are
1236290408d4SNaoya Horiguchi  * doing direct I/O, some references remain after try_to_unmap() and
1237290408d4SNaoya Horiguchi  * hugepage migration fails without data corruption.
1238290408d4SNaoya Horiguchi  *
1239290408d4SNaoya Horiguchi  * There is also no race when direct I/O is issued on the page under migration,
1240290408d4SNaoya Horiguchi  * because then pte is replaced with migration swap entry and direct I/O code
1241290408d4SNaoya Horiguchi  * will wait in the page fault for migration to complete.
1242290408d4SNaoya Horiguchi  */
1243290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page,
124468711a74SDavid Rientjes 				free_page_t put_new_page, unsigned long private,
124568711a74SDavid Rientjes 				struct page *hpage, int force,
1246dd4ae78aSYang Shi 				enum migrate_mode mode, int reason,
1247dd4ae78aSYang Shi 				struct list_head *ret)
1248290408d4SNaoya Horiguchi {
12494eecb8b9SMatthew Wilcox (Oracle) 	struct folio *dst, *src = page_folio(hpage);
12502def7424SHugh Dickins 	int rc = -EAGAIN;
12512ebba6b7SHugh Dickins 	int page_was_mapped = 0;
125232665f2bSJoonsoo Kim 	struct page *new_hpage;
1253290408d4SNaoya Horiguchi 	struct anon_vma *anon_vma = NULL;
1254c0d0381aSMike Kravetz 	struct address_space *mapping = NULL;
1255290408d4SNaoya Horiguchi 
125683467efbSNaoya Horiguchi 	/*
12577ed2c31dSAnshuman Khandual 	 * Migratability of hugepages depends on architectures and their size.
125883467efbSNaoya Horiguchi 	 * This check is necessary because some callers of hugepage migration
125983467efbSNaoya Horiguchi 	 * like soft offline and memory hotremove don't walk through page
126083467efbSNaoya Horiguchi 	 * tables or check whether the hugepage is pmd-based or not before
126183467efbSNaoya Horiguchi 	 * kicking migration.
126283467efbSNaoya Horiguchi 	 */
1263577be05cSHuang Ying 	if (!hugepage_migration_supported(page_hstate(hpage)))
126483467efbSNaoya Horiguchi 		return -ENOSYS;
126583467efbSNaoya Horiguchi 
1266*c33db292SMatthew Wilcox (Oracle) 	if (folio_ref_count(src) == 1) {
126771a64f61SMuchun Song 		/* page was freed from under us. So we are done. */
126871a64f61SMuchun Song 		putback_active_hugepage(hpage);
126971a64f61SMuchun Song 		return MIGRATEPAGE_SUCCESS;
127071a64f61SMuchun Song 	}
127171a64f61SMuchun Song 
1272666feb21SMichal Hocko 	new_hpage = get_new_page(hpage, private);
1273290408d4SNaoya Horiguchi 	if (!new_hpage)
1274290408d4SNaoya Horiguchi 		return -ENOMEM;
12754eecb8b9SMatthew Wilcox (Oracle) 	dst = page_folio(new_hpage);
1276290408d4SNaoya Horiguchi 
1277*c33db292SMatthew Wilcox (Oracle) 	if (!folio_trylock(src)) {
12782916ecc0SJérôme Glisse 		if (!force)
1279290408d4SNaoya Horiguchi 			goto out;
12802916ecc0SJérôme Glisse 		switch (mode) {
12812916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
12822916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
12832916ecc0SJérôme Glisse 			break;
12842916ecc0SJérôme Glisse 		default:
12852916ecc0SJérôme Glisse 			goto out;
12862916ecc0SJérôme Glisse 		}
1287*c33db292SMatthew Wilcox (Oracle) 		folio_lock(src);
1288290408d4SNaoya Horiguchi 	}
1289290408d4SNaoya Horiguchi 
1290cb6acd01SMike Kravetz 	/*
1291cb6acd01SMike Kravetz 	 * Check for pages which are in the process of being freed.  Without
1292*c33db292SMatthew Wilcox (Oracle) 	 * folio_mapping() set, hugetlbfs specific move page routine will not
1293cb6acd01SMike Kravetz 	 * be called and we could leak usage counts for subpools.
1294cb6acd01SMike Kravetz 	 */
1295*c33db292SMatthew Wilcox (Oracle) 	if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) {
1296cb6acd01SMike Kravetz 		rc = -EBUSY;
1297cb6acd01SMike Kravetz 		goto out_unlock;
1298cb6acd01SMike Kravetz 	}
1299cb6acd01SMike Kravetz 
1300*c33db292SMatthew Wilcox (Oracle) 	if (folio_test_anon(src))
1301*c33db292SMatthew Wilcox (Oracle) 		anon_vma = page_get_anon_vma(&src->page);
1302290408d4SNaoya Horiguchi 
1303*c33db292SMatthew Wilcox (Oracle) 	if (unlikely(!folio_trylock(dst)))
13047db7671fSHugh Dickins 		goto put_anon;
13057db7671fSHugh Dickins 
1306*c33db292SMatthew Wilcox (Oracle) 	if (folio_mapped(src)) {
1307a98a2f0cSAlistair Popple 		enum ttu_flags ttu = 0;
1308336bf30eSMike Kravetz 
1309*c33db292SMatthew Wilcox (Oracle) 		if (!folio_test_anon(src)) {
1310c0d0381aSMike Kravetz 			/*
1311336bf30eSMike Kravetz 			 * In shared mappings, try_to_unmap could potentially
1312336bf30eSMike Kravetz 			 * call huge_pmd_unshare.  Because of this, take
1313336bf30eSMike Kravetz 			 * semaphore in write mode here and set TTU_RMAP_LOCKED
1314336bf30eSMike Kravetz 			 * to let lower levels know we have taken the lock.
1315c0d0381aSMike Kravetz 			 */
1316c0d0381aSMike Kravetz 			mapping = hugetlb_page_mapping_lock_write(hpage);
1317c0d0381aSMike Kravetz 			if (unlikely(!mapping))
1318c0d0381aSMike Kravetz 				goto unlock_put_anon;
1319c0d0381aSMike Kravetz 
13205202978bSMiaohe Lin 			ttu = TTU_RMAP_LOCKED;
1321336bf30eSMike Kravetz 		}
1322336bf30eSMike Kravetz 
13234b8554c5SMatthew Wilcox (Oracle) 		try_to_migrate(src, ttu);
13242ebba6b7SHugh Dickins 		page_was_mapped = 1;
1325336bf30eSMike Kravetz 
13265202978bSMiaohe Lin 		if (ttu & TTU_RMAP_LOCKED)
1327336bf30eSMike Kravetz 			i_mmap_unlock_write(mapping);
13282ebba6b7SHugh Dickins 	}
1329290408d4SNaoya Horiguchi 
1330*c33db292SMatthew Wilcox (Oracle) 	if (!folio_mapped(src))
1331e7e3ffebSMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, src, mode);
1332290408d4SNaoya Horiguchi 
1333336bf30eSMike Kravetz 	if (page_was_mapped)
13344eecb8b9SMatthew Wilcox (Oracle) 		remove_migration_ptes(src,
13354eecb8b9SMatthew Wilcox (Oracle) 			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
1336290408d4SNaoya Horiguchi 
1337c0d0381aSMike Kravetz unlock_put_anon:
1338*c33db292SMatthew Wilcox (Oracle) 	folio_unlock(dst);
13397db7671fSHugh Dickins 
13407db7671fSHugh Dickins put_anon:
1341fd4a4663SHugh Dickins 	if (anon_vma)
13429e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
13438e6ac7faSAneesh Kumar K.V 
13442def7424SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1345ab5ac90aSMichal Hocko 		move_hugetlb_state(hpage, new_hpage, reason);
13462def7424SHugh Dickins 		put_new_page = NULL;
13472def7424SHugh Dickins 	}
13488e6ac7faSAneesh Kumar K.V 
1349cb6acd01SMike Kravetz out_unlock:
1350*c33db292SMatthew Wilcox (Oracle) 	folio_unlock(src);
135109761333SHillf Danton out:
1352dd4ae78aSYang Shi 	if (rc == MIGRATEPAGE_SUCCESS)
1353b8ec1ceeSNaoya Horiguchi 		putback_active_hugepage(hpage);
1354a04840c6SMiaohe Lin 	else if (rc != -EAGAIN)
1355*c33db292SMatthew Wilcox (Oracle) 		list_move_tail(&src->lru, ret);
135668711a74SDavid Rientjes 
135768711a74SDavid Rientjes 	/*
135868711a74SDavid Rientjes 	 * If migration was not successful and there's a freeing callback, use
135968711a74SDavid Rientjes 	 * it.  Otherwise, put_page() will drop the reference grabbed during
136068711a74SDavid Rientjes 	 * isolation.
136168711a74SDavid Rientjes 	 */
13622def7424SHugh Dickins 	if (put_new_page)
136368711a74SDavid Rientjes 		put_new_page(new_hpage, private);
136468711a74SDavid Rientjes 	else
13653aaa76e1SNaoya Horiguchi 		putback_active_hugepage(new_hpage);
136668711a74SDavid Rientjes 
1367290408d4SNaoya Horiguchi 	return rc;
1368290408d4SNaoya Horiguchi }
1369290408d4SNaoya Horiguchi 
13709c62ff00SHuang Ying static inline int try_split_thp(struct page *page, struct list_head *split_pages)
1371d532e2e5SYang Shi {
13729c62ff00SHuang Ying 	int rc;
1373d532e2e5SYang Shi 
1374d532e2e5SYang Shi 	lock_page(page);
13759c62ff00SHuang Ying 	rc = split_huge_page_to_list(page, split_pages);
1376d532e2e5SYang Shi 	unlock_page(page);
1377e6fa8a79SHuang Ying 	if (!rc)
1378e6fa8a79SHuang Ying 		list_move_tail(&page->lru, split_pages);
1379d532e2e5SYang Shi 
1380d532e2e5SYang Shi 	return rc;
1381d532e2e5SYang Shi }
1382d532e2e5SYang Shi 
1383290408d4SNaoya Horiguchi /*
1384c73e5c9cSSrivatsa S. Bhat  * migrate_pages - migrate the pages specified in a list, to the free pages
1385c73e5c9cSSrivatsa S. Bhat  *		   supplied as the target for the page migration
1386e24f0b8fSChristoph Lameter  *
1387c73e5c9cSSrivatsa S. Bhat  * @from:		The list of pages to be migrated.
1388c73e5c9cSSrivatsa S. Bhat  * @get_new_page:	The function used to allocate free pages to be used
1389c73e5c9cSSrivatsa S. Bhat  *			as the target of the page migration.
139068711a74SDavid Rientjes  * @put_new_page:	The function used to free target pages if migration
139168711a74SDavid Rientjes  *			fails, or NULL if no special handling is necessary.
1392c73e5c9cSSrivatsa S. Bhat  * @private:		Private data to be passed on to get_new_page()
1393c73e5c9cSSrivatsa S. Bhat  * @mode:		The migration mode that specifies the constraints for
1394c73e5c9cSSrivatsa S. Bhat  *			page migration, if any.
1395c73e5c9cSSrivatsa S. Bhat  * @reason:		The reason for page migration.
1396b5bade97SBaolin Wang  * @ret_succeeded:	Set to the number of normal pages migrated successfully if
13975ac95884SYang Shi  *			the caller passes a non-NULL pointer.
1398e24f0b8fSChristoph Lameter  *
1399c73e5c9cSSrivatsa S. Bhat  * The function returns after 10 attempts or if no pages are movable any more
1400c73e5c9cSSrivatsa S. Bhat  * because the list has become empty or no retryable pages exist any more.
1401dd4ae78aSYang Shi  * It is caller's responsibility to call putback_movable_pages() to return pages
1402dd4ae78aSYang Shi  * to the LRU or free list only if ret != 0.
1403e24f0b8fSChristoph Lameter  *
14045d39a7ebSBaolin Wang  * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
14055d39a7ebSBaolin Wang  * an error code. The number of THP splits will be considered as the number of
14065d39a7ebSBaolin Wang  * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
1407e24f0b8fSChristoph Lameter  */
14089c620e2bSHugh Dickins int migrate_pages(struct list_head *from, new_page_t get_new_page,
140968711a74SDavid Rientjes 		free_page_t put_new_page, unsigned long private,
14105ac95884SYang Shi 		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1411e24f0b8fSChristoph Lameter {
1412e24f0b8fSChristoph Lameter 	int retry = 1;
14131a5bae25SAnshuman Khandual 	int thp_retry = 1;
1414e24f0b8fSChristoph Lameter 	int nr_failed = 0;
1415b5bade97SBaolin Wang 	int nr_failed_pages = 0;
1416077309bcSHuang Ying 	int nr_retry_pages = 0;
14175647bc29SMel Gorman 	int nr_succeeded = 0;
14181a5bae25SAnshuman Khandual 	int nr_thp_succeeded = 0;
14191a5bae25SAnshuman Khandual 	int nr_thp_failed = 0;
14201a5bae25SAnshuman Khandual 	int nr_thp_split = 0;
1421e24f0b8fSChristoph Lameter 	int pass = 0;
14221a5bae25SAnshuman Khandual 	bool is_thp = false;
1423e24f0b8fSChristoph Lameter 	struct page *page;
1424e24f0b8fSChristoph Lameter 	struct page *page2;
14251a5bae25SAnshuman Khandual 	int rc, nr_subpages;
1426dd4ae78aSYang Shi 	LIST_HEAD(ret_pages);
1427b5bade97SBaolin Wang 	LIST_HEAD(thp_split_pages);
1428b0b515bfSYang Shi 	bool nosplit = (reason == MR_NUMA_MISPLACED);
1429b5bade97SBaolin Wang 	bool no_subpage_counting = false;
14302d1db3b1SChristoph Lameter 
14317bc1aec5SLiam Mark 	trace_mm_migrate_pages_start(mode, reason);
14327bc1aec5SLiam Mark 
1433b5bade97SBaolin Wang thp_subpage_migration:
14341a5bae25SAnshuman Khandual 	for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
1435e24f0b8fSChristoph Lameter 		retry = 0;
14361a5bae25SAnshuman Khandual 		thp_retry = 0;
1437077309bcSHuang Ying 		nr_retry_pages = 0;
1438e24f0b8fSChristoph Lameter 
1439e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
14401a5bae25SAnshuman Khandual 			/*
14411a5bae25SAnshuman Khandual 			 * THP statistics is based on the source huge page.
14421a5bae25SAnshuman Khandual 			 * Capture required information that might get lost
14431a5bae25SAnshuman Khandual 			 * during migration.
14441a5bae25SAnshuman Khandual 			 */
14456c5c7b9fSZi Yan 			is_thp = PageTransHuge(page) && !PageHuge(page);
14465d39a7ebSBaolin Wang 			nr_subpages = compound_nr(page);
1447e24f0b8fSChristoph Lameter 			cond_resched();
1448e24f0b8fSChristoph Lameter 
144931caf665SNaoya Horiguchi 			if (PageHuge(page))
145031caf665SNaoya Horiguchi 				rc = unmap_and_move_huge_page(get_new_page,
145168711a74SDavid Rientjes 						put_new_page, private, page,
1452dd4ae78aSYang Shi 						pass > 2, mode, reason,
1453dd4ae78aSYang Shi 						&ret_pages);
145431caf665SNaoya Horiguchi 			else
145568711a74SDavid Rientjes 				rc = unmap_and_move(get_new_page, put_new_page,
1456add05cecSNaoya Horiguchi 						private, page, pass > 2, mode,
1457dd4ae78aSYang Shi 						reason, &ret_pages);
1458dd4ae78aSYang Shi 			/*
1459dd4ae78aSYang Shi 			 * The rules are:
1460dd4ae78aSYang Shi 			 *	Success: non hugetlb page will be freed, hugetlb
1461dd4ae78aSYang Shi 			 *		 page will be put back
1462dd4ae78aSYang Shi 			 *	-EAGAIN: stay on the from list
1463dd4ae78aSYang Shi 			 *	-ENOMEM: stay on the from list
1464577be05cSHuang Ying 			 *	-ENOSYS: stay on the from list
1465dd4ae78aSYang Shi 			 *	Other errno: put on ret_pages list then splice to
1466dd4ae78aSYang Shi 			 *		     from list
1467dd4ae78aSYang Shi 			 */
1468e24f0b8fSChristoph Lameter 			switch(rc) {
146994723aafSMichal Hocko 			/*
147094723aafSMichal Hocko 			 * THP migration might be unsupported or the
147194723aafSMichal Hocko 			 * allocation could've failed so we should
147294723aafSMichal Hocko 			 * retry on the same page with the THP split
147394723aafSMichal Hocko 			 * to base pages.
147494723aafSMichal Hocko 			 *
1475e6fa8a79SHuang Ying 			 * Sub-pages are put in thp_split_pages, and
1476e6fa8a79SHuang Ying 			 * we will migrate them after the rest of the
1477e6fa8a79SHuang Ying 			 * list is processed.
147894723aafSMichal Hocko 			 */
1479d532e2e5SYang Shi 			case -ENOSYS:
1480d532e2e5SYang Shi 				/* THP migration is unsupported */
14816c5c7b9fSZi Yan 				if (is_thp) {
1482b5bade97SBaolin Wang 					nr_thp_failed++;
14839c62ff00SHuang Ying 					if (!try_split_thp(page, &thp_split_pages)) {
1484d532e2e5SYang Shi 						nr_thp_split++;
1485e6fa8a79SHuang Ying 						break;
1486d532e2e5SYang Shi 					}
1487f430893bSMiaohe Lin 				/* Hugetlb migration is unsupported */
1488f430893bSMiaohe Lin 				} else if (!no_subpage_counting) {
1489f430893bSMiaohe Lin 					nr_failed++;
1490d532e2e5SYang Shi 				}
1491d532e2e5SYang Shi 
14925d39a7ebSBaolin Wang 				nr_failed_pages += nr_subpages;
1493577be05cSHuang Ying 				list_move_tail(&page->lru, &ret_pages);
1494d532e2e5SYang Shi 				break;
1495d532e2e5SYang Shi 			case -ENOMEM:
1496d532e2e5SYang Shi 				/*
1497d532e2e5SYang Shi 				 * When memory is low, don't bother to try to migrate
1498d532e2e5SYang Shi 				 * other pages, just exit.
1499d532e2e5SYang Shi 				 */
1500fbed53b4SHuang Ying 				if (is_thp) {
1501b5bade97SBaolin Wang 					nr_thp_failed++;
1502fbed53b4SHuang Ying 					/* THP NUMA faulting doesn't split THP to retry. */
1503fbed53b4SHuang Ying 					if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
15041a5bae25SAnshuman Khandual 						nr_thp_split++;
1505e6fa8a79SHuang Ying 						break;
150694723aafSMichal Hocko 					}
1507f430893bSMiaohe Lin 				} else if (!no_subpage_counting) {
1508f430893bSMiaohe Lin 					nr_failed++;
15091a5bae25SAnshuman Khandual 				}
1510b5bade97SBaolin Wang 
1511077309bcSHuang Ying 				nr_failed_pages += nr_subpages + nr_retry_pages;
151269a041ffSMiaohe Lin 				/*
151369a041ffSMiaohe Lin 				 * There might be some subpages of fail-to-migrate THPs
151469a041ffSMiaohe Lin 				 * left in thp_split_pages list. Move them back to migration
151569a041ffSMiaohe Lin 				 * list so that they could be put back to the right list by
151669a041ffSMiaohe Lin 				 * the caller otherwise the page refcnt will be leaked.
151769a041ffSMiaohe Lin 				 */
151869a041ffSMiaohe Lin 				list_splice_init(&thp_split_pages, from);
1519fbed53b4SHuang Ying 				/* nr_failed isn't updated for not used */
152069a041ffSMiaohe Lin 				nr_thp_failed += thp_retry;
152195a402c3SChristoph Lameter 				goto out;
1522e24f0b8fSChristoph Lameter 			case -EAGAIN:
1523f430893bSMiaohe Lin 				if (is_thp)
15241a5bae25SAnshuman Khandual 					thp_retry++;
15257047b5a4SBaolin Wang 				else if (!no_subpage_counting)
1526b20a3503SChristoph Lameter 					retry++;
1527077309bcSHuang Ying 				nr_retry_pages += nr_subpages;
1528e24f0b8fSChristoph Lameter 				break;
152978bd5209SRafael Aquini 			case MIGRATEPAGE_SUCCESS:
15305d39a7ebSBaolin Wang 				nr_succeeded += nr_subpages;
1531f430893bSMiaohe Lin 				if (is_thp)
15321a5bae25SAnshuman Khandual 					nr_thp_succeeded++;
15331a5bae25SAnshuman Khandual 				break;
1534e24f0b8fSChristoph Lameter 			default:
1535354a3363SNaoya Horiguchi 				/*
1536d532e2e5SYang Shi 				 * Permanent failure (-EBUSY, etc.):
1537354a3363SNaoya Horiguchi 				 * unlike -EAGAIN case, the failed page is
1538354a3363SNaoya Horiguchi 				 * removed from migration page list and not
1539354a3363SNaoya Horiguchi 				 * retried in the next outer loop.
1540354a3363SNaoya Horiguchi 				 */
1541f430893bSMiaohe Lin 				if (is_thp)
15421a5bae25SAnshuman Khandual 					nr_thp_failed++;
1543f430893bSMiaohe Lin 				else if (!no_subpage_counting)
1544b20a3503SChristoph Lameter 					nr_failed++;
1545f430893bSMiaohe Lin 
15465d39a7ebSBaolin Wang 				nr_failed_pages += nr_subpages;
1547e24f0b8fSChristoph Lameter 				break;
1548b20a3503SChristoph Lameter 			}
1549b20a3503SChristoph Lameter 		}
1550e24f0b8fSChristoph Lameter 	}
1551b5bade97SBaolin Wang 	nr_failed += retry;
15521a5bae25SAnshuman Khandual 	nr_thp_failed += thp_retry;
1553077309bcSHuang Ying 	nr_failed_pages += nr_retry_pages;
1554b5bade97SBaolin Wang 	/*
1555b5bade97SBaolin Wang 	 * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
1556b5bade97SBaolin Wang 	 * counting in this round, since all subpages of a THP is counted
1557b5bade97SBaolin Wang 	 * as 1 failure in the first round.
1558b5bade97SBaolin Wang 	 */
1559b5bade97SBaolin Wang 	if (!list_empty(&thp_split_pages)) {
1560b5bade97SBaolin Wang 		/*
1561b5bade97SBaolin Wang 		 * Move non-migrated pages (after 10 retries) to ret_pages
1562b5bade97SBaolin Wang 		 * to avoid migrating them again.
1563b5bade97SBaolin Wang 		 */
1564b5bade97SBaolin Wang 		list_splice_init(from, &ret_pages);
1565b5bade97SBaolin Wang 		list_splice_init(&thp_split_pages, from);
1566b5bade97SBaolin Wang 		no_subpage_counting = true;
1567b5bade97SBaolin Wang 		retry = 1;
1568b5bade97SBaolin Wang 		goto thp_subpage_migration;
1569b5bade97SBaolin Wang 	}
1570b5bade97SBaolin Wang 
1571b5bade97SBaolin Wang 	rc = nr_failed + nr_thp_failed;
157295a402c3SChristoph Lameter out:
1573dd4ae78aSYang Shi 	/*
1574dd4ae78aSYang Shi 	 * Put the permanent failure page back to migration list, they
1575dd4ae78aSYang Shi 	 * will be put back to the right list by the caller.
1576dd4ae78aSYang Shi 	 */
1577dd4ae78aSYang Shi 	list_splice(&ret_pages, from);
1578dd4ae78aSYang Shi 
15795647bc29SMel Gorman 	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1580b5bade97SBaolin Wang 	count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
15811a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
15821a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
15831a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1584b5bade97SBaolin Wang 	trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
15851a5bae25SAnshuman Khandual 			       nr_thp_failed, nr_thp_split, mode, reason);
15867b2a2d4aSMel Gorman 
15875ac95884SYang Shi 	if (ret_succeeded)
15885ac95884SYang Shi 		*ret_succeeded = nr_succeeded;
15895ac95884SYang Shi 
159095a402c3SChristoph Lameter 	return rc;
1591b20a3503SChristoph Lameter }
1592b20a3503SChristoph Lameter 
159319fc7bedSJoonsoo Kim struct page *alloc_migration_target(struct page *page, unsigned long private)
1594b4b38223SJoonsoo Kim {
1595ffe06786SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
159619fc7bedSJoonsoo Kim 	struct migration_target_control *mtc;
159719fc7bedSJoonsoo Kim 	gfp_t gfp_mask;
1598b4b38223SJoonsoo Kim 	unsigned int order = 0;
1599ffe06786SMatthew Wilcox (Oracle) 	struct folio *new_folio = NULL;
160019fc7bedSJoonsoo Kim 	int nid;
160119fc7bedSJoonsoo Kim 	int zidx;
160219fc7bedSJoonsoo Kim 
160319fc7bedSJoonsoo Kim 	mtc = (struct migration_target_control *)private;
160419fc7bedSJoonsoo Kim 	gfp_mask = mtc->gfp_mask;
160519fc7bedSJoonsoo Kim 	nid = mtc->nid;
160619fc7bedSJoonsoo Kim 	if (nid == NUMA_NO_NODE)
1607ffe06786SMatthew Wilcox (Oracle) 		nid = folio_nid(folio);
1608b4b38223SJoonsoo Kim 
1609ffe06786SMatthew Wilcox (Oracle) 	if (folio_test_hugetlb(folio)) {
1610ffe06786SMatthew Wilcox (Oracle) 		struct hstate *h = page_hstate(&folio->page);
1611d92bbc27SJoonsoo Kim 
161219fc7bedSJoonsoo Kim 		gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
161319fc7bedSJoonsoo Kim 		return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1614d92bbc27SJoonsoo Kim 	}
1615b4b38223SJoonsoo Kim 
1616ffe06786SMatthew Wilcox (Oracle) 	if (folio_test_large(folio)) {
16179933a0c8SJoonsoo Kim 		/*
16189933a0c8SJoonsoo Kim 		 * clear __GFP_RECLAIM to make the migration callback
16199933a0c8SJoonsoo Kim 		 * consistent with regular THP allocations.
16209933a0c8SJoonsoo Kim 		 */
16219933a0c8SJoonsoo Kim 		gfp_mask &= ~__GFP_RECLAIM;
1622b4b38223SJoonsoo Kim 		gfp_mask |= GFP_TRANSHUGE;
1623ffe06786SMatthew Wilcox (Oracle) 		order = folio_order(folio);
1624b4b38223SJoonsoo Kim 	}
1625ffe06786SMatthew Wilcox (Oracle) 	zidx = zone_idx(folio_zone(folio));
162619fc7bedSJoonsoo Kim 	if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1627b4b38223SJoonsoo Kim 		gfp_mask |= __GFP_HIGHMEM;
1628b4b38223SJoonsoo Kim 
1629ffe06786SMatthew Wilcox (Oracle) 	new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
1630b4b38223SJoonsoo Kim 
1631ffe06786SMatthew Wilcox (Oracle) 	return &new_folio->page;
1632b4b38223SJoonsoo Kim }
1633b4b38223SJoonsoo Kim 
1634742755a1SChristoph Lameter #ifdef CONFIG_NUMA
1635742755a1SChristoph Lameter 
1636a49bd4d7SMichal Hocko static int store_status(int __user *status, int start, int value, int nr)
1637742755a1SChristoph Lameter {
1638a49bd4d7SMichal Hocko 	while (nr-- > 0) {
1639a49bd4d7SMichal Hocko 		if (put_user(value, status + start))
1640a49bd4d7SMichal Hocko 			return -EFAULT;
1641a49bd4d7SMichal Hocko 		start++;
1642a49bd4d7SMichal Hocko 	}
1643742755a1SChristoph Lameter 
1644a49bd4d7SMichal Hocko 	return 0;
1645a49bd4d7SMichal Hocko }
1646742755a1SChristoph Lameter 
1647a49bd4d7SMichal Hocko static int do_move_pages_to_node(struct mm_struct *mm,
1648a49bd4d7SMichal Hocko 		struct list_head *pagelist, int node)
1649a49bd4d7SMichal Hocko {
1650a49bd4d7SMichal Hocko 	int err;
1651a0976311SJoonsoo Kim 	struct migration_target_control mtc = {
1652a0976311SJoonsoo Kim 		.nid = node,
1653a0976311SJoonsoo Kim 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1654a0976311SJoonsoo Kim 	};
1655742755a1SChristoph Lameter 
1656a0976311SJoonsoo Kim 	err = migrate_pages(pagelist, alloc_migration_target, NULL,
16575ac95884SYang Shi 		(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1658a49bd4d7SMichal Hocko 	if (err)
1659a49bd4d7SMichal Hocko 		putback_movable_pages(pagelist);
1660a49bd4d7SMichal Hocko 	return err;
1661742755a1SChristoph Lameter }
1662742755a1SChristoph Lameter 
1663742755a1SChristoph Lameter /*
1664a49bd4d7SMichal Hocko  * Resolves the given address to a struct page, isolates it from the LRU and
1665a49bd4d7SMichal Hocko  * puts it to the given pagelist.
1666e0153fc2SYang Shi  * Returns:
1667e0153fc2SYang Shi  *     errno - if the page cannot be found/isolated
1668e0153fc2SYang Shi  *     0 - when it doesn't have to be migrated because it is already on the
1669e0153fc2SYang Shi  *         target node
1670e0153fc2SYang Shi  *     1 - when it has been queued
1671742755a1SChristoph Lameter  */
1672a49bd4d7SMichal Hocko static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1673a49bd4d7SMichal Hocko 		int node, struct list_head *pagelist, bool migrate_all)
1674742755a1SChristoph Lameter {
1675742755a1SChristoph Lameter 	struct vm_area_struct *vma;
1676742755a1SChristoph Lameter 	struct page *page;
1677a49bd4d7SMichal Hocko 	int err;
1678742755a1SChristoph Lameter 
1679d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
1680742755a1SChristoph Lameter 	err = -EFAULT;
1681cb1c37b1SMiaohe Lin 	vma = vma_lookup(mm, addr);
1682cb1c37b1SMiaohe Lin 	if (!vma || !vma_migratable(vma))
1683a49bd4d7SMichal Hocko 		goto out;
1684742755a1SChristoph Lameter 
1685d899844eSKirill A. Shutemov 	/* FOLL_DUMP to ignore special (like zero) pages */
168687d2762eSMiaohe Lin 	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
168789f5b7daSLinus Torvalds 
168889f5b7daSLinus Torvalds 	err = PTR_ERR(page);
168989f5b7daSLinus Torvalds 	if (IS_ERR(page))
1690a49bd4d7SMichal Hocko 		goto out;
169189f5b7daSLinus Torvalds 
1692742755a1SChristoph Lameter 	err = -ENOENT;
1693f7091ed6SHaiyue Wang 	if (!page)
1694a49bd4d7SMichal Hocko 		goto out;
1695742755a1SChristoph Lameter 
1696f7091ed6SHaiyue Wang 	if (is_zone_device_page(page))
1697f7091ed6SHaiyue Wang 		goto out_putpage;
1698f7091ed6SHaiyue Wang 
1699a49bd4d7SMichal Hocko 	err = 0;
1700a49bd4d7SMichal Hocko 	if (page_to_nid(page) == node)
1701a49bd4d7SMichal Hocko 		goto out_putpage;
1702742755a1SChristoph Lameter 
1703742755a1SChristoph Lameter 	err = -EACCES;
1704a49bd4d7SMichal Hocko 	if (page_mapcount(page) > 1 && !migrate_all)
1705a49bd4d7SMichal Hocko 		goto out_putpage;
1706742755a1SChristoph Lameter 
1707e632a938SNaoya Horiguchi 	if (PageHuge(page)) {
1708e8db67ebSNaoya Horiguchi 		if (PageHead(page)) {
17097ce82f4cSMiaohe Lin 			err = isolate_hugetlb(page, pagelist);
17107ce82f4cSMiaohe Lin 			if (!err)
1711e0153fc2SYang Shi 				err = 1;
1712e8db67ebSNaoya Horiguchi 		}
1713a49bd4d7SMichal Hocko 	} else {
1714a49bd4d7SMichal Hocko 		struct page *head;
1715e632a938SNaoya Horiguchi 
1716e8db67ebSNaoya Horiguchi 		head = compound_head(page);
1717e8db67ebSNaoya Horiguchi 		err = isolate_lru_page(head);
1718a49bd4d7SMichal Hocko 		if (err)
1719a49bd4d7SMichal Hocko 			goto out_putpage;
1720a49bd4d7SMichal Hocko 
1721e0153fc2SYang Shi 		err = 1;
1722a49bd4d7SMichal Hocko 		list_add_tail(&head->lru, pagelist);
1723e8db67ebSNaoya Horiguchi 		mod_node_page_state(page_pgdat(head),
17249de4f22aSHuang Ying 			NR_ISOLATED_ANON + page_is_file_lru(head),
17256c357848SMatthew Wilcox (Oracle) 			thp_nr_pages(head));
17266d9c285aSKOSAKI Motohiro 	}
1727a49bd4d7SMichal Hocko out_putpage:
1728742755a1SChristoph Lameter 	/*
1729742755a1SChristoph Lameter 	 * Either remove the duplicate refcount from
1730742755a1SChristoph Lameter 	 * isolate_lru_page() or drop the page ref if it was
1731742755a1SChristoph Lameter 	 * not isolated.
1732742755a1SChristoph Lameter 	 */
1733742755a1SChristoph Lameter 	put_page(page);
1734a49bd4d7SMichal Hocko out:
1735d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
1736742755a1SChristoph Lameter 	return err;
1737742755a1SChristoph Lameter }
1738742755a1SChristoph Lameter 
17397ca8783aSWei Yang static int move_pages_and_store_status(struct mm_struct *mm, int node,
17407ca8783aSWei Yang 		struct list_head *pagelist, int __user *status,
17417ca8783aSWei Yang 		int start, int i, unsigned long nr_pages)
17427ca8783aSWei Yang {
17437ca8783aSWei Yang 	int err;
17447ca8783aSWei Yang 
17455d7ae891SWei Yang 	if (list_empty(pagelist))
17465d7ae891SWei Yang 		return 0;
17475d7ae891SWei Yang 
17487ca8783aSWei Yang 	err = do_move_pages_to_node(mm, pagelist, node);
17497ca8783aSWei Yang 	if (err) {
17507ca8783aSWei Yang 		/*
17517ca8783aSWei Yang 		 * Positive err means the number of failed
17527ca8783aSWei Yang 		 * pages to migrate.  Since we are going to
17537ca8783aSWei Yang 		 * abort and return the number of non-migrated
1754ab9dd4f8SLong Li 		 * pages, so need to include the rest of the
17557ca8783aSWei Yang 		 * nr_pages that have not been attempted as
17567ca8783aSWei Yang 		 * well.
17577ca8783aSWei Yang 		 */
17587ca8783aSWei Yang 		if (err > 0)
1759a7504ed1SHuang Ying 			err += nr_pages - i;
17607ca8783aSWei Yang 		return err;
17617ca8783aSWei Yang 	}
17627ca8783aSWei Yang 	return store_status(status, start, node, i - start);
17637ca8783aSWei Yang }
17647ca8783aSWei Yang 
1765742755a1SChristoph Lameter /*
17665e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
17675e9a0f02SBrice Goglin  * the corresponding array of status.
17685e9a0f02SBrice Goglin  */
17693268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
17705e9a0f02SBrice Goglin 			 unsigned long nr_pages,
17715e9a0f02SBrice Goglin 			 const void __user * __user *pages,
17725e9a0f02SBrice Goglin 			 const int __user *nodes,
17735e9a0f02SBrice Goglin 			 int __user *status, int flags)
17745e9a0f02SBrice Goglin {
1775a49bd4d7SMichal Hocko 	int current_node = NUMA_NO_NODE;
1776a49bd4d7SMichal Hocko 	LIST_HEAD(pagelist);
1777a49bd4d7SMichal Hocko 	int start, i;
1778a49bd4d7SMichal Hocko 	int err = 0, err1;
177935282a2dSBrice Goglin 
1780361a2a22SMinchan Kim 	lru_cache_disable();
178135282a2dSBrice Goglin 
1782a49bd4d7SMichal Hocko 	for (i = start = 0; i < nr_pages; i++) {
17835e9a0f02SBrice Goglin 		const void __user *p;
1784a49bd4d7SMichal Hocko 		unsigned long addr;
17855e9a0f02SBrice Goglin 		int node;
17865e9a0f02SBrice Goglin 
17873140a227SBrice Goglin 		err = -EFAULT;
1788a49bd4d7SMichal Hocko 		if (get_user(p, pages + i))
1789a49bd4d7SMichal Hocko 			goto out_flush;
1790a49bd4d7SMichal Hocko 		if (get_user(node, nodes + i))
1791a49bd4d7SMichal Hocko 			goto out_flush;
1792057d3389SAndrey Konovalov 		addr = (unsigned long)untagged_addr(p);
17935e9a0f02SBrice Goglin 
17945e9a0f02SBrice Goglin 		err = -ENODEV;
17956f5a55f1SLinus Torvalds 		if (node < 0 || node >= MAX_NUMNODES)
1796a49bd4d7SMichal Hocko 			goto out_flush;
1797389162c2SLai Jiangshan 		if (!node_state(node, N_MEMORY))
1798a49bd4d7SMichal Hocko 			goto out_flush;
17995e9a0f02SBrice Goglin 
18005e9a0f02SBrice Goglin 		err = -EACCES;
18015e9a0f02SBrice Goglin 		if (!node_isset(node, task_nodes))
1802a49bd4d7SMichal Hocko 			goto out_flush;
18035e9a0f02SBrice Goglin 
1804a49bd4d7SMichal Hocko 		if (current_node == NUMA_NO_NODE) {
1805a49bd4d7SMichal Hocko 			current_node = node;
1806a49bd4d7SMichal Hocko 			start = i;
1807a49bd4d7SMichal Hocko 		} else if (node != current_node) {
18087ca8783aSWei Yang 			err = move_pages_and_store_status(mm, current_node,
18097ca8783aSWei Yang 					&pagelist, status, start, i, nr_pages);
1810a49bd4d7SMichal Hocko 			if (err)
1811a49bd4d7SMichal Hocko 				goto out;
1812a49bd4d7SMichal Hocko 			start = i;
1813a49bd4d7SMichal Hocko 			current_node = node;
18145e9a0f02SBrice Goglin 		}
18155e9a0f02SBrice Goglin 
1816a49bd4d7SMichal Hocko 		/*
1817a49bd4d7SMichal Hocko 		 * Errors in the page lookup or isolation are not fatal and we simply
1818a49bd4d7SMichal Hocko 		 * report them via status
1819a49bd4d7SMichal Hocko 		 */
1820a49bd4d7SMichal Hocko 		err = add_page_for_migration(mm, addr, current_node,
1821a49bd4d7SMichal Hocko 				&pagelist, flags & MPOL_MF_MOVE_ALL);
1822e0153fc2SYang Shi 
1823d08221a0SWei Yang 		if (err > 0) {
1824e0153fc2SYang Shi 			/* The page is successfully queued for migration */
1825e0153fc2SYang Shi 			continue;
1826e0153fc2SYang Shi 		}
18273140a227SBrice Goglin 
1828d08221a0SWei Yang 		/*
182965462462SJohn Hubbard 		 * The move_pages() man page does not have an -EEXIST choice, so
183065462462SJohn Hubbard 		 * use -EFAULT instead.
183165462462SJohn Hubbard 		 */
183265462462SJohn Hubbard 		if (err == -EEXIST)
183365462462SJohn Hubbard 			err = -EFAULT;
183465462462SJohn Hubbard 
183565462462SJohn Hubbard 		/*
1836d08221a0SWei Yang 		 * If the page is already on the target node (!err), store the
1837d08221a0SWei Yang 		 * node, otherwise, store the err.
1838d08221a0SWei Yang 		 */
1839d08221a0SWei Yang 		err = store_status(status, i, err ? : current_node, 1);
1840a49bd4d7SMichal Hocko 		if (err)
1841a49bd4d7SMichal Hocko 			goto out_flush;
18423140a227SBrice Goglin 
18437ca8783aSWei Yang 		err = move_pages_and_store_status(mm, current_node, &pagelist,
18447ca8783aSWei Yang 				status, start, i, nr_pages);
1845a7504ed1SHuang Ying 		if (err) {
1846a7504ed1SHuang Ying 			/* We have accounted for page i */
1847a7504ed1SHuang Ying 			if (err > 0)
1848a7504ed1SHuang Ying 				err--;
1849a49bd4d7SMichal Hocko 			goto out;
1850a7504ed1SHuang Ying 		}
1851a49bd4d7SMichal Hocko 		current_node = NUMA_NO_NODE;
18523140a227SBrice Goglin 	}
1853a49bd4d7SMichal Hocko out_flush:
1854a49bd4d7SMichal Hocko 	/* Make sure we do not overwrite the existing error */
18557ca8783aSWei Yang 	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
18567ca8783aSWei Yang 				status, start, i, nr_pages);
1857dfe9aa23SWei Yang 	if (err >= 0)
1858a49bd4d7SMichal Hocko 		err = err1;
18595e9a0f02SBrice Goglin out:
1860361a2a22SMinchan Kim 	lru_cache_enable();
18615e9a0f02SBrice Goglin 	return err;
18625e9a0f02SBrice Goglin }
18635e9a0f02SBrice Goglin 
18645e9a0f02SBrice Goglin /*
18652f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
1866742755a1SChristoph Lameter  */
186780bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
186880bba129SBrice Goglin 				const void __user **pages, int *status)
1869742755a1SChristoph Lameter {
18702f007e74SBrice Goglin 	unsigned long i;
1871742755a1SChristoph Lameter 
1872d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
18732f007e74SBrice Goglin 
18742f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
187580bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
187683156821SHaiyue Wang 		unsigned int foll_flags = FOLL_DUMP;
18772f007e74SBrice Goglin 		struct vm_area_struct *vma;
18782f007e74SBrice Goglin 		struct page *page;
1879c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
18802f007e74SBrice Goglin 
1881059b8b48SLiam Howlett 		vma = vma_lookup(mm, addr);
1882059b8b48SLiam Howlett 		if (!vma)
1883742755a1SChristoph Lameter 			goto set_status;
1884742755a1SChristoph Lameter 
188583156821SHaiyue Wang 		/* Not all huge page follow APIs support 'FOLL_GET' */
188683156821SHaiyue Wang 		if (!is_vm_hugetlb_page(vma))
188783156821SHaiyue Wang 			foll_flags |= FOLL_GET;
188883156821SHaiyue Wang 
1889d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
189083156821SHaiyue Wang 		page = follow_page(vma, addr, foll_flags);
189189f5b7daSLinus Torvalds 
189289f5b7daSLinus Torvalds 		err = PTR_ERR(page);
189389f5b7daSLinus Torvalds 		if (IS_ERR(page))
189489f5b7daSLinus Torvalds 			goto set_status;
189589f5b7daSLinus Torvalds 
1896f7091ed6SHaiyue Wang 		err = -ENOENT;
1897f7091ed6SHaiyue Wang 		if (!page)
1898f7091ed6SHaiyue Wang 			goto set_status;
1899f7091ed6SHaiyue Wang 
1900f7091ed6SHaiyue Wang 		if (!is_zone_device_page(page))
19014cd61484SMiaohe Lin 			err = page_to_nid(page);
1902f7091ed6SHaiyue Wang 
190383156821SHaiyue Wang 		if (foll_flags & FOLL_GET)
19044cd61484SMiaohe Lin 			put_page(page);
1905742755a1SChristoph Lameter set_status:
190680bba129SBrice Goglin 		*status = err;
190780bba129SBrice Goglin 
190880bba129SBrice Goglin 		pages++;
190980bba129SBrice Goglin 		status++;
191080bba129SBrice Goglin 	}
191180bba129SBrice Goglin 
1912d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
191380bba129SBrice Goglin }
191480bba129SBrice Goglin 
19155b1b561bSArnd Bergmann static int get_compat_pages_array(const void __user *chunk_pages[],
19165b1b561bSArnd Bergmann 				  const void __user * __user *pages,
19175b1b561bSArnd Bergmann 				  unsigned long chunk_nr)
19185b1b561bSArnd Bergmann {
19195b1b561bSArnd Bergmann 	compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
19205b1b561bSArnd Bergmann 	compat_uptr_t p;
19215b1b561bSArnd Bergmann 	int i;
19225b1b561bSArnd Bergmann 
19235b1b561bSArnd Bergmann 	for (i = 0; i < chunk_nr; i++) {
19245b1b561bSArnd Bergmann 		if (get_user(p, pages32 + i))
19255b1b561bSArnd Bergmann 			return -EFAULT;
19265b1b561bSArnd Bergmann 		chunk_pages[i] = compat_ptr(p);
19275b1b561bSArnd Bergmann 	}
19285b1b561bSArnd Bergmann 
19295b1b561bSArnd Bergmann 	return 0;
19305b1b561bSArnd Bergmann }
19315b1b561bSArnd Bergmann 
193280bba129SBrice Goglin /*
193380bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
193480bba129SBrice Goglin  * a user array of status.
193580bba129SBrice Goglin  */
193680bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
193780bba129SBrice Goglin 			 const void __user * __user *pages,
193880bba129SBrice Goglin 			 int __user *status)
193980bba129SBrice Goglin {
19403eefb826SMiaohe Lin #define DO_PAGES_STAT_CHUNK_NR 16UL
194180bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
194280bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
194380bba129SBrice Goglin 
194487b8d1adSH. Peter Anvin 	while (nr_pages) {
19453eefb826SMiaohe Lin 		unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
194687b8d1adSH. Peter Anvin 
19475b1b561bSArnd Bergmann 		if (in_compat_syscall()) {
19485b1b561bSArnd Bergmann 			if (get_compat_pages_array(chunk_pages, pages,
19495b1b561bSArnd Bergmann 						   chunk_nr))
195087b8d1adSH. Peter Anvin 				break;
19515b1b561bSArnd Bergmann 		} else {
19525b1b561bSArnd Bergmann 			if (copy_from_user(chunk_pages, pages,
19535b1b561bSArnd Bergmann 				      chunk_nr * sizeof(*chunk_pages)))
19545b1b561bSArnd Bergmann 				break;
19555b1b561bSArnd Bergmann 		}
195680bba129SBrice Goglin 
195780bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
195880bba129SBrice Goglin 
195987b8d1adSH. Peter Anvin 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
196087b8d1adSH. Peter Anvin 			break;
1961742755a1SChristoph Lameter 
196287b8d1adSH. Peter Anvin 		pages += chunk_nr;
196387b8d1adSH. Peter Anvin 		status += chunk_nr;
196487b8d1adSH. Peter Anvin 		nr_pages -= chunk_nr;
196587b8d1adSH. Peter Anvin 	}
196687b8d1adSH. Peter Anvin 	return nr_pages ? -EFAULT : 0;
1967742755a1SChristoph Lameter }
1968742755a1SChristoph Lameter 
19694dc200ceSMiaohe Lin static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
19704dc200ceSMiaohe Lin {
19714dc200ceSMiaohe Lin 	struct task_struct *task;
19724dc200ceSMiaohe Lin 	struct mm_struct *mm;
19734dc200ceSMiaohe Lin 
19744dc200ceSMiaohe Lin 	/*
19754dc200ceSMiaohe Lin 	 * There is no need to check if current process has the right to modify
19764dc200ceSMiaohe Lin 	 * the specified process when they are same.
19774dc200ceSMiaohe Lin 	 */
19784dc200ceSMiaohe Lin 	if (!pid) {
19794dc200ceSMiaohe Lin 		mmget(current->mm);
19804dc200ceSMiaohe Lin 		*mem_nodes = cpuset_mems_allowed(current);
19814dc200ceSMiaohe Lin 		return current->mm;
19824dc200ceSMiaohe Lin 	}
19834dc200ceSMiaohe Lin 
19844dc200ceSMiaohe Lin 	/* Find the mm_struct */
19854dc200ceSMiaohe Lin 	rcu_read_lock();
19864dc200ceSMiaohe Lin 	task = find_task_by_vpid(pid);
19874dc200ceSMiaohe Lin 	if (!task) {
19884dc200ceSMiaohe Lin 		rcu_read_unlock();
19894dc200ceSMiaohe Lin 		return ERR_PTR(-ESRCH);
19904dc200ceSMiaohe Lin 	}
19914dc200ceSMiaohe Lin 	get_task_struct(task);
19924dc200ceSMiaohe Lin 
19934dc200ceSMiaohe Lin 	/*
19944dc200ceSMiaohe Lin 	 * Check if this process has the right to modify the specified
19954dc200ceSMiaohe Lin 	 * process. Use the regular "ptrace_may_access()" checks.
19964dc200ceSMiaohe Lin 	 */
19974dc200ceSMiaohe Lin 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
19984dc200ceSMiaohe Lin 		rcu_read_unlock();
19994dc200ceSMiaohe Lin 		mm = ERR_PTR(-EPERM);
20004dc200ceSMiaohe Lin 		goto out;
20014dc200ceSMiaohe Lin 	}
20024dc200ceSMiaohe Lin 	rcu_read_unlock();
20034dc200ceSMiaohe Lin 
20044dc200ceSMiaohe Lin 	mm = ERR_PTR(security_task_movememory(task));
20054dc200ceSMiaohe Lin 	if (IS_ERR(mm))
20064dc200ceSMiaohe Lin 		goto out;
20074dc200ceSMiaohe Lin 	*mem_nodes = cpuset_mems_allowed(task);
20084dc200ceSMiaohe Lin 	mm = get_task_mm(task);
20094dc200ceSMiaohe Lin out:
20104dc200ceSMiaohe Lin 	put_task_struct(task);
20114dc200ceSMiaohe Lin 	if (!mm)
20124dc200ceSMiaohe Lin 		mm = ERR_PTR(-EINVAL);
20134dc200ceSMiaohe Lin 	return mm;
20144dc200ceSMiaohe Lin }
20154dc200ceSMiaohe Lin 
2016742755a1SChristoph Lameter /*
2017742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
2018742755a1SChristoph Lameter  * process.
2019742755a1SChristoph Lameter  */
20207addf443SDominik Brodowski static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
20217addf443SDominik Brodowski 			     const void __user * __user *pages,
20227addf443SDominik Brodowski 			     const int __user *nodes,
20237addf443SDominik Brodowski 			     int __user *status, int flags)
2024742755a1SChristoph Lameter {
2025742755a1SChristoph Lameter 	struct mm_struct *mm;
20265e9a0f02SBrice Goglin 	int err;
20273268c63eSChristoph Lameter 	nodemask_t task_nodes;
2028742755a1SChristoph Lameter 
2029742755a1SChristoph Lameter 	/* Check flags */
2030742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
2031742755a1SChristoph Lameter 		return -EINVAL;
2032742755a1SChristoph Lameter 
2033742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2034742755a1SChristoph Lameter 		return -EPERM;
2035742755a1SChristoph Lameter 
20364dc200ceSMiaohe Lin 	mm = find_mm_struct(pid, &task_nodes);
20374dc200ceSMiaohe Lin 	if (IS_ERR(mm))
20384dc200ceSMiaohe Lin 		return PTR_ERR(mm);
20396e8b09eaSSasha Levin 
20403268c63eSChristoph Lameter 	if (nodes)
20413268c63eSChristoph Lameter 		err = do_pages_move(mm, task_nodes, nr_pages, pages,
20423268c63eSChristoph Lameter 				    nodes, status, flags);
20433268c63eSChristoph Lameter 	else
20445e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
20453268c63eSChristoph Lameter 
20463268c63eSChristoph Lameter 	mmput(mm);
20473268c63eSChristoph Lameter 	return err;
2048742755a1SChristoph Lameter }
2049742755a1SChristoph Lameter 
20507addf443SDominik Brodowski SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
20517addf443SDominik Brodowski 		const void __user * __user *, pages,
20527addf443SDominik Brodowski 		const int __user *, nodes,
20537addf443SDominik Brodowski 		int __user *, status, int, flags)
20547addf443SDominik Brodowski {
20557addf443SDominik Brodowski 	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
20567addf443SDominik Brodowski }
20577addf443SDominik Brodowski 
20587039e1dbSPeter Zijlstra #ifdef CONFIG_NUMA_BALANCING
20597039e1dbSPeter Zijlstra /*
20607039e1dbSPeter Zijlstra  * Returns true if this is a safe migration target node for misplaced NUMA
2061bc53008eSWei Yang  * pages. Currently it only checks the watermarks which is crude.
20627039e1dbSPeter Zijlstra  */
20637039e1dbSPeter Zijlstra static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
20643abef4e6SMel Gorman 				   unsigned long nr_migrate_pages)
20657039e1dbSPeter Zijlstra {
20667039e1dbSPeter Zijlstra 	int z;
2067599d0c95SMel Gorman 
20687039e1dbSPeter Zijlstra 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
20697039e1dbSPeter Zijlstra 		struct zone *zone = pgdat->node_zones + z;
20707039e1dbSPeter Zijlstra 
2071bc53008eSWei Yang 		if (!managed_zone(zone))
20727039e1dbSPeter Zijlstra 			continue;
20737039e1dbSPeter Zijlstra 
20747039e1dbSPeter Zijlstra 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
20757039e1dbSPeter Zijlstra 		if (!zone_watermark_ok(zone, 0,
20767039e1dbSPeter Zijlstra 				       high_wmark_pages(zone) +
20777039e1dbSPeter Zijlstra 				       nr_migrate_pages,
2078bfe9d006SHuang Ying 				       ZONE_MOVABLE, 0))
20797039e1dbSPeter Zijlstra 			continue;
20807039e1dbSPeter Zijlstra 		return true;
20817039e1dbSPeter Zijlstra 	}
20827039e1dbSPeter Zijlstra 	return false;
20837039e1dbSPeter Zijlstra }
20847039e1dbSPeter Zijlstra 
20857039e1dbSPeter Zijlstra static struct page *alloc_misplaced_dst_page(struct page *page,
2086666feb21SMichal Hocko 					   unsigned long data)
20877039e1dbSPeter Zijlstra {
20887039e1dbSPeter Zijlstra 	int nid = (int) data;
2089c185e494SMatthew Wilcox (Oracle) 	int order = compound_order(page);
2090c185e494SMatthew Wilcox (Oracle) 	gfp_t gfp = __GFP_THISNODE;
2091c185e494SMatthew Wilcox (Oracle) 	struct folio *new;
20927039e1dbSPeter Zijlstra 
2093c185e494SMatthew Wilcox (Oracle) 	if (order > 0)
2094c185e494SMatthew Wilcox (Oracle) 		gfp |= GFP_TRANSHUGE_LIGHT;
2095c185e494SMatthew Wilcox (Oracle) 	else {
2096c185e494SMatthew Wilcox (Oracle) 		gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2097c185e494SMatthew Wilcox (Oracle) 			__GFP_NOWARN;
2098c185e494SMatthew Wilcox (Oracle) 		gfp &= ~__GFP_RECLAIM;
20997039e1dbSPeter Zijlstra 	}
2100c185e494SMatthew Wilcox (Oracle) 	new = __folio_alloc_node(gfp, order, nid);
21017039e1dbSPeter Zijlstra 
2102c185e494SMatthew Wilcox (Oracle) 	return &new->page;
2103c5b5a3ddSYang Shi }
2104c5b5a3ddSYang Shi 
21051c30e017SMel Gorman static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
2106b32967ffSMel Gorman {
21072b9b624fSBaolin Wang 	int nr_pages = thp_nr_pages(page);
2108c574bbe9SHuang Ying 	int order = compound_order(page);
2109b32967ffSMel Gorman 
2110c574bbe9SHuang Ying 	VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
21113abef4e6SMel Gorman 
2112662aeea7SYang Shi 	/* Do not migrate THP mapped by multiple processes */
2113662aeea7SYang Shi 	if (PageTransHuge(page) && total_mapcount(page) > 1)
2114662aeea7SYang Shi 		return 0;
2115662aeea7SYang Shi 
2116b32967ffSMel Gorman 	/* Avoid migrating to a node that is nearly full */
2117c574bbe9SHuang Ying 	if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2118c574bbe9SHuang Ying 		int z;
2119c574bbe9SHuang Ying 
2120c574bbe9SHuang Ying 		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2121340ef390SHugh Dickins 			return 0;
2122c574bbe9SHuang Ying 		for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2123bc53008eSWei Yang 			if (managed_zone(pgdat->node_zones + z))
2124c574bbe9SHuang Ying 				break;
2125c574bbe9SHuang Ying 		}
2126c574bbe9SHuang Ying 		wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
2127c574bbe9SHuang Ying 		return 0;
2128c574bbe9SHuang Ying 	}
2129b32967ffSMel Gorman 
2130340ef390SHugh Dickins 	if (isolate_lru_page(page))
2131340ef390SHugh Dickins 		return 0;
2132340ef390SHugh Dickins 
2133b75454e1SMiaohe Lin 	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
21342b9b624fSBaolin Wang 			    nr_pages);
2135b32967ffSMel Gorman 
2136b32967ffSMel Gorman 	/*
2137340ef390SHugh Dickins 	 * Isolating the page has taken another reference, so the
2138340ef390SHugh Dickins 	 * caller's reference can be safely dropped without the page
2139340ef390SHugh Dickins 	 * disappearing underneath us during migration.
2140b32967ffSMel Gorman 	 */
2141b32967ffSMel Gorman 	put_page(page);
2142340ef390SHugh Dickins 	return 1;
2143b32967ffSMel Gorman }
2144b32967ffSMel Gorman 
2145a8f60772SMel Gorman /*
21467039e1dbSPeter Zijlstra  * Attempt to migrate a misplaced page to the specified destination
21477039e1dbSPeter Zijlstra  * node. Caller is expected to have an elevated reference count on
21487039e1dbSPeter Zijlstra  * the page that will be dropped by this function before returning.
21497039e1dbSPeter Zijlstra  */
21501bc115d8SMel Gorman int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
21511bc115d8SMel Gorman 			   int node)
21527039e1dbSPeter Zijlstra {
2153a8f60772SMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
2154340ef390SHugh Dickins 	int isolated;
2155b32967ffSMel Gorman 	int nr_remaining;
2156e39bb6beSHuang Ying 	unsigned int nr_succeeded;
21577039e1dbSPeter Zijlstra 	LIST_HEAD(migratepages);
2158b5916c02SAneesh Kumar K.V 	int nr_pages = thp_nr_pages(page);
2159c5b5a3ddSYang Shi 
2160c5b5a3ddSYang Shi 	/*
21611bc115d8SMel Gorman 	 * Don't migrate file pages that are mapped in multiple processes
21621bc115d8SMel Gorman 	 * with execute permissions as they are probably shared libraries.
21637039e1dbSPeter Zijlstra 	 */
21647ee820eeSMiaohe Lin 	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
21657ee820eeSMiaohe Lin 	    (vma->vm_flags & VM_EXEC))
21667039e1dbSPeter Zijlstra 		goto out;
21677039e1dbSPeter Zijlstra 
2168a8f60772SMel Gorman 	/*
216909a913a7SMel Gorman 	 * Also do not migrate dirty pages as not all filesystems can move
217009a913a7SMel Gorman 	 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
217109a913a7SMel Gorman 	 */
21729de4f22aSHuang Ying 	if (page_is_file_lru(page) && PageDirty(page))
217309a913a7SMel Gorman 		goto out;
217409a913a7SMel Gorman 
2175b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
2176b32967ffSMel Gorman 	if (!isolated)
21777039e1dbSPeter Zijlstra 		goto out;
21787039e1dbSPeter Zijlstra 
21797039e1dbSPeter Zijlstra 	list_add(&page->lru, &migratepages);
2180c185e494SMatthew Wilcox (Oracle) 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
2181c185e494SMatthew Wilcox (Oracle) 				     NULL, node, MIGRATE_ASYNC,
2182c185e494SMatthew Wilcox (Oracle) 				     MR_NUMA_MISPLACED, &nr_succeeded);
21837039e1dbSPeter Zijlstra 	if (nr_remaining) {
218459c82b70SJoonsoo Kim 		if (!list_empty(&migratepages)) {
218559c82b70SJoonsoo Kim 			list_del(&page->lru);
2186c5fc5c3aSYang Shi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2187c5fc5c3aSYang Shi 					page_is_file_lru(page), -nr_pages);
218859c82b70SJoonsoo Kim 			putback_lru_page(page);
218959c82b70SJoonsoo Kim 		}
21907039e1dbSPeter Zijlstra 		isolated = 0;
2191e39bb6beSHuang Ying 	}
2192e39bb6beSHuang Ying 	if (nr_succeeded) {
2193e39bb6beSHuang Ying 		count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2194e39bb6beSHuang Ying 		if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
2195e39bb6beSHuang Ying 			mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
2196e39bb6beSHuang Ying 					    nr_succeeded);
2197e39bb6beSHuang Ying 	}
21987039e1dbSPeter Zijlstra 	BUG_ON(!list_empty(&migratepages));
21997039e1dbSPeter Zijlstra 	return isolated;
2200340ef390SHugh Dickins 
2201340ef390SHugh Dickins out:
2202340ef390SHugh Dickins 	put_page(page);
2203340ef390SHugh Dickins 	return 0;
22047039e1dbSPeter Zijlstra }
2205220018d3SMel Gorman #endif /* CONFIG_NUMA_BALANCING */
22067d6e2d96SOscar Salvador #endif /* CONFIG_NUMA */
2207