xref: /linux/mm/migrate.c (revision 2e3468778dbe3ec389a10c21a703bb8e5be5cfbc)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2b20a3503SChristoph Lameter /*
314e0f9bcSHugh Dickins  * Memory Migration functionality - linux/mm/migrate.c
4b20a3503SChristoph Lameter  *
5b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6b20a3503SChristoph Lameter  *
7b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
8b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
9b20a3503SChristoph Lameter  *
10b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
12b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
13cde53535SChristoph Lameter  * Christoph Lameter
14b20a3503SChristoph Lameter  */
15b20a3503SChristoph Lameter 
16b20a3503SChristoph Lameter #include <linux/migrate.h>
17b95f1b31SPaul Gortmaker #include <linux/export.h>
18b20a3503SChristoph Lameter #include <linux/swap.h>
190697212aSChristoph Lameter #include <linux/swapops.h>
20b20a3503SChristoph Lameter #include <linux/pagemap.h>
21e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
22b20a3503SChristoph Lameter #include <linux/mm_inline.h>
23b488893aSPavel Emelyanov #include <linux/nsproxy.h>
24b20a3503SChristoph Lameter #include <linux/pagevec.h>
25e9995ef9SHugh Dickins #include <linux/ksm.h>
26b20a3503SChristoph Lameter #include <linux/rmap.h>
27b20a3503SChristoph Lameter #include <linux/topology.h>
28b20a3503SChristoph Lameter #include <linux/cpu.h>
29b20a3503SChristoph Lameter #include <linux/cpuset.h>
3004e62a29SChristoph Lameter #include <linux/writeback.h>
31742755a1SChristoph Lameter #include <linux/mempolicy.h>
32742755a1SChristoph Lameter #include <linux/vmalloc.h>
3386c3a764SDavid Quigley #include <linux/security.h>
3442cb14b1SHugh Dickins #include <linux/backing-dev.h>
35bda807d4SMinchan Kim #include <linux/compaction.h>
364f5ca265SAdrian Bunk #include <linux/syscalls.h>
377addf443SDominik Brodowski #include <linux/compat.h>
38290408d4SNaoya Horiguchi #include <linux/hugetlb.h>
398e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
405a0e3ad6STejun Heo #include <linux/gfp.h>
41df6ad698SJérôme Glisse #include <linux/pfn_t.h>
42a5430ddaSJérôme Glisse #include <linux/memremap.h>
438315ada7SJérôme Glisse #include <linux/userfaultfd_k.h>
44bf6bddf1SRafael Aquini #include <linux/balloon_compaction.h>
4533c3fc71SVladimir Davydov #include <linux/page_idle.h>
46d435edcaSVlastimil Babka #include <linux/page_owner.h>
476e84f315SIngo Molnar #include <linux/sched/mm.h>
48197e7e52SLinus Torvalds #include <linux/ptrace.h>
4934290e2cSRalph Campbell #include <linux/oom.h>
50884a6e5dSDave Hansen #include <linux/memory.h>
51ac16ec83SBaolin Wang #include <linux/random.h>
52c574bbe9SHuang Ying #include <linux/sched/sysctl.h>
53b20a3503SChristoph Lameter 
540d1836c3SMichal Nazarewicz #include <asm/tlbflush.h>
550d1836c3SMichal Nazarewicz 
567b2a2d4aSMel Gorman #include <trace/events/migrate.h>
577b2a2d4aSMel Gorman 
58b20a3503SChristoph Lameter #include "internal.h"
59b20a3503SChristoph Lameter 
609e5bcd61SYisheng Xie int isolate_movable_page(struct page *page, isolate_mode_t mode)
61bda807d4SMinchan Kim {
6268f2736aSMatthew Wilcox (Oracle) 	const struct movable_operations *mops;
63bda807d4SMinchan Kim 
64bda807d4SMinchan Kim 	/*
65bda807d4SMinchan Kim 	 * Avoid burning cycles with pages that are yet under __free_pages(),
66bda807d4SMinchan Kim 	 * or just got freed under us.
67bda807d4SMinchan Kim 	 *
68bda807d4SMinchan Kim 	 * In case we 'win' a race for a movable page being freed under us and
69bda807d4SMinchan Kim 	 * raise its refcount preventing __free_pages() from doing its job
70bda807d4SMinchan Kim 	 * the put_page() at the end of this block will take care of
71bda807d4SMinchan Kim 	 * release this page, thus avoiding a nasty leakage.
72bda807d4SMinchan Kim 	 */
73bda807d4SMinchan Kim 	if (unlikely(!get_page_unless_zero(page)))
74bda807d4SMinchan Kim 		goto out;
75bda807d4SMinchan Kim 
76bda807d4SMinchan Kim 	/*
77bda807d4SMinchan Kim 	 * Check PageMovable before holding a PG_lock because page's owner
78bda807d4SMinchan Kim 	 * assumes anybody doesn't touch PG_lock of newly allocated page
798bb4e7a2SWei Yang 	 * so unconditionally grabbing the lock ruins page's owner side.
80bda807d4SMinchan Kim 	 */
81bda807d4SMinchan Kim 	if (unlikely(!__PageMovable(page)))
82bda807d4SMinchan Kim 		goto out_putpage;
83bda807d4SMinchan Kim 	/*
84bda807d4SMinchan Kim 	 * As movable pages are not isolated from LRU lists, concurrent
85bda807d4SMinchan Kim 	 * compaction threads can race against page migration functions
86bda807d4SMinchan Kim 	 * as well as race against the releasing a page.
87bda807d4SMinchan Kim 	 *
88bda807d4SMinchan Kim 	 * In order to avoid having an already isolated movable page
89bda807d4SMinchan Kim 	 * being (wrongly) re-isolated while it is under migration,
90bda807d4SMinchan Kim 	 * or to avoid attempting to isolate pages being released,
91bda807d4SMinchan Kim 	 * lets be sure we have the page lock
92bda807d4SMinchan Kim 	 * before proceeding with the movable page isolation steps.
93bda807d4SMinchan Kim 	 */
94bda807d4SMinchan Kim 	if (unlikely(!trylock_page(page)))
95bda807d4SMinchan Kim 		goto out_putpage;
96bda807d4SMinchan Kim 
97bda807d4SMinchan Kim 	if (!PageMovable(page) || PageIsolated(page))
98bda807d4SMinchan Kim 		goto out_no_isolated;
99bda807d4SMinchan Kim 
10068f2736aSMatthew Wilcox (Oracle) 	mops = page_movable_ops(page);
10168f2736aSMatthew Wilcox (Oracle) 	VM_BUG_ON_PAGE(!mops, page);
102bda807d4SMinchan Kim 
10368f2736aSMatthew Wilcox (Oracle) 	if (!mops->isolate_page(page, mode))
104bda807d4SMinchan Kim 		goto out_no_isolated;
105bda807d4SMinchan Kim 
106bda807d4SMinchan Kim 	/* Driver shouldn't use PG_isolated bit of page->flags */
107bda807d4SMinchan Kim 	WARN_ON_ONCE(PageIsolated(page));
108356ea386Sandrew.yang 	SetPageIsolated(page);
109bda807d4SMinchan Kim 	unlock_page(page);
110bda807d4SMinchan Kim 
1119e5bcd61SYisheng Xie 	return 0;
112bda807d4SMinchan Kim 
113bda807d4SMinchan Kim out_no_isolated:
114bda807d4SMinchan Kim 	unlock_page(page);
115bda807d4SMinchan Kim out_putpage:
116bda807d4SMinchan Kim 	put_page(page);
117bda807d4SMinchan Kim out:
1189e5bcd61SYisheng Xie 	return -EBUSY;
119bda807d4SMinchan Kim }
120bda807d4SMinchan Kim 
121606a6f71SMiaohe Lin static void putback_movable_page(struct page *page)
122bda807d4SMinchan Kim {
12368f2736aSMatthew Wilcox (Oracle) 	const struct movable_operations *mops = page_movable_ops(page);
124bda807d4SMinchan Kim 
12568f2736aSMatthew Wilcox (Oracle) 	mops->putback_page(page);
126356ea386Sandrew.yang 	ClearPageIsolated(page);
127bda807d4SMinchan Kim }
128bda807d4SMinchan Kim 
129b20a3503SChristoph Lameter /*
1305733c7d1SRafael Aquini  * Put previously isolated pages back onto the appropriate lists
1315733c7d1SRafael Aquini  * from where they were once taken off for compaction/migration.
1325733c7d1SRafael Aquini  *
13359c82b70SJoonsoo Kim  * This function shall be used whenever the isolated pageset has been
13459c82b70SJoonsoo Kim  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
1357ce82f4cSMiaohe Lin  * and isolate_hugetlb().
1365733c7d1SRafael Aquini  */
1375733c7d1SRafael Aquini void putback_movable_pages(struct list_head *l)
1385733c7d1SRafael Aquini {
1395733c7d1SRafael Aquini 	struct page *page;
1405733c7d1SRafael Aquini 	struct page *page2;
1415733c7d1SRafael Aquini 
1425733c7d1SRafael Aquini 	list_for_each_entry_safe(page, page2, l, lru) {
14331caf665SNaoya Horiguchi 		if (unlikely(PageHuge(page))) {
14431caf665SNaoya Horiguchi 			putback_active_hugepage(page);
14531caf665SNaoya Horiguchi 			continue;
14631caf665SNaoya Horiguchi 		}
1475733c7d1SRafael Aquini 		list_del(&page->lru);
148bda807d4SMinchan Kim 		/*
149bda807d4SMinchan Kim 		 * We isolated non-lru movable page so here we can use
150bda807d4SMinchan Kim 		 * __PageMovable because LRU page's mapping cannot have
151bda807d4SMinchan Kim 		 * PAGE_MAPPING_MOVABLE.
152bda807d4SMinchan Kim 		 */
153b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(page))) {
154bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
155bda807d4SMinchan Kim 			lock_page(page);
156bda807d4SMinchan Kim 			if (PageMovable(page))
157bda807d4SMinchan Kim 				putback_movable_page(page);
158bf6bddf1SRafael Aquini 			else
159356ea386Sandrew.yang 				ClearPageIsolated(page);
160bda807d4SMinchan Kim 			unlock_page(page);
161bda807d4SMinchan Kim 			put_page(page);
162bda807d4SMinchan Kim 		} else {
163e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1646c357848SMatthew Wilcox (Oracle) 					page_is_file_lru(page), -thp_nr_pages(page));
165fc280fe8SRabin Vincent 			putback_lru_page(page);
166b20a3503SChristoph Lameter 		}
167b20a3503SChristoph Lameter 	}
168bda807d4SMinchan Kim }
169b20a3503SChristoph Lameter 
1700697212aSChristoph Lameter /*
1710697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
1720697212aSChristoph Lameter  */
1732f031c6fSMatthew Wilcox (Oracle) static bool remove_migration_pte(struct folio *folio,
1742f031c6fSMatthew Wilcox (Oracle) 		struct vm_area_struct *vma, unsigned long addr, void *old)
1750697212aSChristoph Lameter {
1764eecb8b9SMatthew Wilcox (Oracle) 	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
1770697212aSChristoph Lameter 
1783fe87967SKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
1796c287605SDavid Hildenbrand 		rmap_t rmap_flags = RMAP_NONE;
1800697212aSChristoph Lameter 		pte_t pte;
1810697212aSChristoph Lameter 		swp_entry_t entry;
1824eecb8b9SMatthew Wilcox (Oracle) 		struct page *new;
1834eecb8b9SMatthew Wilcox (Oracle) 		unsigned long idx = 0;
1840697212aSChristoph Lameter 
1854eecb8b9SMatthew Wilcox (Oracle) 		/* pgoff is invalid for ksm pages, but they are never large */
1864eecb8b9SMatthew Wilcox (Oracle) 		if (folio_test_large(folio) && !folio_test_hugetlb(folio))
1874eecb8b9SMatthew Wilcox (Oracle) 			idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
1884eecb8b9SMatthew Wilcox (Oracle) 		new = folio_page(folio, idx);
1890697212aSChristoph Lameter 
190616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
191616b8371SZi Yan 		/* PMD-mapped THP migration entry */
192616b8371SZi Yan 		if (!pvmw.pte) {
1934eecb8b9SMatthew Wilcox (Oracle) 			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
1944eecb8b9SMatthew Wilcox (Oracle) 					!folio_test_pmd_mappable(folio), folio);
195616b8371SZi Yan 			remove_migration_pmd(&pvmw, new);
196616b8371SZi Yan 			continue;
197616b8371SZi Yan 		}
198616b8371SZi Yan #endif
199616b8371SZi Yan 
2004eecb8b9SMatthew Wilcox (Oracle) 		folio_get(folio);
201*2e346877SPeter Xu 		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
2023fe87967SKirill A. Shutemov 		if (pte_swp_soft_dirty(*pvmw.pte))
203c3d16e16SCyrill Gorcunov 			pte = pte_mksoft_dirty(pte);
204d3cb8bf6SMel Gorman 
2053fe87967SKirill A. Shutemov 		/*
2063fe87967SKirill A. Shutemov 		 * Recheck VMA as permissions can change since migration started
2073fe87967SKirill A. Shutemov 		 */
2083fe87967SKirill A. Shutemov 		entry = pte_to_swp_entry(*pvmw.pte);
209*2e346877SPeter Xu 		if (!is_migration_entry_young(entry))
210*2e346877SPeter Xu 			pte = pte_mkold(pte);
211*2e346877SPeter Xu 		if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
212*2e346877SPeter Xu 			pte = pte_mkdirty(pte);
2134dd845b5SAlistair Popple 		if (is_writable_migration_entry(entry))
214d3cb8bf6SMel Gorman 			pte = maybe_mkwrite(pte, vma);
215f45ec5ffSPeter Xu 		else if (pte_swp_uffd_wp(*pvmw.pte))
216f45ec5ffSPeter Xu 			pte = pte_mkuffd_wp(pte);
217d3cb8bf6SMel Gorman 
2186c287605SDavid Hildenbrand 		if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
2196c287605SDavid Hildenbrand 			rmap_flags |= RMAP_EXCLUSIVE;
2206c287605SDavid Hildenbrand 
2216128763fSRalph Campbell 		if (unlikely(is_device_private_page(new))) {
2224dd845b5SAlistair Popple 			if (pte_write(pte))
2234dd845b5SAlistair Popple 				entry = make_writable_device_private_entry(
2244dd845b5SAlistair Popple 							page_to_pfn(new));
2254dd845b5SAlistair Popple 			else
2264dd845b5SAlistair Popple 				entry = make_readable_device_private_entry(
2274dd845b5SAlistair Popple 							page_to_pfn(new));
228a5430ddaSJérôme Glisse 			pte = swp_entry_to_pte(entry);
2293d321bf8SRalph Campbell 			if (pte_swp_soft_dirty(*pvmw.pte))
2303d321bf8SRalph Campbell 				pte = pte_swp_mksoft_dirty(pte);
231f45ec5ffSPeter Xu 			if (pte_swp_uffd_wp(*pvmw.pte))
232ebdf8321SAlistair Popple 				pte = pte_swp_mkuffd_wp(pte);
233df6ad698SJérôme Glisse 		}
234a5430ddaSJérôme Glisse 
2353ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE
2364eecb8b9SMatthew Wilcox (Oracle) 		if (folio_test_hugetlb(folio)) {
23779c1c594SChristophe Leroy 			unsigned int shift = huge_page_shift(hstate_vma(vma));
23879c1c594SChristophe Leroy 
239290408d4SNaoya Horiguchi 			pte = pte_mkhuge(pte);
24079c1c594SChristophe Leroy 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
2414eecb8b9SMatthew Wilcox (Oracle) 			if (folio_test_anon(folio))
24228c5209dSDavid Hildenbrand 				hugepage_add_anon_rmap(new, vma, pvmw.address,
2436c287605SDavid Hildenbrand 						       rmap_flags);
244290408d4SNaoya Horiguchi 			else
245fb3d824dSDavid Hildenbrand 				page_dup_file_rmap(new, true);
2461eba86c0SPasha Tatashin 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
247383321abSAneesh Kumar K.V 		} else
248383321abSAneesh Kumar K.V #endif
249383321abSAneesh Kumar K.V 		{
2504eecb8b9SMatthew Wilcox (Oracle) 			if (folio_test_anon(folio))
251f1e2db12SDavid Hildenbrand 				page_add_anon_rmap(new, vma, pvmw.address,
2526c287605SDavid Hildenbrand 						   rmap_flags);
25304e62a29SChristoph Lameter 			else
254cea86fe2SHugh Dickins 				page_add_file_rmap(new, vma, false);
2551eba86c0SPasha Tatashin 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
256383321abSAneesh Kumar K.V 		}
257b7435507SHugh Dickins 		if (vma->vm_flags & VM_LOCKED)
258adb11e78SSebastian Andrzej Siewior 			mlock_page_drain_local();
259e125fe40SKirill A. Shutemov 
2604cc79b33SAnshuman Khandual 		trace_remove_migration_pte(pvmw.address, pte_val(pte),
2614cc79b33SAnshuman Khandual 					   compound_order(new));
2624cc79b33SAnshuman Khandual 
26304e62a29SChristoph Lameter 		/* No need to invalidate - it was non-present before */
2643fe87967SKirill A. Shutemov 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
2653fe87967SKirill A. Shutemov 	}
2663fe87967SKirill A. Shutemov 
267e4b82222SMinchan Kim 	return true;
2680697212aSChristoph Lameter }
2690697212aSChristoph Lameter 
2700697212aSChristoph Lameter /*
27104e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
27204e62a29SChristoph Lameter  * references to the indicated page.
27304e62a29SChristoph Lameter  */
2744eecb8b9SMatthew Wilcox (Oracle) void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
27504e62a29SChristoph Lameter {
276051ac83aSJoonsoo Kim 	struct rmap_walk_control rwc = {
277051ac83aSJoonsoo Kim 		.rmap_one = remove_migration_pte,
2784eecb8b9SMatthew Wilcox (Oracle) 		.arg = src,
279051ac83aSJoonsoo Kim 	};
280051ac83aSJoonsoo Kim 
281e388466dSKirill A. Shutemov 	if (locked)
2822f031c6fSMatthew Wilcox (Oracle) 		rmap_walk_locked(dst, &rwc);
283e388466dSKirill A. Shutemov 	else
2842f031c6fSMatthew Wilcox (Oracle) 		rmap_walk(dst, &rwc);
28504e62a29SChristoph Lameter }
28604e62a29SChristoph Lameter 
28704e62a29SChristoph Lameter /*
2880697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
2890697212aSChristoph Lameter  * get to the page and wait until migration is finished.
2900697212aSChristoph Lameter  * When we return from this function the fault will be retried.
2910697212aSChristoph Lameter  */
292e66f17ffSNaoya Horiguchi void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
29330dad309SNaoya Horiguchi 				spinlock_t *ptl)
2940697212aSChristoph Lameter {
29530dad309SNaoya Horiguchi 	pte_t pte;
2960697212aSChristoph Lameter 	swp_entry_t entry;
2970697212aSChristoph Lameter 
29830dad309SNaoya Horiguchi 	spin_lock(ptl);
2990697212aSChristoph Lameter 	pte = *ptep;
3000697212aSChristoph Lameter 	if (!is_swap_pte(pte))
3010697212aSChristoph Lameter 		goto out;
3020697212aSChristoph Lameter 
3030697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
3040697212aSChristoph Lameter 	if (!is_migration_entry(entry))
3050697212aSChristoph Lameter 		goto out;
3060697212aSChristoph Lameter 
307ffa65753SAlistair Popple 	migration_entry_wait_on_locked(entry, ptep, ptl);
3080697212aSChristoph Lameter 	return;
3090697212aSChristoph Lameter out:
3100697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3110697212aSChristoph Lameter }
3120697212aSChristoph Lameter 
31330dad309SNaoya Horiguchi void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
31430dad309SNaoya Horiguchi 				unsigned long address)
31530dad309SNaoya Horiguchi {
31630dad309SNaoya Horiguchi 	spinlock_t *ptl = pte_lockptr(mm, pmd);
31730dad309SNaoya Horiguchi 	pte_t *ptep = pte_offset_map(pmd, address);
31830dad309SNaoya Horiguchi 	__migration_entry_wait(mm, ptep, ptl);
31930dad309SNaoya Horiguchi }
32030dad309SNaoya Horiguchi 
321ad1ac596SMiaohe Lin #ifdef CONFIG_HUGETLB_PAGE
322ad1ac596SMiaohe Lin void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
32330dad309SNaoya Horiguchi {
324ad1ac596SMiaohe Lin 	pte_t pte;
325ad1ac596SMiaohe Lin 
326ad1ac596SMiaohe Lin 	spin_lock(ptl);
327ad1ac596SMiaohe Lin 	pte = huge_ptep_get(ptep);
328ad1ac596SMiaohe Lin 
329ad1ac596SMiaohe Lin 	if (unlikely(!is_hugetlb_entry_migration(pte)))
330ad1ac596SMiaohe Lin 		spin_unlock(ptl);
331ad1ac596SMiaohe Lin 	else
332ad1ac596SMiaohe Lin 		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
33330dad309SNaoya Horiguchi }
33430dad309SNaoya Horiguchi 
335ad1ac596SMiaohe Lin void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
336ad1ac596SMiaohe Lin {
337ad1ac596SMiaohe Lin 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
338ad1ac596SMiaohe Lin 
339ad1ac596SMiaohe Lin 	__migration_entry_wait_huge(pte, ptl);
340ad1ac596SMiaohe Lin }
341ad1ac596SMiaohe Lin #endif
342ad1ac596SMiaohe Lin 
343616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
344616b8371SZi Yan void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
345616b8371SZi Yan {
346616b8371SZi Yan 	spinlock_t *ptl;
347616b8371SZi Yan 
348616b8371SZi Yan 	ptl = pmd_lock(mm, pmd);
349616b8371SZi Yan 	if (!is_pmd_migration_entry(*pmd))
350616b8371SZi Yan 		goto unlock;
351ffa65753SAlistair Popple 	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
352616b8371SZi Yan 	return;
353616b8371SZi Yan unlock:
354616b8371SZi Yan 	spin_unlock(ptl);
355616b8371SZi Yan }
356616b8371SZi Yan #endif
357616b8371SZi Yan 
358108ca835SMatthew Wilcox (Oracle) static int folio_expected_refs(struct address_space *mapping,
359108ca835SMatthew Wilcox (Oracle) 		struct folio *folio)
3600b3901b3SJan Kara {
361108ca835SMatthew Wilcox (Oracle) 	int refs = 1;
362108ca835SMatthew Wilcox (Oracle) 	if (!mapping)
363108ca835SMatthew Wilcox (Oracle) 		return refs;
3640b3901b3SJan Kara 
365108ca835SMatthew Wilcox (Oracle) 	refs += folio_nr_pages(folio);
366108ca835SMatthew Wilcox (Oracle) 	if (folio_test_private(folio))
367108ca835SMatthew Wilcox (Oracle) 		refs++;
368108ca835SMatthew Wilcox (Oracle) 
369108ca835SMatthew Wilcox (Oracle) 	return refs;
3700b3901b3SJan Kara }
3710b3901b3SJan Kara 
372b20a3503SChristoph Lameter /*
373c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
3745b5c7120SChristoph Lameter  *
3755b5c7120SChristoph Lameter  * The number of remaining references must be:
3765b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
3775b5c7120SChristoph Lameter  * 2 for pages with a mapping
378266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
379b20a3503SChristoph Lameter  */
3803417013eSMatthew Wilcox (Oracle) int folio_migrate_mapping(struct address_space *mapping,
3813417013eSMatthew Wilcox (Oracle) 		struct folio *newfolio, struct folio *folio, int extra_count)
382b20a3503SChristoph Lameter {
3833417013eSMatthew Wilcox (Oracle) 	XA_STATE(xas, &mapping->i_pages, folio_index(folio));
38442cb14b1SHugh Dickins 	struct zone *oldzone, *newzone;
38542cb14b1SHugh Dickins 	int dirty;
386108ca835SMatthew Wilcox (Oracle) 	int expected_count = folio_expected_refs(mapping, folio) + extra_count;
3873417013eSMatthew Wilcox (Oracle) 	long nr = folio_nr_pages(folio);
3888763cb45SJérôme Glisse 
3896c5240aeSChristoph Lameter 	if (!mapping) {
3900e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
3913417013eSMatthew Wilcox (Oracle) 		if (folio_ref_count(folio) != expected_count)
3926c5240aeSChristoph Lameter 			return -EAGAIN;
393cf4b769aSHugh Dickins 
394cf4b769aSHugh Dickins 		/* No turning back from here */
3953417013eSMatthew Wilcox (Oracle) 		newfolio->index = folio->index;
3963417013eSMatthew Wilcox (Oracle) 		newfolio->mapping = folio->mapping;
3973417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapbacked(folio))
3983417013eSMatthew Wilcox (Oracle) 			__folio_set_swapbacked(newfolio);
399cf4b769aSHugh Dickins 
40078bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
4016c5240aeSChristoph Lameter 	}
4026c5240aeSChristoph Lameter 
4033417013eSMatthew Wilcox (Oracle) 	oldzone = folio_zone(folio);
4043417013eSMatthew Wilcox (Oracle) 	newzone = folio_zone(newfolio);
40542cb14b1SHugh Dickins 
40689eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
4073417013eSMatthew Wilcox (Oracle) 	if (!folio_ref_freeze(folio, expected_count)) {
40889eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
409e286781dSNick Piggin 		return -EAGAIN;
410e286781dSNick Piggin 	}
411e286781dSNick Piggin 
412b20a3503SChristoph Lameter 	/*
4133417013eSMatthew Wilcox (Oracle) 	 * Now we know that no one else is looking at the folio:
414cf4b769aSHugh Dickins 	 * no turning back from here.
415b20a3503SChristoph Lameter 	 */
4163417013eSMatthew Wilcox (Oracle) 	newfolio->index = folio->index;
4173417013eSMatthew Wilcox (Oracle) 	newfolio->mapping = folio->mapping;
4183417013eSMatthew Wilcox (Oracle) 	folio_ref_add(newfolio, nr); /* add cache reference */
4193417013eSMatthew Wilcox (Oracle) 	if (folio_test_swapbacked(folio)) {
4203417013eSMatthew Wilcox (Oracle) 		__folio_set_swapbacked(newfolio);
4213417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapcache(folio)) {
4223417013eSMatthew Wilcox (Oracle) 			folio_set_swapcache(newfolio);
4233417013eSMatthew Wilcox (Oracle) 			newfolio->private = folio_get_private(folio);
424b20a3503SChristoph Lameter 		}
4256326fec1SNicholas Piggin 	} else {
4263417013eSMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
4276326fec1SNicholas Piggin 	}
428b20a3503SChristoph Lameter 
42942cb14b1SHugh Dickins 	/* Move dirty while page refs frozen and newpage not yet exposed */
4303417013eSMatthew Wilcox (Oracle) 	dirty = folio_test_dirty(folio);
43142cb14b1SHugh Dickins 	if (dirty) {
4323417013eSMatthew Wilcox (Oracle) 		folio_clear_dirty(folio);
4333417013eSMatthew Wilcox (Oracle) 		folio_set_dirty(newfolio);
43442cb14b1SHugh Dickins 	}
43542cb14b1SHugh Dickins 
4363417013eSMatthew Wilcox (Oracle) 	xas_store(&xas, newfolio);
4377cf9c2c7SNick Piggin 
4387cf9c2c7SNick Piggin 	/*
439937a94c9SJacobo Giralt 	 * Drop cache reference from old page by unfreezing
440937a94c9SJacobo Giralt 	 * to one less reference.
4417cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
4427cf9c2c7SNick Piggin 	 */
4433417013eSMatthew Wilcox (Oracle) 	folio_ref_unfreeze(folio, expected_count - nr);
4447cf9c2c7SNick Piggin 
44589eb946aSMatthew Wilcox 	xas_unlock(&xas);
44642cb14b1SHugh Dickins 	/* Leave irq disabled to prevent preemption while updating stats */
44742cb14b1SHugh Dickins 
4480e8c7d0fSChristoph Lameter 	/*
4490e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
4500e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
4510e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
4520e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
4530e8c7d0fSChristoph Lameter 	 *
4540e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
4554b9d0fabSMel Gorman 	 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
4560e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
4570e8c7d0fSChristoph Lameter 	 */
45842cb14b1SHugh Dickins 	if (newzone != oldzone) {
4590d1c2072SJohannes Weiner 		struct lruvec *old_lruvec, *new_lruvec;
4600d1c2072SJohannes Weiner 		struct mem_cgroup *memcg;
4610d1c2072SJohannes Weiner 
4623417013eSMatthew Wilcox (Oracle) 		memcg = folio_memcg(folio);
4630d1c2072SJohannes Weiner 		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
4640d1c2072SJohannes Weiner 		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
4650d1c2072SJohannes Weiner 
4665c447d27SShakeel Butt 		__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
4675c447d27SShakeel Butt 		__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
4683417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
4695c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
4705c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
4714b02108aSKOSAKI Motohiro 		}
472b6038942SShakeel Butt #ifdef CONFIG_SWAP
4733417013eSMatthew Wilcox (Oracle) 		if (folio_test_swapcache(folio)) {
474b6038942SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
475b6038942SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
476b6038942SShakeel Butt 		}
477b6038942SShakeel Butt #endif
478f56753acSChristoph Hellwig 		if (dirty && mapping_can_writeback(mapping)) {
4795c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
4805c447d27SShakeel Butt 			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
4815c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
4825c447d27SShakeel Butt 			__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
48342cb14b1SHugh Dickins 		}
48442cb14b1SHugh Dickins 	}
48542cb14b1SHugh Dickins 	local_irq_enable();
486b20a3503SChristoph Lameter 
48778bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
488b20a3503SChristoph Lameter }
4893417013eSMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_mapping);
490b20a3503SChristoph Lameter 
491b20a3503SChristoph Lameter /*
492290408d4SNaoya Horiguchi  * The expected number of remaining references is the same as that
4933417013eSMatthew Wilcox (Oracle)  * of folio_migrate_mapping().
494290408d4SNaoya Horiguchi  */
495290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping,
496b890ec2aSMatthew Wilcox (Oracle) 				   struct folio *dst, struct folio *src)
497290408d4SNaoya Horiguchi {
498b890ec2aSMatthew Wilcox (Oracle) 	XA_STATE(xas, &mapping->i_pages, folio_index(src));
499290408d4SNaoya Horiguchi 	int expected_count;
500290408d4SNaoya Horiguchi 
50189eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
502b890ec2aSMatthew Wilcox (Oracle) 	expected_count = 2 + folio_has_private(src);
503b890ec2aSMatthew Wilcox (Oracle) 	if (!folio_ref_freeze(src, expected_count)) {
50489eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
505290408d4SNaoya Horiguchi 		return -EAGAIN;
506290408d4SNaoya Horiguchi 	}
507290408d4SNaoya Horiguchi 
508b890ec2aSMatthew Wilcox (Oracle) 	dst->index = src->index;
509b890ec2aSMatthew Wilcox (Oracle) 	dst->mapping = src->mapping;
5106a93ca8fSJohannes Weiner 
511b890ec2aSMatthew Wilcox (Oracle) 	folio_get(dst);
512290408d4SNaoya Horiguchi 
513b890ec2aSMatthew Wilcox (Oracle) 	xas_store(&xas, dst);
514290408d4SNaoya Horiguchi 
515b890ec2aSMatthew Wilcox (Oracle) 	folio_ref_unfreeze(src, expected_count - 1);
516290408d4SNaoya Horiguchi 
51789eb946aSMatthew Wilcox 	xas_unlock_irq(&xas);
5186a93ca8fSJohannes Weiner 
51978bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
520290408d4SNaoya Horiguchi }
521290408d4SNaoya Horiguchi 
522290408d4SNaoya Horiguchi /*
52319138349SMatthew Wilcox (Oracle)  * Copy the flags and some other ancillary information
524b20a3503SChristoph Lameter  */
52519138349SMatthew Wilcox (Oracle) void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
526b20a3503SChristoph Lameter {
5277851a45cSRik van Riel 	int cpupid;
5287851a45cSRik van Riel 
52919138349SMatthew Wilcox (Oracle) 	if (folio_test_error(folio))
53019138349SMatthew Wilcox (Oracle) 		folio_set_error(newfolio);
53119138349SMatthew Wilcox (Oracle) 	if (folio_test_referenced(folio))
53219138349SMatthew Wilcox (Oracle) 		folio_set_referenced(newfolio);
53319138349SMatthew Wilcox (Oracle) 	if (folio_test_uptodate(folio))
53419138349SMatthew Wilcox (Oracle) 		folio_mark_uptodate(newfolio);
53519138349SMatthew Wilcox (Oracle) 	if (folio_test_clear_active(folio)) {
53619138349SMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
53719138349SMatthew Wilcox (Oracle) 		folio_set_active(newfolio);
53819138349SMatthew Wilcox (Oracle) 	} else if (folio_test_clear_unevictable(folio))
53919138349SMatthew Wilcox (Oracle) 		folio_set_unevictable(newfolio);
54019138349SMatthew Wilcox (Oracle) 	if (folio_test_workingset(folio))
54119138349SMatthew Wilcox (Oracle) 		folio_set_workingset(newfolio);
54219138349SMatthew Wilcox (Oracle) 	if (folio_test_checked(folio))
54319138349SMatthew Wilcox (Oracle) 		folio_set_checked(newfolio);
5446c287605SDavid Hildenbrand 	/*
5456c287605SDavid Hildenbrand 	 * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
5466c287605SDavid Hildenbrand 	 * migration entries. We can still have PG_anon_exclusive set on an
5476c287605SDavid Hildenbrand 	 * effectively unmapped and unreferenced first sub-pages of an
5486c287605SDavid Hildenbrand 	 * anonymous THP: we can simply copy it here via PG_mappedtodisk.
5496c287605SDavid Hildenbrand 	 */
55019138349SMatthew Wilcox (Oracle) 	if (folio_test_mappedtodisk(folio))
55119138349SMatthew Wilcox (Oracle) 		folio_set_mappedtodisk(newfolio);
552b20a3503SChristoph Lameter 
5533417013eSMatthew Wilcox (Oracle) 	/* Move dirty on pages not done by folio_migrate_mapping() */
55419138349SMatthew Wilcox (Oracle) 	if (folio_test_dirty(folio))
55519138349SMatthew Wilcox (Oracle) 		folio_set_dirty(newfolio);
556b20a3503SChristoph Lameter 
55719138349SMatthew Wilcox (Oracle) 	if (folio_test_young(folio))
55819138349SMatthew Wilcox (Oracle) 		folio_set_young(newfolio);
55919138349SMatthew Wilcox (Oracle) 	if (folio_test_idle(folio))
56019138349SMatthew Wilcox (Oracle) 		folio_set_idle(newfolio);
56133c3fc71SVladimir Davydov 
5627851a45cSRik van Riel 	/*
5637851a45cSRik van Riel 	 * Copy NUMA information to the new page, to prevent over-eager
5647851a45cSRik van Riel 	 * future migrations of this same page.
5657851a45cSRik van Riel 	 */
56619138349SMatthew Wilcox (Oracle) 	cpupid = page_cpupid_xchg_last(&folio->page, -1);
56733024536SHuang Ying 	/*
56833024536SHuang Ying 	 * For memory tiering mode, when migrate between slow and fast
56933024536SHuang Ying 	 * memory node, reset cpupid, because that is used to record
57033024536SHuang Ying 	 * page access time in slow memory node.
57133024536SHuang Ying 	 */
57233024536SHuang Ying 	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
57333024536SHuang Ying 		bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
57433024536SHuang Ying 		bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
57533024536SHuang Ying 
57633024536SHuang Ying 		if (f_toptier != t_toptier)
57733024536SHuang Ying 			cpupid = -1;
57833024536SHuang Ying 	}
57919138349SMatthew Wilcox (Oracle) 	page_cpupid_xchg_last(&newfolio->page, cpupid);
5807851a45cSRik van Riel 
58119138349SMatthew Wilcox (Oracle) 	folio_migrate_ksm(newfolio, folio);
582c8d6553bSHugh Dickins 	/*
583c8d6553bSHugh Dickins 	 * Please do not reorder this without considering how mm/ksm.c's
584c8d6553bSHugh Dickins 	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
585c8d6553bSHugh Dickins 	 */
58619138349SMatthew Wilcox (Oracle) 	if (folio_test_swapcache(folio))
58719138349SMatthew Wilcox (Oracle) 		folio_clear_swapcache(folio);
58819138349SMatthew Wilcox (Oracle) 	folio_clear_private(folio);
589ad2fa371SMuchun Song 
590ad2fa371SMuchun Song 	/* page->private contains hugetlb specific flags */
59119138349SMatthew Wilcox (Oracle) 	if (!folio_test_hugetlb(folio))
59219138349SMatthew Wilcox (Oracle) 		folio->private = NULL;
593b20a3503SChristoph Lameter 
594b20a3503SChristoph Lameter 	/*
595b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
596b20a3503SChristoph Lameter 	 * wake them up.
597b20a3503SChristoph Lameter 	 */
59819138349SMatthew Wilcox (Oracle) 	if (folio_test_writeback(newfolio))
59919138349SMatthew Wilcox (Oracle) 		folio_end_writeback(newfolio);
600d435edcaSVlastimil Babka 
6016aeff241SYang Shi 	/*
6026aeff241SYang Shi 	 * PG_readahead shares the same bit with PG_reclaim.  The above
6036aeff241SYang Shi 	 * end_page_writeback() may clear PG_readahead mistakenly, so set the
6046aeff241SYang Shi 	 * bit after that.
6056aeff241SYang Shi 	 */
60619138349SMatthew Wilcox (Oracle) 	if (folio_test_readahead(folio))
60719138349SMatthew Wilcox (Oracle) 		folio_set_readahead(newfolio);
6086aeff241SYang Shi 
60919138349SMatthew Wilcox (Oracle) 	folio_copy_owner(newfolio, folio);
61074485cf2SJohannes Weiner 
61119138349SMatthew Wilcox (Oracle) 	if (!folio_test_hugetlb(folio))
612d21bba2bSMatthew Wilcox (Oracle) 		mem_cgroup_migrate(folio, newfolio);
613b20a3503SChristoph Lameter }
61419138349SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_flags);
6152916ecc0SJérôme Glisse 
616715cbfd6SMatthew Wilcox (Oracle) void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
6172916ecc0SJérôme Glisse {
618715cbfd6SMatthew Wilcox (Oracle) 	folio_copy(newfolio, folio);
619715cbfd6SMatthew Wilcox (Oracle) 	folio_migrate_flags(newfolio, folio);
6202916ecc0SJérôme Glisse }
621715cbfd6SMatthew Wilcox (Oracle) EXPORT_SYMBOL(folio_migrate_copy);
622b20a3503SChristoph Lameter 
6231d8b85ccSChristoph Lameter /************************************************************
6241d8b85ccSChristoph Lameter  *                    Migration functions
6251d8b85ccSChristoph Lameter  ***********************************************************/
6261d8b85ccSChristoph Lameter 
62754184650SMatthew Wilcox (Oracle) /**
62854184650SMatthew Wilcox (Oracle)  * migrate_folio() - Simple folio migration.
62954184650SMatthew Wilcox (Oracle)  * @mapping: The address_space containing the folio.
63054184650SMatthew Wilcox (Oracle)  * @dst: The folio to migrate the data to.
63154184650SMatthew Wilcox (Oracle)  * @src: The folio containing the current data.
63254184650SMatthew Wilcox (Oracle)  * @mode: How to migrate the page.
633b20a3503SChristoph Lameter  *
63454184650SMatthew Wilcox (Oracle)  * Common logic to directly migrate a single LRU folio suitable for
63554184650SMatthew Wilcox (Oracle)  * folios that do not use PagePrivate/PagePrivate2.
63654184650SMatthew Wilcox (Oracle)  *
63754184650SMatthew Wilcox (Oracle)  * Folios are locked upon entry and exit.
638b20a3503SChristoph Lameter  */
63954184650SMatthew Wilcox (Oracle) int migrate_folio(struct address_space *mapping, struct folio *dst,
64054184650SMatthew Wilcox (Oracle) 		struct folio *src, enum migrate_mode mode)
641b20a3503SChristoph Lameter {
642b20a3503SChristoph Lameter 	int rc;
643b20a3503SChristoph Lameter 
64454184650SMatthew Wilcox (Oracle) 	BUG_ON(folio_test_writeback(src));	/* Writeback must be complete */
645b20a3503SChristoph Lameter 
64654184650SMatthew Wilcox (Oracle) 	rc = folio_migrate_mapping(mapping, dst, src, 0);
647b20a3503SChristoph Lameter 
64878bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
649b20a3503SChristoph Lameter 		return rc;
650b20a3503SChristoph Lameter 
6512916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
65254184650SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
6532916ecc0SJérôme Glisse 	else
65454184650SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
65578bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
656b20a3503SChristoph Lameter }
65754184650SMatthew Wilcox (Oracle) EXPORT_SYMBOL(migrate_folio);
658b20a3503SChristoph Lameter 
6599361401eSDavid Howells #ifdef CONFIG_BLOCK
66084ade7c1SJan Kara /* Returns true if all buffers are successfully locked */
66184ade7c1SJan Kara static bool buffer_migrate_lock_buffers(struct buffer_head *head,
66284ade7c1SJan Kara 							enum migrate_mode mode)
66384ade7c1SJan Kara {
66484ade7c1SJan Kara 	struct buffer_head *bh = head;
66584ade7c1SJan Kara 
66684ade7c1SJan Kara 	/* Simple case, sync compaction */
66784ade7c1SJan Kara 	if (mode != MIGRATE_ASYNC) {
66884ade7c1SJan Kara 		do {
66984ade7c1SJan Kara 			lock_buffer(bh);
67084ade7c1SJan Kara 			bh = bh->b_this_page;
67184ade7c1SJan Kara 
67284ade7c1SJan Kara 		} while (bh != head);
67384ade7c1SJan Kara 
67484ade7c1SJan Kara 		return true;
67584ade7c1SJan Kara 	}
67684ade7c1SJan Kara 
67784ade7c1SJan Kara 	/* async case, we cannot block on lock_buffer so use trylock_buffer */
67884ade7c1SJan Kara 	do {
67984ade7c1SJan Kara 		if (!trylock_buffer(bh)) {
68084ade7c1SJan Kara 			/*
68184ade7c1SJan Kara 			 * We failed to lock the buffer and cannot stall in
68284ade7c1SJan Kara 			 * async migration. Release the taken locks
68384ade7c1SJan Kara 			 */
68484ade7c1SJan Kara 			struct buffer_head *failed_bh = bh;
68584ade7c1SJan Kara 			bh = head;
68684ade7c1SJan Kara 			while (bh != failed_bh) {
68784ade7c1SJan Kara 				unlock_buffer(bh);
68884ade7c1SJan Kara 				bh = bh->b_this_page;
68984ade7c1SJan Kara 			}
69084ade7c1SJan Kara 			return false;
69184ade7c1SJan Kara 		}
69284ade7c1SJan Kara 
69384ade7c1SJan Kara 		bh = bh->b_this_page;
69484ade7c1SJan Kara 	} while (bh != head);
69584ade7c1SJan Kara 	return true;
69684ade7c1SJan Kara }
69784ade7c1SJan Kara 
69867235182SMatthew Wilcox (Oracle) static int __buffer_migrate_folio(struct address_space *mapping,
69967235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode,
70089cb0888SJan Kara 		bool check_refs)
7011d8b85ccSChristoph Lameter {
7021d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
7031d8b85ccSChristoph Lameter 	int rc;
704cc4f11e6SJan Kara 	int expected_count;
7051d8b85ccSChristoph Lameter 
70667235182SMatthew Wilcox (Oracle) 	head = folio_buffers(src);
70767235182SMatthew Wilcox (Oracle) 	if (!head)
70854184650SMatthew Wilcox (Oracle) 		return migrate_folio(mapping, dst, src, mode);
7091d8b85ccSChristoph Lameter 
710cc4f11e6SJan Kara 	/* Check whether page does not have extra refs before we do more work */
711108ca835SMatthew Wilcox (Oracle) 	expected_count = folio_expected_refs(mapping, src);
71267235182SMatthew Wilcox (Oracle) 	if (folio_ref_count(src) != expected_count)
713cc4f11e6SJan Kara 		return -EAGAIN;
714cc4f11e6SJan Kara 
715cc4f11e6SJan Kara 	if (!buffer_migrate_lock_buffers(head, mode))
716cc4f11e6SJan Kara 		return -EAGAIN;
7171d8b85ccSChristoph Lameter 
71889cb0888SJan Kara 	if (check_refs) {
71989cb0888SJan Kara 		bool busy;
72089cb0888SJan Kara 		bool invalidated = false;
72189cb0888SJan Kara 
72289cb0888SJan Kara recheck_buffers:
72389cb0888SJan Kara 		busy = false;
72489cb0888SJan Kara 		spin_lock(&mapping->private_lock);
72589cb0888SJan Kara 		bh = head;
72689cb0888SJan Kara 		do {
72789cb0888SJan Kara 			if (atomic_read(&bh->b_count)) {
72889cb0888SJan Kara 				busy = true;
72989cb0888SJan Kara 				break;
73089cb0888SJan Kara 			}
73189cb0888SJan Kara 			bh = bh->b_this_page;
73289cb0888SJan Kara 		} while (bh != head);
73389cb0888SJan Kara 		if (busy) {
73489cb0888SJan Kara 			if (invalidated) {
73589cb0888SJan Kara 				rc = -EAGAIN;
73689cb0888SJan Kara 				goto unlock_buffers;
73789cb0888SJan Kara 			}
738ebdf4de5SJan Kara 			spin_unlock(&mapping->private_lock);
73989cb0888SJan Kara 			invalidate_bh_lrus();
74089cb0888SJan Kara 			invalidated = true;
74189cb0888SJan Kara 			goto recheck_buffers;
74289cb0888SJan Kara 		}
74389cb0888SJan Kara 	}
74489cb0888SJan Kara 
74567235182SMatthew Wilcox (Oracle) 	rc = folio_migrate_mapping(mapping, dst, src, 0);
74678bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
747cc4f11e6SJan Kara 		goto unlock_buffers;
7481d8b85ccSChristoph Lameter 
74967235182SMatthew Wilcox (Oracle) 	folio_attach_private(dst, folio_detach_private(src));
7501d8b85ccSChristoph Lameter 
7511d8b85ccSChristoph Lameter 	bh = head;
7521d8b85ccSChristoph Lameter 	do {
75367235182SMatthew Wilcox (Oracle) 		set_bh_page(bh, &dst->page, bh_offset(bh));
7541d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7551d8b85ccSChristoph Lameter 	} while (bh != head);
7561d8b85ccSChristoph Lameter 
7572916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
75867235182SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
7592916ecc0SJérôme Glisse 	else
76067235182SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
7611d8b85ccSChristoph Lameter 
762cc4f11e6SJan Kara 	rc = MIGRATEPAGE_SUCCESS;
763cc4f11e6SJan Kara unlock_buffers:
764ebdf4de5SJan Kara 	if (check_refs)
765ebdf4de5SJan Kara 		spin_unlock(&mapping->private_lock);
7661d8b85ccSChristoph Lameter 	bh = head;
7671d8b85ccSChristoph Lameter 	do {
7681d8b85ccSChristoph Lameter 		unlock_buffer(bh);
7691d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7701d8b85ccSChristoph Lameter 	} while (bh != head);
7711d8b85ccSChristoph Lameter 
772cc4f11e6SJan Kara 	return rc;
7731d8b85ccSChristoph Lameter }
77489cb0888SJan Kara 
77567235182SMatthew Wilcox (Oracle) /**
77667235182SMatthew Wilcox (Oracle)  * buffer_migrate_folio() - Migration function for folios with buffers.
77767235182SMatthew Wilcox (Oracle)  * @mapping: The address space containing @src.
77867235182SMatthew Wilcox (Oracle)  * @dst: The folio to migrate to.
77967235182SMatthew Wilcox (Oracle)  * @src: The folio to migrate from.
78067235182SMatthew Wilcox (Oracle)  * @mode: How to migrate the folio.
78167235182SMatthew Wilcox (Oracle)  *
78267235182SMatthew Wilcox (Oracle)  * This function can only be used if the underlying filesystem guarantees
78367235182SMatthew Wilcox (Oracle)  * that no other references to @src exist. For example attached buffer
78467235182SMatthew Wilcox (Oracle)  * heads are accessed only under the folio lock.  If your filesystem cannot
78567235182SMatthew Wilcox (Oracle)  * provide this guarantee, buffer_migrate_folio_norefs() may be more
78667235182SMatthew Wilcox (Oracle)  * appropriate.
78767235182SMatthew Wilcox (Oracle)  *
78867235182SMatthew Wilcox (Oracle)  * Return: 0 on success or a negative errno on failure.
78989cb0888SJan Kara  */
79067235182SMatthew Wilcox (Oracle) int buffer_migrate_folio(struct address_space *mapping,
79167235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
79289cb0888SJan Kara {
79367235182SMatthew Wilcox (Oracle) 	return __buffer_migrate_folio(mapping, dst, src, mode, false);
79489cb0888SJan Kara }
79567235182SMatthew Wilcox (Oracle) EXPORT_SYMBOL(buffer_migrate_folio);
79689cb0888SJan Kara 
79767235182SMatthew Wilcox (Oracle) /**
79867235182SMatthew Wilcox (Oracle)  * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
79967235182SMatthew Wilcox (Oracle)  * @mapping: The address space containing @src.
80067235182SMatthew Wilcox (Oracle)  * @dst: The folio to migrate to.
80167235182SMatthew Wilcox (Oracle)  * @src: The folio to migrate from.
80267235182SMatthew Wilcox (Oracle)  * @mode: How to migrate the folio.
80367235182SMatthew Wilcox (Oracle)  *
80467235182SMatthew Wilcox (Oracle)  * Like buffer_migrate_folio() except that this variant is more careful
80567235182SMatthew Wilcox (Oracle)  * and checks that there are also no buffer head references. This function
80667235182SMatthew Wilcox (Oracle)  * is the right one for mappings where buffer heads are directly looked
80767235182SMatthew Wilcox (Oracle)  * up and referenced (such as block device mappings).
80867235182SMatthew Wilcox (Oracle)  *
80967235182SMatthew Wilcox (Oracle)  * Return: 0 on success or a negative errno on failure.
81089cb0888SJan Kara  */
81167235182SMatthew Wilcox (Oracle) int buffer_migrate_folio_norefs(struct address_space *mapping,
81267235182SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
81389cb0888SJan Kara {
81467235182SMatthew Wilcox (Oracle) 	return __buffer_migrate_folio(mapping, dst, src, mode, true);
81589cb0888SJan Kara }
8169361401eSDavid Howells #endif
8171d8b85ccSChristoph Lameter 
8182ec810d5SMatthew Wilcox (Oracle) int filemap_migrate_folio(struct address_space *mapping,
8192ec810d5SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
8202ec810d5SMatthew Wilcox (Oracle) {
8212ec810d5SMatthew Wilcox (Oracle) 	int ret;
8222ec810d5SMatthew Wilcox (Oracle) 
8232ec810d5SMatthew Wilcox (Oracle) 	ret = folio_migrate_mapping(mapping, dst, src, 0);
8242ec810d5SMatthew Wilcox (Oracle) 	if (ret != MIGRATEPAGE_SUCCESS)
8252ec810d5SMatthew Wilcox (Oracle) 		return ret;
8262ec810d5SMatthew Wilcox (Oracle) 
8272ec810d5SMatthew Wilcox (Oracle) 	if (folio_get_private(src))
8282ec810d5SMatthew Wilcox (Oracle) 		folio_attach_private(dst, folio_detach_private(src));
8292ec810d5SMatthew Wilcox (Oracle) 
8302ec810d5SMatthew Wilcox (Oracle) 	if (mode != MIGRATE_SYNC_NO_COPY)
8312ec810d5SMatthew Wilcox (Oracle) 		folio_migrate_copy(dst, src);
8322ec810d5SMatthew Wilcox (Oracle) 	else
8332ec810d5SMatthew Wilcox (Oracle) 		folio_migrate_flags(dst, src);
8342ec810d5SMatthew Wilcox (Oracle) 	return MIGRATEPAGE_SUCCESS;
8352ec810d5SMatthew Wilcox (Oracle) }
8362ec810d5SMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(filemap_migrate_folio);
8372ec810d5SMatthew Wilcox (Oracle) 
83804e62a29SChristoph Lameter /*
8392be7fa10SMatthew Wilcox (Oracle)  * Writeback a folio to clean the dirty state
84004e62a29SChristoph Lameter  */
8412be7fa10SMatthew Wilcox (Oracle) static int writeout(struct address_space *mapping, struct folio *folio)
84204e62a29SChristoph Lameter {
84304e62a29SChristoph Lameter 	struct writeback_control wbc = {
84404e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
84504e62a29SChristoph Lameter 		.nr_to_write = 1,
84604e62a29SChristoph Lameter 		.range_start = 0,
84704e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
84804e62a29SChristoph Lameter 		.for_reclaim = 1
84904e62a29SChristoph Lameter 	};
85004e62a29SChristoph Lameter 	int rc;
85104e62a29SChristoph Lameter 
85204e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
85304e62a29SChristoph Lameter 		/* No write method for the address space */
85404e62a29SChristoph Lameter 		return -EINVAL;
85504e62a29SChristoph Lameter 
8562be7fa10SMatthew Wilcox (Oracle) 	if (!folio_clear_dirty_for_io(folio))
85704e62a29SChristoph Lameter 		/* Someone else already triggered a write */
85804e62a29SChristoph Lameter 		return -EAGAIN;
85904e62a29SChristoph Lameter 
86004e62a29SChristoph Lameter 	/*
8612be7fa10SMatthew Wilcox (Oracle) 	 * A dirty folio may imply that the underlying filesystem has
8622be7fa10SMatthew Wilcox (Oracle) 	 * the folio on some queue. So the folio must be clean for
8632be7fa10SMatthew Wilcox (Oracle) 	 * migration. Writeout may mean we lose the lock and the
8642be7fa10SMatthew Wilcox (Oracle) 	 * folio state is no longer what we checked for earlier.
86504e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
86604e62a29SChristoph Lameter 	 * be successful.
86704e62a29SChristoph Lameter 	 */
8684eecb8b9SMatthew Wilcox (Oracle) 	remove_migration_ptes(folio, folio, false);
86904e62a29SChristoph Lameter 
8702be7fa10SMatthew Wilcox (Oracle) 	rc = mapping->a_ops->writepage(&folio->page, &wbc);
87104e62a29SChristoph Lameter 
87204e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
87304e62a29SChristoph Lameter 		/* unlocked. Relock */
8742be7fa10SMatthew Wilcox (Oracle) 		folio_lock(folio);
87504e62a29SChristoph Lameter 
876bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
87704e62a29SChristoph Lameter }
87804e62a29SChristoph Lameter 
87904e62a29SChristoph Lameter /*
88004e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
88104e62a29SChristoph Lameter  */
8828faa8ef5SMatthew Wilcox (Oracle) static int fallback_migrate_folio(struct address_space *mapping,
8838faa8ef5SMatthew Wilcox (Oracle) 		struct folio *dst, struct folio *src, enum migrate_mode mode)
8848351a6e4SChristoph Lameter {
8858faa8ef5SMatthew Wilcox (Oracle) 	if (folio_test_dirty(src)) {
8868faa8ef5SMatthew Wilcox (Oracle) 		/* Only writeback folios in full synchronous migration */
8872916ecc0SJérôme Glisse 		switch (mode) {
8882916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
8892916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
8902916ecc0SJérôme Glisse 			break;
8912916ecc0SJérôme Glisse 		default:
892b969c4abSMel Gorman 			return -EBUSY;
8932916ecc0SJérôme Glisse 		}
8942be7fa10SMatthew Wilcox (Oracle) 		return writeout(mapping, src);
895b969c4abSMel Gorman 	}
8968351a6e4SChristoph Lameter 
8978351a6e4SChristoph Lameter 	/*
8988351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
8998351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
9008351a6e4SChristoph Lameter 	 */
9018faa8ef5SMatthew Wilcox (Oracle) 	if (folio_test_private(src) &&
9028faa8ef5SMatthew Wilcox (Oracle) 	    !filemap_release_folio(src, GFP_KERNEL))
903806031bbSMel Gorman 		return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
9048351a6e4SChristoph Lameter 
90554184650SMatthew Wilcox (Oracle) 	return migrate_folio(mapping, dst, src, mode);
9068351a6e4SChristoph Lameter }
9078351a6e4SChristoph Lameter 
9081d8b85ccSChristoph Lameter /*
909e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
910e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
911b20a3503SChristoph Lameter  *
912e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
913e24f0b8fSChristoph Lameter  * is successful.
914894bc310SLee Schermerhorn  *
915894bc310SLee Schermerhorn  * Return value:
916894bc310SLee Schermerhorn  *   < 0 - error code
91778bd5209SRafael Aquini  *  MIGRATEPAGE_SUCCESS - success
918b20a3503SChristoph Lameter  */
919e7e3ffebSMatthew Wilcox (Oracle) static int move_to_new_folio(struct folio *dst, struct folio *src,
9205c3f9a67SHugh Dickins 				enum migrate_mode mode)
921b20a3503SChristoph Lameter {
922bda807d4SMinchan Kim 	int rc = -EAGAIN;
923e7e3ffebSMatthew Wilcox (Oracle) 	bool is_lru = !__PageMovable(&src->page);
924b20a3503SChristoph Lameter 
925e7e3ffebSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
926e7e3ffebSMatthew Wilcox (Oracle) 	VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
927b20a3503SChristoph Lameter 
928bda807d4SMinchan Kim 	if (likely(is_lru)) {
92968f2736aSMatthew Wilcox (Oracle) 		struct address_space *mapping = folio_mapping(src);
93068f2736aSMatthew Wilcox (Oracle) 
931b20a3503SChristoph Lameter 		if (!mapping)
93254184650SMatthew Wilcox (Oracle) 			rc = migrate_folio(mapping, dst, src, mode);
9335490da4fSMatthew Wilcox (Oracle) 		else if (mapping->a_ops->migrate_folio)
934b20a3503SChristoph Lameter 			/*
9355490da4fSMatthew Wilcox (Oracle) 			 * Most folios have a mapping and most filesystems
9365490da4fSMatthew Wilcox (Oracle) 			 * provide a migrate_folio callback. Anonymous folios
937bda807d4SMinchan Kim 			 * are part of swap space which also has its own
9385490da4fSMatthew Wilcox (Oracle) 			 * migrate_folio callback. This is the most common path
939bda807d4SMinchan Kim 			 * for page migration.
940b20a3503SChristoph Lameter 			 */
9415490da4fSMatthew Wilcox (Oracle) 			rc = mapping->a_ops->migrate_folio(mapping, dst, src,
9425490da4fSMatthew Wilcox (Oracle) 								mode);
9438351a6e4SChristoph Lameter 		else
9448faa8ef5SMatthew Wilcox (Oracle) 			rc = fallback_migrate_folio(mapping, dst, src, mode);
945bda807d4SMinchan Kim 	} else {
94668f2736aSMatthew Wilcox (Oracle) 		const struct movable_operations *mops;
94768f2736aSMatthew Wilcox (Oracle) 
948bda807d4SMinchan Kim 		/*
949bda807d4SMinchan Kim 		 * In case of non-lru page, it could be released after
950bda807d4SMinchan Kim 		 * isolation step. In that case, we shouldn't try migration.
951bda807d4SMinchan Kim 		 */
952e7e3ffebSMatthew Wilcox (Oracle) 		VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
953e7e3ffebSMatthew Wilcox (Oracle) 		if (!folio_test_movable(src)) {
954bda807d4SMinchan Kim 			rc = MIGRATEPAGE_SUCCESS;
955e7e3ffebSMatthew Wilcox (Oracle) 			folio_clear_isolated(src);
956bda807d4SMinchan Kim 			goto out;
957bda807d4SMinchan Kim 		}
958bda807d4SMinchan Kim 
95968f2736aSMatthew Wilcox (Oracle) 		mops = page_movable_ops(&src->page);
96068f2736aSMatthew Wilcox (Oracle) 		rc = mops->migrate_page(&dst->page, &src->page, mode);
961bda807d4SMinchan Kim 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
962e7e3ffebSMatthew Wilcox (Oracle) 				!folio_test_isolated(src));
963bda807d4SMinchan Kim 	}
964b20a3503SChristoph Lameter 
9655c3f9a67SHugh Dickins 	/*
966e7e3ffebSMatthew Wilcox (Oracle) 	 * When successful, old pagecache src->mapping must be cleared before
967e7e3ffebSMatthew Wilcox (Oracle) 	 * src is freed; but stats require that PageAnon be left as PageAnon.
9685c3f9a67SHugh Dickins 	 */
9695c3f9a67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
970e7e3ffebSMatthew Wilcox (Oracle) 		if (__PageMovable(&src->page)) {
971e7e3ffebSMatthew Wilcox (Oracle) 			VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
972bda807d4SMinchan Kim 
973bda807d4SMinchan Kim 			/*
974bda807d4SMinchan Kim 			 * We clear PG_movable under page_lock so any compactor
975bda807d4SMinchan Kim 			 * cannot try to migrate this page.
976bda807d4SMinchan Kim 			 */
977e7e3ffebSMatthew Wilcox (Oracle) 			folio_clear_isolated(src);
978bda807d4SMinchan Kim 		}
979bda807d4SMinchan Kim 
980bda807d4SMinchan Kim 		/*
981e7e3ffebSMatthew Wilcox (Oracle) 		 * Anonymous and movable src->mapping will be cleared by
982bda807d4SMinchan Kim 		 * free_pages_prepare so don't reset it here for keeping
983bda807d4SMinchan Kim 		 * the type to work PageAnon, for example.
984bda807d4SMinchan Kim 		 */
985e7e3ffebSMatthew Wilcox (Oracle) 		if (!folio_mapping_flags(src))
986e7e3ffebSMatthew Wilcox (Oracle) 			src->mapping = NULL;
987d2b2c6ddSLars Persson 
988e7e3ffebSMatthew Wilcox (Oracle) 		if (likely(!folio_is_zone_device(dst)))
989e7e3ffebSMatthew Wilcox (Oracle) 			flush_dcache_folio(dst);
9903fe2011fSMel Gorman 	}
991bda807d4SMinchan Kim out:
992e24f0b8fSChristoph Lameter 	return rc;
993e24f0b8fSChristoph Lameter }
994e24f0b8fSChristoph Lameter 
9950dabec93SMinchan Kim static int __unmap_and_move(struct page *page, struct page *newpage,
9969c620e2bSHugh Dickins 				int force, enum migrate_mode mode)
997e24f0b8fSChristoph Lameter {
9984b8554c5SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
9994eecb8b9SMatthew Wilcox (Oracle) 	struct folio *dst = page_folio(newpage);
10000dabec93SMinchan Kim 	int rc = -EAGAIN;
1001213ecb31SBaolin Wang 	bool page_was_mapped = false;
10023f6c8272SMel Gorman 	struct anon_vma *anon_vma = NULL;
1003bda807d4SMinchan Kim 	bool is_lru = !__PageMovable(page);
100495a402c3SChristoph Lameter 
1005529ae9aaSNick Piggin 	if (!trylock_page(page)) {
1006a6bc32b8SMel Gorman 		if (!force || mode == MIGRATE_ASYNC)
10070dabec93SMinchan Kim 			goto out;
10083e7d3449SMel Gorman 
10093e7d3449SMel Gorman 		/*
10103e7d3449SMel Gorman 		 * It's not safe for direct compaction to call lock_page.
10113e7d3449SMel Gorman 		 * For example, during page readahead pages are added locked
10123e7d3449SMel Gorman 		 * to the LRU. Later, when the IO completes the pages are
10133e7d3449SMel Gorman 		 * marked uptodate and unlocked. However, the queueing
10143e7d3449SMel Gorman 		 * could be merging multiple pages for one bio (e.g.
1015d4388340SMatthew Wilcox (Oracle) 		 * mpage_readahead). If an allocation happens for the
10163e7d3449SMel Gorman 		 * second or third page, the process can end up locking
10173e7d3449SMel Gorman 		 * the same page twice and deadlocking. Rather than
10183e7d3449SMel Gorman 		 * trying to be clever about what pages can be locked,
10193e7d3449SMel Gorman 		 * avoid the use of lock_page for direct compaction
10203e7d3449SMel Gorman 		 * altogether.
10213e7d3449SMel Gorman 		 */
10223e7d3449SMel Gorman 		if (current->flags & PF_MEMALLOC)
10230dabec93SMinchan Kim 			goto out;
10243e7d3449SMel Gorman 
1025e24f0b8fSChristoph Lameter 		lock_page(page);
1026e24f0b8fSChristoph Lameter 	}
1027e24f0b8fSChristoph Lameter 
1028e24f0b8fSChristoph Lameter 	if (PageWriteback(page)) {
102911bc82d6SAndrea Arcangeli 		/*
1030fed5b64aSJianguo Wu 		 * Only in the case of a full synchronous migration is it
1031a6bc32b8SMel Gorman 		 * necessary to wait for PageWriteback. In the async case,
1032a6bc32b8SMel Gorman 		 * the retry loop is too short and in the sync-light case,
1033a6bc32b8SMel Gorman 		 * the overhead of stalling is too much
103411bc82d6SAndrea Arcangeli 		 */
10352916ecc0SJérôme Glisse 		switch (mode) {
10362916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
10372916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
10382916ecc0SJérôme Glisse 			break;
10392916ecc0SJérôme Glisse 		default:
104011bc82d6SAndrea Arcangeli 			rc = -EBUSY;
10410a31bc97SJohannes Weiner 			goto out_unlock;
104211bc82d6SAndrea Arcangeli 		}
104311bc82d6SAndrea Arcangeli 		if (!force)
10440a31bc97SJohannes Weiner 			goto out_unlock;
1045e24f0b8fSChristoph Lameter 		wait_on_page_writeback(page);
1046e24f0b8fSChristoph Lameter 	}
104703f15c86SHugh Dickins 
1048e24f0b8fSChristoph Lameter 	/*
104968a9843fSBaolin Wang 	 * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
1050dc386d4dSKAMEZAWA Hiroyuki 	 * we cannot notice that anon_vma is freed while we migrates a page.
10511ce82b69SHugh Dickins 	 * This get_anon_vma() delays freeing anon_vma pointer until the end
1052dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
1053989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
1054989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
10553fe2011fSMel Gorman 	 *
105603f15c86SHugh Dickins 	 * Only page_get_anon_vma() understands the subtleties of
105703f15c86SHugh Dickins 	 * getting a hold on an anon_vma from outside one of its mms.
105803f15c86SHugh Dickins 	 * But if we cannot get anon_vma, then we won't need it anyway,
105903f15c86SHugh Dickins 	 * because that implies that the anon page is no longer mapped
106003f15c86SHugh Dickins 	 * (and cannot be remapped so long as we hold the page lock).
10613fe2011fSMel Gorman 	 */
106203f15c86SHugh Dickins 	if (PageAnon(page) && !PageKsm(page))
106303f15c86SHugh Dickins 		anon_vma = page_get_anon_vma(page);
106462e1c553SShaohua Li 
10657db7671fSHugh Dickins 	/*
10667db7671fSHugh Dickins 	 * Block others from accessing the new page when we get around to
10677db7671fSHugh Dickins 	 * establishing additional references. We are usually the only one
10687db7671fSHugh Dickins 	 * holding a reference to newpage at this point. We used to have a BUG
10697db7671fSHugh Dickins 	 * here if trylock_page(newpage) fails, but would like to allow for
10707db7671fSHugh Dickins 	 * cases where there might be a race with the previous use of newpage.
10717db7671fSHugh Dickins 	 * This is much like races on refcount of oldpage: just don't BUG().
10727db7671fSHugh Dickins 	 */
10737db7671fSHugh Dickins 	if (unlikely(!trylock_page(newpage)))
10747db7671fSHugh Dickins 		goto out_unlock;
10757db7671fSHugh Dickins 
1076bda807d4SMinchan Kim 	if (unlikely(!is_lru)) {
1077e7e3ffebSMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, folio, mode);
1078bda807d4SMinchan Kim 		goto out_unlock_both;
1079bda807d4SMinchan Kim 	}
1080bda807d4SMinchan Kim 
1081dc386d4dSKAMEZAWA Hiroyuki 	/*
108262e1c553SShaohua Li 	 * Corner case handling:
108362e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
108462e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
108562e1c553SShaohua Li 	 * Calling try_to_unmap() against a page->mapping==NULL page will
108662e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
1087d12b8951SYang Shi 	 * 2. An orphaned page (see truncate_cleanup_page) might have
108862e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
108962e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
109062e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
109162e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
1092dc386d4dSKAMEZAWA Hiroyuki 	 */
109362e1c553SShaohua Li 	if (!page->mapping) {
1094309381feSSasha Levin 		VM_BUG_ON_PAGE(PageAnon(page), page);
10951ce82b69SHugh Dickins 		if (page_has_private(page)) {
109668189fefSMatthew Wilcox (Oracle) 			try_to_free_buffers(folio);
10977db7671fSHugh Dickins 			goto out_unlock_both;
109862e1c553SShaohua Li 		}
10997db7671fSHugh Dickins 	} else if (page_mapped(page)) {
11007db7671fSHugh Dickins 		/* Establish migration ptes */
110103f15c86SHugh Dickins 		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
110203f15c86SHugh Dickins 				page);
11034b8554c5SMatthew Wilcox (Oracle) 		try_to_migrate(folio, 0);
1104213ecb31SBaolin Wang 		page_was_mapped = true;
11052ebba6b7SHugh Dickins 	}
1106dc386d4dSKAMEZAWA Hiroyuki 
1107e24f0b8fSChristoph Lameter 	if (!page_mapped(page))
1108e7e3ffebSMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, folio, mode);
1109e24f0b8fSChristoph Lameter 
1110c3096e67SHugh Dickins 	/*
1111c3096e67SHugh Dickins 	 * When successful, push newpage to LRU immediately: so that if it
1112c3096e67SHugh Dickins 	 * turns out to be an mlocked page, remove_migration_ptes() will
1113c3096e67SHugh Dickins 	 * automatically build up the correct newpage->mlock_count for it.
1114c3096e67SHugh Dickins 	 *
1115c3096e67SHugh Dickins 	 * We would like to do something similar for the old page, when
1116c3096e67SHugh Dickins 	 * unsuccessful, and other cases when a page has been temporarily
1117c3096e67SHugh Dickins 	 * isolated from the unevictable LRU: but this case is the easiest.
1118c3096e67SHugh Dickins 	 */
1119c3096e67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1120c3096e67SHugh Dickins 		lru_cache_add(newpage);
11215c3f9a67SHugh Dickins 		if (page_was_mapped)
1122c3096e67SHugh Dickins 			lru_add_drain();
1123c3096e67SHugh Dickins 	}
1124c3096e67SHugh Dickins 
11255c3f9a67SHugh Dickins 	if (page_was_mapped)
11264eecb8b9SMatthew Wilcox (Oracle) 		remove_migration_ptes(folio,
11274eecb8b9SMatthew Wilcox (Oracle) 			rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
11283f6c8272SMel Gorman 
11297db7671fSHugh Dickins out_unlock_both:
11307db7671fSHugh Dickins 	unlock_page(newpage);
11317db7671fSHugh Dickins out_unlock:
11323f6c8272SMel Gorman 	/* Drop an anon_vma reference if we took one */
113376545066SRik van Riel 	if (anon_vma)
11349e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
1135b20a3503SChristoph Lameter 	unlock_page(page);
11360dabec93SMinchan Kim out:
1137c6c919ebSMinchan Kim 	/*
1138c3096e67SHugh Dickins 	 * If migration is successful, decrease refcount of the newpage,
1139c6c919ebSMinchan Kim 	 * which will not free the page because new page owner increased
1140c3096e67SHugh Dickins 	 * refcounter.
1141c6c919ebSMinchan Kim 	 */
1142c3096e67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS)
1143c6c919ebSMinchan Kim 		put_page(newpage);
1144c6c919ebSMinchan Kim 
11450dabec93SMinchan Kim 	return rc;
11460dabec93SMinchan Kim }
114795a402c3SChristoph Lameter 
11480dabec93SMinchan Kim /*
11490dabec93SMinchan Kim  * Obtain the lock on page, remove all ptes and migrate the page
11500dabec93SMinchan Kim  * to the newly allocated page in newpage.
11510dabec93SMinchan Kim  */
11526ec4476aSLinus Torvalds static int unmap_and_move(new_page_t get_new_page,
1153ef2a5153SGeert Uytterhoeven 				   free_page_t put_new_page,
1154ef2a5153SGeert Uytterhoeven 				   unsigned long private, struct page *page,
1155add05cecSNaoya Horiguchi 				   int force, enum migrate_mode mode,
1156dd4ae78aSYang Shi 				   enum migrate_reason reason,
1157dd4ae78aSYang Shi 				   struct list_head *ret)
11580dabec93SMinchan Kim {
11592def7424SHugh Dickins 	int rc = MIGRATEPAGE_SUCCESS;
116074d4a579SYang Shi 	struct page *newpage = NULL;
11610dabec93SMinchan Kim 
116294723aafSMichal Hocko 	if (!thp_migration_supported() && PageTransHuge(page))
1163d532e2e5SYang Shi 		return -ENOSYS;
116494723aafSMichal Hocko 
11650dabec93SMinchan Kim 	if (page_count(page) == 1) {
1166160088b3SMiaohe Lin 		/* Page was freed from under us. So we are done. */
1167c6c919ebSMinchan Kim 		ClearPageActive(page);
1168c6c919ebSMinchan Kim 		ClearPageUnevictable(page);
1169160088b3SMiaohe Lin 		/* free_pages_prepare() will clear PG_isolated. */
11700dabec93SMinchan Kim 		goto out;
11710dabec93SMinchan Kim 	}
11720dabec93SMinchan Kim 
117374d4a579SYang Shi 	newpage = get_new_page(page, private);
117474d4a579SYang Shi 	if (!newpage)
117574d4a579SYang Shi 		return -ENOMEM;
117674d4a579SYang Shi 
1177b653db77SMatthew Wilcox (Oracle) 	newpage->private = 0;
11789c620e2bSHugh Dickins 	rc = __unmap_and_move(page, newpage, force, mode);
1179c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS)
11807cd12b4aSVlastimil Babka 		set_page_owner_migrate_reason(newpage, reason);
1181bf6bddf1SRafael Aquini 
11820dabec93SMinchan Kim out:
1183e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
1184aaa994b3SChristoph Lameter 		/*
1185aaa994b3SChristoph Lameter 		 * A page that has been migrated has all references
1186aaa994b3SChristoph Lameter 		 * removed and will be freed. A page that has not been
1187c23a0c99SRalph Campbell 		 * migrated will have kept its references and be restored.
1188aaa994b3SChristoph Lameter 		 */
1189aaa994b3SChristoph Lameter 		list_del(&page->lru);
1190e24f0b8fSChristoph Lameter 	}
119168711a74SDavid Rientjes 
119295a402c3SChristoph Lameter 	/*
1193c6c919ebSMinchan Kim 	 * If migration is successful, releases reference grabbed during
1194c6c919ebSMinchan Kim 	 * isolation. Otherwise, restore the page to right list unless
1195c6c919ebSMinchan Kim 	 * we want to retry.
119695a402c3SChristoph Lameter 	 */
1197c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1198dd4ae78aSYang Shi 		/*
1199dd4ae78aSYang Shi 		 * Compaction can migrate also non-LRU pages which are
1200dd4ae78aSYang Shi 		 * not accounted to NR_ISOLATED_*. They can be recognized
1201dd4ae78aSYang Shi 		 * as __PageMovable
1202dd4ae78aSYang Shi 		 */
1203dd4ae78aSYang Shi 		if (likely(!__PageMovable(page)))
1204dd4ae78aSYang Shi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1205dd4ae78aSYang Shi 					page_is_file_lru(page), -thp_nr_pages(page));
1206dd4ae78aSYang Shi 
120779f5f8faSOscar Salvador 		if (reason != MR_MEMORY_FAILURE)
1208c6c919ebSMinchan Kim 			/*
120979f5f8faSOscar Salvador 			 * We release the page in page_handle_poison.
1210c6c919ebSMinchan Kim 			 */
121179f5f8faSOscar Salvador 			put_page(page);
1212c6c919ebSMinchan Kim 	} else {
1213dd4ae78aSYang Shi 		if (rc != -EAGAIN)
1214dd4ae78aSYang Shi 			list_add_tail(&page->lru, ret);
1215bda807d4SMinchan Kim 
1216cf4b769aSHugh Dickins 		if (put_new_page)
121768711a74SDavid Rientjes 			put_new_page(newpage, private);
1218c6c919ebSMinchan Kim 		else
1219d6d86c0aSKonstantin Khlebnikov 			put_page(newpage);
1220c6c919ebSMinchan Kim 	}
122168711a74SDavid Rientjes 
1222e24f0b8fSChristoph Lameter 	return rc;
1223e24f0b8fSChristoph Lameter }
1224b20a3503SChristoph Lameter 
1225e24f0b8fSChristoph Lameter /*
1226290408d4SNaoya Horiguchi  * Counterpart of unmap_and_move_page() for hugepage migration.
1227290408d4SNaoya Horiguchi  *
1228290408d4SNaoya Horiguchi  * This function doesn't wait the completion of hugepage I/O
1229290408d4SNaoya Horiguchi  * because there is no race between I/O and migration for hugepage.
1230290408d4SNaoya Horiguchi  * Note that currently hugepage I/O occurs only in direct I/O
1231290408d4SNaoya Horiguchi  * where no lock is held and PG_writeback is irrelevant,
1232290408d4SNaoya Horiguchi  * and writeback status of all subpages are counted in the reference
1233290408d4SNaoya Horiguchi  * count of the head page (i.e. if all subpages of a 2MB hugepage are
1234290408d4SNaoya Horiguchi  * under direct I/O, the reference of the head page is 512 and a bit more.)
1235290408d4SNaoya Horiguchi  * This means that when we try to migrate hugepage whose subpages are
1236290408d4SNaoya Horiguchi  * doing direct I/O, some references remain after try_to_unmap() and
1237290408d4SNaoya Horiguchi  * hugepage migration fails without data corruption.
1238290408d4SNaoya Horiguchi  *
1239290408d4SNaoya Horiguchi  * There is also no race when direct I/O is issued on the page under migration,
1240290408d4SNaoya Horiguchi  * because then pte is replaced with migration swap entry and direct I/O code
1241290408d4SNaoya Horiguchi  * will wait in the page fault for migration to complete.
1242290408d4SNaoya Horiguchi  */
1243290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page,
124468711a74SDavid Rientjes 				free_page_t put_new_page, unsigned long private,
124568711a74SDavid Rientjes 				struct page *hpage, int force,
1246dd4ae78aSYang Shi 				enum migrate_mode mode, int reason,
1247dd4ae78aSYang Shi 				struct list_head *ret)
1248290408d4SNaoya Horiguchi {
12494eecb8b9SMatthew Wilcox (Oracle) 	struct folio *dst, *src = page_folio(hpage);
12502def7424SHugh Dickins 	int rc = -EAGAIN;
12512ebba6b7SHugh Dickins 	int page_was_mapped = 0;
125232665f2bSJoonsoo Kim 	struct page *new_hpage;
1253290408d4SNaoya Horiguchi 	struct anon_vma *anon_vma = NULL;
1254c0d0381aSMike Kravetz 	struct address_space *mapping = NULL;
1255290408d4SNaoya Horiguchi 
125683467efbSNaoya Horiguchi 	/*
12577ed2c31dSAnshuman Khandual 	 * Migratability of hugepages depends on architectures and their size.
125883467efbSNaoya Horiguchi 	 * This check is necessary because some callers of hugepage migration
125983467efbSNaoya Horiguchi 	 * like soft offline and memory hotremove don't walk through page
126083467efbSNaoya Horiguchi 	 * tables or check whether the hugepage is pmd-based or not before
126183467efbSNaoya Horiguchi 	 * kicking migration.
126283467efbSNaoya Horiguchi 	 */
1263100873d7SNaoya Horiguchi 	if (!hugepage_migration_supported(page_hstate(hpage))) {
1264dd4ae78aSYang Shi 		list_move_tail(&hpage->lru, ret);
126583467efbSNaoya Horiguchi 		return -ENOSYS;
126632665f2bSJoonsoo Kim 	}
126783467efbSNaoya Horiguchi 
126871a64f61SMuchun Song 	if (page_count(hpage) == 1) {
126971a64f61SMuchun Song 		/* page was freed from under us. So we are done. */
127071a64f61SMuchun Song 		putback_active_hugepage(hpage);
127171a64f61SMuchun Song 		return MIGRATEPAGE_SUCCESS;
127271a64f61SMuchun Song 	}
127371a64f61SMuchun Song 
1274666feb21SMichal Hocko 	new_hpage = get_new_page(hpage, private);
1275290408d4SNaoya Horiguchi 	if (!new_hpage)
1276290408d4SNaoya Horiguchi 		return -ENOMEM;
12774eecb8b9SMatthew Wilcox (Oracle) 	dst = page_folio(new_hpage);
1278290408d4SNaoya Horiguchi 
1279290408d4SNaoya Horiguchi 	if (!trylock_page(hpage)) {
12802916ecc0SJérôme Glisse 		if (!force)
1281290408d4SNaoya Horiguchi 			goto out;
12822916ecc0SJérôme Glisse 		switch (mode) {
12832916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
12842916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
12852916ecc0SJérôme Glisse 			break;
12862916ecc0SJérôme Glisse 		default:
12872916ecc0SJérôme Glisse 			goto out;
12882916ecc0SJérôme Glisse 		}
1289290408d4SNaoya Horiguchi 		lock_page(hpage);
1290290408d4SNaoya Horiguchi 	}
1291290408d4SNaoya Horiguchi 
1292cb6acd01SMike Kravetz 	/*
1293cb6acd01SMike Kravetz 	 * Check for pages which are in the process of being freed.  Without
1294cb6acd01SMike Kravetz 	 * page_mapping() set, hugetlbfs specific move page routine will not
1295cb6acd01SMike Kravetz 	 * be called and we could leak usage counts for subpools.
1296cb6acd01SMike Kravetz 	 */
12976acfb5baSMuchun Song 	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
1298cb6acd01SMike Kravetz 		rc = -EBUSY;
1299cb6acd01SMike Kravetz 		goto out_unlock;
1300cb6acd01SMike Kravetz 	}
1301cb6acd01SMike Kravetz 
1302746b18d4SPeter Zijlstra 	if (PageAnon(hpage))
1303746b18d4SPeter Zijlstra 		anon_vma = page_get_anon_vma(hpage);
1304290408d4SNaoya Horiguchi 
13057db7671fSHugh Dickins 	if (unlikely(!trylock_page(new_hpage)))
13067db7671fSHugh Dickins 		goto put_anon;
13077db7671fSHugh Dickins 
13082ebba6b7SHugh Dickins 	if (page_mapped(hpage)) {
1309a98a2f0cSAlistair Popple 		enum ttu_flags ttu = 0;
1310336bf30eSMike Kravetz 
1311336bf30eSMike Kravetz 		if (!PageAnon(hpage)) {
1312c0d0381aSMike Kravetz 			/*
1313336bf30eSMike Kravetz 			 * In shared mappings, try_to_unmap could potentially
1314336bf30eSMike Kravetz 			 * call huge_pmd_unshare.  Because of this, take
1315336bf30eSMike Kravetz 			 * semaphore in write mode here and set TTU_RMAP_LOCKED
1316336bf30eSMike Kravetz 			 * to let lower levels know we have taken the lock.
1317c0d0381aSMike Kravetz 			 */
1318c0d0381aSMike Kravetz 			mapping = hugetlb_page_mapping_lock_write(hpage);
1319c0d0381aSMike Kravetz 			if (unlikely(!mapping))
1320c0d0381aSMike Kravetz 				goto unlock_put_anon;
1321c0d0381aSMike Kravetz 
13225202978bSMiaohe Lin 			ttu = TTU_RMAP_LOCKED;
1323336bf30eSMike Kravetz 		}
1324336bf30eSMike Kravetz 
13254b8554c5SMatthew Wilcox (Oracle) 		try_to_migrate(src, ttu);
13262ebba6b7SHugh Dickins 		page_was_mapped = 1;
1327336bf30eSMike Kravetz 
13285202978bSMiaohe Lin 		if (ttu & TTU_RMAP_LOCKED)
1329336bf30eSMike Kravetz 			i_mmap_unlock_write(mapping);
13302ebba6b7SHugh Dickins 	}
1331290408d4SNaoya Horiguchi 
1332290408d4SNaoya Horiguchi 	if (!page_mapped(hpage))
1333e7e3ffebSMatthew Wilcox (Oracle) 		rc = move_to_new_folio(dst, src, mode);
1334290408d4SNaoya Horiguchi 
1335336bf30eSMike Kravetz 	if (page_was_mapped)
13364eecb8b9SMatthew Wilcox (Oracle) 		remove_migration_ptes(src,
13374eecb8b9SMatthew Wilcox (Oracle) 			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
1338290408d4SNaoya Horiguchi 
1339c0d0381aSMike Kravetz unlock_put_anon:
13407db7671fSHugh Dickins 	unlock_page(new_hpage);
13417db7671fSHugh Dickins 
13427db7671fSHugh Dickins put_anon:
1343fd4a4663SHugh Dickins 	if (anon_vma)
13449e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
13458e6ac7faSAneesh Kumar K.V 
13462def7424SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1347ab5ac90aSMichal Hocko 		move_hugetlb_state(hpage, new_hpage, reason);
13482def7424SHugh Dickins 		put_new_page = NULL;
13492def7424SHugh Dickins 	}
13508e6ac7faSAneesh Kumar K.V 
1351cb6acd01SMike Kravetz out_unlock:
1352290408d4SNaoya Horiguchi 	unlock_page(hpage);
135309761333SHillf Danton out:
1354dd4ae78aSYang Shi 	if (rc == MIGRATEPAGE_SUCCESS)
1355b8ec1ceeSNaoya Horiguchi 		putback_active_hugepage(hpage);
1356a04840c6SMiaohe Lin 	else if (rc != -EAGAIN)
1357dd4ae78aSYang Shi 		list_move_tail(&hpage->lru, ret);
135868711a74SDavid Rientjes 
135968711a74SDavid Rientjes 	/*
136068711a74SDavid Rientjes 	 * If migration was not successful and there's a freeing callback, use
136168711a74SDavid Rientjes 	 * it.  Otherwise, put_page() will drop the reference grabbed during
136268711a74SDavid Rientjes 	 * isolation.
136368711a74SDavid Rientjes 	 */
13642def7424SHugh Dickins 	if (put_new_page)
136568711a74SDavid Rientjes 		put_new_page(new_hpage, private);
136668711a74SDavid Rientjes 	else
13673aaa76e1SNaoya Horiguchi 		putback_active_hugepage(new_hpage);
136868711a74SDavid Rientjes 
1369290408d4SNaoya Horiguchi 	return rc;
1370290408d4SNaoya Horiguchi }
1371290408d4SNaoya Horiguchi 
1372d532e2e5SYang Shi static inline int try_split_thp(struct page *page, struct page **page2,
1373d532e2e5SYang Shi 				struct list_head *from)
1374d532e2e5SYang Shi {
1375d532e2e5SYang Shi 	int rc = 0;
1376d532e2e5SYang Shi 
1377d532e2e5SYang Shi 	lock_page(page);
1378d532e2e5SYang Shi 	rc = split_huge_page_to_list(page, from);
1379d532e2e5SYang Shi 	unlock_page(page);
1380d532e2e5SYang Shi 	if (!rc)
1381d532e2e5SYang Shi 		list_safe_reset_next(page, *page2, lru);
1382d532e2e5SYang Shi 
1383d532e2e5SYang Shi 	return rc;
1384d532e2e5SYang Shi }
1385d532e2e5SYang Shi 
1386290408d4SNaoya Horiguchi /*
1387c73e5c9cSSrivatsa S. Bhat  * migrate_pages - migrate the pages specified in a list, to the free pages
1388c73e5c9cSSrivatsa S. Bhat  *		   supplied as the target for the page migration
1389e24f0b8fSChristoph Lameter  *
1390c73e5c9cSSrivatsa S. Bhat  * @from:		The list of pages to be migrated.
1391c73e5c9cSSrivatsa S. Bhat  * @get_new_page:	The function used to allocate free pages to be used
1392c73e5c9cSSrivatsa S. Bhat  *			as the target of the page migration.
139368711a74SDavid Rientjes  * @put_new_page:	The function used to free target pages if migration
139468711a74SDavid Rientjes  *			fails, or NULL if no special handling is necessary.
1395c73e5c9cSSrivatsa S. Bhat  * @private:		Private data to be passed on to get_new_page()
1396c73e5c9cSSrivatsa S. Bhat  * @mode:		The migration mode that specifies the constraints for
1397c73e5c9cSSrivatsa S. Bhat  *			page migration, if any.
1398c73e5c9cSSrivatsa S. Bhat  * @reason:		The reason for page migration.
1399b5bade97SBaolin Wang  * @ret_succeeded:	Set to the number of normal pages migrated successfully if
14005ac95884SYang Shi  *			the caller passes a non-NULL pointer.
1401e24f0b8fSChristoph Lameter  *
1402c73e5c9cSSrivatsa S. Bhat  * The function returns after 10 attempts or if no pages are movable any more
1403c73e5c9cSSrivatsa S. Bhat  * because the list has become empty or no retryable pages exist any more.
1404dd4ae78aSYang Shi  * It is caller's responsibility to call putback_movable_pages() to return pages
1405dd4ae78aSYang Shi  * to the LRU or free list only if ret != 0.
1406e24f0b8fSChristoph Lameter  *
14075d39a7ebSBaolin Wang  * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
14085d39a7ebSBaolin Wang  * an error code. The number of THP splits will be considered as the number of
14095d39a7ebSBaolin Wang  * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
1410e24f0b8fSChristoph Lameter  */
14119c620e2bSHugh Dickins int migrate_pages(struct list_head *from, new_page_t get_new_page,
141268711a74SDavid Rientjes 		free_page_t put_new_page, unsigned long private,
14135ac95884SYang Shi 		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1414e24f0b8fSChristoph Lameter {
1415e24f0b8fSChristoph Lameter 	int retry = 1;
14161a5bae25SAnshuman Khandual 	int thp_retry = 1;
1417e24f0b8fSChristoph Lameter 	int nr_failed = 0;
1418b5bade97SBaolin Wang 	int nr_failed_pages = 0;
14195647bc29SMel Gorman 	int nr_succeeded = 0;
14201a5bae25SAnshuman Khandual 	int nr_thp_succeeded = 0;
14211a5bae25SAnshuman Khandual 	int nr_thp_failed = 0;
14221a5bae25SAnshuman Khandual 	int nr_thp_split = 0;
1423e24f0b8fSChristoph Lameter 	int pass = 0;
14241a5bae25SAnshuman Khandual 	bool is_thp = false;
1425e24f0b8fSChristoph Lameter 	struct page *page;
1426e24f0b8fSChristoph Lameter 	struct page *page2;
14271a5bae25SAnshuman Khandual 	int rc, nr_subpages;
1428dd4ae78aSYang Shi 	LIST_HEAD(ret_pages);
1429b5bade97SBaolin Wang 	LIST_HEAD(thp_split_pages);
1430b0b515bfSYang Shi 	bool nosplit = (reason == MR_NUMA_MISPLACED);
1431b5bade97SBaolin Wang 	bool no_subpage_counting = false;
14322d1db3b1SChristoph Lameter 
14337bc1aec5SLiam Mark 	trace_mm_migrate_pages_start(mode, reason);
14347bc1aec5SLiam Mark 
1435b5bade97SBaolin Wang thp_subpage_migration:
14361a5bae25SAnshuman Khandual 	for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
1437e24f0b8fSChristoph Lameter 		retry = 0;
14381a5bae25SAnshuman Khandual 		thp_retry = 0;
1439e24f0b8fSChristoph Lameter 
1440e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
144194723aafSMichal Hocko retry:
14421a5bae25SAnshuman Khandual 			/*
14431a5bae25SAnshuman Khandual 			 * THP statistics is based on the source huge page.
14441a5bae25SAnshuman Khandual 			 * Capture required information that might get lost
14451a5bae25SAnshuman Khandual 			 * during migration.
14461a5bae25SAnshuman Khandual 			 */
14476c5c7b9fSZi Yan 			is_thp = PageTransHuge(page) && !PageHuge(page);
14485d39a7ebSBaolin Wang 			nr_subpages = compound_nr(page);
1449e24f0b8fSChristoph Lameter 			cond_resched();
1450e24f0b8fSChristoph Lameter 
145131caf665SNaoya Horiguchi 			if (PageHuge(page))
145231caf665SNaoya Horiguchi 				rc = unmap_and_move_huge_page(get_new_page,
145368711a74SDavid Rientjes 						put_new_page, private, page,
1454dd4ae78aSYang Shi 						pass > 2, mode, reason,
1455dd4ae78aSYang Shi 						&ret_pages);
145631caf665SNaoya Horiguchi 			else
145768711a74SDavid Rientjes 				rc = unmap_and_move(get_new_page, put_new_page,
1458add05cecSNaoya Horiguchi 						private, page, pass > 2, mode,
1459dd4ae78aSYang Shi 						reason, &ret_pages);
1460dd4ae78aSYang Shi 			/*
1461dd4ae78aSYang Shi 			 * The rules are:
1462dd4ae78aSYang Shi 			 *	Success: non hugetlb page will be freed, hugetlb
1463dd4ae78aSYang Shi 			 *		 page will be put back
1464dd4ae78aSYang Shi 			 *	-EAGAIN: stay on the from list
1465dd4ae78aSYang Shi 			 *	-ENOMEM: stay on the from list
1466dd4ae78aSYang Shi 			 *	Other errno: put on ret_pages list then splice to
1467dd4ae78aSYang Shi 			 *		     from list
1468dd4ae78aSYang Shi 			 */
1469e24f0b8fSChristoph Lameter 			switch(rc) {
147094723aafSMichal Hocko 			/*
147194723aafSMichal Hocko 			 * THP migration might be unsupported or the
147294723aafSMichal Hocko 			 * allocation could've failed so we should
147394723aafSMichal Hocko 			 * retry on the same page with the THP split
147494723aafSMichal Hocko 			 * to base pages.
147594723aafSMichal Hocko 			 *
147694723aafSMichal Hocko 			 * Head page is retried immediately and tail
147794723aafSMichal Hocko 			 * pages are added to the tail of the list so
147894723aafSMichal Hocko 			 * we encounter them after the rest of the list
147994723aafSMichal Hocko 			 * is processed.
148094723aafSMichal Hocko 			 */
1481d532e2e5SYang Shi 			case -ENOSYS:
1482d532e2e5SYang Shi 				/* THP migration is unsupported */
14836c5c7b9fSZi Yan 				if (is_thp) {
1484b5bade97SBaolin Wang 					nr_thp_failed++;
1485b5bade97SBaolin Wang 					if (!try_split_thp(page, &page2, &thp_split_pages)) {
1486d532e2e5SYang Shi 						nr_thp_split++;
1487d532e2e5SYang Shi 						goto retry;
1488d532e2e5SYang Shi 					}
1489f430893bSMiaohe Lin 				/* Hugetlb migration is unsupported */
1490f430893bSMiaohe Lin 				} else if (!no_subpage_counting) {
1491f430893bSMiaohe Lin 					nr_failed++;
1492d532e2e5SYang Shi 				}
1493d532e2e5SYang Shi 
14945d39a7ebSBaolin Wang 				nr_failed_pages += nr_subpages;
1495d532e2e5SYang Shi 				break;
1496d532e2e5SYang Shi 			case -ENOMEM:
1497d532e2e5SYang Shi 				/*
1498d532e2e5SYang Shi 				 * When memory is low, don't bother to try to migrate
1499d532e2e5SYang Shi 				 * other pages, just exit.
1500b0b515bfSYang Shi 				 * THP NUMA faulting doesn't split THP to retry.
1501d532e2e5SYang Shi 				 */
1502b0b515bfSYang Shi 				if (is_thp && !nosplit) {
1503b5bade97SBaolin Wang 					nr_thp_failed++;
1504b5bade97SBaolin Wang 					if (!try_split_thp(page, &page2, &thp_split_pages)) {
15051a5bae25SAnshuman Khandual 						nr_thp_split++;
150694723aafSMichal Hocko 						goto retry;
150794723aafSMichal Hocko 					}
1508f430893bSMiaohe Lin 				} else if (!no_subpage_counting) {
1509f430893bSMiaohe Lin 					nr_failed++;
15101a5bae25SAnshuman Khandual 				}
1511b5bade97SBaolin Wang 
15125d39a7ebSBaolin Wang 				nr_failed_pages += nr_subpages;
151369a041ffSMiaohe Lin 				/*
151469a041ffSMiaohe Lin 				 * There might be some subpages of fail-to-migrate THPs
151569a041ffSMiaohe Lin 				 * left in thp_split_pages list. Move them back to migration
151669a041ffSMiaohe Lin 				 * list so that they could be put back to the right list by
151769a041ffSMiaohe Lin 				 * the caller otherwise the page refcnt will be leaked.
151869a041ffSMiaohe Lin 				 */
151969a041ffSMiaohe Lin 				list_splice_init(&thp_split_pages, from);
152069a041ffSMiaohe Lin 				nr_thp_failed += thp_retry;
152195a402c3SChristoph Lameter 				goto out;
1522e24f0b8fSChristoph Lameter 			case -EAGAIN:
1523f430893bSMiaohe Lin 				if (is_thp)
15241a5bae25SAnshuman Khandual 					thp_retry++;
1525f430893bSMiaohe Lin 				else
1526b20a3503SChristoph Lameter 					retry++;
1527e24f0b8fSChristoph Lameter 				break;
152878bd5209SRafael Aquini 			case MIGRATEPAGE_SUCCESS:
15295d39a7ebSBaolin Wang 				nr_succeeded += nr_subpages;
1530f430893bSMiaohe Lin 				if (is_thp)
15311a5bae25SAnshuman Khandual 					nr_thp_succeeded++;
15321a5bae25SAnshuman Khandual 				break;
1533e24f0b8fSChristoph Lameter 			default:
1534354a3363SNaoya Horiguchi 				/*
1535d532e2e5SYang Shi 				 * Permanent failure (-EBUSY, etc.):
1536354a3363SNaoya Horiguchi 				 * unlike -EAGAIN case, the failed page is
1537354a3363SNaoya Horiguchi 				 * removed from migration page list and not
1538354a3363SNaoya Horiguchi 				 * retried in the next outer loop.
1539354a3363SNaoya Horiguchi 				 */
1540f430893bSMiaohe Lin 				if (is_thp)
15411a5bae25SAnshuman Khandual 					nr_thp_failed++;
1542f430893bSMiaohe Lin 				else if (!no_subpage_counting)
1543b20a3503SChristoph Lameter 					nr_failed++;
1544f430893bSMiaohe Lin 
15455d39a7ebSBaolin Wang 				nr_failed_pages += nr_subpages;
1546e24f0b8fSChristoph Lameter 				break;
1547b20a3503SChristoph Lameter 			}
1548b20a3503SChristoph Lameter 		}
1549e24f0b8fSChristoph Lameter 	}
1550b5bade97SBaolin Wang 	nr_failed += retry;
15511a5bae25SAnshuman Khandual 	nr_thp_failed += thp_retry;
1552b5bade97SBaolin Wang 	/*
1553b5bade97SBaolin Wang 	 * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
1554b5bade97SBaolin Wang 	 * counting in this round, since all subpages of a THP is counted
1555b5bade97SBaolin Wang 	 * as 1 failure in the first round.
1556b5bade97SBaolin Wang 	 */
1557b5bade97SBaolin Wang 	if (!list_empty(&thp_split_pages)) {
1558b5bade97SBaolin Wang 		/*
1559b5bade97SBaolin Wang 		 * Move non-migrated pages (after 10 retries) to ret_pages
1560b5bade97SBaolin Wang 		 * to avoid migrating them again.
1561b5bade97SBaolin Wang 		 */
1562b5bade97SBaolin Wang 		list_splice_init(from, &ret_pages);
1563b5bade97SBaolin Wang 		list_splice_init(&thp_split_pages, from);
1564b5bade97SBaolin Wang 		no_subpage_counting = true;
1565b5bade97SBaolin Wang 		retry = 1;
1566b5bade97SBaolin Wang 		goto thp_subpage_migration;
1567b5bade97SBaolin Wang 	}
1568b5bade97SBaolin Wang 
1569b5bade97SBaolin Wang 	rc = nr_failed + nr_thp_failed;
157095a402c3SChristoph Lameter out:
1571dd4ae78aSYang Shi 	/*
1572dd4ae78aSYang Shi 	 * Put the permanent failure page back to migration list, they
1573dd4ae78aSYang Shi 	 * will be put back to the right list by the caller.
1574dd4ae78aSYang Shi 	 */
1575dd4ae78aSYang Shi 	list_splice(&ret_pages, from);
1576dd4ae78aSYang Shi 
15775647bc29SMel Gorman 	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1578b5bade97SBaolin Wang 	count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
15791a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
15801a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
15811a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1582b5bade97SBaolin Wang 	trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
15831a5bae25SAnshuman Khandual 			       nr_thp_failed, nr_thp_split, mode, reason);
15847b2a2d4aSMel Gorman 
15855ac95884SYang Shi 	if (ret_succeeded)
15865ac95884SYang Shi 		*ret_succeeded = nr_succeeded;
15875ac95884SYang Shi 
158895a402c3SChristoph Lameter 	return rc;
1589b20a3503SChristoph Lameter }
1590b20a3503SChristoph Lameter 
159119fc7bedSJoonsoo Kim struct page *alloc_migration_target(struct page *page, unsigned long private)
1592b4b38223SJoonsoo Kim {
1593ffe06786SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(page);
159419fc7bedSJoonsoo Kim 	struct migration_target_control *mtc;
159519fc7bedSJoonsoo Kim 	gfp_t gfp_mask;
1596b4b38223SJoonsoo Kim 	unsigned int order = 0;
1597ffe06786SMatthew Wilcox (Oracle) 	struct folio *new_folio = NULL;
159819fc7bedSJoonsoo Kim 	int nid;
159919fc7bedSJoonsoo Kim 	int zidx;
160019fc7bedSJoonsoo Kim 
160119fc7bedSJoonsoo Kim 	mtc = (struct migration_target_control *)private;
160219fc7bedSJoonsoo Kim 	gfp_mask = mtc->gfp_mask;
160319fc7bedSJoonsoo Kim 	nid = mtc->nid;
160419fc7bedSJoonsoo Kim 	if (nid == NUMA_NO_NODE)
1605ffe06786SMatthew Wilcox (Oracle) 		nid = folio_nid(folio);
1606b4b38223SJoonsoo Kim 
1607ffe06786SMatthew Wilcox (Oracle) 	if (folio_test_hugetlb(folio)) {
1608ffe06786SMatthew Wilcox (Oracle) 		struct hstate *h = page_hstate(&folio->page);
1609d92bbc27SJoonsoo Kim 
161019fc7bedSJoonsoo Kim 		gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
161119fc7bedSJoonsoo Kim 		return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1612d92bbc27SJoonsoo Kim 	}
1613b4b38223SJoonsoo Kim 
1614ffe06786SMatthew Wilcox (Oracle) 	if (folio_test_large(folio)) {
16159933a0c8SJoonsoo Kim 		/*
16169933a0c8SJoonsoo Kim 		 * clear __GFP_RECLAIM to make the migration callback
16179933a0c8SJoonsoo Kim 		 * consistent with regular THP allocations.
16189933a0c8SJoonsoo Kim 		 */
16199933a0c8SJoonsoo Kim 		gfp_mask &= ~__GFP_RECLAIM;
1620b4b38223SJoonsoo Kim 		gfp_mask |= GFP_TRANSHUGE;
1621ffe06786SMatthew Wilcox (Oracle) 		order = folio_order(folio);
1622b4b38223SJoonsoo Kim 	}
1623ffe06786SMatthew Wilcox (Oracle) 	zidx = zone_idx(folio_zone(folio));
162419fc7bedSJoonsoo Kim 	if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1625b4b38223SJoonsoo Kim 		gfp_mask |= __GFP_HIGHMEM;
1626b4b38223SJoonsoo Kim 
1627ffe06786SMatthew Wilcox (Oracle) 	new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
1628b4b38223SJoonsoo Kim 
1629ffe06786SMatthew Wilcox (Oracle) 	return &new_folio->page;
1630b4b38223SJoonsoo Kim }
1631b4b38223SJoonsoo Kim 
1632742755a1SChristoph Lameter #ifdef CONFIG_NUMA
1633742755a1SChristoph Lameter 
1634a49bd4d7SMichal Hocko static int store_status(int __user *status, int start, int value, int nr)
1635742755a1SChristoph Lameter {
1636a49bd4d7SMichal Hocko 	while (nr-- > 0) {
1637a49bd4d7SMichal Hocko 		if (put_user(value, status + start))
1638a49bd4d7SMichal Hocko 			return -EFAULT;
1639a49bd4d7SMichal Hocko 		start++;
1640a49bd4d7SMichal Hocko 	}
1641742755a1SChristoph Lameter 
1642a49bd4d7SMichal Hocko 	return 0;
1643a49bd4d7SMichal Hocko }
1644742755a1SChristoph Lameter 
1645a49bd4d7SMichal Hocko static int do_move_pages_to_node(struct mm_struct *mm,
1646a49bd4d7SMichal Hocko 		struct list_head *pagelist, int node)
1647a49bd4d7SMichal Hocko {
1648a49bd4d7SMichal Hocko 	int err;
1649a0976311SJoonsoo Kim 	struct migration_target_control mtc = {
1650a0976311SJoonsoo Kim 		.nid = node,
1651a0976311SJoonsoo Kim 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1652a0976311SJoonsoo Kim 	};
1653742755a1SChristoph Lameter 
1654a0976311SJoonsoo Kim 	err = migrate_pages(pagelist, alloc_migration_target, NULL,
16555ac95884SYang Shi 		(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1656a49bd4d7SMichal Hocko 	if (err)
1657a49bd4d7SMichal Hocko 		putback_movable_pages(pagelist);
1658a49bd4d7SMichal Hocko 	return err;
1659742755a1SChristoph Lameter }
1660742755a1SChristoph Lameter 
1661742755a1SChristoph Lameter /*
1662a49bd4d7SMichal Hocko  * Resolves the given address to a struct page, isolates it from the LRU and
1663a49bd4d7SMichal Hocko  * puts it to the given pagelist.
1664e0153fc2SYang Shi  * Returns:
1665e0153fc2SYang Shi  *     errno - if the page cannot be found/isolated
1666e0153fc2SYang Shi  *     0 - when it doesn't have to be migrated because it is already on the
1667e0153fc2SYang Shi  *         target node
1668e0153fc2SYang Shi  *     1 - when it has been queued
1669742755a1SChristoph Lameter  */
1670a49bd4d7SMichal Hocko static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1671a49bd4d7SMichal Hocko 		int node, struct list_head *pagelist, bool migrate_all)
1672742755a1SChristoph Lameter {
1673742755a1SChristoph Lameter 	struct vm_area_struct *vma;
1674742755a1SChristoph Lameter 	struct page *page;
1675a49bd4d7SMichal Hocko 	int err;
1676742755a1SChristoph Lameter 
1677d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
1678742755a1SChristoph Lameter 	err = -EFAULT;
1679cb1c37b1SMiaohe Lin 	vma = vma_lookup(mm, addr);
1680cb1c37b1SMiaohe Lin 	if (!vma || !vma_migratable(vma))
1681a49bd4d7SMichal Hocko 		goto out;
1682742755a1SChristoph Lameter 
1683d899844eSKirill A. Shutemov 	/* FOLL_DUMP to ignore special (like zero) pages */
168487d2762eSMiaohe Lin 	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
168589f5b7daSLinus Torvalds 
168689f5b7daSLinus Torvalds 	err = PTR_ERR(page);
168789f5b7daSLinus Torvalds 	if (IS_ERR(page))
1688a49bd4d7SMichal Hocko 		goto out;
168989f5b7daSLinus Torvalds 
1690742755a1SChristoph Lameter 	err = -ENOENT;
16913218f871SAlex Sierra 	if (!page || is_zone_device_page(page))
1692a49bd4d7SMichal Hocko 		goto out;
1693742755a1SChristoph Lameter 
1694a49bd4d7SMichal Hocko 	err = 0;
1695a49bd4d7SMichal Hocko 	if (page_to_nid(page) == node)
1696a49bd4d7SMichal Hocko 		goto out_putpage;
1697742755a1SChristoph Lameter 
1698742755a1SChristoph Lameter 	err = -EACCES;
1699a49bd4d7SMichal Hocko 	if (page_mapcount(page) > 1 && !migrate_all)
1700a49bd4d7SMichal Hocko 		goto out_putpage;
1701742755a1SChristoph Lameter 
1702e632a938SNaoya Horiguchi 	if (PageHuge(page)) {
1703e8db67ebSNaoya Horiguchi 		if (PageHead(page)) {
17047ce82f4cSMiaohe Lin 			err = isolate_hugetlb(page, pagelist);
17057ce82f4cSMiaohe Lin 			if (!err)
1706e0153fc2SYang Shi 				err = 1;
1707e8db67ebSNaoya Horiguchi 		}
1708a49bd4d7SMichal Hocko 	} else {
1709a49bd4d7SMichal Hocko 		struct page *head;
1710e632a938SNaoya Horiguchi 
1711e8db67ebSNaoya Horiguchi 		head = compound_head(page);
1712e8db67ebSNaoya Horiguchi 		err = isolate_lru_page(head);
1713a49bd4d7SMichal Hocko 		if (err)
1714a49bd4d7SMichal Hocko 			goto out_putpage;
1715a49bd4d7SMichal Hocko 
1716e0153fc2SYang Shi 		err = 1;
1717a49bd4d7SMichal Hocko 		list_add_tail(&head->lru, pagelist);
1718e8db67ebSNaoya Horiguchi 		mod_node_page_state(page_pgdat(head),
17199de4f22aSHuang Ying 			NR_ISOLATED_ANON + page_is_file_lru(head),
17206c357848SMatthew Wilcox (Oracle) 			thp_nr_pages(head));
17216d9c285aSKOSAKI Motohiro 	}
1722a49bd4d7SMichal Hocko out_putpage:
1723742755a1SChristoph Lameter 	/*
1724742755a1SChristoph Lameter 	 * Either remove the duplicate refcount from
1725742755a1SChristoph Lameter 	 * isolate_lru_page() or drop the page ref if it was
1726742755a1SChristoph Lameter 	 * not isolated.
1727742755a1SChristoph Lameter 	 */
1728742755a1SChristoph Lameter 	put_page(page);
1729a49bd4d7SMichal Hocko out:
1730d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
1731742755a1SChristoph Lameter 	return err;
1732742755a1SChristoph Lameter }
1733742755a1SChristoph Lameter 
17347ca8783aSWei Yang static int move_pages_and_store_status(struct mm_struct *mm, int node,
17357ca8783aSWei Yang 		struct list_head *pagelist, int __user *status,
17367ca8783aSWei Yang 		int start, int i, unsigned long nr_pages)
17377ca8783aSWei Yang {
17387ca8783aSWei Yang 	int err;
17397ca8783aSWei Yang 
17405d7ae891SWei Yang 	if (list_empty(pagelist))
17415d7ae891SWei Yang 		return 0;
17425d7ae891SWei Yang 
17437ca8783aSWei Yang 	err = do_move_pages_to_node(mm, pagelist, node);
17447ca8783aSWei Yang 	if (err) {
17457ca8783aSWei Yang 		/*
17467ca8783aSWei Yang 		 * Positive err means the number of failed
17477ca8783aSWei Yang 		 * pages to migrate.  Since we are going to
17487ca8783aSWei Yang 		 * abort and return the number of non-migrated
1749ab9dd4f8SLong Li 		 * pages, so need to include the rest of the
17507ca8783aSWei Yang 		 * nr_pages that have not been attempted as
17517ca8783aSWei Yang 		 * well.
17527ca8783aSWei Yang 		 */
17537ca8783aSWei Yang 		if (err > 0)
17547ca8783aSWei Yang 			err += nr_pages - i - 1;
17557ca8783aSWei Yang 		return err;
17567ca8783aSWei Yang 	}
17577ca8783aSWei Yang 	return store_status(status, start, node, i - start);
17587ca8783aSWei Yang }
17597ca8783aSWei Yang 
1760742755a1SChristoph Lameter /*
17615e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
17625e9a0f02SBrice Goglin  * the corresponding array of status.
17635e9a0f02SBrice Goglin  */
17643268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
17655e9a0f02SBrice Goglin 			 unsigned long nr_pages,
17665e9a0f02SBrice Goglin 			 const void __user * __user *pages,
17675e9a0f02SBrice Goglin 			 const int __user *nodes,
17685e9a0f02SBrice Goglin 			 int __user *status, int flags)
17695e9a0f02SBrice Goglin {
1770a49bd4d7SMichal Hocko 	int current_node = NUMA_NO_NODE;
1771a49bd4d7SMichal Hocko 	LIST_HEAD(pagelist);
1772a49bd4d7SMichal Hocko 	int start, i;
1773a49bd4d7SMichal Hocko 	int err = 0, err1;
177435282a2dSBrice Goglin 
1775361a2a22SMinchan Kim 	lru_cache_disable();
177635282a2dSBrice Goglin 
1777a49bd4d7SMichal Hocko 	for (i = start = 0; i < nr_pages; i++) {
17785e9a0f02SBrice Goglin 		const void __user *p;
1779a49bd4d7SMichal Hocko 		unsigned long addr;
17805e9a0f02SBrice Goglin 		int node;
17815e9a0f02SBrice Goglin 
17823140a227SBrice Goglin 		err = -EFAULT;
1783a49bd4d7SMichal Hocko 		if (get_user(p, pages + i))
1784a49bd4d7SMichal Hocko 			goto out_flush;
1785a49bd4d7SMichal Hocko 		if (get_user(node, nodes + i))
1786a49bd4d7SMichal Hocko 			goto out_flush;
1787057d3389SAndrey Konovalov 		addr = (unsigned long)untagged_addr(p);
17885e9a0f02SBrice Goglin 
17895e9a0f02SBrice Goglin 		err = -ENODEV;
17906f5a55f1SLinus Torvalds 		if (node < 0 || node >= MAX_NUMNODES)
1791a49bd4d7SMichal Hocko 			goto out_flush;
1792389162c2SLai Jiangshan 		if (!node_state(node, N_MEMORY))
1793a49bd4d7SMichal Hocko 			goto out_flush;
17945e9a0f02SBrice Goglin 
17955e9a0f02SBrice Goglin 		err = -EACCES;
17965e9a0f02SBrice Goglin 		if (!node_isset(node, task_nodes))
1797a49bd4d7SMichal Hocko 			goto out_flush;
17985e9a0f02SBrice Goglin 
1799a49bd4d7SMichal Hocko 		if (current_node == NUMA_NO_NODE) {
1800a49bd4d7SMichal Hocko 			current_node = node;
1801a49bd4d7SMichal Hocko 			start = i;
1802a49bd4d7SMichal Hocko 		} else if (node != current_node) {
18037ca8783aSWei Yang 			err = move_pages_and_store_status(mm, current_node,
18047ca8783aSWei Yang 					&pagelist, status, start, i, nr_pages);
1805a49bd4d7SMichal Hocko 			if (err)
1806a49bd4d7SMichal Hocko 				goto out;
1807a49bd4d7SMichal Hocko 			start = i;
1808a49bd4d7SMichal Hocko 			current_node = node;
18095e9a0f02SBrice Goglin 		}
18105e9a0f02SBrice Goglin 
1811a49bd4d7SMichal Hocko 		/*
1812a49bd4d7SMichal Hocko 		 * Errors in the page lookup or isolation are not fatal and we simply
1813a49bd4d7SMichal Hocko 		 * report them via status
1814a49bd4d7SMichal Hocko 		 */
1815a49bd4d7SMichal Hocko 		err = add_page_for_migration(mm, addr, current_node,
1816a49bd4d7SMichal Hocko 				&pagelist, flags & MPOL_MF_MOVE_ALL);
1817e0153fc2SYang Shi 
1818d08221a0SWei Yang 		if (err > 0) {
1819e0153fc2SYang Shi 			/* The page is successfully queued for migration */
1820e0153fc2SYang Shi 			continue;
1821e0153fc2SYang Shi 		}
18223140a227SBrice Goglin 
1823d08221a0SWei Yang 		/*
182465462462SJohn Hubbard 		 * The move_pages() man page does not have an -EEXIST choice, so
182565462462SJohn Hubbard 		 * use -EFAULT instead.
182665462462SJohn Hubbard 		 */
182765462462SJohn Hubbard 		if (err == -EEXIST)
182865462462SJohn Hubbard 			err = -EFAULT;
182965462462SJohn Hubbard 
183065462462SJohn Hubbard 		/*
1831d08221a0SWei Yang 		 * If the page is already on the target node (!err), store the
1832d08221a0SWei Yang 		 * node, otherwise, store the err.
1833d08221a0SWei Yang 		 */
1834d08221a0SWei Yang 		err = store_status(status, i, err ? : current_node, 1);
1835a49bd4d7SMichal Hocko 		if (err)
1836a49bd4d7SMichal Hocko 			goto out_flush;
18373140a227SBrice Goglin 
18387ca8783aSWei Yang 		err = move_pages_and_store_status(mm, current_node, &pagelist,
18397ca8783aSWei Yang 				status, start, i, nr_pages);
1840a49bd4d7SMichal Hocko 		if (err)
1841a49bd4d7SMichal Hocko 			goto out;
1842a49bd4d7SMichal Hocko 		current_node = NUMA_NO_NODE;
18433140a227SBrice Goglin 	}
1844a49bd4d7SMichal Hocko out_flush:
1845a49bd4d7SMichal Hocko 	/* Make sure we do not overwrite the existing error */
18467ca8783aSWei Yang 	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
18477ca8783aSWei Yang 				status, start, i, nr_pages);
1848dfe9aa23SWei Yang 	if (err >= 0)
1849a49bd4d7SMichal Hocko 		err = err1;
18505e9a0f02SBrice Goglin out:
1851361a2a22SMinchan Kim 	lru_cache_enable();
18525e9a0f02SBrice Goglin 	return err;
18535e9a0f02SBrice Goglin }
18545e9a0f02SBrice Goglin 
18555e9a0f02SBrice Goglin /*
18562f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
1857742755a1SChristoph Lameter  */
185880bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
185980bba129SBrice Goglin 				const void __user **pages, int *status)
1860742755a1SChristoph Lameter {
18612f007e74SBrice Goglin 	unsigned long i;
1862742755a1SChristoph Lameter 
1863d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
18642f007e74SBrice Goglin 
18652f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
186680bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
186783156821SHaiyue Wang 		unsigned int foll_flags = FOLL_DUMP;
18682f007e74SBrice Goglin 		struct vm_area_struct *vma;
18692f007e74SBrice Goglin 		struct page *page;
1870c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
18712f007e74SBrice Goglin 
1872059b8b48SLiam Howlett 		vma = vma_lookup(mm, addr);
1873059b8b48SLiam Howlett 		if (!vma)
1874742755a1SChristoph Lameter 			goto set_status;
1875742755a1SChristoph Lameter 
187683156821SHaiyue Wang 		/* Not all huge page follow APIs support 'FOLL_GET' */
187783156821SHaiyue Wang 		if (!is_vm_hugetlb_page(vma))
187883156821SHaiyue Wang 			foll_flags |= FOLL_GET;
187983156821SHaiyue Wang 
1880d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
188183156821SHaiyue Wang 		page = follow_page(vma, addr, foll_flags);
188289f5b7daSLinus Torvalds 
188389f5b7daSLinus Torvalds 		err = PTR_ERR(page);
188489f5b7daSLinus Torvalds 		if (IS_ERR(page))
188589f5b7daSLinus Torvalds 			goto set_status;
188689f5b7daSLinus Torvalds 
18873218f871SAlex Sierra 		if (page && !is_zone_device_page(page)) {
18884cd61484SMiaohe Lin 			err = page_to_nid(page);
188983156821SHaiyue Wang 			if (foll_flags & FOLL_GET)
18904cd61484SMiaohe Lin 				put_page(page);
18914cd61484SMiaohe Lin 		} else {
18924cd61484SMiaohe Lin 			err = -ENOENT;
18934cd61484SMiaohe Lin 		}
1894742755a1SChristoph Lameter set_status:
189580bba129SBrice Goglin 		*status = err;
189680bba129SBrice Goglin 
189780bba129SBrice Goglin 		pages++;
189880bba129SBrice Goglin 		status++;
189980bba129SBrice Goglin 	}
190080bba129SBrice Goglin 
1901d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
190280bba129SBrice Goglin }
190380bba129SBrice Goglin 
19045b1b561bSArnd Bergmann static int get_compat_pages_array(const void __user *chunk_pages[],
19055b1b561bSArnd Bergmann 				  const void __user * __user *pages,
19065b1b561bSArnd Bergmann 				  unsigned long chunk_nr)
19075b1b561bSArnd Bergmann {
19085b1b561bSArnd Bergmann 	compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
19095b1b561bSArnd Bergmann 	compat_uptr_t p;
19105b1b561bSArnd Bergmann 	int i;
19115b1b561bSArnd Bergmann 
19125b1b561bSArnd Bergmann 	for (i = 0; i < chunk_nr; i++) {
19135b1b561bSArnd Bergmann 		if (get_user(p, pages32 + i))
19145b1b561bSArnd Bergmann 			return -EFAULT;
19155b1b561bSArnd Bergmann 		chunk_pages[i] = compat_ptr(p);
19165b1b561bSArnd Bergmann 	}
19175b1b561bSArnd Bergmann 
19185b1b561bSArnd Bergmann 	return 0;
19195b1b561bSArnd Bergmann }
19205b1b561bSArnd Bergmann 
192180bba129SBrice Goglin /*
192280bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
192380bba129SBrice Goglin  * a user array of status.
192480bba129SBrice Goglin  */
192580bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
192680bba129SBrice Goglin 			 const void __user * __user *pages,
192780bba129SBrice Goglin 			 int __user *status)
192880bba129SBrice Goglin {
19293eefb826SMiaohe Lin #define DO_PAGES_STAT_CHUNK_NR 16UL
193080bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
193180bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
193280bba129SBrice Goglin 
193387b8d1adSH. Peter Anvin 	while (nr_pages) {
19343eefb826SMiaohe Lin 		unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
193587b8d1adSH. Peter Anvin 
19365b1b561bSArnd Bergmann 		if (in_compat_syscall()) {
19375b1b561bSArnd Bergmann 			if (get_compat_pages_array(chunk_pages, pages,
19385b1b561bSArnd Bergmann 						   chunk_nr))
193987b8d1adSH. Peter Anvin 				break;
19405b1b561bSArnd Bergmann 		} else {
19415b1b561bSArnd Bergmann 			if (copy_from_user(chunk_pages, pages,
19425b1b561bSArnd Bergmann 				      chunk_nr * sizeof(*chunk_pages)))
19435b1b561bSArnd Bergmann 				break;
19445b1b561bSArnd Bergmann 		}
194580bba129SBrice Goglin 
194680bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
194780bba129SBrice Goglin 
194887b8d1adSH. Peter Anvin 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
194987b8d1adSH. Peter Anvin 			break;
1950742755a1SChristoph Lameter 
195187b8d1adSH. Peter Anvin 		pages += chunk_nr;
195287b8d1adSH. Peter Anvin 		status += chunk_nr;
195387b8d1adSH. Peter Anvin 		nr_pages -= chunk_nr;
195487b8d1adSH. Peter Anvin 	}
195587b8d1adSH. Peter Anvin 	return nr_pages ? -EFAULT : 0;
1956742755a1SChristoph Lameter }
1957742755a1SChristoph Lameter 
19584dc200ceSMiaohe Lin static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
19594dc200ceSMiaohe Lin {
19604dc200ceSMiaohe Lin 	struct task_struct *task;
19614dc200ceSMiaohe Lin 	struct mm_struct *mm;
19624dc200ceSMiaohe Lin 
19634dc200ceSMiaohe Lin 	/*
19644dc200ceSMiaohe Lin 	 * There is no need to check if current process has the right to modify
19654dc200ceSMiaohe Lin 	 * the specified process when they are same.
19664dc200ceSMiaohe Lin 	 */
19674dc200ceSMiaohe Lin 	if (!pid) {
19684dc200ceSMiaohe Lin 		mmget(current->mm);
19694dc200ceSMiaohe Lin 		*mem_nodes = cpuset_mems_allowed(current);
19704dc200ceSMiaohe Lin 		return current->mm;
19714dc200ceSMiaohe Lin 	}
19724dc200ceSMiaohe Lin 
19734dc200ceSMiaohe Lin 	/* Find the mm_struct */
19744dc200ceSMiaohe Lin 	rcu_read_lock();
19754dc200ceSMiaohe Lin 	task = find_task_by_vpid(pid);
19764dc200ceSMiaohe Lin 	if (!task) {
19774dc200ceSMiaohe Lin 		rcu_read_unlock();
19784dc200ceSMiaohe Lin 		return ERR_PTR(-ESRCH);
19794dc200ceSMiaohe Lin 	}
19804dc200ceSMiaohe Lin 	get_task_struct(task);
19814dc200ceSMiaohe Lin 
19824dc200ceSMiaohe Lin 	/*
19834dc200ceSMiaohe Lin 	 * Check if this process has the right to modify the specified
19844dc200ceSMiaohe Lin 	 * process. Use the regular "ptrace_may_access()" checks.
19854dc200ceSMiaohe Lin 	 */
19864dc200ceSMiaohe Lin 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
19874dc200ceSMiaohe Lin 		rcu_read_unlock();
19884dc200ceSMiaohe Lin 		mm = ERR_PTR(-EPERM);
19894dc200ceSMiaohe Lin 		goto out;
19904dc200ceSMiaohe Lin 	}
19914dc200ceSMiaohe Lin 	rcu_read_unlock();
19924dc200ceSMiaohe Lin 
19934dc200ceSMiaohe Lin 	mm = ERR_PTR(security_task_movememory(task));
19944dc200ceSMiaohe Lin 	if (IS_ERR(mm))
19954dc200ceSMiaohe Lin 		goto out;
19964dc200ceSMiaohe Lin 	*mem_nodes = cpuset_mems_allowed(task);
19974dc200ceSMiaohe Lin 	mm = get_task_mm(task);
19984dc200ceSMiaohe Lin out:
19994dc200ceSMiaohe Lin 	put_task_struct(task);
20004dc200ceSMiaohe Lin 	if (!mm)
20014dc200ceSMiaohe Lin 		mm = ERR_PTR(-EINVAL);
20024dc200ceSMiaohe Lin 	return mm;
20034dc200ceSMiaohe Lin }
20044dc200ceSMiaohe Lin 
2005742755a1SChristoph Lameter /*
2006742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
2007742755a1SChristoph Lameter  * process.
2008742755a1SChristoph Lameter  */
20097addf443SDominik Brodowski static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
20107addf443SDominik Brodowski 			     const void __user * __user *pages,
20117addf443SDominik Brodowski 			     const int __user *nodes,
20127addf443SDominik Brodowski 			     int __user *status, int flags)
2013742755a1SChristoph Lameter {
2014742755a1SChristoph Lameter 	struct mm_struct *mm;
20155e9a0f02SBrice Goglin 	int err;
20163268c63eSChristoph Lameter 	nodemask_t task_nodes;
2017742755a1SChristoph Lameter 
2018742755a1SChristoph Lameter 	/* Check flags */
2019742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
2020742755a1SChristoph Lameter 		return -EINVAL;
2021742755a1SChristoph Lameter 
2022742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2023742755a1SChristoph Lameter 		return -EPERM;
2024742755a1SChristoph Lameter 
20254dc200ceSMiaohe Lin 	mm = find_mm_struct(pid, &task_nodes);
20264dc200ceSMiaohe Lin 	if (IS_ERR(mm))
20274dc200ceSMiaohe Lin 		return PTR_ERR(mm);
20286e8b09eaSSasha Levin 
20293268c63eSChristoph Lameter 	if (nodes)
20303268c63eSChristoph Lameter 		err = do_pages_move(mm, task_nodes, nr_pages, pages,
20313268c63eSChristoph Lameter 				    nodes, status, flags);
20323268c63eSChristoph Lameter 	else
20335e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
20343268c63eSChristoph Lameter 
20353268c63eSChristoph Lameter 	mmput(mm);
20363268c63eSChristoph Lameter 	return err;
2037742755a1SChristoph Lameter }
2038742755a1SChristoph Lameter 
20397addf443SDominik Brodowski SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
20407addf443SDominik Brodowski 		const void __user * __user *, pages,
20417addf443SDominik Brodowski 		const int __user *, nodes,
20427addf443SDominik Brodowski 		int __user *, status, int, flags)
20437addf443SDominik Brodowski {
20447addf443SDominik Brodowski 	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
20457addf443SDominik Brodowski }
20467addf443SDominik Brodowski 
20477039e1dbSPeter Zijlstra #ifdef CONFIG_NUMA_BALANCING
20487039e1dbSPeter Zijlstra /*
20497039e1dbSPeter Zijlstra  * Returns true if this is a safe migration target node for misplaced NUMA
2050bc53008eSWei Yang  * pages. Currently it only checks the watermarks which is crude.
20517039e1dbSPeter Zijlstra  */
20527039e1dbSPeter Zijlstra static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
20533abef4e6SMel Gorman 				   unsigned long nr_migrate_pages)
20547039e1dbSPeter Zijlstra {
20557039e1dbSPeter Zijlstra 	int z;
2056599d0c95SMel Gorman 
20577039e1dbSPeter Zijlstra 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
20587039e1dbSPeter Zijlstra 		struct zone *zone = pgdat->node_zones + z;
20597039e1dbSPeter Zijlstra 
2060bc53008eSWei Yang 		if (!managed_zone(zone))
20617039e1dbSPeter Zijlstra 			continue;
20627039e1dbSPeter Zijlstra 
20637039e1dbSPeter Zijlstra 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
20647039e1dbSPeter Zijlstra 		if (!zone_watermark_ok(zone, 0,
20657039e1dbSPeter Zijlstra 				       high_wmark_pages(zone) +
20667039e1dbSPeter Zijlstra 				       nr_migrate_pages,
2067bfe9d006SHuang Ying 				       ZONE_MOVABLE, 0))
20687039e1dbSPeter Zijlstra 			continue;
20697039e1dbSPeter Zijlstra 		return true;
20707039e1dbSPeter Zijlstra 	}
20717039e1dbSPeter Zijlstra 	return false;
20727039e1dbSPeter Zijlstra }
20737039e1dbSPeter Zijlstra 
20747039e1dbSPeter Zijlstra static struct page *alloc_misplaced_dst_page(struct page *page,
2075666feb21SMichal Hocko 					   unsigned long data)
20767039e1dbSPeter Zijlstra {
20777039e1dbSPeter Zijlstra 	int nid = (int) data;
2078c185e494SMatthew Wilcox (Oracle) 	int order = compound_order(page);
2079c185e494SMatthew Wilcox (Oracle) 	gfp_t gfp = __GFP_THISNODE;
2080c185e494SMatthew Wilcox (Oracle) 	struct folio *new;
20817039e1dbSPeter Zijlstra 
2082c185e494SMatthew Wilcox (Oracle) 	if (order > 0)
2083c185e494SMatthew Wilcox (Oracle) 		gfp |= GFP_TRANSHUGE_LIGHT;
2084c185e494SMatthew Wilcox (Oracle) 	else {
2085c185e494SMatthew Wilcox (Oracle) 		gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2086c185e494SMatthew Wilcox (Oracle) 			__GFP_NOWARN;
2087c185e494SMatthew Wilcox (Oracle) 		gfp &= ~__GFP_RECLAIM;
20887039e1dbSPeter Zijlstra 	}
2089c185e494SMatthew Wilcox (Oracle) 	new = __folio_alloc_node(gfp, order, nid);
20907039e1dbSPeter Zijlstra 
2091c185e494SMatthew Wilcox (Oracle) 	return &new->page;
2092c5b5a3ddSYang Shi }
2093c5b5a3ddSYang Shi 
20941c30e017SMel Gorman static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
2095b32967ffSMel Gorman {
20962b9b624fSBaolin Wang 	int nr_pages = thp_nr_pages(page);
2097c574bbe9SHuang Ying 	int order = compound_order(page);
2098b32967ffSMel Gorman 
2099c574bbe9SHuang Ying 	VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
21003abef4e6SMel Gorman 
2101662aeea7SYang Shi 	/* Do not migrate THP mapped by multiple processes */
2102662aeea7SYang Shi 	if (PageTransHuge(page) && total_mapcount(page) > 1)
2103662aeea7SYang Shi 		return 0;
2104662aeea7SYang Shi 
2105b32967ffSMel Gorman 	/* Avoid migrating to a node that is nearly full */
2106c574bbe9SHuang Ying 	if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2107c574bbe9SHuang Ying 		int z;
2108c574bbe9SHuang Ying 
2109c574bbe9SHuang Ying 		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2110340ef390SHugh Dickins 			return 0;
2111c574bbe9SHuang Ying 		for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2112bc53008eSWei Yang 			if (managed_zone(pgdat->node_zones + z))
2113c574bbe9SHuang Ying 				break;
2114c574bbe9SHuang Ying 		}
2115c574bbe9SHuang Ying 		wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
2116c574bbe9SHuang Ying 		return 0;
2117c574bbe9SHuang Ying 	}
2118b32967ffSMel Gorman 
2119340ef390SHugh Dickins 	if (isolate_lru_page(page))
2120340ef390SHugh Dickins 		return 0;
2121340ef390SHugh Dickins 
2122b75454e1SMiaohe Lin 	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
21232b9b624fSBaolin Wang 			    nr_pages);
2124b32967ffSMel Gorman 
2125b32967ffSMel Gorman 	/*
2126340ef390SHugh Dickins 	 * Isolating the page has taken another reference, so the
2127340ef390SHugh Dickins 	 * caller's reference can be safely dropped without the page
2128340ef390SHugh Dickins 	 * disappearing underneath us during migration.
2129b32967ffSMel Gorman 	 */
2130b32967ffSMel Gorman 	put_page(page);
2131340ef390SHugh Dickins 	return 1;
2132b32967ffSMel Gorman }
2133b32967ffSMel Gorman 
2134a8f60772SMel Gorman /*
21357039e1dbSPeter Zijlstra  * Attempt to migrate a misplaced page to the specified destination
21367039e1dbSPeter Zijlstra  * node. Caller is expected to have an elevated reference count on
21377039e1dbSPeter Zijlstra  * the page that will be dropped by this function before returning.
21387039e1dbSPeter Zijlstra  */
21391bc115d8SMel Gorman int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
21401bc115d8SMel Gorman 			   int node)
21417039e1dbSPeter Zijlstra {
2142a8f60772SMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
2143340ef390SHugh Dickins 	int isolated;
2144b32967ffSMel Gorman 	int nr_remaining;
2145e39bb6beSHuang Ying 	unsigned int nr_succeeded;
21467039e1dbSPeter Zijlstra 	LIST_HEAD(migratepages);
2147b5916c02SAneesh Kumar K.V 	int nr_pages = thp_nr_pages(page);
2148c5b5a3ddSYang Shi 
2149c5b5a3ddSYang Shi 	/*
21501bc115d8SMel Gorman 	 * Don't migrate file pages that are mapped in multiple processes
21511bc115d8SMel Gorman 	 * with execute permissions as they are probably shared libraries.
21527039e1dbSPeter Zijlstra 	 */
21537ee820eeSMiaohe Lin 	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
21547ee820eeSMiaohe Lin 	    (vma->vm_flags & VM_EXEC))
21557039e1dbSPeter Zijlstra 		goto out;
21567039e1dbSPeter Zijlstra 
2157a8f60772SMel Gorman 	/*
215809a913a7SMel Gorman 	 * Also do not migrate dirty pages as not all filesystems can move
215909a913a7SMel Gorman 	 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
216009a913a7SMel Gorman 	 */
21619de4f22aSHuang Ying 	if (page_is_file_lru(page) && PageDirty(page))
216209a913a7SMel Gorman 		goto out;
216309a913a7SMel Gorman 
2164b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
2165b32967ffSMel Gorman 	if (!isolated)
21667039e1dbSPeter Zijlstra 		goto out;
21677039e1dbSPeter Zijlstra 
21687039e1dbSPeter Zijlstra 	list_add(&page->lru, &migratepages);
2169c185e494SMatthew Wilcox (Oracle) 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
2170c185e494SMatthew Wilcox (Oracle) 				     NULL, node, MIGRATE_ASYNC,
2171c185e494SMatthew Wilcox (Oracle) 				     MR_NUMA_MISPLACED, &nr_succeeded);
21727039e1dbSPeter Zijlstra 	if (nr_remaining) {
217359c82b70SJoonsoo Kim 		if (!list_empty(&migratepages)) {
217459c82b70SJoonsoo Kim 			list_del(&page->lru);
2175c5fc5c3aSYang Shi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2176c5fc5c3aSYang Shi 					page_is_file_lru(page), -nr_pages);
217759c82b70SJoonsoo Kim 			putback_lru_page(page);
217859c82b70SJoonsoo Kim 		}
21797039e1dbSPeter Zijlstra 		isolated = 0;
2180e39bb6beSHuang Ying 	}
2181e39bb6beSHuang Ying 	if (nr_succeeded) {
2182e39bb6beSHuang Ying 		count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2183e39bb6beSHuang Ying 		if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
2184e39bb6beSHuang Ying 			mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
2185e39bb6beSHuang Ying 					    nr_succeeded);
2186e39bb6beSHuang Ying 	}
21877039e1dbSPeter Zijlstra 	BUG_ON(!list_empty(&migratepages));
21887039e1dbSPeter Zijlstra 	return isolated;
2189340ef390SHugh Dickins 
2190340ef390SHugh Dickins out:
2191340ef390SHugh Dickins 	put_page(page);
2192340ef390SHugh Dickins 	return 0;
21937039e1dbSPeter Zijlstra }
2194220018d3SMel Gorman #endif /* CONFIG_NUMA_BALANCING */
21958763cb45SJérôme Glisse 
2196dcee9bf5SHuang Ying /*
2197dcee9bf5SHuang Ying  * node_demotion[] example:
2198dcee9bf5SHuang Ying  *
2199dcee9bf5SHuang Ying  * Consider a system with two sockets.  Each socket has
2200dcee9bf5SHuang Ying  * three classes of memory attached: fast, medium and slow.
2201dcee9bf5SHuang Ying  * Each memory class is placed in its own NUMA node.  The
2202dcee9bf5SHuang Ying  * CPUs are placed in the node with the "fast" memory.  The
2203dcee9bf5SHuang Ying  * 6 NUMA nodes (0-5) might be split among the sockets like
2204dcee9bf5SHuang Ying  * this:
2205dcee9bf5SHuang Ying  *
2206dcee9bf5SHuang Ying  *	Socket A: 0, 1, 2
2207dcee9bf5SHuang Ying  *	Socket B: 3, 4, 5
2208dcee9bf5SHuang Ying  *
2209dcee9bf5SHuang Ying  * When Node 0 fills up, its memory should be migrated to
2210dcee9bf5SHuang Ying  * Node 1.  When Node 1 fills up, it should be migrated to
2211dcee9bf5SHuang Ying  * Node 2.  The migration path start on the nodes with the
2212dcee9bf5SHuang Ying  * processors (since allocations default to this node) and
2213dcee9bf5SHuang Ying  * fast memory, progress through medium and end with the
2214dcee9bf5SHuang Ying  * slow memory:
2215dcee9bf5SHuang Ying  *
2216dcee9bf5SHuang Ying  *	0 -> 1 -> 2 -> stop
2217dcee9bf5SHuang Ying  *	3 -> 4 -> 5 -> stop
2218dcee9bf5SHuang Ying  *
2219dcee9bf5SHuang Ying  * This is represented in the node_demotion[] like this:
2220dcee9bf5SHuang Ying  *
2221dcee9bf5SHuang Ying  *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
2222dcee9bf5SHuang Ying  *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
2223dcee9bf5SHuang Ying  *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
2224dcee9bf5SHuang Ying  *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
2225dcee9bf5SHuang Ying  *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
2226dcee9bf5SHuang Ying  *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
2227dcee9bf5SHuang Ying  *
2228dcee9bf5SHuang Ying  * Moreover some systems may have multiple slow memory nodes.
2229dcee9bf5SHuang Ying  * Suppose a system has one socket with 3 memory nodes, node 0
2230dcee9bf5SHuang Ying  * is fast memory type, and node 1/2 both are slow memory
2231dcee9bf5SHuang Ying  * type, and the distance between fast memory node and slow
2232dcee9bf5SHuang Ying  * memory node is same. So the migration path should be:
2233dcee9bf5SHuang Ying  *
2234dcee9bf5SHuang Ying  *	0 -> 1/2 -> stop
2235dcee9bf5SHuang Ying  *
2236dcee9bf5SHuang Ying  * This is represented in the node_demotion[] like this:
2237dcee9bf5SHuang Ying  *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
2238dcee9bf5SHuang Ying  *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
2239dcee9bf5SHuang Ying  *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
2240dcee9bf5SHuang Ying  */
2241dcee9bf5SHuang Ying 
2242dcee9bf5SHuang Ying /*
2243dcee9bf5SHuang Ying  * Writes to this array occur without locking.  Cycles are
2244dcee9bf5SHuang Ying  * not allowed: Node X demotes to Y which demotes to X...
2245dcee9bf5SHuang Ying  *
2246dcee9bf5SHuang Ying  * If multiple reads are performed, a single rcu_read_lock()
2247dcee9bf5SHuang Ying  * must be held over all reads to ensure that no cycles are
2248dcee9bf5SHuang Ying  * observed.
2249dcee9bf5SHuang Ying  */
2250dcee9bf5SHuang Ying #define DEFAULT_DEMOTION_TARGET_NODES 15
2251dcee9bf5SHuang Ying 
2252dcee9bf5SHuang Ying #if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
2253dcee9bf5SHuang Ying #define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
2254dcee9bf5SHuang Ying #else
2255dcee9bf5SHuang Ying #define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
2256dcee9bf5SHuang Ying #endif
2257dcee9bf5SHuang Ying 
2258dcee9bf5SHuang Ying struct demotion_nodes {
2259dcee9bf5SHuang Ying 	unsigned short nr;
2260dcee9bf5SHuang Ying 	short nodes[DEMOTION_TARGET_NODES];
2261dcee9bf5SHuang Ying };
2262dcee9bf5SHuang Ying 
2263dcee9bf5SHuang Ying static struct demotion_nodes *node_demotion __read_mostly;
2264dcee9bf5SHuang Ying 
2265dcee9bf5SHuang Ying /**
2266dcee9bf5SHuang Ying  * next_demotion_node() - Get the next node in the demotion path
2267dcee9bf5SHuang Ying  * @node: The starting node to lookup the next node
2268dcee9bf5SHuang Ying  *
2269dcee9bf5SHuang Ying  * Return: node id for next memory node in the demotion path hierarchy
2270dcee9bf5SHuang Ying  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
2271dcee9bf5SHuang Ying  * @node online or guarantee that it *continues* to be the next demotion
2272dcee9bf5SHuang Ying  * target.
2273dcee9bf5SHuang Ying  */
2274dcee9bf5SHuang Ying int next_demotion_node(int node)
2275dcee9bf5SHuang Ying {
2276dcee9bf5SHuang Ying 	struct demotion_nodes *nd;
2277dcee9bf5SHuang Ying 	unsigned short target_nr, index;
2278dcee9bf5SHuang Ying 	int target;
2279dcee9bf5SHuang Ying 
2280dcee9bf5SHuang Ying 	if (!node_demotion)
2281dcee9bf5SHuang Ying 		return NUMA_NO_NODE;
2282dcee9bf5SHuang Ying 
2283dcee9bf5SHuang Ying 	nd = &node_demotion[node];
2284dcee9bf5SHuang Ying 
2285dcee9bf5SHuang Ying 	/*
2286dcee9bf5SHuang Ying 	 * node_demotion[] is updated without excluding this
2287dcee9bf5SHuang Ying 	 * function from running.  RCU doesn't provide any
2288dcee9bf5SHuang Ying 	 * compiler barriers, so the READ_ONCE() is required
2289dcee9bf5SHuang Ying 	 * to avoid compiler reordering or read merging.
2290dcee9bf5SHuang Ying 	 *
2291dcee9bf5SHuang Ying 	 * Make sure to use RCU over entire code blocks if
2292dcee9bf5SHuang Ying 	 * node_demotion[] reads need to be consistent.
2293dcee9bf5SHuang Ying 	 */
2294dcee9bf5SHuang Ying 	rcu_read_lock();
2295dcee9bf5SHuang Ying 	target_nr = READ_ONCE(nd->nr);
2296dcee9bf5SHuang Ying 
2297dcee9bf5SHuang Ying 	switch (target_nr) {
2298dcee9bf5SHuang Ying 	case 0:
2299dcee9bf5SHuang Ying 		target = NUMA_NO_NODE;
2300dcee9bf5SHuang Ying 		goto out;
2301dcee9bf5SHuang Ying 	case 1:
2302dcee9bf5SHuang Ying 		index = 0;
2303dcee9bf5SHuang Ying 		break;
2304dcee9bf5SHuang Ying 	default:
2305dcee9bf5SHuang Ying 		/*
2306dcee9bf5SHuang Ying 		 * If there are multiple target nodes, just select one
2307dcee9bf5SHuang Ying 		 * target node randomly.
2308dcee9bf5SHuang Ying 		 *
2309dcee9bf5SHuang Ying 		 * In addition, we can also use round-robin to select
2310dcee9bf5SHuang Ying 		 * target node, but we should introduce another variable
2311dcee9bf5SHuang Ying 		 * for node_demotion[] to record last selected target node,
2312dcee9bf5SHuang Ying 		 * that may cause cache ping-pong due to the changing of
2313dcee9bf5SHuang Ying 		 * last target node. Or introducing per-cpu data to avoid
2314dcee9bf5SHuang Ying 		 * caching issue, which seems more complicated. So selecting
2315dcee9bf5SHuang Ying 		 * target node randomly seems better until now.
2316dcee9bf5SHuang Ying 		 */
2317dcee9bf5SHuang Ying 		index = get_random_int() % target_nr;
2318dcee9bf5SHuang Ying 		break;
2319dcee9bf5SHuang Ying 	}
2320dcee9bf5SHuang Ying 
2321dcee9bf5SHuang Ying 	target = READ_ONCE(nd->nodes[index]);
2322dcee9bf5SHuang Ying 
2323dcee9bf5SHuang Ying out:
2324dcee9bf5SHuang Ying 	rcu_read_unlock();
2325dcee9bf5SHuang Ying 	return target;
2326dcee9bf5SHuang Ying }
2327dcee9bf5SHuang Ying 
232879c28a41SDave Hansen /* Disable reclaim-based migration. */
232979c28a41SDave Hansen static void __disable_all_migrate_targets(void)
233079c28a41SDave Hansen {
2331ac16ec83SBaolin Wang 	int node, i;
233279c28a41SDave Hansen 
2333ac16ec83SBaolin Wang 	if (!node_demotion)
2334ac16ec83SBaolin Wang 		return;
2335ac16ec83SBaolin Wang 
2336ac16ec83SBaolin Wang 	for_each_online_node(node) {
2337ac16ec83SBaolin Wang 		node_demotion[node].nr = 0;
2338ac16ec83SBaolin Wang 		for (i = 0; i < DEMOTION_TARGET_NODES; i++)
2339ac16ec83SBaolin Wang 			node_demotion[node].nodes[i] = NUMA_NO_NODE;
2340ac16ec83SBaolin Wang 	}
234179c28a41SDave Hansen }
234279c28a41SDave Hansen 
234379c28a41SDave Hansen static void disable_all_migrate_targets(void)
234479c28a41SDave Hansen {
234579c28a41SDave Hansen 	__disable_all_migrate_targets();
234679c28a41SDave Hansen 
234779c28a41SDave Hansen 	/*
234879c28a41SDave Hansen 	 * Ensure that the "disable" is visible across the system.
234979c28a41SDave Hansen 	 * Readers will see either a combination of before+disable
235079c28a41SDave Hansen 	 * state or disable+after.  They will never see before and
235179c28a41SDave Hansen 	 * after state together.
235279c28a41SDave Hansen 	 *
235379c28a41SDave Hansen 	 * The before+after state together might have cycles and
235479c28a41SDave Hansen 	 * could cause readers to do things like loop until this
235579c28a41SDave Hansen 	 * function finishes.  This ensures they can only see a
235679c28a41SDave Hansen 	 * single "bad" read and would, for instance, only loop
235779c28a41SDave Hansen 	 * once.
235879c28a41SDave Hansen 	 */
235979c28a41SDave Hansen 	synchronize_rcu();
236079c28a41SDave Hansen }
236179c28a41SDave Hansen 
236279c28a41SDave Hansen /*
236379c28a41SDave Hansen  * Find an automatic demotion target for 'node'.
236479c28a41SDave Hansen  * Failing here is OK.  It might just indicate
236579c28a41SDave Hansen  * being at the end of a chain.
236679c28a41SDave Hansen  */
2367ac16ec83SBaolin Wang static int establish_migrate_target(int node, nodemask_t *used,
2368ac16ec83SBaolin Wang 				    int best_distance)
236979c28a41SDave Hansen {
2370ac16ec83SBaolin Wang 	int migration_target, index, val;
2371ac16ec83SBaolin Wang 	struct demotion_nodes *nd;
237279c28a41SDave Hansen 
2373ac16ec83SBaolin Wang 	if (!node_demotion)
237479c28a41SDave Hansen 		return NUMA_NO_NODE;
237579c28a41SDave Hansen 
2376ac16ec83SBaolin Wang 	nd = &node_demotion[node];
2377ac16ec83SBaolin Wang 
237879c28a41SDave Hansen 	migration_target = find_next_best_node(node, used);
237979c28a41SDave Hansen 	if (migration_target == NUMA_NO_NODE)
238079c28a41SDave Hansen 		return NUMA_NO_NODE;
238179c28a41SDave Hansen 
2382ac16ec83SBaolin Wang 	/*
2383ac16ec83SBaolin Wang 	 * If the node has been set a migration target node before,
2384ac16ec83SBaolin Wang 	 * which means it's the best distance between them. Still
2385ac16ec83SBaolin Wang 	 * check if this node can be demoted to other target nodes
2386ac16ec83SBaolin Wang 	 * if they have a same best distance.
2387ac16ec83SBaolin Wang 	 */
2388ac16ec83SBaolin Wang 	if (best_distance != -1) {
2389ac16ec83SBaolin Wang 		val = node_distance(node, migration_target);
2390ac16ec83SBaolin Wang 		if (val > best_distance)
2391fc89213aSHuang Ying 			goto out_clear;
2392ac16ec83SBaolin Wang 	}
2393ac16ec83SBaolin Wang 
2394ac16ec83SBaolin Wang 	index = nd->nr;
2395ac16ec83SBaolin Wang 	if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
2396ac16ec83SBaolin Wang 		      "Exceeds maximum demotion target nodes\n"))
2397fc89213aSHuang Ying 		goto out_clear;
2398ac16ec83SBaolin Wang 
2399ac16ec83SBaolin Wang 	nd->nodes[index] = migration_target;
2400ac16ec83SBaolin Wang 	nd->nr++;
240179c28a41SDave Hansen 
240279c28a41SDave Hansen 	return migration_target;
2403fc89213aSHuang Ying out_clear:
2404fc89213aSHuang Ying 	node_clear(migration_target, *used);
2405fc89213aSHuang Ying 	return NUMA_NO_NODE;
240679c28a41SDave Hansen }
240779c28a41SDave Hansen 
240879c28a41SDave Hansen /*
240979c28a41SDave Hansen  * When memory fills up on a node, memory contents can be
241079c28a41SDave Hansen  * automatically migrated to another node instead of
241179c28a41SDave Hansen  * discarded at reclaim.
241279c28a41SDave Hansen  *
241379c28a41SDave Hansen  * Establish a "migration path" which will start at nodes
241479c28a41SDave Hansen  * with CPUs and will follow the priorities used to build the
241579c28a41SDave Hansen  * page allocator zonelists.
241679c28a41SDave Hansen  *
241779c28a41SDave Hansen  * The difference here is that cycles must be avoided.  If
241879c28a41SDave Hansen  * node0 migrates to node1, then neither node1, nor anything
2419ac16ec83SBaolin Wang  * node1 migrates to can migrate to node0. Also one node can
2420ac16ec83SBaolin Wang  * be migrated to multiple nodes if the target nodes all have
2421ac16ec83SBaolin Wang  * a same best-distance against the source node.
242279c28a41SDave Hansen  *
242379c28a41SDave Hansen  * This function can run simultaneously with readers of
242479c28a41SDave Hansen  * node_demotion[].  However, it can not run simultaneously
242579c28a41SDave Hansen  * with itself.  Exclusion is provided by memory hotplug events
242679c28a41SDave Hansen  * being single-threaded.
242779c28a41SDave Hansen  */
242879c28a41SDave Hansen static void __set_migration_target_nodes(void)
242979c28a41SDave Hansen {
243091925ab8SMiaohe Lin 	nodemask_t next_pass;
243191925ab8SMiaohe Lin 	nodemask_t this_pass;
243279c28a41SDave Hansen 	nodemask_t used_targets = NODE_MASK_NONE;
2433ac16ec83SBaolin Wang 	int node, best_distance;
243479c28a41SDave Hansen 
243579c28a41SDave Hansen 	/*
243679c28a41SDave Hansen 	 * Avoid any oddities like cycles that could occur
243779c28a41SDave Hansen 	 * from changes in the topology.  This will leave
243879c28a41SDave Hansen 	 * a momentary gap when migration is disabled.
243979c28a41SDave Hansen 	 */
244079c28a41SDave Hansen 	disable_all_migrate_targets();
244179c28a41SDave Hansen 
244279c28a41SDave Hansen 	/*
244379c28a41SDave Hansen 	 * Allocations go close to CPUs, first.  Assume that
244479c28a41SDave Hansen 	 * the migration path starts at the nodes with CPUs.
244579c28a41SDave Hansen 	 */
244679c28a41SDave Hansen 	next_pass = node_states[N_CPU];
244779c28a41SDave Hansen again:
244879c28a41SDave Hansen 	this_pass = next_pass;
244979c28a41SDave Hansen 	next_pass = NODE_MASK_NONE;
245079c28a41SDave Hansen 	/*
245179c28a41SDave Hansen 	 * To avoid cycles in the migration "graph", ensure
245279c28a41SDave Hansen 	 * that migration sources are not future targets by
245379c28a41SDave Hansen 	 * setting them in 'used_targets'.  Do this only
245479c28a41SDave Hansen 	 * once per pass so that multiple source nodes can
245579c28a41SDave Hansen 	 * share a target node.
245679c28a41SDave Hansen 	 *
245779c28a41SDave Hansen 	 * 'used_targets' will become unavailable in future
245879c28a41SDave Hansen 	 * passes.  This limits some opportunities for
245979c28a41SDave Hansen 	 * multiple source nodes to share a destination.
246079c28a41SDave Hansen 	 */
246179c28a41SDave Hansen 	nodes_or(used_targets, used_targets, this_pass);
2462ac16ec83SBaolin Wang 
246379c28a41SDave Hansen 	for_each_node_mask(node, this_pass) {
2464ac16ec83SBaolin Wang 		best_distance = -1;
2465ac16ec83SBaolin Wang 
2466ac16ec83SBaolin Wang 		/*
2467ac16ec83SBaolin Wang 		 * Try to set up the migration path for the node, and the target
2468ac16ec83SBaolin Wang 		 * migration nodes can be multiple, so doing a loop to find all
2469ac16ec83SBaolin Wang 		 * the target nodes if they all have a best node distance.
2470ac16ec83SBaolin Wang 		 */
2471ac16ec83SBaolin Wang 		do {
2472ac16ec83SBaolin Wang 			int target_node =
2473ac16ec83SBaolin Wang 				establish_migrate_target(node, &used_targets,
2474ac16ec83SBaolin Wang 							 best_distance);
247579c28a41SDave Hansen 
247679c28a41SDave Hansen 			if (target_node == NUMA_NO_NODE)
2477ac16ec83SBaolin Wang 				break;
2478ac16ec83SBaolin Wang 
2479ac16ec83SBaolin Wang 			if (best_distance == -1)
2480ac16ec83SBaolin Wang 				best_distance = node_distance(node, target_node);
248179c28a41SDave Hansen 
248279c28a41SDave Hansen 			/*
248379c28a41SDave Hansen 			 * Visit targets from this pass in the next pass.
248479c28a41SDave Hansen 			 * Eventually, every node will have been part of
248579c28a41SDave Hansen 			 * a pass, and will become set in 'used_targets'.
248679c28a41SDave Hansen 			 */
248779c28a41SDave Hansen 			node_set(target_node, next_pass);
2488ac16ec83SBaolin Wang 		} while (1);
248979c28a41SDave Hansen 	}
249079c28a41SDave Hansen 	/*
249179c28a41SDave Hansen 	 * 'next_pass' contains nodes which became migration
249279c28a41SDave Hansen 	 * targets in this pass.  Make additional passes until
249379c28a41SDave Hansen 	 * no more migrations targets are available.
249479c28a41SDave Hansen 	 */
249579c28a41SDave Hansen 	if (!nodes_empty(next_pass))
249679c28a41SDave Hansen 		goto again;
249779c28a41SDave Hansen }
249879c28a41SDave Hansen 
249979c28a41SDave Hansen /*
250079c28a41SDave Hansen  * For callers that do not hold get_online_mems() already.
250179c28a41SDave Hansen  */
2502734c1570SOscar Salvador void set_migration_target_nodes(void)
250379c28a41SDave Hansen {
250479c28a41SDave Hansen 	get_online_mems();
250579c28a41SDave Hansen 	__set_migration_target_nodes();
250679c28a41SDave Hansen 	put_online_mems();
250779c28a41SDave Hansen }
2508884a6e5dSDave Hansen 
2509884a6e5dSDave Hansen /*
2510884a6e5dSDave Hansen  * This leaves migrate-on-reclaim transiently disabled between
2511884a6e5dSDave Hansen  * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
2512884a6e5dSDave Hansen  * whether reclaim-based migration is enabled or not, which
2513884a6e5dSDave Hansen  * ensures that the user can turn reclaim-based migration at
2514884a6e5dSDave Hansen  * any time without needing to recalculate migration targets.
2515884a6e5dSDave Hansen  *
2516884a6e5dSDave Hansen  * These callbacks already hold get_online_mems().  That is why
2517884a6e5dSDave Hansen  * __set_migration_target_nodes() can be used as opposed to
2518884a6e5dSDave Hansen  * set_migration_target_nodes().
2519884a6e5dSDave Hansen  */
25207d6e2d96SOscar Salvador #ifdef CONFIG_MEMORY_HOTPLUG
2521884a6e5dSDave Hansen static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
2522295be91fSDave Hansen 						 unsigned long action, void *_arg)
2523884a6e5dSDave Hansen {
2524295be91fSDave Hansen 	struct memory_notify *arg = _arg;
2525295be91fSDave Hansen 
2526295be91fSDave Hansen 	/*
2527295be91fSDave Hansen 	 * Only update the node migration order when a node is
2528295be91fSDave Hansen 	 * changing status, like online->offline.  This avoids
2529295be91fSDave Hansen 	 * the overhead of synchronize_rcu() in most cases.
2530295be91fSDave Hansen 	 */
2531295be91fSDave Hansen 	if (arg->status_change_nid < 0)
2532295be91fSDave Hansen 		return notifier_from_errno(0);
2533295be91fSDave Hansen 
2534884a6e5dSDave Hansen 	switch (action) {
2535884a6e5dSDave Hansen 	case MEM_GOING_OFFLINE:
2536884a6e5dSDave Hansen 		/*
2537884a6e5dSDave Hansen 		 * Make sure there are not transient states where
2538884a6e5dSDave Hansen 		 * an offline node is a migration target.  This
2539884a6e5dSDave Hansen 		 * will leave migration disabled until the offline
2540884a6e5dSDave Hansen 		 * completes and the MEM_OFFLINE case below runs.
2541884a6e5dSDave Hansen 		 */
2542884a6e5dSDave Hansen 		disable_all_migrate_targets();
2543884a6e5dSDave Hansen 		break;
2544884a6e5dSDave Hansen 	case MEM_OFFLINE:
2545884a6e5dSDave Hansen 	case MEM_ONLINE:
2546884a6e5dSDave Hansen 		/*
2547884a6e5dSDave Hansen 		 * Recalculate the target nodes once the node
2548884a6e5dSDave Hansen 		 * reaches its final state (online or offline).
2549884a6e5dSDave Hansen 		 */
2550884a6e5dSDave Hansen 		__set_migration_target_nodes();
2551884a6e5dSDave Hansen 		break;
2552884a6e5dSDave Hansen 	case MEM_CANCEL_OFFLINE:
2553884a6e5dSDave Hansen 		/*
2554884a6e5dSDave Hansen 		 * MEM_GOING_OFFLINE disabled all the migration
2555884a6e5dSDave Hansen 		 * targets.  Reenable them.
2556884a6e5dSDave Hansen 		 */
2557884a6e5dSDave Hansen 		__set_migration_target_nodes();
2558884a6e5dSDave Hansen 		break;
2559884a6e5dSDave Hansen 	case MEM_GOING_ONLINE:
2560884a6e5dSDave Hansen 	case MEM_CANCEL_ONLINE:
2561884a6e5dSDave Hansen 		break;
2562884a6e5dSDave Hansen 	}
2563884a6e5dSDave Hansen 
2564884a6e5dSDave Hansen 	return notifier_from_errno(0);
2565884a6e5dSDave Hansen }
25667d6e2d96SOscar Salvador #endif
2567884a6e5dSDave Hansen 
2568734c1570SOscar Salvador void __init migrate_on_reclaim_init(void)
256976af6a05SDave Hansen {
25703f26c88bSMiaohe Lin 	node_demotion = kcalloc(nr_node_ids,
2571ac16ec83SBaolin Wang 				sizeof(struct demotion_nodes),
2572ac16ec83SBaolin Wang 				GFP_KERNEL);
2573ac16ec83SBaolin Wang 	WARN_ON(!node_demotion);
25747d6e2d96SOscar Salvador #ifdef CONFIG_MEMORY_HOTPLUG
2575884a6e5dSDave Hansen 	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
25767d6e2d96SOscar Salvador #endif
2577734c1570SOscar Salvador 	/*
2578734c1570SOscar Salvador 	 * At this point, all numa nodes with memory/CPus have their state
2579734c1570SOscar Salvador 	 * properly set, so we can build the demotion order now.
2580734c1570SOscar Salvador 	 * Let us hold the cpu_hotplug lock just, as we could possibily have
2581734c1570SOscar Salvador 	 * CPU hotplug events during boot.
2582734c1570SOscar Salvador 	 */
2583734c1570SOscar Salvador 	cpus_read_lock();
2584734c1570SOscar Salvador 	set_migration_target_nodes();
2585734c1570SOscar Salvador 	cpus_read_unlock();
2586884a6e5dSDave Hansen }
258720f9ba4fSYang Shi 
258820f9ba4fSYang Shi bool numa_demotion_enabled = false;
258920f9ba4fSYang Shi 
259020f9ba4fSYang Shi #ifdef CONFIG_SYSFS
259120f9ba4fSYang Shi static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
259220f9ba4fSYang Shi 					  struct kobj_attribute *attr, char *buf)
259320f9ba4fSYang Shi {
259420f9ba4fSYang Shi 	return sysfs_emit(buf, "%s\n",
259520f9ba4fSYang Shi 			  numa_demotion_enabled ? "true" : "false");
259620f9ba4fSYang Shi }
259720f9ba4fSYang Shi 
259820f9ba4fSYang Shi static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
259920f9ba4fSYang Shi 					   struct kobj_attribute *attr,
260020f9ba4fSYang Shi 					   const char *buf, size_t count)
260120f9ba4fSYang Shi {
2602717aeab4SJagdish Gediya 	ssize_t ret;
2603717aeab4SJagdish Gediya 
2604717aeab4SJagdish Gediya 	ret = kstrtobool(buf, &numa_demotion_enabled);
2605717aeab4SJagdish Gediya 	if (ret)
2606717aeab4SJagdish Gediya 		return ret;
260720f9ba4fSYang Shi 
260820f9ba4fSYang Shi 	return count;
260920f9ba4fSYang Shi }
261020f9ba4fSYang Shi 
261120f9ba4fSYang Shi static struct kobj_attribute numa_demotion_enabled_attr =
261220f9ba4fSYang Shi 	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
261320f9ba4fSYang Shi 	       numa_demotion_enabled_store);
261420f9ba4fSYang Shi 
261520f9ba4fSYang Shi static struct attribute *numa_attrs[] = {
261620f9ba4fSYang Shi 	&numa_demotion_enabled_attr.attr,
261720f9ba4fSYang Shi 	NULL,
261820f9ba4fSYang Shi };
261920f9ba4fSYang Shi 
262020f9ba4fSYang Shi static const struct attribute_group numa_attr_group = {
262120f9ba4fSYang Shi 	.attrs = numa_attrs,
262220f9ba4fSYang Shi };
262320f9ba4fSYang Shi 
262420f9ba4fSYang Shi static int __init numa_init_sysfs(void)
262520f9ba4fSYang Shi {
262620f9ba4fSYang Shi 	int err;
262720f9ba4fSYang Shi 	struct kobject *numa_kobj;
262820f9ba4fSYang Shi 
262920f9ba4fSYang Shi 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
263020f9ba4fSYang Shi 	if (!numa_kobj) {
263120f9ba4fSYang Shi 		pr_err("failed to create numa kobject\n");
263220f9ba4fSYang Shi 		return -ENOMEM;
263320f9ba4fSYang Shi 	}
263420f9ba4fSYang Shi 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
263520f9ba4fSYang Shi 	if (err) {
263620f9ba4fSYang Shi 		pr_err("failed to register numa group\n");
263720f9ba4fSYang Shi 		goto delete_obj;
263820f9ba4fSYang Shi 	}
263920f9ba4fSYang Shi 	return 0;
264020f9ba4fSYang Shi 
264120f9ba4fSYang Shi delete_obj:
264220f9ba4fSYang Shi 	kobject_put(numa_kobj);
264320f9ba4fSYang Shi 	return err;
264420f9ba4fSYang Shi }
264520f9ba4fSYang Shi subsys_initcall(numa_init_sysfs);
26467d6e2d96SOscar Salvador #endif /* CONFIG_SYSFS */
26477d6e2d96SOscar Salvador #endif /* CONFIG_NUMA */
2648