xref: /linux/mm/migrate.c (revision 5b1b561ba73c8ab9c98e5dfd14dc7ee47efb6530)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2b20a3503SChristoph Lameter /*
314e0f9bcSHugh Dickins  * Memory Migration functionality - linux/mm/migrate.c
4b20a3503SChristoph Lameter  *
5b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6b20a3503SChristoph Lameter  *
7b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
8b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
9b20a3503SChristoph Lameter  *
10b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
12b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
13cde53535SChristoph Lameter  * Christoph Lameter
14b20a3503SChristoph Lameter  */
15b20a3503SChristoph Lameter 
16b20a3503SChristoph Lameter #include <linux/migrate.h>
17b95f1b31SPaul Gortmaker #include <linux/export.h>
18b20a3503SChristoph Lameter #include <linux/swap.h>
190697212aSChristoph Lameter #include <linux/swapops.h>
20b20a3503SChristoph Lameter #include <linux/pagemap.h>
21e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
22b20a3503SChristoph Lameter #include <linux/mm_inline.h>
23b488893aSPavel Emelyanov #include <linux/nsproxy.h>
24b20a3503SChristoph Lameter #include <linux/pagevec.h>
25e9995ef9SHugh Dickins #include <linux/ksm.h>
26b20a3503SChristoph Lameter #include <linux/rmap.h>
27b20a3503SChristoph Lameter #include <linux/topology.h>
28b20a3503SChristoph Lameter #include <linux/cpu.h>
29b20a3503SChristoph Lameter #include <linux/cpuset.h>
3004e62a29SChristoph Lameter #include <linux/writeback.h>
31742755a1SChristoph Lameter #include <linux/mempolicy.h>
32742755a1SChristoph Lameter #include <linux/vmalloc.h>
3386c3a764SDavid Quigley #include <linux/security.h>
3442cb14b1SHugh Dickins #include <linux/backing-dev.h>
35bda807d4SMinchan Kim #include <linux/compaction.h>
364f5ca265SAdrian Bunk #include <linux/syscalls.h>
377addf443SDominik Brodowski #include <linux/compat.h>
38290408d4SNaoya Horiguchi #include <linux/hugetlb.h>
398e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
405a0e3ad6STejun Heo #include <linux/gfp.h>
41a520110eSChristoph Hellwig #include <linux/pagewalk.h>
42df6ad698SJérôme Glisse #include <linux/pfn_t.h>
43a5430ddaSJérôme Glisse #include <linux/memremap.h>
448315ada7SJérôme Glisse #include <linux/userfaultfd_k.h>
45bf6bddf1SRafael Aquini #include <linux/balloon_compaction.h>
46f714f4f2SMel Gorman #include <linux/mmu_notifier.h>
4733c3fc71SVladimir Davydov #include <linux/page_idle.h>
48d435edcaSVlastimil Babka #include <linux/page_owner.h>
496e84f315SIngo Molnar #include <linux/sched/mm.h>
50197e7e52SLinus Torvalds #include <linux/ptrace.h>
5134290e2cSRalph Campbell #include <linux/oom.h>
52884a6e5dSDave Hansen #include <linux/memory.h>
53b20a3503SChristoph Lameter 
540d1836c3SMichal Nazarewicz #include <asm/tlbflush.h>
550d1836c3SMichal Nazarewicz 
567b2a2d4aSMel Gorman #define CREATE_TRACE_POINTS
577b2a2d4aSMel Gorman #include <trace/events/migrate.h>
587b2a2d4aSMel Gorman 
59b20a3503SChristoph Lameter #include "internal.h"
60b20a3503SChristoph Lameter 
619e5bcd61SYisheng Xie int isolate_movable_page(struct page *page, isolate_mode_t mode)
62bda807d4SMinchan Kim {
63bda807d4SMinchan Kim 	struct address_space *mapping;
64bda807d4SMinchan Kim 
65bda807d4SMinchan Kim 	/*
66bda807d4SMinchan Kim 	 * Avoid burning cycles with pages that are yet under __free_pages(),
67bda807d4SMinchan Kim 	 * or just got freed under us.
68bda807d4SMinchan Kim 	 *
69bda807d4SMinchan Kim 	 * In case we 'win' a race for a movable page being freed under us and
70bda807d4SMinchan Kim 	 * raise its refcount preventing __free_pages() from doing its job
71bda807d4SMinchan Kim 	 * the put_page() at the end of this block will take care of
72bda807d4SMinchan Kim 	 * release this page, thus avoiding a nasty leakage.
73bda807d4SMinchan Kim 	 */
74bda807d4SMinchan Kim 	if (unlikely(!get_page_unless_zero(page)))
75bda807d4SMinchan Kim 		goto out;
76bda807d4SMinchan Kim 
77bda807d4SMinchan Kim 	/*
78bda807d4SMinchan Kim 	 * Check PageMovable before holding a PG_lock because page's owner
79bda807d4SMinchan Kim 	 * assumes anybody doesn't touch PG_lock of newly allocated page
808bb4e7a2SWei Yang 	 * so unconditionally grabbing the lock ruins page's owner side.
81bda807d4SMinchan Kim 	 */
82bda807d4SMinchan Kim 	if (unlikely(!__PageMovable(page)))
83bda807d4SMinchan Kim 		goto out_putpage;
84bda807d4SMinchan Kim 	/*
85bda807d4SMinchan Kim 	 * As movable pages are not isolated from LRU lists, concurrent
86bda807d4SMinchan Kim 	 * compaction threads can race against page migration functions
87bda807d4SMinchan Kim 	 * as well as race against the releasing a page.
88bda807d4SMinchan Kim 	 *
89bda807d4SMinchan Kim 	 * In order to avoid having an already isolated movable page
90bda807d4SMinchan Kim 	 * being (wrongly) re-isolated while it is under migration,
91bda807d4SMinchan Kim 	 * or to avoid attempting to isolate pages being released,
92bda807d4SMinchan Kim 	 * lets be sure we have the page lock
93bda807d4SMinchan Kim 	 * before proceeding with the movable page isolation steps.
94bda807d4SMinchan Kim 	 */
95bda807d4SMinchan Kim 	if (unlikely(!trylock_page(page)))
96bda807d4SMinchan Kim 		goto out_putpage;
97bda807d4SMinchan Kim 
98bda807d4SMinchan Kim 	if (!PageMovable(page) || PageIsolated(page))
99bda807d4SMinchan Kim 		goto out_no_isolated;
100bda807d4SMinchan Kim 
101bda807d4SMinchan Kim 	mapping = page_mapping(page);
102bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!mapping, page);
103bda807d4SMinchan Kim 
104bda807d4SMinchan Kim 	if (!mapping->a_ops->isolate_page(page, mode))
105bda807d4SMinchan Kim 		goto out_no_isolated;
106bda807d4SMinchan Kim 
107bda807d4SMinchan Kim 	/* Driver shouldn't use PG_isolated bit of page->flags */
108bda807d4SMinchan Kim 	WARN_ON_ONCE(PageIsolated(page));
109bda807d4SMinchan Kim 	__SetPageIsolated(page);
110bda807d4SMinchan Kim 	unlock_page(page);
111bda807d4SMinchan Kim 
1129e5bcd61SYisheng Xie 	return 0;
113bda807d4SMinchan Kim 
114bda807d4SMinchan Kim out_no_isolated:
115bda807d4SMinchan Kim 	unlock_page(page);
116bda807d4SMinchan Kim out_putpage:
117bda807d4SMinchan Kim 	put_page(page);
118bda807d4SMinchan Kim out:
1199e5bcd61SYisheng Xie 	return -EBUSY;
120bda807d4SMinchan Kim }
121bda807d4SMinchan Kim 
122606a6f71SMiaohe Lin static void putback_movable_page(struct page *page)
123bda807d4SMinchan Kim {
124bda807d4SMinchan Kim 	struct address_space *mapping;
125bda807d4SMinchan Kim 
126bda807d4SMinchan Kim 	mapping = page_mapping(page);
127bda807d4SMinchan Kim 	mapping->a_ops->putback_page(page);
128bda807d4SMinchan Kim 	__ClearPageIsolated(page);
129bda807d4SMinchan Kim }
130bda807d4SMinchan Kim 
131b20a3503SChristoph Lameter /*
1325733c7d1SRafael Aquini  * Put previously isolated pages back onto the appropriate lists
1335733c7d1SRafael Aquini  * from where they were once taken off for compaction/migration.
1345733c7d1SRafael Aquini  *
13559c82b70SJoonsoo Kim  * This function shall be used whenever the isolated pageset has been
13659c82b70SJoonsoo Kim  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
13759c82b70SJoonsoo Kim  * and isolate_huge_page().
1385733c7d1SRafael Aquini  */
1395733c7d1SRafael Aquini void putback_movable_pages(struct list_head *l)
1405733c7d1SRafael Aquini {
1415733c7d1SRafael Aquini 	struct page *page;
1425733c7d1SRafael Aquini 	struct page *page2;
1435733c7d1SRafael Aquini 
1445733c7d1SRafael Aquini 	list_for_each_entry_safe(page, page2, l, lru) {
14531caf665SNaoya Horiguchi 		if (unlikely(PageHuge(page))) {
14631caf665SNaoya Horiguchi 			putback_active_hugepage(page);
14731caf665SNaoya Horiguchi 			continue;
14831caf665SNaoya Horiguchi 		}
1495733c7d1SRafael Aquini 		list_del(&page->lru);
150bda807d4SMinchan Kim 		/*
151bda807d4SMinchan Kim 		 * We isolated non-lru movable page so here we can use
152bda807d4SMinchan Kim 		 * __PageMovable because LRU page's mapping cannot have
153bda807d4SMinchan Kim 		 * PAGE_MAPPING_MOVABLE.
154bda807d4SMinchan Kim 		 */
155b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(page))) {
156bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
157bda807d4SMinchan Kim 			lock_page(page);
158bda807d4SMinchan Kim 			if (PageMovable(page))
159bda807d4SMinchan Kim 				putback_movable_page(page);
160bf6bddf1SRafael Aquini 			else
161bda807d4SMinchan Kim 				__ClearPageIsolated(page);
162bda807d4SMinchan Kim 			unlock_page(page);
163bda807d4SMinchan Kim 			put_page(page);
164bda807d4SMinchan Kim 		} else {
165e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1666c357848SMatthew Wilcox (Oracle) 					page_is_file_lru(page), -thp_nr_pages(page));
167fc280fe8SRabin Vincent 			putback_lru_page(page);
168b20a3503SChristoph Lameter 		}
169b20a3503SChristoph Lameter 	}
170bda807d4SMinchan Kim }
171b20a3503SChristoph Lameter 
1720697212aSChristoph Lameter /*
1730697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
1740697212aSChristoph Lameter  */
175e4b82222SMinchan Kim static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
176e9995ef9SHugh Dickins 				 unsigned long addr, void *old)
1770697212aSChristoph Lameter {
1783fe87967SKirill A. Shutemov 	struct page_vma_mapped_walk pvmw = {
1793fe87967SKirill A. Shutemov 		.page = old,
1803fe87967SKirill A. Shutemov 		.vma = vma,
1813fe87967SKirill A. Shutemov 		.address = addr,
1823fe87967SKirill A. Shutemov 		.flags = PVMW_SYNC | PVMW_MIGRATION,
1833fe87967SKirill A. Shutemov 	};
1843fe87967SKirill A. Shutemov 	struct page *new;
1853fe87967SKirill A. Shutemov 	pte_t pte;
1860697212aSChristoph Lameter 	swp_entry_t entry;
1870697212aSChristoph Lameter 
1883fe87967SKirill A. Shutemov 	VM_BUG_ON_PAGE(PageTail(page), page);
1893fe87967SKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
1904b0ece6fSNaoya Horiguchi 		if (PageKsm(page))
1914b0ece6fSNaoya Horiguchi 			new = page;
1924b0ece6fSNaoya Horiguchi 		else
1933fe87967SKirill A. Shutemov 			new = page - pvmw.page->index +
1943fe87967SKirill A. Shutemov 				linear_page_index(vma, pvmw.address);
1950697212aSChristoph Lameter 
196616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
197616b8371SZi Yan 		/* PMD-mapped THP migration entry */
198616b8371SZi Yan 		if (!pvmw.pte) {
199616b8371SZi Yan 			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
200616b8371SZi Yan 			remove_migration_pmd(&pvmw, new);
201616b8371SZi Yan 			continue;
202616b8371SZi Yan 		}
203616b8371SZi Yan #endif
204616b8371SZi Yan 
2050697212aSChristoph Lameter 		get_page(new);
2066d2329f8SAndrea Arcangeli 		pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
2073fe87967SKirill A. Shutemov 		if (pte_swp_soft_dirty(*pvmw.pte))
208c3d16e16SCyrill Gorcunov 			pte = pte_mksoft_dirty(pte);
209d3cb8bf6SMel Gorman 
2103fe87967SKirill A. Shutemov 		/*
2113fe87967SKirill A. Shutemov 		 * Recheck VMA as permissions can change since migration started
2123fe87967SKirill A. Shutemov 		 */
2133fe87967SKirill A. Shutemov 		entry = pte_to_swp_entry(*pvmw.pte);
2144dd845b5SAlistair Popple 		if (is_writable_migration_entry(entry))
215d3cb8bf6SMel Gorman 			pte = maybe_mkwrite(pte, vma);
216f45ec5ffSPeter Xu 		else if (pte_swp_uffd_wp(*pvmw.pte))
217f45ec5ffSPeter Xu 			pte = pte_mkuffd_wp(pte);
218d3cb8bf6SMel Gorman 
2196128763fSRalph Campbell 		if (unlikely(is_device_private_page(new))) {
2204dd845b5SAlistair Popple 			if (pte_write(pte))
2214dd845b5SAlistair Popple 				entry = make_writable_device_private_entry(
2224dd845b5SAlistair Popple 							page_to_pfn(new));
2234dd845b5SAlistair Popple 			else
2244dd845b5SAlistair Popple 				entry = make_readable_device_private_entry(
2254dd845b5SAlistair Popple 							page_to_pfn(new));
226a5430ddaSJérôme Glisse 			pte = swp_entry_to_pte(entry);
2273d321bf8SRalph Campbell 			if (pte_swp_soft_dirty(*pvmw.pte))
2283d321bf8SRalph Campbell 				pte = pte_swp_mksoft_dirty(pte);
229f45ec5ffSPeter Xu 			if (pte_swp_uffd_wp(*pvmw.pte))
230ebdf8321SAlistair Popple 				pte = pte_swp_mkuffd_wp(pte);
231df6ad698SJérôme Glisse 		}
232a5430ddaSJérôme Glisse 
2333ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE
234be7517d6STony Lu 		if (PageHuge(new)) {
23579c1c594SChristophe Leroy 			unsigned int shift = huge_page_shift(hstate_vma(vma));
23679c1c594SChristophe Leroy 
237290408d4SNaoya Horiguchi 			pte = pte_mkhuge(pte);
23879c1c594SChristophe Leroy 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
239383321abSAneesh Kumar K.V 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
24004e62a29SChristoph Lameter 			if (PageAnon(new))
2413fe87967SKirill A. Shutemov 				hugepage_add_anon_rmap(new, vma, pvmw.address);
242290408d4SNaoya Horiguchi 			else
24353f9263bSKirill A. Shutemov 				page_dup_rmap(new, true);
244383321abSAneesh Kumar K.V 		} else
245383321abSAneesh Kumar K.V #endif
246383321abSAneesh Kumar K.V 		{
247383321abSAneesh Kumar K.V 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
248383321abSAneesh Kumar K.V 
249383321abSAneesh Kumar K.V 			if (PageAnon(new))
2503fe87967SKirill A. Shutemov 				page_add_anon_rmap(new, vma, pvmw.address, false);
25104e62a29SChristoph Lameter 			else
252dd78feddSKirill A. Shutemov 				page_add_file_rmap(new, false);
253383321abSAneesh Kumar K.V 		}
254e388466dSKirill A. Shutemov 		if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
25551afb12bSHugh Dickins 			mlock_vma_page(new);
25651afb12bSHugh Dickins 
257e125fe40SKirill A. Shutemov 		if (PageTransHuge(page) && PageMlocked(page))
258e125fe40SKirill A. Shutemov 			clear_page_mlock(page);
259e125fe40SKirill A. Shutemov 
26004e62a29SChristoph Lameter 		/* No need to invalidate - it was non-present before */
2613fe87967SKirill A. Shutemov 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
2623fe87967SKirill A. Shutemov 	}
2633fe87967SKirill A. Shutemov 
264e4b82222SMinchan Kim 	return true;
2650697212aSChristoph Lameter }
2660697212aSChristoph Lameter 
2670697212aSChristoph Lameter /*
26804e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
26904e62a29SChristoph Lameter  * references to the indicated page.
27004e62a29SChristoph Lameter  */
271e388466dSKirill A. Shutemov void remove_migration_ptes(struct page *old, struct page *new, bool locked)
27204e62a29SChristoph Lameter {
273051ac83aSJoonsoo Kim 	struct rmap_walk_control rwc = {
274051ac83aSJoonsoo Kim 		.rmap_one = remove_migration_pte,
275051ac83aSJoonsoo Kim 		.arg = old,
276051ac83aSJoonsoo Kim 	};
277051ac83aSJoonsoo Kim 
278e388466dSKirill A. Shutemov 	if (locked)
279e388466dSKirill A. Shutemov 		rmap_walk_locked(new, &rwc);
280e388466dSKirill A. Shutemov 	else
281051ac83aSJoonsoo Kim 		rmap_walk(new, &rwc);
28204e62a29SChristoph Lameter }
28304e62a29SChristoph Lameter 
28404e62a29SChristoph Lameter /*
2850697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
2860697212aSChristoph Lameter  * get to the page and wait until migration is finished.
2870697212aSChristoph Lameter  * When we return from this function the fault will be retried.
2880697212aSChristoph Lameter  */
289e66f17ffSNaoya Horiguchi void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
29030dad309SNaoya Horiguchi 				spinlock_t *ptl)
2910697212aSChristoph Lameter {
29230dad309SNaoya Horiguchi 	pte_t pte;
2930697212aSChristoph Lameter 	swp_entry_t entry;
2940697212aSChristoph Lameter 	struct page *page;
2950697212aSChristoph Lameter 
29630dad309SNaoya Horiguchi 	spin_lock(ptl);
2970697212aSChristoph Lameter 	pte = *ptep;
2980697212aSChristoph Lameter 	if (!is_swap_pte(pte))
2990697212aSChristoph Lameter 		goto out;
3000697212aSChristoph Lameter 
3010697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
3020697212aSChristoph Lameter 	if (!is_migration_entry(entry))
3030697212aSChristoph Lameter 		goto out;
3040697212aSChristoph Lameter 
305af5cdaf8SAlistair Popple 	page = pfn_swap_entry_to_page(entry);
306ffc90cbbSXu Yu 	page = compound_head(page);
3070697212aSChristoph Lameter 
308e286781dSNick Piggin 	/*
30989eb946aSMatthew Wilcox 	 * Once page cache replacement of page migration started, page_count
3109a1ea439SHugh Dickins 	 * is zero; but we must not call put_and_wait_on_page_locked() without
3119a1ea439SHugh Dickins 	 * a ref. Use get_page_unless_zero(), and just fault again if it fails.
312e286781dSNick Piggin 	 */
313e286781dSNick Piggin 	if (!get_page_unless_zero(page))
314e286781dSNick Piggin 		goto out;
3150697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
31648054625SMatthew Wilcox (Oracle) 	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
3170697212aSChristoph Lameter 	return;
3180697212aSChristoph Lameter out:
3190697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3200697212aSChristoph Lameter }
3210697212aSChristoph Lameter 
32230dad309SNaoya Horiguchi void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
32330dad309SNaoya Horiguchi 				unsigned long address)
32430dad309SNaoya Horiguchi {
32530dad309SNaoya Horiguchi 	spinlock_t *ptl = pte_lockptr(mm, pmd);
32630dad309SNaoya Horiguchi 	pte_t *ptep = pte_offset_map(pmd, address);
32730dad309SNaoya Horiguchi 	__migration_entry_wait(mm, ptep, ptl);
32830dad309SNaoya Horiguchi }
32930dad309SNaoya Horiguchi 
330cb900f41SKirill A. Shutemov void migration_entry_wait_huge(struct vm_area_struct *vma,
331cb900f41SKirill A. Shutemov 		struct mm_struct *mm, pte_t *pte)
33230dad309SNaoya Horiguchi {
333cb900f41SKirill A. Shutemov 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
33430dad309SNaoya Horiguchi 	__migration_entry_wait(mm, pte, ptl);
33530dad309SNaoya Horiguchi }
33630dad309SNaoya Horiguchi 
337616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
338616b8371SZi Yan void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
339616b8371SZi Yan {
340616b8371SZi Yan 	spinlock_t *ptl;
341616b8371SZi Yan 	struct page *page;
342616b8371SZi Yan 
343616b8371SZi Yan 	ptl = pmd_lock(mm, pmd);
344616b8371SZi Yan 	if (!is_pmd_migration_entry(*pmd))
345616b8371SZi Yan 		goto unlock;
346af5cdaf8SAlistair Popple 	page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
347616b8371SZi Yan 	if (!get_page_unless_zero(page))
348616b8371SZi Yan 		goto unlock;
349616b8371SZi Yan 	spin_unlock(ptl);
35048054625SMatthew Wilcox (Oracle) 	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
351616b8371SZi Yan 	return;
352616b8371SZi Yan unlock:
353616b8371SZi Yan 	spin_unlock(ptl);
354616b8371SZi Yan }
355616b8371SZi Yan #endif
356616b8371SZi Yan 
357f900482dSJan Kara static int expected_page_refs(struct address_space *mapping, struct page *page)
3580b3901b3SJan Kara {
3590b3901b3SJan Kara 	int expected_count = 1;
3600b3901b3SJan Kara 
3610b3901b3SJan Kara 	/*
362f1f4f3abSRalph Campbell 	 * Device private pages have an extra refcount as they are
3630b3901b3SJan Kara 	 * ZONE_DEVICE pages.
3640b3901b3SJan Kara 	 */
3650b3901b3SJan Kara 	expected_count += is_device_private_page(page);
366f900482dSJan Kara 	if (mapping)
3676c357848SMatthew Wilcox (Oracle) 		expected_count += thp_nr_pages(page) + page_has_private(page);
3680b3901b3SJan Kara 
3690b3901b3SJan Kara 	return expected_count;
3700b3901b3SJan Kara }
3710b3901b3SJan Kara 
372b20a3503SChristoph Lameter /*
373c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
3745b5c7120SChristoph Lameter  *
3755b5c7120SChristoph Lameter  * The number of remaining references must be:
3765b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
3775b5c7120SChristoph Lameter  * 2 for pages with a mapping
378266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
379b20a3503SChristoph Lameter  */
38036bc08ccSGu Zheng int migrate_page_move_mapping(struct address_space *mapping,
38137109694SKeith Busch 		struct page *newpage, struct page *page, int extra_count)
382b20a3503SChristoph Lameter {
38389eb946aSMatthew Wilcox 	XA_STATE(xas, &mapping->i_pages, page_index(page));
38442cb14b1SHugh Dickins 	struct zone *oldzone, *newzone;
38542cb14b1SHugh Dickins 	int dirty;
386f900482dSJan Kara 	int expected_count = expected_page_refs(mapping, page) + extra_count;
3875c447d27SShakeel Butt 	int nr = thp_nr_pages(page);
3888763cb45SJérôme Glisse 
3896c5240aeSChristoph Lameter 	if (!mapping) {
3900e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
3918e321fefSBenjamin LaHaise 		if (page_count(page) != expected_count)
3926c5240aeSChristoph Lameter 			return -EAGAIN;
393cf4b769aSHugh Dickins 
394cf4b769aSHugh Dickins 		/* No turning back from here */
395cf4b769aSHugh Dickins 		newpage->index = page->index;
396cf4b769aSHugh Dickins 		newpage->mapping = page->mapping;
397cf4b769aSHugh Dickins 		if (PageSwapBacked(page))
398fa9949daSHugh Dickins 			__SetPageSwapBacked(newpage);
399cf4b769aSHugh Dickins 
40078bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
4016c5240aeSChristoph Lameter 	}
4026c5240aeSChristoph Lameter 
40342cb14b1SHugh Dickins 	oldzone = page_zone(page);
40442cb14b1SHugh Dickins 	newzone = page_zone(newpage);
40542cb14b1SHugh Dickins 
40689eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
40789eb946aSMatthew Wilcox 	if (page_count(page) != expected_count || xas_load(&xas) != page) {
40889eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
409e23ca00bSChristoph Lameter 		return -EAGAIN;
410b20a3503SChristoph Lameter 	}
411b20a3503SChristoph Lameter 
412fe896d18SJoonsoo Kim 	if (!page_ref_freeze(page, expected_count)) {
41389eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
414e286781dSNick Piggin 		return -EAGAIN;
415e286781dSNick Piggin 	}
416e286781dSNick Piggin 
417b20a3503SChristoph Lameter 	/*
418cf4b769aSHugh Dickins 	 * Now we know that no one else is looking at the page:
419cf4b769aSHugh Dickins 	 * no turning back from here.
420b20a3503SChristoph Lameter 	 */
421cf4b769aSHugh Dickins 	newpage->index = page->index;
422cf4b769aSHugh Dickins 	newpage->mapping = page->mapping;
4235c447d27SShakeel Butt 	page_ref_add(newpage, nr); /* add cache reference */
4246326fec1SNicholas Piggin 	if (PageSwapBacked(page)) {
4256326fec1SNicholas Piggin 		__SetPageSwapBacked(newpage);
426b20a3503SChristoph Lameter 		if (PageSwapCache(page)) {
427b20a3503SChristoph Lameter 			SetPageSwapCache(newpage);
428b20a3503SChristoph Lameter 			set_page_private(newpage, page_private(page));
429b20a3503SChristoph Lameter 		}
4306326fec1SNicholas Piggin 	} else {
4316326fec1SNicholas Piggin 		VM_BUG_ON_PAGE(PageSwapCache(page), page);
4326326fec1SNicholas Piggin 	}
433b20a3503SChristoph Lameter 
43442cb14b1SHugh Dickins 	/* Move dirty while page refs frozen and newpage not yet exposed */
43542cb14b1SHugh Dickins 	dirty = PageDirty(page);
43642cb14b1SHugh Dickins 	if (dirty) {
43742cb14b1SHugh Dickins 		ClearPageDirty(page);
43842cb14b1SHugh Dickins 		SetPageDirty(newpage);
43942cb14b1SHugh Dickins 	}
44042cb14b1SHugh Dickins 
44189eb946aSMatthew Wilcox 	xas_store(&xas, newpage);
442e71769aeSNaoya Horiguchi 	if (PageTransHuge(page)) {
443e71769aeSNaoya Horiguchi 		int i;
444e71769aeSNaoya Horiguchi 
4455c447d27SShakeel Butt 		for (i = 1; i < nr; i++) {
44689eb946aSMatthew Wilcox 			xas_next(&xas);
4474101196bSMatthew Wilcox (Oracle) 			xas_store(&xas, newpage);
448e71769aeSNaoya Horiguchi 		}
449e71769aeSNaoya Horiguchi 	}
4507cf9c2c7SNick Piggin 
4517cf9c2c7SNick Piggin 	/*
452937a94c9SJacobo Giralt 	 * Drop cache reference from old page by unfreezing
453937a94c9SJacobo Giralt 	 * to one less reference.
4547cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
4557cf9c2c7SNick Piggin 	 */
4565c447d27SShakeel Butt 	page_ref_unfreeze(page, expected_count - nr);
4577cf9c2c7SNick Piggin 
45889eb946aSMatthew Wilcox 	xas_unlock(&xas);
45942cb14b1SHugh Dickins 	/* Leave irq disabled to prevent preemption while updating stats */
46042cb14b1SHugh Dickins 
4610e8c7d0fSChristoph Lameter 	/*
4620e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
4630e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
4640e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
4650e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
4660e8c7d0fSChristoph Lameter 	 *
4670e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
4684b9d0fabSMel Gorman 	 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
4690e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
4700e8c7d0fSChristoph Lameter 	 */
47142cb14b1SHugh Dickins 	if (newzone != oldzone) {
4720d1c2072SJohannes Weiner 		struct lruvec *old_lruvec, *new_lruvec;
4730d1c2072SJohannes Weiner 		struct mem_cgroup *memcg;
4740d1c2072SJohannes Weiner 
4750d1c2072SJohannes Weiner 		memcg = page_memcg(page);
4760d1c2072SJohannes Weiner 		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
4770d1c2072SJohannes Weiner 		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
4780d1c2072SJohannes Weiner 
4795c447d27SShakeel Butt 		__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
4805c447d27SShakeel Butt 		__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
48142cb14b1SHugh Dickins 		if (PageSwapBacked(page) && !PageSwapCache(page)) {
4825c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
4835c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
4844b02108aSKOSAKI Motohiro 		}
485b6038942SShakeel Butt #ifdef CONFIG_SWAP
486b6038942SShakeel Butt 		if (PageSwapCache(page)) {
487b6038942SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
488b6038942SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
489b6038942SShakeel Butt 		}
490b6038942SShakeel Butt #endif
491f56753acSChristoph Hellwig 		if (dirty && mapping_can_writeback(mapping)) {
4925c447d27SShakeel Butt 			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
4935c447d27SShakeel Butt 			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
4945c447d27SShakeel Butt 			__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
4955c447d27SShakeel Butt 			__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
49642cb14b1SHugh Dickins 		}
49742cb14b1SHugh Dickins 	}
49842cb14b1SHugh Dickins 	local_irq_enable();
499b20a3503SChristoph Lameter 
50078bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
501b20a3503SChristoph Lameter }
5021118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_move_mapping);
503b20a3503SChristoph Lameter 
504b20a3503SChristoph Lameter /*
505290408d4SNaoya Horiguchi  * The expected number of remaining references is the same as that
506290408d4SNaoya Horiguchi  * of migrate_page_move_mapping().
507290408d4SNaoya Horiguchi  */
508290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping,
509290408d4SNaoya Horiguchi 				   struct page *newpage, struct page *page)
510290408d4SNaoya Horiguchi {
51189eb946aSMatthew Wilcox 	XA_STATE(xas, &mapping->i_pages, page_index(page));
512290408d4SNaoya Horiguchi 	int expected_count;
513290408d4SNaoya Horiguchi 
51489eb946aSMatthew Wilcox 	xas_lock_irq(&xas);
515290408d4SNaoya Horiguchi 	expected_count = 2 + page_has_private(page);
51689eb946aSMatthew Wilcox 	if (page_count(page) != expected_count || xas_load(&xas) != page) {
51789eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
518290408d4SNaoya Horiguchi 		return -EAGAIN;
519290408d4SNaoya Horiguchi 	}
520290408d4SNaoya Horiguchi 
521fe896d18SJoonsoo Kim 	if (!page_ref_freeze(page, expected_count)) {
52289eb946aSMatthew Wilcox 		xas_unlock_irq(&xas);
523290408d4SNaoya Horiguchi 		return -EAGAIN;
524290408d4SNaoya Horiguchi 	}
525290408d4SNaoya Horiguchi 
526cf4b769aSHugh Dickins 	newpage->index = page->index;
527cf4b769aSHugh Dickins 	newpage->mapping = page->mapping;
5286a93ca8fSJohannes Weiner 
529290408d4SNaoya Horiguchi 	get_page(newpage);
530290408d4SNaoya Horiguchi 
53189eb946aSMatthew Wilcox 	xas_store(&xas, newpage);
532290408d4SNaoya Horiguchi 
533fe896d18SJoonsoo Kim 	page_ref_unfreeze(page, expected_count - 1);
534290408d4SNaoya Horiguchi 
53589eb946aSMatthew Wilcox 	xas_unlock_irq(&xas);
5366a93ca8fSJohannes Weiner 
53778bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
538290408d4SNaoya Horiguchi }
539290408d4SNaoya Horiguchi 
540290408d4SNaoya Horiguchi /*
541b20a3503SChristoph Lameter  * Copy the page to its new location
542b20a3503SChristoph Lameter  */
5432916ecc0SJérôme Glisse void migrate_page_states(struct page *newpage, struct page *page)
544b20a3503SChristoph Lameter {
5457851a45cSRik van Riel 	int cpupid;
5467851a45cSRik van Riel 
547b20a3503SChristoph Lameter 	if (PageError(page))
548b20a3503SChristoph Lameter 		SetPageError(newpage);
549b20a3503SChristoph Lameter 	if (PageReferenced(page))
550b20a3503SChristoph Lameter 		SetPageReferenced(newpage);
551b20a3503SChristoph Lameter 	if (PageUptodate(page))
552b20a3503SChristoph Lameter 		SetPageUptodate(newpage);
553894bc310SLee Schermerhorn 	if (TestClearPageActive(page)) {
554309381feSSasha Levin 		VM_BUG_ON_PAGE(PageUnevictable(page), page);
555b20a3503SChristoph Lameter 		SetPageActive(newpage);
556418b27efSLee Schermerhorn 	} else if (TestClearPageUnevictable(page))
557418b27efSLee Schermerhorn 		SetPageUnevictable(newpage);
5581899ad18SJohannes Weiner 	if (PageWorkingset(page))
5591899ad18SJohannes Weiner 		SetPageWorkingset(newpage);
560b20a3503SChristoph Lameter 	if (PageChecked(page))
561b20a3503SChristoph Lameter 		SetPageChecked(newpage);
562b20a3503SChristoph Lameter 	if (PageMappedToDisk(page))
563b20a3503SChristoph Lameter 		SetPageMappedToDisk(newpage);
564b20a3503SChristoph Lameter 
56542cb14b1SHugh Dickins 	/* Move dirty on pages not done by migrate_page_move_mapping() */
56642cb14b1SHugh Dickins 	if (PageDirty(page))
567752dc185SHugh Dickins 		SetPageDirty(newpage);
568b20a3503SChristoph Lameter 
56933c3fc71SVladimir Davydov 	if (page_is_young(page))
57033c3fc71SVladimir Davydov 		set_page_young(newpage);
57133c3fc71SVladimir Davydov 	if (page_is_idle(page))
57233c3fc71SVladimir Davydov 		set_page_idle(newpage);
57333c3fc71SVladimir Davydov 
5747851a45cSRik van Riel 	/*
5757851a45cSRik van Riel 	 * Copy NUMA information to the new page, to prevent over-eager
5767851a45cSRik van Riel 	 * future migrations of this same page.
5777851a45cSRik van Riel 	 */
5787851a45cSRik van Riel 	cpupid = page_cpupid_xchg_last(page, -1);
5797851a45cSRik van Riel 	page_cpupid_xchg_last(newpage, cpupid);
5807851a45cSRik van Riel 
581e9995ef9SHugh Dickins 	ksm_migrate_page(newpage, page);
582c8d6553bSHugh Dickins 	/*
583c8d6553bSHugh Dickins 	 * Please do not reorder this without considering how mm/ksm.c's
584c8d6553bSHugh Dickins 	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
585c8d6553bSHugh Dickins 	 */
586b3b3a99cSNaoya Horiguchi 	if (PageSwapCache(page))
587b20a3503SChristoph Lameter 		ClearPageSwapCache(page);
588b20a3503SChristoph Lameter 	ClearPagePrivate(page);
589ad2fa371SMuchun Song 
590ad2fa371SMuchun Song 	/* page->private contains hugetlb specific flags */
591ad2fa371SMuchun Song 	if (!PageHuge(page))
592b20a3503SChristoph Lameter 		set_page_private(page, 0);
593b20a3503SChristoph Lameter 
594b20a3503SChristoph Lameter 	/*
595b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
596b20a3503SChristoph Lameter 	 * wake them up.
597b20a3503SChristoph Lameter 	 */
598b20a3503SChristoph Lameter 	if (PageWriteback(newpage))
599b20a3503SChristoph Lameter 		end_page_writeback(newpage);
600d435edcaSVlastimil Babka 
6016aeff241SYang Shi 	/*
6026aeff241SYang Shi 	 * PG_readahead shares the same bit with PG_reclaim.  The above
6036aeff241SYang Shi 	 * end_page_writeback() may clear PG_readahead mistakenly, so set the
6046aeff241SYang Shi 	 * bit after that.
6056aeff241SYang Shi 	 */
6066aeff241SYang Shi 	if (PageReadahead(page))
6076aeff241SYang Shi 		SetPageReadahead(newpage);
6086aeff241SYang Shi 
609d435edcaSVlastimil Babka 	copy_page_owner(page, newpage);
61074485cf2SJohannes Weiner 
611a333e3e7SHugh Dickins 	if (!PageHuge(page))
61274485cf2SJohannes Weiner 		mem_cgroup_migrate(page, newpage);
613b20a3503SChristoph Lameter }
6142916ecc0SJérôme Glisse EXPORT_SYMBOL(migrate_page_states);
6152916ecc0SJérôme Glisse 
6162916ecc0SJérôme Glisse void migrate_page_copy(struct page *newpage, struct page *page)
6172916ecc0SJérôme Glisse {
6182916ecc0SJérôme Glisse 	if (PageHuge(page) || PageTransHuge(page))
6192916ecc0SJérôme Glisse 		copy_huge_page(newpage, page);
6202916ecc0SJérôme Glisse 	else
6212916ecc0SJérôme Glisse 		copy_highpage(newpage, page);
6222916ecc0SJérôme Glisse 
6232916ecc0SJérôme Glisse 	migrate_page_states(newpage, page);
6242916ecc0SJérôme Glisse }
6251118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_copy);
626b20a3503SChristoph Lameter 
6271d8b85ccSChristoph Lameter /************************************************************
6281d8b85ccSChristoph Lameter  *                    Migration functions
6291d8b85ccSChristoph Lameter  ***********************************************************/
6301d8b85ccSChristoph Lameter 
631b20a3503SChristoph Lameter /*
632bda807d4SMinchan Kim  * Common logic to directly migrate a single LRU page suitable for
633266cf658SDavid Howells  * pages that do not use PagePrivate/PagePrivate2.
634b20a3503SChristoph Lameter  *
635b20a3503SChristoph Lameter  * Pages are locked upon entry and exit.
636b20a3503SChristoph Lameter  */
6372d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping,
638a6bc32b8SMel Gorman 		struct page *newpage, struct page *page,
639a6bc32b8SMel Gorman 		enum migrate_mode mode)
640b20a3503SChristoph Lameter {
641b20a3503SChristoph Lameter 	int rc;
642b20a3503SChristoph Lameter 
643b20a3503SChristoph Lameter 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
644b20a3503SChristoph Lameter 
64537109694SKeith Busch 	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
646b20a3503SChristoph Lameter 
64778bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
648b20a3503SChristoph Lameter 		return rc;
649b20a3503SChristoph Lameter 
6502916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
651b20a3503SChristoph Lameter 		migrate_page_copy(newpage, page);
6522916ecc0SJérôme Glisse 	else
6532916ecc0SJérôme Glisse 		migrate_page_states(newpage, page);
65478bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
655b20a3503SChristoph Lameter }
656b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page);
657b20a3503SChristoph Lameter 
6589361401eSDavid Howells #ifdef CONFIG_BLOCK
65984ade7c1SJan Kara /* Returns true if all buffers are successfully locked */
66084ade7c1SJan Kara static bool buffer_migrate_lock_buffers(struct buffer_head *head,
66184ade7c1SJan Kara 							enum migrate_mode mode)
66284ade7c1SJan Kara {
66384ade7c1SJan Kara 	struct buffer_head *bh = head;
66484ade7c1SJan Kara 
66584ade7c1SJan Kara 	/* Simple case, sync compaction */
66684ade7c1SJan Kara 	if (mode != MIGRATE_ASYNC) {
66784ade7c1SJan Kara 		do {
66884ade7c1SJan Kara 			lock_buffer(bh);
66984ade7c1SJan Kara 			bh = bh->b_this_page;
67084ade7c1SJan Kara 
67184ade7c1SJan Kara 		} while (bh != head);
67284ade7c1SJan Kara 
67384ade7c1SJan Kara 		return true;
67484ade7c1SJan Kara 	}
67584ade7c1SJan Kara 
67684ade7c1SJan Kara 	/* async case, we cannot block on lock_buffer so use trylock_buffer */
67784ade7c1SJan Kara 	do {
67884ade7c1SJan Kara 		if (!trylock_buffer(bh)) {
67984ade7c1SJan Kara 			/*
68084ade7c1SJan Kara 			 * We failed to lock the buffer and cannot stall in
68184ade7c1SJan Kara 			 * async migration. Release the taken locks
68284ade7c1SJan Kara 			 */
68384ade7c1SJan Kara 			struct buffer_head *failed_bh = bh;
68484ade7c1SJan Kara 			bh = head;
68584ade7c1SJan Kara 			while (bh != failed_bh) {
68684ade7c1SJan Kara 				unlock_buffer(bh);
68784ade7c1SJan Kara 				bh = bh->b_this_page;
68884ade7c1SJan Kara 			}
68984ade7c1SJan Kara 			return false;
69084ade7c1SJan Kara 		}
69184ade7c1SJan Kara 
69284ade7c1SJan Kara 		bh = bh->b_this_page;
69384ade7c1SJan Kara 	} while (bh != head);
69484ade7c1SJan Kara 	return true;
69584ade7c1SJan Kara }
69684ade7c1SJan Kara 
69789cb0888SJan Kara static int __buffer_migrate_page(struct address_space *mapping,
69889cb0888SJan Kara 		struct page *newpage, struct page *page, enum migrate_mode mode,
69989cb0888SJan Kara 		bool check_refs)
7001d8b85ccSChristoph Lameter {
7011d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
7021d8b85ccSChristoph Lameter 	int rc;
703cc4f11e6SJan Kara 	int expected_count;
7041d8b85ccSChristoph Lameter 
7051d8b85ccSChristoph Lameter 	if (!page_has_buffers(page))
706a6bc32b8SMel Gorman 		return migrate_page(mapping, newpage, page, mode);
7071d8b85ccSChristoph Lameter 
708cc4f11e6SJan Kara 	/* Check whether page does not have extra refs before we do more work */
709f900482dSJan Kara 	expected_count = expected_page_refs(mapping, page);
710cc4f11e6SJan Kara 	if (page_count(page) != expected_count)
711cc4f11e6SJan Kara 		return -EAGAIN;
712cc4f11e6SJan Kara 
7131d8b85ccSChristoph Lameter 	head = page_buffers(page);
714cc4f11e6SJan Kara 	if (!buffer_migrate_lock_buffers(head, mode))
715cc4f11e6SJan Kara 		return -EAGAIN;
7161d8b85ccSChristoph Lameter 
71789cb0888SJan Kara 	if (check_refs) {
71889cb0888SJan Kara 		bool busy;
71989cb0888SJan Kara 		bool invalidated = false;
72089cb0888SJan Kara 
72189cb0888SJan Kara recheck_buffers:
72289cb0888SJan Kara 		busy = false;
72389cb0888SJan Kara 		spin_lock(&mapping->private_lock);
72489cb0888SJan Kara 		bh = head;
72589cb0888SJan Kara 		do {
72689cb0888SJan Kara 			if (atomic_read(&bh->b_count)) {
72789cb0888SJan Kara 				busy = true;
72889cb0888SJan Kara 				break;
72989cb0888SJan Kara 			}
73089cb0888SJan Kara 			bh = bh->b_this_page;
73189cb0888SJan Kara 		} while (bh != head);
73289cb0888SJan Kara 		if (busy) {
73389cb0888SJan Kara 			if (invalidated) {
73489cb0888SJan Kara 				rc = -EAGAIN;
73589cb0888SJan Kara 				goto unlock_buffers;
73689cb0888SJan Kara 			}
737ebdf4de5SJan Kara 			spin_unlock(&mapping->private_lock);
73889cb0888SJan Kara 			invalidate_bh_lrus();
73989cb0888SJan Kara 			invalidated = true;
74089cb0888SJan Kara 			goto recheck_buffers;
74189cb0888SJan Kara 		}
74289cb0888SJan Kara 	}
74389cb0888SJan Kara 
74437109694SKeith Busch 	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
74578bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
746cc4f11e6SJan Kara 		goto unlock_buffers;
7471d8b85ccSChristoph Lameter 
748cd0f3715SGuoqing Jiang 	attach_page_private(newpage, detach_page_private(page));
7491d8b85ccSChristoph Lameter 
7501d8b85ccSChristoph Lameter 	bh = head;
7511d8b85ccSChristoph Lameter 	do {
7521d8b85ccSChristoph Lameter 		set_bh_page(bh, newpage, bh_offset(bh));
7531d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7541d8b85ccSChristoph Lameter 
7551d8b85ccSChristoph Lameter 	} while (bh != head);
7561d8b85ccSChristoph Lameter 
7572916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
7581d8b85ccSChristoph Lameter 		migrate_page_copy(newpage, page);
7592916ecc0SJérôme Glisse 	else
7602916ecc0SJérôme Glisse 		migrate_page_states(newpage, page);
7611d8b85ccSChristoph Lameter 
762cc4f11e6SJan Kara 	rc = MIGRATEPAGE_SUCCESS;
763cc4f11e6SJan Kara unlock_buffers:
764ebdf4de5SJan Kara 	if (check_refs)
765ebdf4de5SJan Kara 		spin_unlock(&mapping->private_lock);
7661d8b85ccSChristoph Lameter 	bh = head;
7671d8b85ccSChristoph Lameter 	do {
7681d8b85ccSChristoph Lameter 		unlock_buffer(bh);
7691d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7701d8b85ccSChristoph Lameter 
7711d8b85ccSChristoph Lameter 	} while (bh != head);
7721d8b85ccSChristoph Lameter 
773cc4f11e6SJan Kara 	return rc;
7741d8b85ccSChristoph Lameter }
77589cb0888SJan Kara 
77689cb0888SJan Kara /*
77789cb0888SJan Kara  * Migration function for pages with buffers. This function can only be used
77889cb0888SJan Kara  * if the underlying filesystem guarantees that no other references to "page"
77989cb0888SJan Kara  * exist. For example attached buffer heads are accessed only under page lock.
78089cb0888SJan Kara  */
78189cb0888SJan Kara int buffer_migrate_page(struct address_space *mapping,
78289cb0888SJan Kara 		struct page *newpage, struct page *page, enum migrate_mode mode)
78389cb0888SJan Kara {
78489cb0888SJan Kara 	return __buffer_migrate_page(mapping, newpage, page, mode, false);
78589cb0888SJan Kara }
7861d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page);
78789cb0888SJan Kara 
78889cb0888SJan Kara /*
78989cb0888SJan Kara  * Same as above except that this variant is more careful and checks that there
79089cb0888SJan Kara  * are also no buffer head references. This function is the right one for
79189cb0888SJan Kara  * mappings where buffer heads are directly looked up and referenced (such as
79289cb0888SJan Kara  * block device mappings).
79389cb0888SJan Kara  */
79489cb0888SJan Kara int buffer_migrate_page_norefs(struct address_space *mapping,
79589cb0888SJan Kara 		struct page *newpage, struct page *page, enum migrate_mode mode)
79689cb0888SJan Kara {
79789cb0888SJan Kara 	return __buffer_migrate_page(mapping, newpage, page, mode, true);
79889cb0888SJan Kara }
7999361401eSDavid Howells #endif
8001d8b85ccSChristoph Lameter 
80104e62a29SChristoph Lameter /*
80204e62a29SChristoph Lameter  * Writeback a page to clean the dirty state
80304e62a29SChristoph Lameter  */
80404e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page)
80504e62a29SChristoph Lameter {
80604e62a29SChristoph Lameter 	struct writeback_control wbc = {
80704e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
80804e62a29SChristoph Lameter 		.nr_to_write = 1,
80904e62a29SChristoph Lameter 		.range_start = 0,
81004e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
81104e62a29SChristoph Lameter 		.for_reclaim = 1
81204e62a29SChristoph Lameter 	};
81304e62a29SChristoph Lameter 	int rc;
81404e62a29SChristoph Lameter 
81504e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
81604e62a29SChristoph Lameter 		/* No write method for the address space */
81704e62a29SChristoph Lameter 		return -EINVAL;
81804e62a29SChristoph Lameter 
81904e62a29SChristoph Lameter 	if (!clear_page_dirty_for_io(page))
82004e62a29SChristoph Lameter 		/* Someone else already triggered a write */
82104e62a29SChristoph Lameter 		return -EAGAIN;
82204e62a29SChristoph Lameter 
82304e62a29SChristoph Lameter 	/*
82404e62a29SChristoph Lameter 	 * A dirty page may imply that the underlying filesystem has
82504e62a29SChristoph Lameter 	 * the page on some queue. So the page must be clean for
82604e62a29SChristoph Lameter 	 * migration. Writeout may mean we loose the lock and the
82704e62a29SChristoph Lameter 	 * page state is no longer what we checked for earlier.
82804e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
82904e62a29SChristoph Lameter 	 * be successful.
83004e62a29SChristoph Lameter 	 */
831e388466dSKirill A. Shutemov 	remove_migration_ptes(page, page, false);
83204e62a29SChristoph Lameter 
83304e62a29SChristoph Lameter 	rc = mapping->a_ops->writepage(page, &wbc);
83404e62a29SChristoph Lameter 
83504e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
83604e62a29SChristoph Lameter 		/* unlocked. Relock */
83704e62a29SChristoph Lameter 		lock_page(page);
83804e62a29SChristoph Lameter 
839bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
84004e62a29SChristoph Lameter }
84104e62a29SChristoph Lameter 
84204e62a29SChristoph Lameter /*
84304e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
84404e62a29SChristoph Lameter  */
8458351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping,
846a6bc32b8SMel Gorman 	struct page *newpage, struct page *page, enum migrate_mode mode)
8478351a6e4SChristoph Lameter {
848b969c4abSMel Gorman 	if (PageDirty(page)) {
849a6bc32b8SMel Gorman 		/* Only writeback pages in full synchronous migration */
8502916ecc0SJérôme Glisse 		switch (mode) {
8512916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
8522916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
8532916ecc0SJérôme Glisse 			break;
8542916ecc0SJérôme Glisse 		default:
855b969c4abSMel Gorman 			return -EBUSY;
8562916ecc0SJérôme Glisse 		}
85704e62a29SChristoph Lameter 		return writeout(mapping, page);
858b969c4abSMel Gorman 	}
8598351a6e4SChristoph Lameter 
8608351a6e4SChristoph Lameter 	/*
8618351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
8628351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
8638351a6e4SChristoph Lameter 	 */
864266cf658SDavid Howells 	if (page_has_private(page) &&
8658351a6e4SChristoph Lameter 	    !try_to_release_page(page, GFP_KERNEL))
866806031bbSMel Gorman 		return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
8678351a6e4SChristoph Lameter 
868a6bc32b8SMel Gorman 	return migrate_page(mapping, newpage, page, mode);
8698351a6e4SChristoph Lameter }
8708351a6e4SChristoph Lameter 
8711d8b85ccSChristoph Lameter /*
872e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
873e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
874b20a3503SChristoph Lameter  *
875e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
876e24f0b8fSChristoph Lameter  * is successful.
877894bc310SLee Schermerhorn  *
878894bc310SLee Schermerhorn  * Return value:
879894bc310SLee Schermerhorn  *   < 0 - error code
88078bd5209SRafael Aquini  *  MIGRATEPAGE_SUCCESS - success
881b20a3503SChristoph Lameter  */
8823fe2011fSMel Gorman static int move_to_new_page(struct page *newpage, struct page *page,
8835c3f9a67SHugh Dickins 				enum migrate_mode mode)
884b20a3503SChristoph Lameter {
885e24f0b8fSChristoph Lameter 	struct address_space *mapping;
886bda807d4SMinchan Kim 	int rc = -EAGAIN;
887bda807d4SMinchan Kim 	bool is_lru = !__PageMovable(page);
888b20a3503SChristoph Lameter 
8897db7671fSHugh Dickins 	VM_BUG_ON_PAGE(!PageLocked(page), page);
8907db7671fSHugh Dickins 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
891b20a3503SChristoph Lameter 
892b20a3503SChristoph Lameter 	mapping = page_mapping(page);
893bda807d4SMinchan Kim 
894bda807d4SMinchan Kim 	if (likely(is_lru)) {
895b20a3503SChristoph Lameter 		if (!mapping)
896a6bc32b8SMel Gorman 			rc = migrate_page(mapping, newpage, page, mode);
8976c5240aeSChristoph Lameter 		else if (mapping->a_ops->migratepage)
898b20a3503SChristoph Lameter 			/*
899bda807d4SMinchan Kim 			 * Most pages have a mapping and most filesystems
900bda807d4SMinchan Kim 			 * provide a migratepage callback. Anonymous pages
901bda807d4SMinchan Kim 			 * are part of swap space which also has its own
902bda807d4SMinchan Kim 			 * migratepage callback. This is the most common path
903bda807d4SMinchan Kim 			 * for page migration.
904b20a3503SChristoph Lameter 			 */
905bda807d4SMinchan Kim 			rc = mapping->a_ops->migratepage(mapping, newpage,
906bda807d4SMinchan Kim 							page, mode);
9078351a6e4SChristoph Lameter 		else
908bda807d4SMinchan Kim 			rc = fallback_migrate_page(mapping, newpage,
909bda807d4SMinchan Kim 							page, mode);
910bda807d4SMinchan Kim 	} else {
911bda807d4SMinchan Kim 		/*
912bda807d4SMinchan Kim 		 * In case of non-lru page, it could be released after
913bda807d4SMinchan Kim 		 * isolation step. In that case, we shouldn't try migration.
914bda807d4SMinchan Kim 		 */
915bda807d4SMinchan Kim 		VM_BUG_ON_PAGE(!PageIsolated(page), page);
916bda807d4SMinchan Kim 		if (!PageMovable(page)) {
917bda807d4SMinchan Kim 			rc = MIGRATEPAGE_SUCCESS;
918bda807d4SMinchan Kim 			__ClearPageIsolated(page);
919bda807d4SMinchan Kim 			goto out;
920bda807d4SMinchan Kim 		}
921bda807d4SMinchan Kim 
922bda807d4SMinchan Kim 		rc = mapping->a_ops->migratepage(mapping, newpage,
923bda807d4SMinchan Kim 						page, mode);
924bda807d4SMinchan Kim 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
925bda807d4SMinchan Kim 			!PageIsolated(page));
926bda807d4SMinchan Kim 	}
927b20a3503SChristoph Lameter 
9285c3f9a67SHugh Dickins 	/*
9295c3f9a67SHugh Dickins 	 * When successful, old pagecache page->mapping must be cleared before
9305c3f9a67SHugh Dickins 	 * page is freed; but stats require that PageAnon be left as PageAnon.
9315c3f9a67SHugh Dickins 	 */
9325c3f9a67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
933bda807d4SMinchan Kim 		if (__PageMovable(page)) {
934bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
935bda807d4SMinchan Kim 
936bda807d4SMinchan Kim 			/*
937bda807d4SMinchan Kim 			 * We clear PG_movable under page_lock so any compactor
938bda807d4SMinchan Kim 			 * cannot try to migrate this page.
939bda807d4SMinchan Kim 			 */
940bda807d4SMinchan Kim 			__ClearPageIsolated(page);
941bda807d4SMinchan Kim 		}
942bda807d4SMinchan Kim 
943bda807d4SMinchan Kim 		/*
944c23a0c99SRalph Campbell 		 * Anonymous and movable page->mapping will be cleared by
945bda807d4SMinchan Kim 		 * free_pages_prepare so don't reset it here for keeping
946bda807d4SMinchan Kim 		 * the type to work PageAnon, for example.
947bda807d4SMinchan Kim 		 */
948bda807d4SMinchan Kim 		if (!PageMappingFlags(page))
9495c3f9a67SHugh Dickins 			page->mapping = NULL;
950d2b2c6ddSLars Persson 
95125b2995aSChristoph Hellwig 		if (likely(!is_zone_device_page(newpage)))
952d2b2c6ddSLars Persson 			flush_dcache_page(newpage);
953d2b2c6ddSLars Persson 
9543fe2011fSMel Gorman 	}
955bda807d4SMinchan Kim out:
956e24f0b8fSChristoph Lameter 	return rc;
957e24f0b8fSChristoph Lameter }
958e24f0b8fSChristoph Lameter 
9590dabec93SMinchan Kim static int __unmap_and_move(struct page *page, struct page *newpage,
9609c620e2bSHugh Dickins 				int force, enum migrate_mode mode)
961e24f0b8fSChristoph Lameter {
9620dabec93SMinchan Kim 	int rc = -EAGAIN;
963213ecb31SBaolin Wang 	bool page_was_mapped = false;
9643f6c8272SMel Gorman 	struct anon_vma *anon_vma = NULL;
965bda807d4SMinchan Kim 	bool is_lru = !__PageMovable(page);
96695a402c3SChristoph Lameter 
967529ae9aaSNick Piggin 	if (!trylock_page(page)) {
968a6bc32b8SMel Gorman 		if (!force || mode == MIGRATE_ASYNC)
9690dabec93SMinchan Kim 			goto out;
9703e7d3449SMel Gorman 
9713e7d3449SMel Gorman 		/*
9723e7d3449SMel Gorman 		 * It's not safe for direct compaction to call lock_page.
9733e7d3449SMel Gorman 		 * For example, during page readahead pages are added locked
9743e7d3449SMel Gorman 		 * to the LRU. Later, when the IO completes the pages are
9753e7d3449SMel Gorman 		 * marked uptodate and unlocked. However, the queueing
9763e7d3449SMel Gorman 		 * could be merging multiple pages for one bio (e.g.
977d4388340SMatthew Wilcox (Oracle) 		 * mpage_readahead). If an allocation happens for the
9783e7d3449SMel Gorman 		 * second or third page, the process can end up locking
9793e7d3449SMel Gorman 		 * the same page twice and deadlocking. Rather than
9803e7d3449SMel Gorman 		 * trying to be clever about what pages can be locked,
9813e7d3449SMel Gorman 		 * avoid the use of lock_page for direct compaction
9823e7d3449SMel Gorman 		 * altogether.
9833e7d3449SMel Gorman 		 */
9843e7d3449SMel Gorman 		if (current->flags & PF_MEMALLOC)
9850dabec93SMinchan Kim 			goto out;
9863e7d3449SMel Gorman 
987e24f0b8fSChristoph Lameter 		lock_page(page);
988e24f0b8fSChristoph Lameter 	}
989e24f0b8fSChristoph Lameter 
990e24f0b8fSChristoph Lameter 	if (PageWriteback(page)) {
99111bc82d6SAndrea Arcangeli 		/*
992fed5b64aSJianguo Wu 		 * Only in the case of a full synchronous migration is it
993a6bc32b8SMel Gorman 		 * necessary to wait for PageWriteback. In the async case,
994a6bc32b8SMel Gorman 		 * the retry loop is too short and in the sync-light case,
995a6bc32b8SMel Gorman 		 * the overhead of stalling is too much
99611bc82d6SAndrea Arcangeli 		 */
9972916ecc0SJérôme Glisse 		switch (mode) {
9982916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
9992916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
10002916ecc0SJérôme Glisse 			break;
10012916ecc0SJérôme Glisse 		default:
100211bc82d6SAndrea Arcangeli 			rc = -EBUSY;
10030a31bc97SJohannes Weiner 			goto out_unlock;
100411bc82d6SAndrea Arcangeli 		}
100511bc82d6SAndrea Arcangeli 		if (!force)
10060a31bc97SJohannes Weiner 			goto out_unlock;
1007e24f0b8fSChristoph Lameter 		wait_on_page_writeback(page);
1008e24f0b8fSChristoph Lameter 	}
100903f15c86SHugh Dickins 
1010e24f0b8fSChristoph Lameter 	/*
101168a9843fSBaolin Wang 	 * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
1012dc386d4dSKAMEZAWA Hiroyuki 	 * we cannot notice that anon_vma is freed while we migrates a page.
10131ce82b69SHugh Dickins 	 * This get_anon_vma() delays freeing anon_vma pointer until the end
1014dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
1015989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
1016989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
10173fe2011fSMel Gorman 	 *
101803f15c86SHugh Dickins 	 * Only page_get_anon_vma() understands the subtleties of
101903f15c86SHugh Dickins 	 * getting a hold on an anon_vma from outside one of its mms.
102003f15c86SHugh Dickins 	 * But if we cannot get anon_vma, then we won't need it anyway,
102103f15c86SHugh Dickins 	 * because that implies that the anon page is no longer mapped
102203f15c86SHugh Dickins 	 * (and cannot be remapped so long as we hold the page lock).
10233fe2011fSMel Gorman 	 */
102403f15c86SHugh Dickins 	if (PageAnon(page) && !PageKsm(page))
102503f15c86SHugh Dickins 		anon_vma = page_get_anon_vma(page);
102662e1c553SShaohua Li 
10277db7671fSHugh Dickins 	/*
10287db7671fSHugh Dickins 	 * Block others from accessing the new page when we get around to
10297db7671fSHugh Dickins 	 * establishing additional references. We are usually the only one
10307db7671fSHugh Dickins 	 * holding a reference to newpage at this point. We used to have a BUG
10317db7671fSHugh Dickins 	 * here if trylock_page(newpage) fails, but would like to allow for
10327db7671fSHugh Dickins 	 * cases where there might be a race with the previous use of newpage.
10337db7671fSHugh Dickins 	 * This is much like races on refcount of oldpage: just don't BUG().
10347db7671fSHugh Dickins 	 */
10357db7671fSHugh Dickins 	if (unlikely(!trylock_page(newpage)))
10367db7671fSHugh Dickins 		goto out_unlock;
10377db7671fSHugh Dickins 
1038bda807d4SMinchan Kim 	if (unlikely(!is_lru)) {
1039bda807d4SMinchan Kim 		rc = move_to_new_page(newpage, page, mode);
1040bda807d4SMinchan Kim 		goto out_unlock_both;
1041bda807d4SMinchan Kim 	}
1042bda807d4SMinchan Kim 
1043dc386d4dSKAMEZAWA Hiroyuki 	/*
104462e1c553SShaohua Li 	 * Corner case handling:
104562e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
104662e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
104762e1c553SShaohua Li 	 * Calling try_to_unmap() against a page->mapping==NULL page will
104862e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
1049d12b8951SYang Shi 	 * 2. An orphaned page (see truncate_cleanup_page) might have
105062e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
105162e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
105262e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
105362e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
1054dc386d4dSKAMEZAWA Hiroyuki 	 */
105562e1c553SShaohua Li 	if (!page->mapping) {
1056309381feSSasha Levin 		VM_BUG_ON_PAGE(PageAnon(page), page);
10571ce82b69SHugh Dickins 		if (page_has_private(page)) {
105862e1c553SShaohua Li 			try_to_free_buffers(page);
10597db7671fSHugh Dickins 			goto out_unlock_both;
106062e1c553SShaohua Li 		}
10617db7671fSHugh Dickins 	} else if (page_mapped(page)) {
10627db7671fSHugh Dickins 		/* Establish migration ptes */
106303f15c86SHugh Dickins 		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
106403f15c86SHugh Dickins 				page);
1065a98a2f0cSAlistair Popple 		try_to_migrate(page, 0);
1066213ecb31SBaolin Wang 		page_was_mapped = true;
10672ebba6b7SHugh Dickins 	}
1068dc386d4dSKAMEZAWA Hiroyuki 
1069e24f0b8fSChristoph Lameter 	if (!page_mapped(page))
10705c3f9a67SHugh Dickins 		rc = move_to_new_page(newpage, page, mode);
1071e24f0b8fSChristoph Lameter 
10725c3f9a67SHugh Dickins 	if (page_was_mapped)
10735c3f9a67SHugh Dickins 		remove_migration_ptes(page,
1074e388466dSKirill A. Shutemov 			rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
10753f6c8272SMel Gorman 
10767db7671fSHugh Dickins out_unlock_both:
10777db7671fSHugh Dickins 	unlock_page(newpage);
10787db7671fSHugh Dickins out_unlock:
10793f6c8272SMel Gorman 	/* Drop an anon_vma reference if we took one */
108076545066SRik van Riel 	if (anon_vma)
10819e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
1082b20a3503SChristoph Lameter 	unlock_page(page);
10830dabec93SMinchan Kim out:
1084c6c919ebSMinchan Kim 	/*
1085c6c919ebSMinchan Kim 	 * If migration is successful, decrease refcount of the newpage
1086c6c919ebSMinchan Kim 	 * which will not free the page because new page owner increased
1087c6c919ebSMinchan Kim 	 * refcounter. As well, if it is LRU page, add the page to LRU
1088e0a352faSDavid Hildenbrand 	 * list in here. Use the old state of the isolated source page to
1089e0a352faSDavid Hildenbrand 	 * determine if we migrated a LRU page. newpage was already unlocked
1090e0a352faSDavid Hildenbrand 	 * and possibly modified by its owner - don't rely on the page
1091e0a352faSDavid Hildenbrand 	 * state.
1092c6c919ebSMinchan Kim 	 */
1093c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1094e0a352faSDavid Hildenbrand 		if (unlikely(!is_lru))
1095c6c919ebSMinchan Kim 			put_page(newpage);
1096c6c919ebSMinchan Kim 		else
1097c6c919ebSMinchan Kim 			putback_lru_page(newpage);
1098c6c919ebSMinchan Kim 	}
1099c6c919ebSMinchan Kim 
11000dabec93SMinchan Kim 	return rc;
11010dabec93SMinchan Kim }
110295a402c3SChristoph Lameter 
110379c28a41SDave Hansen 
110479c28a41SDave Hansen /*
110579c28a41SDave Hansen  * node_demotion[] example:
110679c28a41SDave Hansen  *
110779c28a41SDave Hansen  * Consider a system with two sockets.  Each socket has
110879c28a41SDave Hansen  * three classes of memory attached: fast, medium and slow.
110979c28a41SDave Hansen  * Each memory class is placed in its own NUMA node.  The
111079c28a41SDave Hansen  * CPUs are placed in the node with the "fast" memory.  The
111179c28a41SDave Hansen  * 6 NUMA nodes (0-5) might be split among the sockets like
111279c28a41SDave Hansen  * this:
111379c28a41SDave Hansen  *
111479c28a41SDave Hansen  *	Socket A: 0, 1, 2
111579c28a41SDave Hansen  *	Socket B: 3, 4, 5
111679c28a41SDave Hansen  *
111779c28a41SDave Hansen  * When Node 0 fills up, its memory should be migrated to
111879c28a41SDave Hansen  * Node 1.  When Node 1 fills up, it should be migrated to
111979c28a41SDave Hansen  * Node 2.  The migration path start on the nodes with the
112079c28a41SDave Hansen  * processors (since allocations default to this node) and
112179c28a41SDave Hansen  * fast memory, progress through medium and end with the
112279c28a41SDave Hansen  * slow memory:
112379c28a41SDave Hansen  *
112479c28a41SDave Hansen  *	0 -> 1 -> 2 -> stop
112579c28a41SDave Hansen  *	3 -> 4 -> 5 -> stop
112679c28a41SDave Hansen  *
112779c28a41SDave Hansen  * This is represented in the node_demotion[] like this:
112879c28a41SDave Hansen  *
112979c28a41SDave Hansen  *	{  1, // Node 0 migrates to 1
113079c28a41SDave Hansen  *	   2, // Node 1 migrates to 2
113179c28a41SDave Hansen  *	  -1, // Node 2 does not migrate
113279c28a41SDave Hansen  *	   4, // Node 3 migrates to 4
113379c28a41SDave Hansen  *	   5, // Node 4 migrates to 5
113479c28a41SDave Hansen  *	  -1} // Node 5 does not migrate
113579c28a41SDave Hansen  */
113679c28a41SDave Hansen 
113779c28a41SDave Hansen /*
113879c28a41SDave Hansen  * Writes to this array occur without locking.  Cycles are
113979c28a41SDave Hansen  * not allowed: Node X demotes to Y which demotes to X...
114079c28a41SDave Hansen  *
114179c28a41SDave Hansen  * If multiple reads are performed, a single rcu_read_lock()
114279c28a41SDave Hansen  * must be held over all reads to ensure that no cycles are
114379c28a41SDave Hansen  * observed.
114479c28a41SDave Hansen  */
114579c28a41SDave Hansen static int node_demotion[MAX_NUMNODES] __read_mostly =
114679c28a41SDave Hansen 	{[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
114779c28a41SDave Hansen 
114879c28a41SDave Hansen /**
114979c28a41SDave Hansen  * next_demotion_node() - Get the next node in the demotion path
115079c28a41SDave Hansen  * @node: The starting node to lookup the next node
115179c28a41SDave Hansen  *
1152c9bd7d18SRandy Dunlap  * Return: node id for next memory node in the demotion path hierarchy
115379c28a41SDave Hansen  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
115479c28a41SDave Hansen  * @node online or guarantee that it *continues* to be the next demotion
115579c28a41SDave Hansen  * target.
115679c28a41SDave Hansen  */
115779c28a41SDave Hansen int next_demotion_node(int node)
115879c28a41SDave Hansen {
115979c28a41SDave Hansen 	int target;
116079c28a41SDave Hansen 
116179c28a41SDave Hansen 	/*
116279c28a41SDave Hansen 	 * node_demotion[] is updated without excluding this
116379c28a41SDave Hansen 	 * function from running.  RCU doesn't provide any
116479c28a41SDave Hansen 	 * compiler barriers, so the READ_ONCE() is required
116579c28a41SDave Hansen 	 * to avoid compiler reordering or read merging.
116679c28a41SDave Hansen 	 *
116779c28a41SDave Hansen 	 * Make sure to use RCU over entire code blocks if
116879c28a41SDave Hansen 	 * node_demotion[] reads need to be consistent.
116979c28a41SDave Hansen 	 */
117079c28a41SDave Hansen 	rcu_read_lock();
117179c28a41SDave Hansen 	target = READ_ONCE(node_demotion[node]);
117279c28a41SDave Hansen 	rcu_read_unlock();
117379c28a41SDave Hansen 
117479c28a41SDave Hansen 	return target;
117579c28a41SDave Hansen }
117679c28a41SDave Hansen 
11770dabec93SMinchan Kim /*
11780dabec93SMinchan Kim  * Obtain the lock on page, remove all ptes and migrate the page
11790dabec93SMinchan Kim  * to the newly allocated page in newpage.
11800dabec93SMinchan Kim  */
11816ec4476aSLinus Torvalds static int unmap_and_move(new_page_t get_new_page,
1182ef2a5153SGeert Uytterhoeven 				   free_page_t put_new_page,
1183ef2a5153SGeert Uytterhoeven 				   unsigned long private, struct page *page,
1184add05cecSNaoya Horiguchi 				   int force, enum migrate_mode mode,
1185dd4ae78aSYang Shi 				   enum migrate_reason reason,
1186dd4ae78aSYang Shi 				   struct list_head *ret)
11870dabec93SMinchan Kim {
11882def7424SHugh Dickins 	int rc = MIGRATEPAGE_SUCCESS;
118974d4a579SYang Shi 	struct page *newpage = NULL;
11900dabec93SMinchan Kim 
119194723aafSMichal Hocko 	if (!thp_migration_supported() && PageTransHuge(page))
1192d532e2e5SYang Shi 		return -ENOSYS;
119394723aafSMichal Hocko 
11940dabec93SMinchan Kim 	if (page_count(page) == 1) {
11950dabec93SMinchan Kim 		/* page was freed from under us. So we are done. */
1196c6c919ebSMinchan Kim 		ClearPageActive(page);
1197c6c919ebSMinchan Kim 		ClearPageUnevictable(page);
1198bda807d4SMinchan Kim 		if (unlikely(__PageMovable(page))) {
1199bda807d4SMinchan Kim 			lock_page(page);
1200bda807d4SMinchan Kim 			if (!PageMovable(page))
1201bda807d4SMinchan Kim 				__ClearPageIsolated(page);
1202bda807d4SMinchan Kim 			unlock_page(page);
1203bda807d4SMinchan Kim 		}
12040dabec93SMinchan Kim 		goto out;
12050dabec93SMinchan Kim 	}
12060dabec93SMinchan Kim 
120774d4a579SYang Shi 	newpage = get_new_page(page, private);
120874d4a579SYang Shi 	if (!newpage)
120974d4a579SYang Shi 		return -ENOMEM;
121074d4a579SYang Shi 
12119c620e2bSHugh Dickins 	rc = __unmap_and_move(page, newpage, force, mode);
1212c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS)
12137cd12b4aSVlastimil Babka 		set_page_owner_migrate_reason(newpage, reason);
1214bf6bddf1SRafael Aquini 
12150dabec93SMinchan Kim out:
1216e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
1217aaa994b3SChristoph Lameter 		/*
1218aaa994b3SChristoph Lameter 		 * A page that has been migrated has all references
1219aaa994b3SChristoph Lameter 		 * removed and will be freed. A page that has not been
1220c23a0c99SRalph Campbell 		 * migrated will have kept its references and be restored.
1221aaa994b3SChristoph Lameter 		 */
1222aaa994b3SChristoph Lameter 		list_del(&page->lru);
1223e24f0b8fSChristoph Lameter 	}
122468711a74SDavid Rientjes 
122595a402c3SChristoph Lameter 	/*
1226c6c919ebSMinchan Kim 	 * If migration is successful, releases reference grabbed during
1227c6c919ebSMinchan Kim 	 * isolation. Otherwise, restore the page to right list unless
1228c6c919ebSMinchan Kim 	 * we want to retry.
122995a402c3SChristoph Lameter 	 */
1230c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1231dd4ae78aSYang Shi 		/*
1232dd4ae78aSYang Shi 		 * Compaction can migrate also non-LRU pages which are
1233dd4ae78aSYang Shi 		 * not accounted to NR_ISOLATED_*. They can be recognized
1234dd4ae78aSYang Shi 		 * as __PageMovable
1235dd4ae78aSYang Shi 		 */
1236dd4ae78aSYang Shi 		if (likely(!__PageMovable(page)))
1237dd4ae78aSYang Shi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1238dd4ae78aSYang Shi 					page_is_file_lru(page), -thp_nr_pages(page));
1239dd4ae78aSYang Shi 
124079f5f8faSOscar Salvador 		if (reason != MR_MEMORY_FAILURE)
1241c6c919ebSMinchan Kim 			/*
124279f5f8faSOscar Salvador 			 * We release the page in page_handle_poison.
1243c6c919ebSMinchan Kim 			 */
124479f5f8faSOscar Salvador 			put_page(page);
1245c6c919ebSMinchan Kim 	} else {
1246dd4ae78aSYang Shi 		if (rc != -EAGAIN)
1247dd4ae78aSYang Shi 			list_add_tail(&page->lru, ret);
1248bda807d4SMinchan Kim 
1249cf4b769aSHugh Dickins 		if (put_new_page)
125068711a74SDavid Rientjes 			put_new_page(newpage, private);
1251c6c919ebSMinchan Kim 		else
1252d6d86c0aSKonstantin Khlebnikov 			put_page(newpage);
1253c6c919ebSMinchan Kim 	}
125468711a74SDavid Rientjes 
1255e24f0b8fSChristoph Lameter 	return rc;
1256e24f0b8fSChristoph Lameter }
1257b20a3503SChristoph Lameter 
1258e24f0b8fSChristoph Lameter /*
1259290408d4SNaoya Horiguchi  * Counterpart of unmap_and_move_page() for hugepage migration.
1260290408d4SNaoya Horiguchi  *
1261290408d4SNaoya Horiguchi  * This function doesn't wait the completion of hugepage I/O
1262290408d4SNaoya Horiguchi  * because there is no race between I/O and migration for hugepage.
1263290408d4SNaoya Horiguchi  * Note that currently hugepage I/O occurs only in direct I/O
1264290408d4SNaoya Horiguchi  * where no lock is held and PG_writeback is irrelevant,
1265290408d4SNaoya Horiguchi  * and writeback status of all subpages are counted in the reference
1266290408d4SNaoya Horiguchi  * count of the head page (i.e. if all subpages of a 2MB hugepage are
1267290408d4SNaoya Horiguchi  * under direct I/O, the reference of the head page is 512 and a bit more.)
1268290408d4SNaoya Horiguchi  * This means that when we try to migrate hugepage whose subpages are
1269290408d4SNaoya Horiguchi  * doing direct I/O, some references remain after try_to_unmap() and
1270290408d4SNaoya Horiguchi  * hugepage migration fails without data corruption.
1271290408d4SNaoya Horiguchi  *
1272290408d4SNaoya Horiguchi  * There is also no race when direct I/O is issued on the page under migration,
1273290408d4SNaoya Horiguchi  * because then pte is replaced with migration swap entry and direct I/O code
1274290408d4SNaoya Horiguchi  * will wait in the page fault for migration to complete.
1275290408d4SNaoya Horiguchi  */
1276290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page,
127768711a74SDavid Rientjes 				free_page_t put_new_page, unsigned long private,
127868711a74SDavid Rientjes 				struct page *hpage, int force,
1279dd4ae78aSYang Shi 				enum migrate_mode mode, int reason,
1280dd4ae78aSYang Shi 				struct list_head *ret)
1281290408d4SNaoya Horiguchi {
12822def7424SHugh Dickins 	int rc = -EAGAIN;
12832ebba6b7SHugh Dickins 	int page_was_mapped = 0;
128432665f2bSJoonsoo Kim 	struct page *new_hpage;
1285290408d4SNaoya Horiguchi 	struct anon_vma *anon_vma = NULL;
1286c0d0381aSMike Kravetz 	struct address_space *mapping = NULL;
1287290408d4SNaoya Horiguchi 
128883467efbSNaoya Horiguchi 	/*
12897ed2c31dSAnshuman Khandual 	 * Migratability of hugepages depends on architectures and their size.
129083467efbSNaoya Horiguchi 	 * This check is necessary because some callers of hugepage migration
129183467efbSNaoya Horiguchi 	 * like soft offline and memory hotremove don't walk through page
129283467efbSNaoya Horiguchi 	 * tables or check whether the hugepage is pmd-based or not before
129383467efbSNaoya Horiguchi 	 * kicking migration.
129483467efbSNaoya Horiguchi 	 */
1295100873d7SNaoya Horiguchi 	if (!hugepage_migration_supported(page_hstate(hpage))) {
1296dd4ae78aSYang Shi 		list_move_tail(&hpage->lru, ret);
129783467efbSNaoya Horiguchi 		return -ENOSYS;
129832665f2bSJoonsoo Kim 	}
129983467efbSNaoya Horiguchi 
130071a64f61SMuchun Song 	if (page_count(hpage) == 1) {
130171a64f61SMuchun Song 		/* page was freed from under us. So we are done. */
130271a64f61SMuchun Song 		putback_active_hugepage(hpage);
130371a64f61SMuchun Song 		return MIGRATEPAGE_SUCCESS;
130471a64f61SMuchun Song 	}
130571a64f61SMuchun Song 
1306666feb21SMichal Hocko 	new_hpage = get_new_page(hpage, private);
1307290408d4SNaoya Horiguchi 	if (!new_hpage)
1308290408d4SNaoya Horiguchi 		return -ENOMEM;
1309290408d4SNaoya Horiguchi 
1310290408d4SNaoya Horiguchi 	if (!trylock_page(hpage)) {
13112916ecc0SJérôme Glisse 		if (!force)
1312290408d4SNaoya Horiguchi 			goto out;
13132916ecc0SJérôme Glisse 		switch (mode) {
13142916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
13152916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
13162916ecc0SJérôme Glisse 			break;
13172916ecc0SJérôme Glisse 		default:
13182916ecc0SJérôme Glisse 			goto out;
13192916ecc0SJérôme Glisse 		}
1320290408d4SNaoya Horiguchi 		lock_page(hpage);
1321290408d4SNaoya Horiguchi 	}
1322290408d4SNaoya Horiguchi 
1323cb6acd01SMike Kravetz 	/*
1324cb6acd01SMike Kravetz 	 * Check for pages which are in the process of being freed.  Without
1325cb6acd01SMike Kravetz 	 * page_mapping() set, hugetlbfs specific move page routine will not
1326cb6acd01SMike Kravetz 	 * be called and we could leak usage counts for subpools.
1327cb6acd01SMike Kravetz 	 */
13286acfb5baSMuchun Song 	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
1329cb6acd01SMike Kravetz 		rc = -EBUSY;
1330cb6acd01SMike Kravetz 		goto out_unlock;
1331cb6acd01SMike Kravetz 	}
1332cb6acd01SMike Kravetz 
1333746b18d4SPeter Zijlstra 	if (PageAnon(hpage))
1334746b18d4SPeter Zijlstra 		anon_vma = page_get_anon_vma(hpage);
1335290408d4SNaoya Horiguchi 
13367db7671fSHugh Dickins 	if (unlikely(!trylock_page(new_hpage)))
13377db7671fSHugh Dickins 		goto put_anon;
13387db7671fSHugh Dickins 
13392ebba6b7SHugh Dickins 	if (page_mapped(hpage)) {
1340336bf30eSMike Kravetz 		bool mapping_locked = false;
1341a98a2f0cSAlistair Popple 		enum ttu_flags ttu = 0;
1342336bf30eSMike Kravetz 
1343336bf30eSMike Kravetz 		if (!PageAnon(hpage)) {
1344c0d0381aSMike Kravetz 			/*
1345336bf30eSMike Kravetz 			 * In shared mappings, try_to_unmap could potentially
1346336bf30eSMike Kravetz 			 * call huge_pmd_unshare.  Because of this, take
1347336bf30eSMike Kravetz 			 * semaphore in write mode here and set TTU_RMAP_LOCKED
1348336bf30eSMike Kravetz 			 * to let lower levels know we have taken the lock.
1349c0d0381aSMike Kravetz 			 */
1350c0d0381aSMike Kravetz 			mapping = hugetlb_page_mapping_lock_write(hpage);
1351c0d0381aSMike Kravetz 			if (unlikely(!mapping))
1352c0d0381aSMike Kravetz 				goto unlock_put_anon;
1353c0d0381aSMike Kravetz 
1354336bf30eSMike Kravetz 			mapping_locked = true;
1355336bf30eSMike Kravetz 			ttu |= TTU_RMAP_LOCKED;
1356336bf30eSMike Kravetz 		}
1357336bf30eSMike Kravetz 
1358a98a2f0cSAlistair Popple 		try_to_migrate(hpage, ttu);
13592ebba6b7SHugh Dickins 		page_was_mapped = 1;
1360336bf30eSMike Kravetz 
1361336bf30eSMike Kravetz 		if (mapping_locked)
1362336bf30eSMike Kravetz 			i_mmap_unlock_write(mapping);
13632ebba6b7SHugh Dickins 	}
1364290408d4SNaoya Horiguchi 
1365290408d4SNaoya Horiguchi 	if (!page_mapped(hpage))
13665c3f9a67SHugh Dickins 		rc = move_to_new_page(new_hpage, hpage, mode);
1367290408d4SNaoya Horiguchi 
1368336bf30eSMike Kravetz 	if (page_was_mapped)
13695c3f9a67SHugh Dickins 		remove_migration_ptes(hpage,
1370336bf30eSMike Kravetz 			rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
1371290408d4SNaoya Horiguchi 
1372c0d0381aSMike Kravetz unlock_put_anon:
13737db7671fSHugh Dickins 	unlock_page(new_hpage);
13747db7671fSHugh Dickins 
13757db7671fSHugh Dickins put_anon:
1376fd4a4663SHugh Dickins 	if (anon_vma)
13779e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
13788e6ac7faSAneesh Kumar K.V 
13792def7424SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
1380ab5ac90aSMichal Hocko 		move_hugetlb_state(hpage, new_hpage, reason);
13812def7424SHugh Dickins 		put_new_page = NULL;
13822def7424SHugh Dickins 	}
13838e6ac7faSAneesh Kumar K.V 
1384cb6acd01SMike Kravetz out_unlock:
1385290408d4SNaoya Horiguchi 	unlock_page(hpage);
138609761333SHillf Danton out:
1387dd4ae78aSYang Shi 	if (rc == MIGRATEPAGE_SUCCESS)
1388b8ec1ceeSNaoya Horiguchi 		putback_active_hugepage(hpage);
1389a04840c6SMiaohe Lin 	else if (rc != -EAGAIN)
1390dd4ae78aSYang Shi 		list_move_tail(&hpage->lru, ret);
139168711a74SDavid Rientjes 
139268711a74SDavid Rientjes 	/*
139368711a74SDavid Rientjes 	 * If migration was not successful and there's a freeing callback, use
139468711a74SDavid Rientjes 	 * it.  Otherwise, put_page() will drop the reference grabbed during
139568711a74SDavid Rientjes 	 * isolation.
139668711a74SDavid Rientjes 	 */
13972def7424SHugh Dickins 	if (put_new_page)
139868711a74SDavid Rientjes 		put_new_page(new_hpage, private);
139968711a74SDavid Rientjes 	else
14003aaa76e1SNaoya Horiguchi 		putback_active_hugepage(new_hpage);
140168711a74SDavid Rientjes 
1402290408d4SNaoya Horiguchi 	return rc;
1403290408d4SNaoya Horiguchi }
1404290408d4SNaoya Horiguchi 
1405d532e2e5SYang Shi static inline int try_split_thp(struct page *page, struct page **page2,
1406d532e2e5SYang Shi 				struct list_head *from)
1407d532e2e5SYang Shi {
1408d532e2e5SYang Shi 	int rc = 0;
1409d532e2e5SYang Shi 
1410d532e2e5SYang Shi 	lock_page(page);
1411d532e2e5SYang Shi 	rc = split_huge_page_to_list(page, from);
1412d532e2e5SYang Shi 	unlock_page(page);
1413d532e2e5SYang Shi 	if (!rc)
1414d532e2e5SYang Shi 		list_safe_reset_next(page, *page2, lru);
1415d532e2e5SYang Shi 
1416d532e2e5SYang Shi 	return rc;
1417d532e2e5SYang Shi }
1418d532e2e5SYang Shi 
1419290408d4SNaoya Horiguchi /*
1420c73e5c9cSSrivatsa S. Bhat  * migrate_pages - migrate the pages specified in a list, to the free pages
1421c73e5c9cSSrivatsa S. Bhat  *		   supplied as the target for the page migration
1422e24f0b8fSChristoph Lameter  *
1423c73e5c9cSSrivatsa S. Bhat  * @from:		The list of pages to be migrated.
1424c73e5c9cSSrivatsa S. Bhat  * @get_new_page:	The function used to allocate free pages to be used
1425c73e5c9cSSrivatsa S. Bhat  *			as the target of the page migration.
142668711a74SDavid Rientjes  * @put_new_page:	The function used to free target pages if migration
142768711a74SDavid Rientjes  *			fails, or NULL if no special handling is necessary.
1428c73e5c9cSSrivatsa S. Bhat  * @private:		Private data to be passed on to get_new_page()
1429c73e5c9cSSrivatsa S. Bhat  * @mode:		The migration mode that specifies the constraints for
1430c73e5c9cSSrivatsa S. Bhat  *			page migration, if any.
1431c73e5c9cSSrivatsa S. Bhat  * @reason:		The reason for page migration.
14325ac95884SYang Shi  * @ret_succeeded:	Set to the number of pages migrated successfully if
14335ac95884SYang Shi  *			the caller passes a non-NULL pointer.
1434e24f0b8fSChristoph Lameter  *
1435c73e5c9cSSrivatsa S. Bhat  * The function returns after 10 attempts or if no pages are movable any more
1436c73e5c9cSSrivatsa S. Bhat  * because the list has become empty or no retryable pages exist any more.
1437dd4ae78aSYang Shi  * It is caller's responsibility to call putback_movable_pages() to return pages
1438dd4ae78aSYang Shi  * to the LRU or free list only if ret != 0.
1439e24f0b8fSChristoph Lameter  *
1440c73e5c9cSSrivatsa S. Bhat  * Returns the number of pages that were not migrated, or an error code.
1441e24f0b8fSChristoph Lameter  */
14429c620e2bSHugh Dickins int migrate_pages(struct list_head *from, new_page_t get_new_page,
144368711a74SDavid Rientjes 		free_page_t put_new_page, unsigned long private,
14445ac95884SYang Shi 		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1445e24f0b8fSChristoph Lameter {
1446e24f0b8fSChristoph Lameter 	int retry = 1;
14471a5bae25SAnshuman Khandual 	int thp_retry = 1;
1448e24f0b8fSChristoph Lameter 	int nr_failed = 0;
14495647bc29SMel Gorman 	int nr_succeeded = 0;
14501a5bae25SAnshuman Khandual 	int nr_thp_succeeded = 0;
14511a5bae25SAnshuman Khandual 	int nr_thp_failed = 0;
14521a5bae25SAnshuman Khandual 	int nr_thp_split = 0;
1453e24f0b8fSChristoph Lameter 	int pass = 0;
14541a5bae25SAnshuman Khandual 	bool is_thp = false;
1455e24f0b8fSChristoph Lameter 	struct page *page;
1456e24f0b8fSChristoph Lameter 	struct page *page2;
1457e24f0b8fSChristoph Lameter 	int swapwrite = current->flags & PF_SWAPWRITE;
14581a5bae25SAnshuman Khandual 	int rc, nr_subpages;
1459dd4ae78aSYang Shi 	LIST_HEAD(ret_pages);
1460b0b515bfSYang Shi 	bool nosplit = (reason == MR_NUMA_MISPLACED);
14612d1db3b1SChristoph Lameter 
14627bc1aec5SLiam Mark 	trace_mm_migrate_pages_start(mode, reason);
14637bc1aec5SLiam Mark 
1464e24f0b8fSChristoph Lameter 	if (!swapwrite)
1465e24f0b8fSChristoph Lameter 		current->flags |= PF_SWAPWRITE;
1466e24f0b8fSChristoph Lameter 
14671a5bae25SAnshuman Khandual 	for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
1468e24f0b8fSChristoph Lameter 		retry = 0;
14691a5bae25SAnshuman Khandual 		thp_retry = 0;
1470e24f0b8fSChristoph Lameter 
1471e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
147294723aafSMichal Hocko retry:
14731a5bae25SAnshuman Khandual 			/*
14741a5bae25SAnshuman Khandual 			 * THP statistics is based on the source huge page.
14751a5bae25SAnshuman Khandual 			 * Capture required information that might get lost
14761a5bae25SAnshuman Khandual 			 * during migration.
14771a5bae25SAnshuman Khandual 			 */
14786c5c7b9fSZi Yan 			is_thp = PageTransHuge(page) && !PageHuge(page);
14796c357848SMatthew Wilcox (Oracle) 			nr_subpages = thp_nr_pages(page);
1480e24f0b8fSChristoph Lameter 			cond_resched();
1481e24f0b8fSChristoph Lameter 
148231caf665SNaoya Horiguchi 			if (PageHuge(page))
148331caf665SNaoya Horiguchi 				rc = unmap_and_move_huge_page(get_new_page,
148468711a74SDavid Rientjes 						put_new_page, private, page,
1485dd4ae78aSYang Shi 						pass > 2, mode, reason,
1486dd4ae78aSYang Shi 						&ret_pages);
148731caf665SNaoya Horiguchi 			else
148868711a74SDavid Rientjes 				rc = unmap_and_move(get_new_page, put_new_page,
1489add05cecSNaoya Horiguchi 						private, page, pass > 2, mode,
1490dd4ae78aSYang Shi 						reason, &ret_pages);
1491dd4ae78aSYang Shi 			/*
1492dd4ae78aSYang Shi 			 * The rules are:
1493dd4ae78aSYang Shi 			 *	Success: non hugetlb page will be freed, hugetlb
1494dd4ae78aSYang Shi 			 *		 page will be put back
1495dd4ae78aSYang Shi 			 *	-EAGAIN: stay on the from list
1496dd4ae78aSYang Shi 			 *	-ENOMEM: stay on the from list
1497dd4ae78aSYang Shi 			 *	Other errno: put on ret_pages list then splice to
1498dd4ae78aSYang Shi 			 *		     from list
1499dd4ae78aSYang Shi 			 */
1500e24f0b8fSChristoph Lameter 			switch(rc) {
150194723aafSMichal Hocko 			/*
150294723aafSMichal Hocko 			 * THP migration might be unsupported or the
150394723aafSMichal Hocko 			 * allocation could've failed so we should
150494723aafSMichal Hocko 			 * retry on the same page with the THP split
150594723aafSMichal Hocko 			 * to base pages.
150694723aafSMichal Hocko 			 *
150794723aafSMichal Hocko 			 * Head page is retried immediately and tail
150894723aafSMichal Hocko 			 * pages are added to the tail of the list so
150994723aafSMichal Hocko 			 * we encounter them after the rest of the list
151094723aafSMichal Hocko 			 * is processed.
151194723aafSMichal Hocko 			 */
1512d532e2e5SYang Shi 			case -ENOSYS:
1513d532e2e5SYang Shi 				/* THP migration is unsupported */
15146c5c7b9fSZi Yan 				if (is_thp) {
1515d532e2e5SYang Shi 					if (!try_split_thp(page, &page2, from)) {
1516d532e2e5SYang Shi 						nr_thp_split++;
1517d532e2e5SYang Shi 						goto retry;
1518d532e2e5SYang Shi 					}
1519d532e2e5SYang Shi 
1520d532e2e5SYang Shi 					nr_thp_failed++;
1521d532e2e5SYang Shi 					nr_failed += nr_subpages;
1522d532e2e5SYang Shi 					break;
1523d532e2e5SYang Shi 				}
1524d532e2e5SYang Shi 
1525d532e2e5SYang Shi 				/* Hugetlb migration is unsupported */
1526d532e2e5SYang Shi 				nr_failed++;
1527d532e2e5SYang Shi 				break;
1528d532e2e5SYang Shi 			case -ENOMEM:
1529d532e2e5SYang Shi 				/*
1530d532e2e5SYang Shi 				 * When memory is low, don't bother to try to migrate
1531d532e2e5SYang Shi 				 * other pages, just exit.
1532b0b515bfSYang Shi 				 * THP NUMA faulting doesn't split THP to retry.
1533d532e2e5SYang Shi 				 */
1534b0b515bfSYang Shi 				if (is_thp && !nosplit) {
1535d532e2e5SYang Shi 					if (!try_split_thp(page, &page2, from)) {
15361a5bae25SAnshuman Khandual 						nr_thp_split++;
153794723aafSMichal Hocko 						goto retry;
153894723aafSMichal Hocko 					}
15396c5c7b9fSZi Yan 
15401a5bae25SAnshuman Khandual 					nr_thp_failed++;
15411a5bae25SAnshuman Khandual 					nr_failed += nr_subpages;
15421a5bae25SAnshuman Khandual 					goto out;
15431a5bae25SAnshuman Khandual 				}
1544dfef2ef4SDavid Rientjes 				nr_failed++;
154595a402c3SChristoph Lameter 				goto out;
1546e24f0b8fSChristoph Lameter 			case -EAGAIN:
15471a5bae25SAnshuman Khandual 				if (is_thp) {
15481a5bae25SAnshuman Khandual 					thp_retry++;
15491a5bae25SAnshuman Khandual 					break;
15501a5bae25SAnshuman Khandual 				}
1551b20a3503SChristoph Lameter 				retry++;
1552e24f0b8fSChristoph Lameter 				break;
155378bd5209SRafael Aquini 			case MIGRATEPAGE_SUCCESS:
15541a5bae25SAnshuman Khandual 				if (is_thp) {
15551a5bae25SAnshuman Khandual 					nr_thp_succeeded++;
15561a5bae25SAnshuman Khandual 					nr_succeeded += nr_subpages;
15571a5bae25SAnshuman Khandual 					break;
15581a5bae25SAnshuman Khandual 				}
15595647bc29SMel Gorman 				nr_succeeded++;
1560e24f0b8fSChristoph Lameter 				break;
1561e24f0b8fSChristoph Lameter 			default:
1562354a3363SNaoya Horiguchi 				/*
1563d532e2e5SYang Shi 				 * Permanent failure (-EBUSY, etc.):
1564354a3363SNaoya Horiguchi 				 * unlike -EAGAIN case, the failed page is
1565354a3363SNaoya Horiguchi 				 * removed from migration page list and not
1566354a3363SNaoya Horiguchi 				 * retried in the next outer loop.
1567354a3363SNaoya Horiguchi 				 */
15681a5bae25SAnshuman Khandual 				if (is_thp) {
15691a5bae25SAnshuman Khandual 					nr_thp_failed++;
15701a5bae25SAnshuman Khandual 					nr_failed += nr_subpages;
15711a5bae25SAnshuman Khandual 					break;
15721a5bae25SAnshuman Khandual 				}
1573b20a3503SChristoph Lameter 				nr_failed++;
1574e24f0b8fSChristoph Lameter 				break;
1575b20a3503SChristoph Lameter 			}
1576b20a3503SChristoph Lameter 		}
1577e24f0b8fSChristoph Lameter 	}
15781a5bae25SAnshuman Khandual 	nr_failed += retry + thp_retry;
15791a5bae25SAnshuman Khandual 	nr_thp_failed += thp_retry;
1580f2f81fb2SVlastimil Babka 	rc = nr_failed;
158195a402c3SChristoph Lameter out:
1582dd4ae78aSYang Shi 	/*
1583dd4ae78aSYang Shi 	 * Put the permanent failure page back to migration list, they
1584dd4ae78aSYang Shi 	 * will be put back to the right list by the caller.
1585dd4ae78aSYang Shi 	 */
1586dd4ae78aSYang Shi 	list_splice(&ret_pages, from);
1587dd4ae78aSYang Shi 
15885647bc29SMel Gorman 	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
15895647bc29SMel Gorman 	count_vm_events(PGMIGRATE_FAIL, nr_failed);
15901a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
15911a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
15921a5bae25SAnshuman Khandual 	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
15931a5bae25SAnshuman Khandual 	trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
15941a5bae25SAnshuman Khandual 			       nr_thp_failed, nr_thp_split, mode, reason);
15957b2a2d4aSMel Gorman 
1596b20a3503SChristoph Lameter 	if (!swapwrite)
1597b20a3503SChristoph Lameter 		current->flags &= ~PF_SWAPWRITE;
1598b20a3503SChristoph Lameter 
15995ac95884SYang Shi 	if (ret_succeeded)
16005ac95884SYang Shi 		*ret_succeeded = nr_succeeded;
16015ac95884SYang Shi 
160295a402c3SChristoph Lameter 	return rc;
1603b20a3503SChristoph Lameter }
1604b20a3503SChristoph Lameter 
160519fc7bedSJoonsoo Kim struct page *alloc_migration_target(struct page *page, unsigned long private)
1606b4b38223SJoonsoo Kim {
160719fc7bedSJoonsoo Kim 	struct migration_target_control *mtc;
160819fc7bedSJoonsoo Kim 	gfp_t gfp_mask;
1609b4b38223SJoonsoo Kim 	unsigned int order = 0;
1610b4b38223SJoonsoo Kim 	struct page *new_page = NULL;
161119fc7bedSJoonsoo Kim 	int nid;
161219fc7bedSJoonsoo Kim 	int zidx;
161319fc7bedSJoonsoo Kim 
161419fc7bedSJoonsoo Kim 	mtc = (struct migration_target_control *)private;
161519fc7bedSJoonsoo Kim 	gfp_mask = mtc->gfp_mask;
161619fc7bedSJoonsoo Kim 	nid = mtc->nid;
161719fc7bedSJoonsoo Kim 	if (nid == NUMA_NO_NODE)
161819fc7bedSJoonsoo Kim 		nid = page_to_nid(page);
1619b4b38223SJoonsoo Kim 
1620d92bbc27SJoonsoo Kim 	if (PageHuge(page)) {
1621d92bbc27SJoonsoo Kim 		struct hstate *h = page_hstate(compound_head(page));
1622d92bbc27SJoonsoo Kim 
162319fc7bedSJoonsoo Kim 		gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
162419fc7bedSJoonsoo Kim 		return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1625d92bbc27SJoonsoo Kim 	}
1626b4b38223SJoonsoo Kim 
1627b4b38223SJoonsoo Kim 	if (PageTransHuge(page)) {
16289933a0c8SJoonsoo Kim 		/*
16299933a0c8SJoonsoo Kim 		 * clear __GFP_RECLAIM to make the migration callback
16309933a0c8SJoonsoo Kim 		 * consistent with regular THP allocations.
16319933a0c8SJoonsoo Kim 		 */
16329933a0c8SJoonsoo Kim 		gfp_mask &= ~__GFP_RECLAIM;
1633b4b38223SJoonsoo Kim 		gfp_mask |= GFP_TRANSHUGE;
1634b4b38223SJoonsoo Kim 		order = HPAGE_PMD_ORDER;
1635b4b38223SJoonsoo Kim 	}
163619fc7bedSJoonsoo Kim 	zidx = zone_idx(page_zone(page));
163719fc7bedSJoonsoo Kim 	if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1638b4b38223SJoonsoo Kim 		gfp_mask |= __GFP_HIGHMEM;
1639b4b38223SJoonsoo Kim 
164084172f4bSMatthew Wilcox (Oracle) 	new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
1641b4b38223SJoonsoo Kim 
1642b4b38223SJoonsoo Kim 	if (new_page && PageTransHuge(new_page))
1643b4b38223SJoonsoo Kim 		prep_transhuge_page(new_page);
1644b4b38223SJoonsoo Kim 
1645b4b38223SJoonsoo Kim 	return new_page;
1646b4b38223SJoonsoo Kim }
1647b4b38223SJoonsoo Kim 
1648742755a1SChristoph Lameter #ifdef CONFIG_NUMA
1649742755a1SChristoph Lameter 
1650a49bd4d7SMichal Hocko static int store_status(int __user *status, int start, int value, int nr)
1651742755a1SChristoph Lameter {
1652a49bd4d7SMichal Hocko 	while (nr-- > 0) {
1653a49bd4d7SMichal Hocko 		if (put_user(value, status + start))
1654a49bd4d7SMichal Hocko 			return -EFAULT;
1655a49bd4d7SMichal Hocko 		start++;
1656a49bd4d7SMichal Hocko 	}
1657742755a1SChristoph Lameter 
1658a49bd4d7SMichal Hocko 	return 0;
1659a49bd4d7SMichal Hocko }
1660742755a1SChristoph Lameter 
1661a49bd4d7SMichal Hocko static int do_move_pages_to_node(struct mm_struct *mm,
1662a49bd4d7SMichal Hocko 		struct list_head *pagelist, int node)
1663a49bd4d7SMichal Hocko {
1664a49bd4d7SMichal Hocko 	int err;
1665a0976311SJoonsoo Kim 	struct migration_target_control mtc = {
1666a0976311SJoonsoo Kim 		.nid = node,
1667a0976311SJoonsoo Kim 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1668a0976311SJoonsoo Kim 	};
1669742755a1SChristoph Lameter 
1670a0976311SJoonsoo Kim 	err = migrate_pages(pagelist, alloc_migration_target, NULL,
16715ac95884SYang Shi 		(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1672a49bd4d7SMichal Hocko 	if (err)
1673a49bd4d7SMichal Hocko 		putback_movable_pages(pagelist);
1674a49bd4d7SMichal Hocko 	return err;
1675742755a1SChristoph Lameter }
1676742755a1SChristoph Lameter 
1677742755a1SChristoph Lameter /*
1678a49bd4d7SMichal Hocko  * Resolves the given address to a struct page, isolates it from the LRU and
1679a49bd4d7SMichal Hocko  * puts it to the given pagelist.
1680e0153fc2SYang Shi  * Returns:
1681e0153fc2SYang Shi  *     errno - if the page cannot be found/isolated
1682e0153fc2SYang Shi  *     0 - when it doesn't have to be migrated because it is already on the
1683e0153fc2SYang Shi  *         target node
1684e0153fc2SYang Shi  *     1 - when it has been queued
1685742755a1SChristoph Lameter  */
1686a49bd4d7SMichal Hocko static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1687a49bd4d7SMichal Hocko 		int node, struct list_head *pagelist, bool migrate_all)
1688742755a1SChristoph Lameter {
1689742755a1SChristoph Lameter 	struct vm_area_struct *vma;
1690742755a1SChristoph Lameter 	struct page *page;
1691e8db67ebSNaoya Horiguchi 	unsigned int follflags;
1692a49bd4d7SMichal Hocko 	int err;
1693742755a1SChristoph Lameter 
1694d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
1695742755a1SChristoph Lameter 	err = -EFAULT;
1696a49bd4d7SMichal Hocko 	vma = find_vma(mm, addr);
1697a49bd4d7SMichal Hocko 	if (!vma || addr < vma->vm_start || !vma_migratable(vma))
1698a49bd4d7SMichal Hocko 		goto out;
1699742755a1SChristoph Lameter 
1700d899844eSKirill A. Shutemov 	/* FOLL_DUMP to ignore special (like zero) pages */
1701e8db67ebSNaoya Horiguchi 	follflags = FOLL_GET | FOLL_DUMP;
1702a49bd4d7SMichal Hocko 	page = follow_page(vma, addr, follflags);
170389f5b7daSLinus Torvalds 
170489f5b7daSLinus Torvalds 	err = PTR_ERR(page);
170589f5b7daSLinus Torvalds 	if (IS_ERR(page))
1706a49bd4d7SMichal Hocko 		goto out;
170789f5b7daSLinus Torvalds 
1708742755a1SChristoph Lameter 	err = -ENOENT;
1709742755a1SChristoph Lameter 	if (!page)
1710a49bd4d7SMichal Hocko 		goto out;
1711742755a1SChristoph Lameter 
1712a49bd4d7SMichal Hocko 	err = 0;
1713a49bd4d7SMichal Hocko 	if (page_to_nid(page) == node)
1714a49bd4d7SMichal Hocko 		goto out_putpage;
1715742755a1SChristoph Lameter 
1716742755a1SChristoph Lameter 	err = -EACCES;
1717a49bd4d7SMichal Hocko 	if (page_mapcount(page) > 1 && !migrate_all)
1718a49bd4d7SMichal Hocko 		goto out_putpage;
1719742755a1SChristoph Lameter 
1720e632a938SNaoya Horiguchi 	if (PageHuge(page)) {
1721e8db67ebSNaoya Horiguchi 		if (PageHead(page)) {
1722a49bd4d7SMichal Hocko 			isolate_huge_page(page, pagelist);
1723e0153fc2SYang Shi 			err = 1;
1724e8db67ebSNaoya Horiguchi 		}
1725a49bd4d7SMichal Hocko 	} else {
1726a49bd4d7SMichal Hocko 		struct page *head;
1727e632a938SNaoya Horiguchi 
1728e8db67ebSNaoya Horiguchi 		head = compound_head(page);
1729e8db67ebSNaoya Horiguchi 		err = isolate_lru_page(head);
1730a49bd4d7SMichal Hocko 		if (err)
1731a49bd4d7SMichal Hocko 			goto out_putpage;
1732a49bd4d7SMichal Hocko 
1733e0153fc2SYang Shi 		err = 1;
1734a49bd4d7SMichal Hocko 		list_add_tail(&head->lru, pagelist);
1735e8db67ebSNaoya Horiguchi 		mod_node_page_state(page_pgdat(head),
17369de4f22aSHuang Ying 			NR_ISOLATED_ANON + page_is_file_lru(head),
17376c357848SMatthew Wilcox (Oracle) 			thp_nr_pages(head));
17386d9c285aSKOSAKI Motohiro 	}
1739a49bd4d7SMichal Hocko out_putpage:
1740742755a1SChristoph Lameter 	/*
1741742755a1SChristoph Lameter 	 * Either remove the duplicate refcount from
1742742755a1SChristoph Lameter 	 * isolate_lru_page() or drop the page ref if it was
1743742755a1SChristoph Lameter 	 * not isolated.
1744742755a1SChristoph Lameter 	 */
1745742755a1SChristoph Lameter 	put_page(page);
1746a49bd4d7SMichal Hocko out:
1747d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
1748742755a1SChristoph Lameter 	return err;
1749742755a1SChristoph Lameter }
1750742755a1SChristoph Lameter 
17517ca8783aSWei Yang static int move_pages_and_store_status(struct mm_struct *mm, int node,
17527ca8783aSWei Yang 		struct list_head *pagelist, int __user *status,
17537ca8783aSWei Yang 		int start, int i, unsigned long nr_pages)
17547ca8783aSWei Yang {
17557ca8783aSWei Yang 	int err;
17567ca8783aSWei Yang 
17575d7ae891SWei Yang 	if (list_empty(pagelist))
17585d7ae891SWei Yang 		return 0;
17595d7ae891SWei Yang 
17607ca8783aSWei Yang 	err = do_move_pages_to_node(mm, pagelist, node);
17617ca8783aSWei Yang 	if (err) {
17627ca8783aSWei Yang 		/*
17637ca8783aSWei Yang 		 * Positive err means the number of failed
17647ca8783aSWei Yang 		 * pages to migrate.  Since we are going to
17657ca8783aSWei Yang 		 * abort and return the number of non-migrated
1766ab9dd4f8SLong Li 		 * pages, so need to include the rest of the
17677ca8783aSWei Yang 		 * nr_pages that have not been attempted as
17687ca8783aSWei Yang 		 * well.
17697ca8783aSWei Yang 		 */
17707ca8783aSWei Yang 		if (err > 0)
17717ca8783aSWei Yang 			err += nr_pages - i - 1;
17727ca8783aSWei Yang 		return err;
17737ca8783aSWei Yang 	}
17747ca8783aSWei Yang 	return store_status(status, start, node, i - start);
17757ca8783aSWei Yang }
17767ca8783aSWei Yang 
1777742755a1SChristoph Lameter /*
17785e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
17795e9a0f02SBrice Goglin  * the corresponding array of status.
17805e9a0f02SBrice Goglin  */
17813268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
17825e9a0f02SBrice Goglin 			 unsigned long nr_pages,
17835e9a0f02SBrice Goglin 			 const void __user * __user *pages,
17845e9a0f02SBrice Goglin 			 const int __user *nodes,
17855e9a0f02SBrice Goglin 			 int __user *status, int flags)
17865e9a0f02SBrice Goglin {
1787a49bd4d7SMichal Hocko 	int current_node = NUMA_NO_NODE;
1788a49bd4d7SMichal Hocko 	LIST_HEAD(pagelist);
1789a49bd4d7SMichal Hocko 	int start, i;
1790a49bd4d7SMichal Hocko 	int err = 0, err1;
179135282a2dSBrice Goglin 
1792361a2a22SMinchan Kim 	lru_cache_disable();
179335282a2dSBrice Goglin 
1794a49bd4d7SMichal Hocko 	for (i = start = 0; i < nr_pages; i++) {
17955e9a0f02SBrice Goglin 		const void __user *p;
1796a49bd4d7SMichal Hocko 		unsigned long addr;
17975e9a0f02SBrice Goglin 		int node;
17985e9a0f02SBrice Goglin 
17993140a227SBrice Goglin 		err = -EFAULT;
1800a49bd4d7SMichal Hocko 		if (get_user(p, pages + i))
1801a49bd4d7SMichal Hocko 			goto out_flush;
1802a49bd4d7SMichal Hocko 		if (get_user(node, nodes + i))
1803a49bd4d7SMichal Hocko 			goto out_flush;
1804057d3389SAndrey Konovalov 		addr = (unsigned long)untagged_addr(p);
18055e9a0f02SBrice Goglin 
18065e9a0f02SBrice Goglin 		err = -ENODEV;
18076f5a55f1SLinus Torvalds 		if (node < 0 || node >= MAX_NUMNODES)
1808a49bd4d7SMichal Hocko 			goto out_flush;
1809389162c2SLai Jiangshan 		if (!node_state(node, N_MEMORY))
1810a49bd4d7SMichal Hocko 			goto out_flush;
18115e9a0f02SBrice Goglin 
18125e9a0f02SBrice Goglin 		err = -EACCES;
18135e9a0f02SBrice Goglin 		if (!node_isset(node, task_nodes))
1814a49bd4d7SMichal Hocko 			goto out_flush;
18155e9a0f02SBrice Goglin 
1816a49bd4d7SMichal Hocko 		if (current_node == NUMA_NO_NODE) {
1817a49bd4d7SMichal Hocko 			current_node = node;
1818a49bd4d7SMichal Hocko 			start = i;
1819a49bd4d7SMichal Hocko 		} else if (node != current_node) {
18207ca8783aSWei Yang 			err = move_pages_and_store_status(mm, current_node,
18217ca8783aSWei Yang 					&pagelist, status, start, i, nr_pages);
1822a49bd4d7SMichal Hocko 			if (err)
1823a49bd4d7SMichal Hocko 				goto out;
1824a49bd4d7SMichal Hocko 			start = i;
1825a49bd4d7SMichal Hocko 			current_node = node;
18265e9a0f02SBrice Goglin 		}
18275e9a0f02SBrice Goglin 
1828a49bd4d7SMichal Hocko 		/*
1829a49bd4d7SMichal Hocko 		 * Errors in the page lookup or isolation are not fatal and we simply
1830a49bd4d7SMichal Hocko 		 * report them via status
1831a49bd4d7SMichal Hocko 		 */
1832a49bd4d7SMichal Hocko 		err = add_page_for_migration(mm, addr, current_node,
1833a49bd4d7SMichal Hocko 				&pagelist, flags & MPOL_MF_MOVE_ALL);
1834e0153fc2SYang Shi 
1835d08221a0SWei Yang 		if (err > 0) {
1836e0153fc2SYang Shi 			/* The page is successfully queued for migration */
1837e0153fc2SYang Shi 			continue;
1838e0153fc2SYang Shi 		}
18393140a227SBrice Goglin 
1840d08221a0SWei Yang 		/*
1841d08221a0SWei Yang 		 * If the page is already on the target node (!err), store the
1842d08221a0SWei Yang 		 * node, otherwise, store the err.
1843d08221a0SWei Yang 		 */
1844d08221a0SWei Yang 		err = store_status(status, i, err ? : current_node, 1);
1845a49bd4d7SMichal Hocko 		if (err)
1846a49bd4d7SMichal Hocko 			goto out_flush;
18473140a227SBrice Goglin 
18487ca8783aSWei Yang 		err = move_pages_and_store_status(mm, current_node, &pagelist,
18497ca8783aSWei Yang 				status, start, i, nr_pages);
1850a49bd4d7SMichal Hocko 		if (err)
1851a49bd4d7SMichal Hocko 			goto out;
1852a49bd4d7SMichal Hocko 		current_node = NUMA_NO_NODE;
18533140a227SBrice Goglin 	}
1854a49bd4d7SMichal Hocko out_flush:
1855a49bd4d7SMichal Hocko 	/* Make sure we do not overwrite the existing error */
18567ca8783aSWei Yang 	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
18577ca8783aSWei Yang 				status, start, i, nr_pages);
1858dfe9aa23SWei Yang 	if (err >= 0)
1859a49bd4d7SMichal Hocko 		err = err1;
18605e9a0f02SBrice Goglin out:
1861361a2a22SMinchan Kim 	lru_cache_enable();
18625e9a0f02SBrice Goglin 	return err;
18635e9a0f02SBrice Goglin }
18645e9a0f02SBrice Goglin 
18655e9a0f02SBrice Goglin /*
18662f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
1867742755a1SChristoph Lameter  */
186880bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
186980bba129SBrice Goglin 				const void __user **pages, int *status)
1870742755a1SChristoph Lameter {
18712f007e74SBrice Goglin 	unsigned long i;
1872742755a1SChristoph Lameter 
1873d8ed45c5SMichel Lespinasse 	mmap_read_lock(mm);
18742f007e74SBrice Goglin 
18752f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
187680bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
18772f007e74SBrice Goglin 		struct vm_area_struct *vma;
18782f007e74SBrice Goglin 		struct page *page;
1879c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
18802f007e74SBrice Goglin 
1881059b8b48SLiam Howlett 		vma = vma_lookup(mm, addr);
1882059b8b48SLiam Howlett 		if (!vma)
1883742755a1SChristoph Lameter 			goto set_status;
1884742755a1SChristoph Lameter 
1885d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
1886d899844eSKirill A. Shutemov 		page = follow_page(vma, addr, FOLL_DUMP);
188789f5b7daSLinus Torvalds 
188889f5b7daSLinus Torvalds 		err = PTR_ERR(page);
188989f5b7daSLinus Torvalds 		if (IS_ERR(page))
189089f5b7daSLinus Torvalds 			goto set_status;
189189f5b7daSLinus Torvalds 
1892d899844eSKirill A. Shutemov 		err = page ? page_to_nid(page) : -ENOENT;
1893742755a1SChristoph Lameter set_status:
189480bba129SBrice Goglin 		*status = err;
189580bba129SBrice Goglin 
189680bba129SBrice Goglin 		pages++;
189780bba129SBrice Goglin 		status++;
189880bba129SBrice Goglin 	}
189980bba129SBrice Goglin 
1900d8ed45c5SMichel Lespinasse 	mmap_read_unlock(mm);
190180bba129SBrice Goglin }
190280bba129SBrice Goglin 
1903*5b1b561bSArnd Bergmann static int get_compat_pages_array(const void __user *chunk_pages[],
1904*5b1b561bSArnd Bergmann 				  const void __user * __user *pages,
1905*5b1b561bSArnd Bergmann 				  unsigned long chunk_nr)
1906*5b1b561bSArnd Bergmann {
1907*5b1b561bSArnd Bergmann 	compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
1908*5b1b561bSArnd Bergmann 	compat_uptr_t p;
1909*5b1b561bSArnd Bergmann 	int i;
1910*5b1b561bSArnd Bergmann 
1911*5b1b561bSArnd Bergmann 	for (i = 0; i < chunk_nr; i++) {
1912*5b1b561bSArnd Bergmann 		if (get_user(p, pages32 + i))
1913*5b1b561bSArnd Bergmann 			return -EFAULT;
1914*5b1b561bSArnd Bergmann 		chunk_pages[i] = compat_ptr(p);
1915*5b1b561bSArnd Bergmann 	}
1916*5b1b561bSArnd Bergmann 
1917*5b1b561bSArnd Bergmann 	return 0;
1918*5b1b561bSArnd Bergmann }
1919*5b1b561bSArnd Bergmann 
192080bba129SBrice Goglin /*
192180bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
192280bba129SBrice Goglin  * a user array of status.
192380bba129SBrice Goglin  */
192480bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
192580bba129SBrice Goglin 			 const void __user * __user *pages,
192680bba129SBrice Goglin 			 int __user *status)
192780bba129SBrice Goglin {
192880bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16
192980bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
193080bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
193180bba129SBrice Goglin 
193287b8d1adSH. Peter Anvin 	while (nr_pages) {
193387b8d1adSH. Peter Anvin 		unsigned long chunk_nr;
193480bba129SBrice Goglin 
193587b8d1adSH. Peter Anvin 		chunk_nr = nr_pages;
193687b8d1adSH. Peter Anvin 		if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
193787b8d1adSH. Peter Anvin 			chunk_nr = DO_PAGES_STAT_CHUNK_NR;
193887b8d1adSH. Peter Anvin 
1939*5b1b561bSArnd Bergmann 		if (in_compat_syscall()) {
1940*5b1b561bSArnd Bergmann 			if (get_compat_pages_array(chunk_pages, pages,
1941*5b1b561bSArnd Bergmann 						   chunk_nr))
194287b8d1adSH. Peter Anvin 				break;
1943*5b1b561bSArnd Bergmann 		} else {
1944*5b1b561bSArnd Bergmann 			if (copy_from_user(chunk_pages, pages,
1945*5b1b561bSArnd Bergmann 				      chunk_nr * sizeof(*chunk_pages)))
1946*5b1b561bSArnd Bergmann 				break;
1947*5b1b561bSArnd Bergmann 		}
194880bba129SBrice Goglin 
194980bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
195080bba129SBrice Goglin 
195187b8d1adSH. Peter Anvin 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
195287b8d1adSH. Peter Anvin 			break;
1953742755a1SChristoph Lameter 
195487b8d1adSH. Peter Anvin 		pages += chunk_nr;
195587b8d1adSH. Peter Anvin 		status += chunk_nr;
195687b8d1adSH. Peter Anvin 		nr_pages -= chunk_nr;
195787b8d1adSH. Peter Anvin 	}
195887b8d1adSH. Peter Anvin 	return nr_pages ? -EFAULT : 0;
1959742755a1SChristoph Lameter }
1960742755a1SChristoph Lameter 
19614dc200ceSMiaohe Lin static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
19624dc200ceSMiaohe Lin {
19634dc200ceSMiaohe Lin 	struct task_struct *task;
19644dc200ceSMiaohe Lin 	struct mm_struct *mm;
19654dc200ceSMiaohe Lin 
19664dc200ceSMiaohe Lin 	/*
19674dc200ceSMiaohe Lin 	 * There is no need to check if current process has the right to modify
19684dc200ceSMiaohe Lin 	 * the specified process when they are same.
19694dc200ceSMiaohe Lin 	 */
19704dc200ceSMiaohe Lin 	if (!pid) {
19714dc200ceSMiaohe Lin 		mmget(current->mm);
19724dc200ceSMiaohe Lin 		*mem_nodes = cpuset_mems_allowed(current);
19734dc200ceSMiaohe Lin 		return current->mm;
19744dc200ceSMiaohe Lin 	}
19754dc200ceSMiaohe Lin 
19764dc200ceSMiaohe Lin 	/* Find the mm_struct */
19774dc200ceSMiaohe Lin 	rcu_read_lock();
19784dc200ceSMiaohe Lin 	task = find_task_by_vpid(pid);
19794dc200ceSMiaohe Lin 	if (!task) {
19804dc200ceSMiaohe Lin 		rcu_read_unlock();
19814dc200ceSMiaohe Lin 		return ERR_PTR(-ESRCH);
19824dc200ceSMiaohe Lin 	}
19834dc200ceSMiaohe Lin 	get_task_struct(task);
19844dc200ceSMiaohe Lin 
19854dc200ceSMiaohe Lin 	/*
19864dc200ceSMiaohe Lin 	 * Check if this process has the right to modify the specified
19874dc200ceSMiaohe Lin 	 * process. Use the regular "ptrace_may_access()" checks.
19884dc200ceSMiaohe Lin 	 */
19894dc200ceSMiaohe Lin 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
19904dc200ceSMiaohe Lin 		rcu_read_unlock();
19914dc200ceSMiaohe Lin 		mm = ERR_PTR(-EPERM);
19924dc200ceSMiaohe Lin 		goto out;
19934dc200ceSMiaohe Lin 	}
19944dc200ceSMiaohe Lin 	rcu_read_unlock();
19954dc200ceSMiaohe Lin 
19964dc200ceSMiaohe Lin 	mm = ERR_PTR(security_task_movememory(task));
19974dc200ceSMiaohe Lin 	if (IS_ERR(mm))
19984dc200ceSMiaohe Lin 		goto out;
19994dc200ceSMiaohe Lin 	*mem_nodes = cpuset_mems_allowed(task);
20004dc200ceSMiaohe Lin 	mm = get_task_mm(task);
20014dc200ceSMiaohe Lin out:
20024dc200ceSMiaohe Lin 	put_task_struct(task);
20034dc200ceSMiaohe Lin 	if (!mm)
20044dc200ceSMiaohe Lin 		mm = ERR_PTR(-EINVAL);
20054dc200ceSMiaohe Lin 	return mm;
20064dc200ceSMiaohe Lin }
20074dc200ceSMiaohe Lin 
2008742755a1SChristoph Lameter /*
2009742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
2010742755a1SChristoph Lameter  * process.
2011742755a1SChristoph Lameter  */
20127addf443SDominik Brodowski static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
20137addf443SDominik Brodowski 			     const void __user * __user *pages,
20147addf443SDominik Brodowski 			     const int __user *nodes,
20157addf443SDominik Brodowski 			     int __user *status, int flags)
2016742755a1SChristoph Lameter {
2017742755a1SChristoph Lameter 	struct mm_struct *mm;
20185e9a0f02SBrice Goglin 	int err;
20193268c63eSChristoph Lameter 	nodemask_t task_nodes;
2020742755a1SChristoph Lameter 
2021742755a1SChristoph Lameter 	/* Check flags */
2022742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
2023742755a1SChristoph Lameter 		return -EINVAL;
2024742755a1SChristoph Lameter 
2025742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2026742755a1SChristoph Lameter 		return -EPERM;
2027742755a1SChristoph Lameter 
20284dc200ceSMiaohe Lin 	mm = find_mm_struct(pid, &task_nodes);
20294dc200ceSMiaohe Lin 	if (IS_ERR(mm))
20304dc200ceSMiaohe Lin 		return PTR_ERR(mm);
20316e8b09eaSSasha Levin 
20323268c63eSChristoph Lameter 	if (nodes)
20333268c63eSChristoph Lameter 		err = do_pages_move(mm, task_nodes, nr_pages, pages,
20343268c63eSChristoph Lameter 				    nodes, status, flags);
20353268c63eSChristoph Lameter 	else
20365e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
20373268c63eSChristoph Lameter 
20383268c63eSChristoph Lameter 	mmput(mm);
20393268c63eSChristoph Lameter 	return err;
2040742755a1SChristoph Lameter }
2041742755a1SChristoph Lameter 
20427addf443SDominik Brodowski SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
20437addf443SDominik Brodowski 		const void __user * __user *, pages,
20447addf443SDominik Brodowski 		const int __user *, nodes,
20457addf443SDominik Brodowski 		int __user *, status, int, flags)
20467addf443SDominik Brodowski {
20477addf443SDominik Brodowski 	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
20487addf443SDominik Brodowski }
20497addf443SDominik Brodowski 
20507addf443SDominik Brodowski #ifdef CONFIG_COMPAT
20517addf443SDominik Brodowski COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
2052*5b1b561bSArnd Bergmann 		       compat_uptr_t __user *, pages,
20537addf443SDominik Brodowski 		       const int __user *, nodes,
20547addf443SDominik Brodowski 		       int __user *, status,
20557addf443SDominik Brodowski 		       int, flags)
20567addf443SDominik Brodowski {
2057*5b1b561bSArnd Bergmann 	return kernel_move_pages(pid, nr_pages,
2058*5b1b561bSArnd Bergmann 				 (const void __user *__user *)pages,
2059*5b1b561bSArnd Bergmann 				 nodes, status, flags);
20607addf443SDominik Brodowski }
20617addf443SDominik Brodowski #endif /* CONFIG_COMPAT */
20627addf443SDominik Brodowski 
20637039e1dbSPeter Zijlstra #ifdef CONFIG_NUMA_BALANCING
20647039e1dbSPeter Zijlstra /*
20657039e1dbSPeter Zijlstra  * Returns true if this is a safe migration target node for misplaced NUMA
20667039e1dbSPeter Zijlstra  * pages. Currently it only checks the watermarks which crude
20677039e1dbSPeter Zijlstra  */
20687039e1dbSPeter Zijlstra static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
20693abef4e6SMel Gorman 				   unsigned long nr_migrate_pages)
20707039e1dbSPeter Zijlstra {
20717039e1dbSPeter Zijlstra 	int z;
2072599d0c95SMel Gorman 
20737039e1dbSPeter Zijlstra 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
20747039e1dbSPeter Zijlstra 		struct zone *zone = pgdat->node_zones + z;
20757039e1dbSPeter Zijlstra 
20767039e1dbSPeter Zijlstra 		if (!populated_zone(zone))
20777039e1dbSPeter Zijlstra 			continue;
20787039e1dbSPeter Zijlstra 
20797039e1dbSPeter Zijlstra 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
20807039e1dbSPeter Zijlstra 		if (!zone_watermark_ok(zone, 0,
20817039e1dbSPeter Zijlstra 				       high_wmark_pages(zone) +
20827039e1dbSPeter Zijlstra 				       nr_migrate_pages,
2083bfe9d006SHuang Ying 				       ZONE_MOVABLE, 0))
20847039e1dbSPeter Zijlstra 			continue;
20857039e1dbSPeter Zijlstra 		return true;
20867039e1dbSPeter Zijlstra 	}
20877039e1dbSPeter Zijlstra 	return false;
20887039e1dbSPeter Zijlstra }
20897039e1dbSPeter Zijlstra 
20907039e1dbSPeter Zijlstra static struct page *alloc_misplaced_dst_page(struct page *page,
2091666feb21SMichal Hocko 					   unsigned long data)
20927039e1dbSPeter Zijlstra {
20937039e1dbSPeter Zijlstra 	int nid = (int) data;
20947039e1dbSPeter Zijlstra 	struct page *newpage;
20957039e1dbSPeter Zijlstra 
209696db800fSVlastimil Babka 	newpage = __alloc_pages_node(nid,
2097e97ca8e5SJohannes Weiner 					 (GFP_HIGHUSER_MOVABLE |
2098e97ca8e5SJohannes Weiner 					  __GFP_THISNODE | __GFP_NOMEMALLOC |
2099e97ca8e5SJohannes Weiner 					  __GFP_NORETRY | __GFP_NOWARN) &
21008479eba7SMel Gorman 					 ~__GFP_RECLAIM, 0);
2101bac0382cSHillf Danton 
21027039e1dbSPeter Zijlstra 	return newpage;
21037039e1dbSPeter Zijlstra }
21047039e1dbSPeter Zijlstra 
2105c5b5a3ddSYang Shi static struct page *alloc_misplaced_dst_page_thp(struct page *page,
2106c5b5a3ddSYang Shi 						 unsigned long data)
2107c5b5a3ddSYang Shi {
2108c5b5a3ddSYang Shi 	int nid = (int) data;
2109c5b5a3ddSYang Shi 	struct page *newpage;
2110c5b5a3ddSYang Shi 
2111c5b5a3ddSYang Shi 	newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
2112c5b5a3ddSYang Shi 				   HPAGE_PMD_ORDER);
2113c5b5a3ddSYang Shi 	if (!newpage)
2114c5b5a3ddSYang Shi 		goto out;
2115c5b5a3ddSYang Shi 
2116c5b5a3ddSYang Shi 	prep_transhuge_page(newpage);
2117c5b5a3ddSYang Shi 
2118c5b5a3ddSYang Shi out:
2119c5b5a3ddSYang Shi 	return newpage;
2120c5b5a3ddSYang Shi }
2121c5b5a3ddSYang Shi 
21221c30e017SMel Gorman static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
2123b32967ffSMel Gorman {
2124340ef390SHugh Dickins 	int page_lru;
21252b9b624fSBaolin Wang 	int nr_pages = thp_nr_pages(page);
2126b32967ffSMel Gorman 
2127309381feSSasha Levin 	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
21283abef4e6SMel Gorman 
2129662aeea7SYang Shi 	/* Do not migrate THP mapped by multiple processes */
2130662aeea7SYang Shi 	if (PageTransHuge(page) && total_mapcount(page) > 1)
2131662aeea7SYang Shi 		return 0;
2132662aeea7SYang Shi 
2133b32967ffSMel Gorman 	/* Avoid migrating to a node that is nearly full */
21342b9b624fSBaolin Wang 	if (!migrate_balanced_pgdat(pgdat, nr_pages))
2135340ef390SHugh Dickins 		return 0;
2136b32967ffSMel Gorman 
2137340ef390SHugh Dickins 	if (isolate_lru_page(page))
2138340ef390SHugh Dickins 		return 0;
2139340ef390SHugh Dickins 
21409de4f22aSHuang Ying 	page_lru = page_is_file_lru(page);
2141599d0c95SMel Gorman 	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
21422b9b624fSBaolin Wang 			    nr_pages);
2143b32967ffSMel Gorman 
2144b32967ffSMel Gorman 	/*
2145340ef390SHugh Dickins 	 * Isolating the page has taken another reference, so the
2146340ef390SHugh Dickins 	 * caller's reference can be safely dropped without the page
2147340ef390SHugh Dickins 	 * disappearing underneath us during migration.
2148b32967ffSMel Gorman 	 */
2149b32967ffSMel Gorman 	put_page(page);
2150340ef390SHugh Dickins 	return 1;
2151b32967ffSMel Gorman }
2152b32967ffSMel Gorman 
2153a8f60772SMel Gorman /*
21547039e1dbSPeter Zijlstra  * Attempt to migrate a misplaced page to the specified destination
21557039e1dbSPeter Zijlstra  * node. Caller is expected to have an elevated reference count on
21567039e1dbSPeter Zijlstra  * the page that will be dropped by this function before returning.
21577039e1dbSPeter Zijlstra  */
21581bc115d8SMel Gorman int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
21591bc115d8SMel Gorman 			   int node)
21607039e1dbSPeter Zijlstra {
2161a8f60772SMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
2162340ef390SHugh Dickins 	int isolated;
2163b32967ffSMel Gorman 	int nr_remaining;
21647039e1dbSPeter Zijlstra 	LIST_HEAD(migratepages);
2165c5b5a3ddSYang Shi 	new_page_t *new;
2166c5b5a3ddSYang Shi 	bool compound;
2167b5916c02SAneesh Kumar K.V 	int nr_pages = thp_nr_pages(page);
2168c5b5a3ddSYang Shi 
2169c5b5a3ddSYang Shi 	/*
2170c5b5a3ddSYang Shi 	 * PTE mapped THP or HugeTLB page can't reach here so the page could
2171c5b5a3ddSYang Shi 	 * be either base page or THP.  And it must be head page if it is
2172c5b5a3ddSYang Shi 	 * THP.
2173c5b5a3ddSYang Shi 	 */
2174c5b5a3ddSYang Shi 	compound = PageTransHuge(page);
2175c5b5a3ddSYang Shi 
2176c5b5a3ddSYang Shi 	if (compound)
2177c5b5a3ddSYang Shi 		new = alloc_misplaced_dst_page_thp;
2178c5b5a3ddSYang Shi 	else
2179c5b5a3ddSYang Shi 		new = alloc_misplaced_dst_page;
21807039e1dbSPeter Zijlstra 
21817039e1dbSPeter Zijlstra 	/*
21821bc115d8SMel Gorman 	 * Don't migrate file pages that are mapped in multiple processes
21831bc115d8SMel Gorman 	 * with execute permissions as they are probably shared libraries.
21847039e1dbSPeter Zijlstra 	 */
21857ee820eeSMiaohe Lin 	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
21867ee820eeSMiaohe Lin 	    (vma->vm_flags & VM_EXEC))
21877039e1dbSPeter Zijlstra 		goto out;
21887039e1dbSPeter Zijlstra 
2189a8f60772SMel Gorman 	/*
219009a913a7SMel Gorman 	 * Also do not migrate dirty pages as not all filesystems can move
219109a913a7SMel Gorman 	 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
219209a913a7SMel Gorman 	 */
21939de4f22aSHuang Ying 	if (page_is_file_lru(page) && PageDirty(page))
219409a913a7SMel Gorman 		goto out;
219509a913a7SMel Gorman 
2196b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
2197b32967ffSMel Gorman 	if (!isolated)
21987039e1dbSPeter Zijlstra 		goto out;
21997039e1dbSPeter Zijlstra 
22007039e1dbSPeter Zijlstra 	list_add(&page->lru, &migratepages);
2201c5b5a3ddSYang Shi 	nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
22025ac95884SYang Shi 				     MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
22037039e1dbSPeter Zijlstra 	if (nr_remaining) {
220459c82b70SJoonsoo Kim 		if (!list_empty(&migratepages)) {
220559c82b70SJoonsoo Kim 			list_del(&page->lru);
2206c5fc5c3aSYang Shi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2207c5fc5c3aSYang Shi 					page_is_file_lru(page), -nr_pages);
220859c82b70SJoonsoo Kim 			putback_lru_page(page);
220959c82b70SJoonsoo Kim 		}
22107039e1dbSPeter Zijlstra 		isolated = 0;
221103c5a6e1SMel Gorman 	} else
2212c5fc5c3aSYang Shi 		count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
22137039e1dbSPeter Zijlstra 	BUG_ON(!list_empty(&migratepages));
22147039e1dbSPeter Zijlstra 	return isolated;
2215340ef390SHugh Dickins 
2216340ef390SHugh Dickins out:
2217340ef390SHugh Dickins 	put_page(page);
2218340ef390SHugh Dickins 	return 0;
22197039e1dbSPeter Zijlstra }
2220220018d3SMel Gorman #endif /* CONFIG_NUMA_BALANCING */
22217039e1dbSPeter Zijlstra #endif /* CONFIG_NUMA */
22228763cb45SJérôme Glisse 
22239b2ed9cbSChristoph Hellwig #ifdef CONFIG_DEVICE_PRIVATE
22248315ada7SJérôme Glisse static int migrate_vma_collect_skip(unsigned long start,
22258315ada7SJérôme Glisse 				    unsigned long end,
22268315ada7SJérôme Glisse 				    struct mm_walk *walk)
22278315ada7SJérôme Glisse {
22288315ada7SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
22298315ada7SJérôme Glisse 	unsigned long addr;
22308315ada7SJérôme Glisse 
2231872ea707SRalph Campbell 	for (addr = start; addr < end; addr += PAGE_SIZE) {
22328763cb45SJérôme Glisse 		migrate->dst[migrate->npages] = 0;
22338763cb45SJérôme Glisse 		migrate->src[migrate->npages++] = 0;
22348763cb45SJérôme Glisse 	}
22358763cb45SJérôme Glisse 
22368763cb45SJérôme Glisse 	return 0;
22378763cb45SJérôme Glisse }
22388763cb45SJérôme Glisse 
2239843e1be1SMiaohe Lin static int migrate_vma_collect_hole(unsigned long start,
2240843e1be1SMiaohe Lin 				    unsigned long end,
2241843e1be1SMiaohe Lin 				    __always_unused int depth,
2242843e1be1SMiaohe Lin 				    struct mm_walk *walk)
2243843e1be1SMiaohe Lin {
2244843e1be1SMiaohe Lin 	struct migrate_vma *migrate = walk->private;
2245843e1be1SMiaohe Lin 	unsigned long addr;
2246843e1be1SMiaohe Lin 
2247843e1be1SMiaohe Lin 	/* Only allow populating anonymous memory. */
2248843e1be1SMiaohe Lin 	if (!vma_is_anonymous(walk->vma))
2249843e1be1SMiaohe Lin 		return migrate_vma_collect_skip(start, end, walk);
2250843e1be1SMiaohe Lin 
2251843e1be1SMiaohe Lin 	for (addr = start; addr < end; addr += PAGE_SIZE) {
2252843e1be1SMiaohe Lin 		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
2253843e1be1SMiaohe Lin 		migrate->dst[migrate->npages] = 0;
2254843e1be1SMiaohe Lin 		migrate->npages++;
2255843e1be1SMiaohe Lin 		migrate->cpages++;
2256843e1be1SMiaohe Lin 	}
2257843e1be1SMiaohe Lin 
2258843e1be1SMiaohe Lin 	return 0;
2259843e1be1SMiaohe Lin }
2260843e1be1SMiaohe Lin 
22618763cb45SJérôme Glisse static int migrate_vma_collect_pmd(pmd_t *pmdp,
22628763cb45SJérôme Glisse 				   unsigned long start,
22638763cb45SJérôme Glisse 				   unsigned long end,
22648763cb45SJérôme Glisse 				   struct mm_walk *walk)
22658763cb45SJérôme Glisse {
22668763cb45SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
22678763cb45SJérôme Glisse 	struct vm_area_struct *vma = walk->vma;
22688763cb45SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
22698c3328f1SJérôme Glisse 	unsigned long addr = start, unmapped = 0;
22708763cb45SJérôme Glisse 	spinlock_t *ptl;
22718763cb45SJérôme Glisse 	pte_t *ptep;
22728763cb45SJérôme Glisse 
22738763cb45SJérôme Glisse again:
22748763cb45SJérôme Glisse 	if (pmd_none(*pmdp))
2275b7a16c7aSSteven Price 		return migrate_vma_collect_hole(start, end, -1, walk);
22768763cb45SJérôme Glisse 
22778763cb45SJérôme Glisse 	if (pmd_trans_huge(*pmdp)) {
22788763cb45SJérôme Glisse 		struct page *page;
22798763cb45SJérôme Glisse 
22808763cb45SJérôme Glisse 		ptl = pmd_lock(mm, pmdp);
22818763cb45SJérôme Glisse 		if (unlikely(!pmd_trans_huge(*pmdp))) {
22828763cb45SJérôme Glisse 			spin_unlock(ptl);
22838763cb45SJérôme Glisse 			goto again;
22848763cb45SJérôme Glisse 		}
22858763cb45SJérôme Glisse 
22868763cb45SJérôme Glisse 		page = pmd_page(*pmdp);
22878763cb45SJérôme Glisse 		if (is_huge_zero_page(page)) {
22888763cb45SJérôme Glisse 			spin_unlock(ptl);
22898763cb45SJérôme Glisse 			split_huge_pmd(vma, pmdp, addr);
22908763cb45SJérôme Glisse 			if (pmd_trans_unstable(pmdp))
22918315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
22928763cb45SJérôme Glisse 								walk);
22938763cb45SJérôme Glisse 		} else {
22948763cb45SJérôme Glisse 			int ret;
22958763cb45SJérôme Glisse 
22968763cb45SJérôme Glisse 			get_page(page);
22978763cb45SJérôme Glisse 			spin_unlock(ptl);
22988763cb45SJérôme Glisse 			if (unlikely(!trylock_page(page)))
22998315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
23008763cb45SJérôme Glisse 								walk);
23018763cb45SJérôme Glisse 			ret = split_huge_page(page);
23028763cb45SJérôme Glisse 			unlock_page(page);
23038763cb45SJérôme Glisse 			put_page(page);
23048315ada7SJérôme Glisse 			if (ret)
23058315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
23068315ada7SJérôme Glisse 								walk);
23078315ada7SJérôme Glisse 			if (pmd_none(*pmdp))
2308b7a16c7aSSteven Price 				return migrate_vma_collect_hole(start, end, -1,
23098763cb45SJérôme Glisse 								walk);
23108763cb45SJérôme Glisse 		}
23118763cb45SJérôme Glisse 	}
23128763cb45SJérôme Glisse 
23138763cb45SJérôme Glisse 	if (unlikely(pmd_bad(*pmdp)))
23148315ada7SJérôme Glisse 		return migrate_vma_collect_skip(start, end, walk);
23158763cb45SJérôme Glisse 
23168763cb45SJérôme Glisse 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
23178c3328f1SJérôme Glisse 	arch_enter_lazy_mmu_mode();
23188c3328f1SJérôme Glisse 
23198763cb45SJérôme Glisse 	for (; addr < end; addr += PAGE_SIZE, ptep++) {
2320800bb1c8SChristoph Hellwig 		unsigned long mpfn = 0, pfn;
23218763cb45SJérôme Glisse 		struct page *page;
23228c3328f1SJérôme Glisse 		swp_entry_t entry;
23238763cb45SJérôme Glisse 		pte_t pte;
23248763cb45SJérôme Glisse 
23258763cb45SJérôme Glisse 		pte = *ptep;
23268763cb45SJérôme Glisse 
2327a5430ddaSJérôme Glisse 		if (pte_none(pte)) {
23280744f280SRalph Campbell 			if (vma_is_anonymous(vma)) {
23298315ada7SJérôme Glisse 				mpfn = MIGRATE_PFN_MIGRATE;
23308315ada7SJérôme Glisse 				migrate->cpages++;
23310744f280SRalph Campbell 			}
23328763cb45SJérôme Glisse 			goto next;
23338763cb45SJérôme Glisse 		}
23348763cb45SJérôme Glisse 
2335a5430ddaSJérôme Glisse 		if (!pte_present(pte)) {
2336a5430ddaSJérôme Glisse 			/*
2337a5430ddaSJérôme Glisse 			 * Only care about unaddressable device page special
2338a5430ddaSJérôme Glisse 			 * page table entry. Other special swap entries are not
2339a5430ddaSJérôme Glisse 			 * migratable, and we ignore regular swapped page.
2340a5430ddaSJérôme Glisse 			 */
2341a5430ddaSJérôme Glisse 			entry = pte_to_swp_entry(pte);
2342a5430ddaSJérôme Glisse 			if (!is_device_private_entry(entry))
2343a5430ddaSJérôme Glisse 				goto next;
2344a5430ddaSJérôme Glisse 
2345af5cdaf8SAlistair Popple 			page = pfn_swap_entry_to_page(entry);
23465143192cSRalph Campbell 			if (!(migrate->flags &
23475143192cSRalph Campbell 				MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
23485143192cSRalph Campbell 			    page->pgmap->owner != migrate->pgmap_owner)
2349800bb1c8SChristoph Hellwig 				goto next;
2350800bb1c8SChristoph Hellwig 
2351a5430ddaSJérôme Glisse 			mpfn = migrate_pfn(page_to_pfn(page)) |
235206d462beSChristoph Hellwig 					MIGRATE_PFN_MIGRATE;
23534dd845b5SAlistair Popple 			if (is_writable_device_private_entry(entry))
2354a5430ddaSJérôme Glisse 				mpfn |= MIGRATE_PFN_WRITE;
2355a5430ddaSJérôme Glisse 		} else {
23565143192cSRalph Campbell 			if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
2357800bb1c8SChristoph Hellwig 				goto next;
2358276f756dSPingfan Liu 			pfn = pte_pfn(pte);
23598315ada7SJérôme Glisse 			if (is_zero_pfn(pfn)) {
23608315ada7SJérôme Glisse 				mpfn = MIGRATE_PFN_MIGRATE;
23618315ada7SJérôme Glisse 				migrate->cpages++;
23628315ada7SJérôme Glisse 				goto next;
23638315ada7SJérôme Glisse 			}
236425b2995aSChristoph Hellwig 			page = vm_normal_page(migrate->vma, addr, pte);
2365a5430ddaSJérôme Glisse 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2366a5430ddaSJérôme Glisse 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2367a5430ddaSJérôme Glisse 		}
2368a5430ddaSJérôme Glisse 
2369a5430ddaSJérôme Glisse 		/* FIXME support THP */
23708763cb45SJérôme Glisse 		if (!page || !page->mapping || PageTransCompound(page)) {
2371276f756dSPingfan Liu 			mpfn = 0;
23728763cb45SJérôme Glisse 			goto next;
23738763cb45SJérôme Glisse 		}
23748763cb45SJérôme Glisse 
23758763cb45SJérôme Glisse 		/*
23768763cb45SJérôme Glisse 		 * By getting a reference on the page we pin it and that blocks
23778763cb45SJérôme Glisse 		 * any kind of migration. Side effect is that it "freezes" the
23788763cb45SJérôme Glisse 		 * pte.
23798763cb45SJérôme Glisse 		 *
23808763cb45SJérôme Glisse 		 * We drop this reference after isolating the page from the lru
23818763cb45SJérôme Glisse 		 * for non device page (device page are not on the lru and thus
23828763cb45SJérôme Glisse 		 * can't be dropped from it).
23838763cb45SJérôme Glisse 		 */
23848763cb45SJérôme Glisse 		get_page(page);
23858763cb45SJérôme Glisse 		migrate->cpages++;
23868763cb45SJérôme Glisse 
23878c3328f1SJérôme Glisse 		/*
23888c3328f1SJérôme Glisse 		 * Optimize for the common case where page is only mapped once
23898c3328f1SJérôme Glisse 		 * in one process. If we can lock the page, then we can safely
23908c3328f1SJérôme Glisse 		 * set up a special migration page table entry now.
23918c3328f1SJérôme Glisse 		 */
23928c3328f1SJérôme Glisse 		if (trylock_page(page)) {
23938c3328f1SJérôme Glisse 			pte_t swp_pte;
23948c3328f1SJérôme Glisse 
23958c3328f1SJérôme Glisse 			mpfn |= MIGRATE_PFN_LOCKED;
23968c3328f1SJérôme Glisse 			ptep_get_and_clear(mm, addr, ptep);
23978c3328f1SJérôme Glisse 
23988c3328f1SJérôme Glisse 			/* Setup special migration page table entry */
23994dd845b5SAlistair Popple 			if (mpfn & MIGRATE_PFN_WRITE)
24004dd845b5SAlistair Popple 				entry = make_writable_migration_entry(
24014dd845b5SAlistair Popple 							page_to_pfn(page));
24024dd845b5SAlistair Popple 			else
24034dd845b5SAlistair Popple 				entry = make_readable_migration_entry(
24044dd845b5SAlistair Popple 							page_to_pfn(page));
24058c3328f1SJérôme Glisse 			swp_pte = swp_entry_to_pte(entry);
2406ad7df764SAlistair Popple 			if (pte_present(pte)) {
24078c3328f1SJérôme Glisse 				if (pte_soft_dirty(pte))
24088c3328f1SJérôme Glisse 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
2409f45ec5ffSPeter Xu 				if (pte_uffd_wp(pte))
2410f45ec5ffSPeter Xu 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
2411ad7df764SAlistair Popple 			} else {
2412ad7df764SAlistair Popple 				if (pte_swp_soft_dirty(pte))
2413ad7df764SAlistair Popple 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
2414ad7df764SAlistair Popple 				if (pte_swp_uffd_wp(pte))
2415ad7df764SAlistair Popple 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
2416ad7df764SAlistair Popple 			}
24178c3328f1SJérôme Glisse 			set_pte_at(mm, addr, ptep, swp_pte);
24188c3328f1SJérôme Glisse 
24198c3328f1SJérôme Glisse 			/*
24208c3328f1SJérôme Glisse 			 * This is like regular unmap: we remove the rmap and
24218c3328f1SJérôme Glisse 			 * drop page refcount. Page won't be freed, as we took
24228c3328f1SJérôme Glisse 			 * a reference just above.
24238c3328f1SJérôme Glisse 			 */
24248c3328f1SJérôme Glisse 			page_remove_rmap(page, false);
24258c3328f1SJérôme Glisse 			put_page(page);
2426a5430ddaSJérôme Glisse 
2427a5430ddaSJérôme Glisse 			if (pte_present(pte))
24288c3328f1SJérôme Glisse 				unmapped++;
24298c3328f1SJérôme Glisse 		}
24308c3328f1SJérôme Glisse 
24318763cb45SJérôme Glisse next:
2432a5430ddaSJérôme Glisse 		migrate->dst[migrate->npages] = 0;
24338763cb45SJérôme Glisse 		migrate->src[migrate->npages++] = mpfn;
24348763cb45SJérôme Glisse 	}
24358c3328f1SJérôme Glisse 	arch_leave_lazy_mmu_mode();
24368763cb45SJérôme Glisse 	pte_unmap_unlock(ptep - 1, ptl);
24378763cb45SJérôme Glisse 
24388c3328f1SJérôme Glisse 	/* Only flush the TLB if we actually modified any entries */
24398c3328f1SJérôme Glisse 	if (unmapped)
24408c3328f1SJérôme Glisse 		flush_tlb_range(walk->vma, start, end);
24418c3328f1SJérôme Glisse 
24428763cb45SJérôme Glisse 	return 0;
24438763cb45SJérôme Glisse }
24448763cb45SJérôme Glisse 
24457b86ac33SChristoph Hellwig static const struct mm_walk_ops migrate_vma_walk_ops = {
24467b86ac33SChristoph Hellwig 	.pmd_entry		= migrate_vma_collect_pmd,
24477b86ac33SChristoph Hellwig 	.pte_hole		= migrate_vma_collect_hole,
24487b86ac33SChristoph Hellwig };
24497b86ac33SChristoph Hellwig 
24508763cb45SJérôme Glisse /*
24518763cb45SJérôme Glisse  * migrate_vma_collect() - collect pages over a range of virtual addresses
24528763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
24538763cb45SJérôme Glisse  *
24548763cb45SJérôme Glisse  * This will walk the CPU page table. For each virtual address backed by a
24558763cb45SJérôme Glisse  * valid page, it updates the src array and takes a reference on the page, in
24568763cb45SJérôme Glisse  * order to pin the page until we lock it and unmap it.
24578763cb45SJérôme Glisse  */
24588763cb45SJérôme Glisse static void migrate_vma_collect(struct migrate_vma *migrate)
24598763cb45SJérôme Glisse {
2460ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
24618763cb45SJérôme Glisse 
2462998427b3SRalph Campbell 	/*
2463998427b3SRalph Campbell 	 * Note that the pgmap_owner is passed to the mmu notifier callback so
2464998427b3SRalph Campbell 	 * that the registered device driver can skip invalidating device
2465998427b3SRalph Campbell 	 * private page mappings that won't be migrated.
2466998427b3SRalph Campbell 	 */
24676b49bf6dSAlistair Popple 	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
24686b49bf6dSAlistair Popple 		migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
2469c1a06df6SRalph Campbell 		migrate->pgmap_owner);
2470ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
24718763cb45SJérôme Glisse 
24727b86ac33SChristoph Hellwig 	walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
24737b86ac33SChristoph Hellwig 			&migrate_vma_walk_ops, migrate);
24747b86ac33SChristoph Hellwig 
24757b86ac33SChristoph Hellwig 	mmu_notifier_invalidate_range_end(&range);
24768763cb45SJérôme Glisse 	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
24778763cb45SJérôme Glisse }
24788763cb45SJérôme Glisse 
24798763cb45SJérôme Glisse /*
24808763cb45SJérôme Glisse  * migrate_vma_check_page() - check if page is pinned or not
24818763cb45SJérôme Glisse  * @page: struct page to check
24828763cb45SJérôme Glisse  *
24838763cb45SJérôme Glisse  * Pinned pages cannot be migrated. This is the same test as in
24848763cb45SJérôme Glisse  * migrate_page_move_mapping(), except that here we allow migration of a
24858763cb45SJérôme Glisse  * ZONE_DEVICE page.
24868763cb45SJérôme Glisse  */
24878763cb45SJérôme Glisse static bool migrate_vma_check_page(struct page *page)
24888763cb45SJérôme Glisse {
24898763cb45SJérôme Glisse 	/*
24908763cb45SJérôme Glisse 	 * One extra ref because caller holds an extra reference, either from
24918763cb45SJérôme Glisse 	 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
24928763cb45SJérôme Glisse 	 * a device page.
24938763cb45SJérôme Glisse 	 */
24948763cb45SJérôme Glisse 	int extra = 1;
24958763cb45SJérôme Glisse 
24968763cb45SJérôme Glisse 	/*
24978763cb45SJérôme Glisse 	 * FIXME support THP (transparent huge page), it is bit more complex to
24988763cb45SJérôme Glisse 	 * check them than regular pages, because they can be mapped with a pmd
24998763cb45SJérôme Glisse 	 * or with a pte (split pte mapping).
25008763cb45SJérôme Glisse 	 */
25018763cb45SJérôme Glisse 	if (PageCompound(page))
25028763cb45SJérôme Glisse 		return false;
25038763cb45SJérôme Glisse 
2504a5430ddaSJérôme Glisse 	/* Page from ZONE_DEVICE have one extra reference */
2505a5430ddaSJérôme Glisse 	if (is_zone_device_page(page)) {
2506a5430ddaSJérôme Glisse 		/*
2507a5430ddaSJérôme Glisse 		 * Private page can never be pin as they have no valid pte and
2508a5430ddaSJérôme Glisse 		 * GUP will fail for those. Yet if there is a pending migration
2509a5430ddaSJérôme Glisse 		 * a thread might try to wait on the pte migration entry and
2510a5430ddaSJérôme Glisse 		 * will bump the page reference count. Sadly there is no way to
2511a5430ddaSJérôme Glisse 		 * differentiate a regular pin from migration wait. Hence to
2512a5430ddaSJérôme Glisse 		 * avoid 2 racing thread trying to migrate back to CPU to enter
25138958b249SHaitao Shi 		 * infinite loop (one stopping migration because the other is
2514a5430ddaSJérôme Glisse 		 * waiting on pte migration entry). We always return true here.
2515a5430ddaSJérôme Glisse 		 *
2516a5430ddaSJérôme Glisse 		 * FIXME proper solution is to rework migration_entry_wait() so
2517a5430ddaSJérôme Glisse 		 * it does not need to take a reference on page.
2518a5430ddaSJérôme Glisse 		 */
251925b2995aSChristoph Hellwig 		return is_device_private_page(page);
2520a5430ddaSJérôme Glisse 	}
2521a5430ddaSJérôme Glisse 
2522df6ad698SJérôme Glisse 	/* For file back page */
2523df6ad698SJérôme Glisse 	if (page_mapping(page))
2524df6ad698SJérôme Glisse 		extra += 1 + page_has_private(page);
2525df6ad698SJérôme Glisse 
25268763cb45SJérôme Glisse 	if ((page_count(page) - extra) > page_mapcount(page))
25278763cb45SJérôme Glisse 		return false;
25288763cb45SJérôme Glisse 
25298763cb45SJérôme Glisse 	return true;
25308763cb45SJérôme Glisse }
25318763cb45SJérôme Glisse 
25328763cb45SJérôme Glisse /*
25338763cb45SJérôme Glisse  * migrate_vma_prepare() - lock pages and isolate them from the lru
25348763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
25358763cb45SJérôme Glisse  *
25368763cb45SJérôme Glisse  * This locks pages that have been collected by migrate_vma_collect(). Once each
25378763cb45SJérôme Glisse  * page is locked it is isolated from the lru (for non-device pages). Finally,
25388763cb45SJérôme Glisse  * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
25398763cb45SJérôme Glisse  * migrated by concurrent kernel threads.
25408763cb45SJérôme Glisse  */
25418763cb45SJérôme Glisse static void migrate_vma_prepare(struct migrate_vma *migrate)
25428763cb45SJérôme Glisse {
25438763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
25448c3328f1SJérôme Glisse 	const unsigned long start = migrate->start;
25458c3328f1SJérôme Glisse 	unsigned long addr, i, restore = 0;
25468763cb45SJérôme Glisse 	bool allow_drain = true;
25478763cb45SJérôme Glisse 
25488763cb45SJérôme Glisse 	lru_add_drain();
25498763cb45SJérôme Glisse 
25508763cb45SJérôme Glisse 	for (i = 0; (i < npages) && migrate->cpages; i++) {
25518763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
25528c3328f1SJérôme Glisse 		bool remap = true;
25538763cb45SJérôme Glisse 
25548763cb45SJérôme Glisse 		if (!page)
25558763cb45SJérôme Glisse 			continue;
25568763cb45SJérôme Glisse 
25578c3328f1SJérôme Glisse 		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
25588763cb45SJérôme Glisse 			/*
25598763cb45SJérôme Glisse 			 * Because we are migrating several pages there can be
25608763cb45SJérôme Glisse 			 * a deadlock between 2 concurrent migration where each
25618763cb45SJérôme Glisse 			 * are waiting on each other page lock.
25628763cb45SJérôme Glisse 			 *
25638763cb45SJérôme Glisse 			 * Make migrate_vma() a best effort thing and backoff
25648763cb45SJérôme Glisse 			 * for any page we can not lock right away.
25658763cb45SJérôme Glisse 			 */
25668763cb45SJérôme Glisse 			if (!trylock_page(page)) {
25678763cb45SJérôme Glisse 				migrate->src[i] = 0;
25688763cb45SJérôme Glisse 				migrate->cpages--;
25698763cb45SJérôme Glisse 				put_page(page);
25708763cb45SJérôme Glisse 				continue;
25718763cb45SJérôme Glisse 			}
25728c3328f1SJérôme Glisse 			remap = false;
25738763cb45SJérôme Glisse 			migrate->src[i] |= MIGRATE_PFN_LOCKED;
25748c3328f1SJérôme Glisse 		}
25758763cb45SJérôme Glisse 
2576a5430ddaSJérôme Glisse 		/* ZONE_DEVICE pages are not on LRU */
2577a5430ddaSJérôme Glisse 		if (!is_zone_device_page(page)) {
25788763cb45SJérôme Glisse 			if (!PageLRU(page) && allow_drain) {
25798763cb45SJérôme Glisse 				/* Drain CPU's pagevec */
25808763cb45SJérôme Glisse 				lru_add_drain_all();
25818763cb45SJérôme Glisse 				allow_drain = false;
25828763cb45SJérôme Glisse 			}
25838763cb45SJérôme Glisse 
25848763cb45SJérôme Glisse 			if (isolate_lru_page(page)) {
25858c3328f1SJérôme Glisse 				if (remap) {
25868c3328f1SJérôme Glisse 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
25878c3328f1SJérôme Glisse 					migrate->cpages--;
25888c3328f1SJérôme Glisse 					restore++;
25898c3328f1SJérôme Glisse 				} else {
25908763cb45SJérôme Glisse 					migrate->src[i] = 0;
25918763cb45SJérôme Glisse 					unlock_page(page);
25928763cb45SJérôme Glisse 					migrate->cpages--;
25938763cb45SJérôme Glisse 					put_page(page);
25948c3328f1SJérôme Glisse 				}
25958763cb45SJérôme Glisse 				continue;
25968763cb45SJérôme Glisse 			}
25978763cb45SJérôme Glisse 
2598a5430ddaSJérôme Glisse 			/* Drop the reference we took in collect */
2599a5430ddaSJérôme Glisse 			put_page(page);
2600a5430ddaSJérôme Glisse 		}
2601a5430ddaSJérôme Glisse 
26028763cb45SJérôme Glisse 		if (!migrate_vma_check_page(page)) {
26038c3328f1SJérôme Glisse 			if (remap) {
26048c3328f1SJérôme Glisse 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
26058c3328f1SJérôme Glisse 				migrate->cpages--;
26068c3328f1SJérôme Glisse 				restore++;
26078c3328f1SJérôme Glisse 
2608a5430ddaSJérôme Glisse 				if (!is_zone_device_page(page)) {
26098c3328f1SJérôme Glisse 					get_page(page);
26108c3328f1SJérôme Glisse 					putback_lru_page(page);
2611a5430ddaSJérôme Glisse 				}
26128c3328f1SJérôme Glisse 			} else {
26138763cb45SJérôme Glisse 				migrate->src[i] = 0;
26148763cb45SJérôme Glisse 				unlock_page(page);
26158763cb45SJérôme Glisse 				migrate->cpages--;
26168763cb45SJérôme Glisse 
2617a5430ddaSJérôme Glisse 				if (!is_zone_device_page(page))
26188763cb45SJérôme Glisse 					putback_lru_page(page);
2619a5430ddaSJérôme Glisse 				else
2620a5430ddaSJérôme Glisse 					put_page(page);
26218763cb45SJérôme Glisse 			}
26228763cb45SJérôme Glisse 		}
26238763cb45SJérôme Glisse 	}
26248763cb45SJérôme Glisse 
26258c3328f1SJérôme Glisse 	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
26268c3328f1SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
26278c3328f1SJérôme Glisse 
26288c3328f1SJérôme Glisse 		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
26298c3328f1SJérôme Glisse 			continue;
26308c3328f1SJérôme Glisse 
26318c3328f1SJérôme Glisse 		remove_migration_pte(page, migrate->vma, addr, page);
26328c3328f1SJérôme Glisse 
26338c3328f1SJérôme Glisse 		migrate->src[i] = 0;
26348c3328f1SJérôme Glisse 		unlock_page(page);
26358c3328f1SJérôme Glisse 		put_page(page);
26368c3328f1SJérôme Glisse 		restore--;
26378c3328f1SJérôme Glisse 	}
26388c3328f1SJérôme Glisse }
26398c3328f1SJérôme Glisse 
26408763cb45SJérôme Glisse /*
26418763cb45SJérôme Glisse  * migrate_vma_unmap() - replace page mapping with special migration pte entry
26428763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
26438763cb45SJérôme Glisse  *
26448763cb45SJérôme Glisse  * Replace page mapping (CPU page table pte) with a special migration pte entry
26458763cb45SJérôme Glisse  * and check again if it has been pinned. Pinned pages are restored because we
26468763cb45SJérôme Glisse  * cannot migrate them.
26478763cb45SJérôme Glisse  *
26488763cb45SJérôme Glisse  * This is the last step before we call the device driver callback to allocate
26498763cb45SJérôme Glisse  * destination memory and copy contents of original page over to new page.
26508763cb45SJérôme Glisse  */
26518763cb45SJérôme Glisse static void migrate_vma_unmap(struct migrate_vma *migrate)
26528763cb45SJérôme Glisse {
26538763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
26548763cb45SJérôme Glisse 	const unsigned long start = migrate->start;
26558763cb45SJérôme Glisse 	unsigned long addr, i, restore = 0;
26568763cb45SJérôme Glisse 
26578763cb45SJérôme Glisse 	for (i = 0; i < npages; i++) {
26588763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
26598763cb45SJérôme Glisse 
26608763cb45SJérôme Glisse 		if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
26618763cb45SJérôme Glisse 			continue;
26628763cb45SJérôme Glisse 
26638c3328f1SJérôme Glisse 		if (page_mapped(page)) {
2664a98a2f0cSAlistair Popple 			try_to_migrate(page, 0);
26658c3328f1SJérôme Glisse 			if (page_mapped(page))
26668c3328f1SJérôme Glisse 				goto restore;
26678c3328f1SJérôme Glisse 		}
26688c3328f1SJérôme Glisse 
26698c3328f1SJérôme Glisse 		if (migrate_vma_check_page(page))
26708c3328f1SJérôme Glisse 			continue;
26718c3328f1SJérôme Glisse 
26728c3328f1SJérôme Glisse restore:
26738763cb45SJérôme Glisse 		migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
26748763cb45SJérôme Glisse 		migrate->cpages--;
26758763cb45SJérôme Glisse 		restore++;
26768763cb45SJérôme Glisse 	}
26778763cb45SJérôme Glisse 
26788763cb45SJérôme Glisse 	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
26798763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
26808763cb45SJérôme Glisse 
26818763cb45SJérôme Glisse 		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
26828763cb45SJérôme Glisse 			continue;
26838763cb45SJérôme Glisse 
26848763cb45SJérôme Glisse 		remove_migration_ptes(page, page, false);
26858763cb45SJérôme Glisse 
26868763cb45SJérôme Glisse 		migrate->src[i] = 0;
26878763cb45SJérôme Glisse 		unlock_page(page);
26888763cb45SJérôme Glisse 		restore--;
26898763cb45SJérôme Glisse 
2690a5430ddaSJérôme Glisse 		if (is_zone_device_page(page))
2691a5430ddaSJérôme Glisse 			put_page(page);
2692a5430ddaSJérôme Glisse 		else
26938763cb45SJérôme Glisse 			putback_lru_page(page);
26948763cb45SJérôme Glisse 	}
26958763cb45SJérôme Glisse }
26968763cb45SJérôme Glisse 
2697a7d1f22bSChristoph Hellwig /**
2698a7d1f22bSChristoph Hellwig  * migrate_vma_setup() - prepare to migrate a range of memory
2699eaf444deSRandy Dunlap  * @args: contains the vma, start, and pfns arrays for the migration
2700a7d1f22bSChristoph Hellwig  *
2701a7d1f22bSChristoph Hellwig  * Returns: negative errno on failures, 0 when 0 or more pages were migrated
2702a7d1f22bSChristoph Hellwig  * without an error.
2703a7d1f22bSChristoph Hellwig  *
2704a7d1f22bSChristoph Hellwig  * Prepare to migrate a range of memory virtual address range by collecting all
2705a7d1f22bSChristoph Hellwig  * the pages backing each virtual address in the range, saving them inside the
2706a7d1f22bSChristoph Hellwig  * src array.  Then lock those pages and unmap them. Once the pages are locked
2707a7d1f22bSChristoph Hellwig  * and unmapped, check whether each page is pinned or not.  Pages that aren't
2708a7d1f22bSChristoph Hellwig  * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
2709a7d1f22bSChristoph Hellwig  * corresponding src array entry.  Then restores any pages that are pinned, by
2710a7d1f22bSChristoph Hellwig  * remapping and unlocking those pages.
2711a7d1f22bSChristoph Hellwig  *
2712a7d1f22bSChristoph Hellwig  * The caller should then allocate destination memory and copy source memory to
2713a7d1f22bSChristoph Hellwig  * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
2714a7d1f22bSChristoph Hellwig  * flag set).  Once these are allocated and copied, the caller must update each
2715a7d1f22bSChristoph Hellwig  * corresponding entry in the dst array with the pfn value of the destination
2716a7d1f22bSChristoph Hellwig  * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
2717a7d1f22bSChristoph Hellwig  * (destination pages must have their struct pages locked, via lock_page()).
2718a7d1f22bSChristoph Hellwig  *
2719a7d1f22bSChristoph Hellwig  * Note that the caller does not have to migrate all the pages that are marked
2720a7d1f22bSChristoph Hellwig  * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
2721a7d1f22bSChristoph Hellwig  * device memory to system memory.  If the caller cannot migrate a device page
2722a7d1f22bSChristoph Hellwig  * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
2723a7d1f22bSChristoph Hellwig  * consequences for the userspace process, so it must be avoided if at all
2724a7d1f22bSChristoph Hellwig  * possible.
2725a7d1f22bSChristoph Hellwig  *
2726a7d1f22bSChristoph Hellwig  * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
2727a7d1f22bSChristoph Hellwig  * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
2728f0953a1bSIngo Molnar  * allowing the caller to allocate device memory for those unbacked virtual
2729f0953a1bSIngo Molnar  * addresses.  For this the caller simply has to allocate device memory and
2730a7d1f22bSChristoph Hellwig  * properly set the destination entry like for regular migration.  Note that
2731f0953a1bSIngo Molnar  * this can still fail, and thus inside the device driver you must check if the
2732f0953a1bSIngo Molnar  * migration was successful for those entries after calling migrate_vma_pages(),
2733a7d1f22bSChristoph Hellwig  * just like for regular migration.
2734a7d1f22bSChristoph Hellwig  *
2735a7d1f22bSChristoph Hellwig  * After that, the callers must call migrate_vma_pages() to go over each entry
2736a7d1f22bSChristoph Hellwig  * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2737a7d1f22bSChristoph Hellwig  * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2738a7d1f22bSChristoph Hellwig  * then migrate_vma_pages() to migrate struct page information from the source
2739a7d1f22bSChristoph Hellwig  * struct page to the destination struct page.  If it fails to migrate the
2740a7d1f22bSChristoph Hellwig  * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
2741a7d1f22bSChristoph Hellwig  * src array.
2742a7d1f22bSChristoph Hellwig  *
2743a7d1f22bSChristoph Hellwig  * At this point all successfully migrated pages have an entry in the src
2744a7d1f22bSChristoph Hellwig  * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2745a7d1f22bSChristoph Hellwig  * array entry with MIGRATE_PFN_VALID flag set.
2746a7d1f22bSChristoph Hellwig  *
2747a7d1f22bSChristoph Hellwig  * Once migrate_vma_pages() returns the caller may inspect which pages were
2748a7d1f22bSChristoph Hellwig  * successfully migrated, and which were not.  Successfully migrated pages will
2749a7d1f22bSChristoph Hellwig  * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
2750a7d1f22bSChristoph Hellwig  *
2751a7d1f22bSChristoph Hellwig  * It is safe to update device page table after migrate_vma_pages() because
2752c1e8d7c6SMichel Lespinasse  * both destination and source page are still locked, and the mmap_lock is held
2753a7d1f22bSChristoph Hellwig  * in read mode (hence no one can unmap the range being migrated).
2754a7d1f22bSChristoph Hellwig  *
2755a7d1f22bSChristoph Hellwig  * Once the caller is done cleaning up things and updating its page table (if it
2756a7d1f22bSChristoph Hellwig  * chose to do so, this is not an obligation) it finally calls
2757a7d1f22bSChristoph Hellwig  * migrate_vma_finalize() to update the CPU page table to point to new pages
2758a7d1f22bSChristoph Hellwig  * for successfully migrated pages or otherwise restore the CPU page table to
2759a7d1f22bSChristoph Hellwig  * point to the original source pages.
2760a7d1f22bSChristoph Hellwig  */
2761a7d1f22bSChristoph Hellwig int migrate_vma_setup(struct migrate_vma *args)
2762a7d1f22bSChristoph Hellwig {
2763a7d1f22bSChristoph Hellwig 	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
2764a7d1f22bSChristoph Hellwig 
2765a7d1f22bSChristoph Hellwig 	args->start &= PAGE_MASK;
2766a7d1f22bSChristoph Hellwig 	args->end &= PAGE_MASK;
2767a7d1f22bSChristoph Hellwig 	if (!args->vma || is_vm_hugetlb_page(args->vma) ||
2768a7d1f22bSChristoph Hellwig 	    (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
2769a7d1f22bSChristoph Hellwig 		return -EINVAL;
2770a7d1f22bSChristoph Hellwig 	if (nr_pages <= 0)
2771a7d1f22bSChristoph Hellwig 		return -EINVAL;
2772a7d1f22bSChristoph Hellwig 	if (args->start < args->vma->vm_start ||
2773a7d1f22bSChristoph Hellwig 	    args->start >= args->vma->vm_end)
2774a7d1f22bSChristoph Hellwig 		return -EINVAL;
2775a7d1f22bSChristoph Hellwig 	if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
2776a7d1f22bSChristoph Hellwig 		return -EINVAL;
2777a7d1f22bSChristoph Hellwig 	if (!args->src || !args->dst)
2778a7d1f22bSChristoph Hellwig 		return -EINVAL;
2779a7d1f22bSChristoph Hellwig 
2780a7d1f22bSChristoph Hellwig 	memset(args->src, 0, sizeof(*args->src) * nr_pages);
2781a7d1f22bSChristoph Hellwig 	args->cpages = 0;
2782a7d1f22bSChristoph Hellwig 	args->npages = 0;
2783a7d1f22bSChristoph Hellwig 
2784a7d1f22bSChristoph Hellwig 	migrate_vma_collect(args);
2785a7d1f22bSChristoph Hellwig 
2786a7d1f22bSChristoph Hellwig 	if (args->cpages)
2787a7d1f22bSChristoph Hellwig 		migrate_vma_prepare(args);
2788a7d1f22bSChristoph Hellwig 	if (args->cpages)
2789a7d1f22bSChristoph Hellwig 		migrate_vma_unmap(args);
2790a7d1f22bSChristoph Hellwig 
2791a7d1f22bSChristoph Hellwig 	/*
2792a7d1f22bSChristoph Hellwig 	 * At this point pages are locked and unmapped, and thus they have
2793a7d1f22bSChristoph Hellwig 	 * stable content and can safely be copied to destination memory that
2794a7d1f22bSChristoph Hellwig 	 * is allocated by the drivers.
2795a7d1f22bSChristoph Hellwig 	 */
2796a7d1f22bSChristoph Hellwig 	return 0;
2797a7d1f22bSChristoph Hellwig 
2798a7d1f22bSChristoph Hellwig }
2799a7d1f22bSChristoph Hellwig EXPORT_SYMBOL(migrate_vma_setup);
2800a7d1f22bSChristoph Hellwig 
280134290e2cSRalph Campbell /*
280234290e2cSRalph Campbell  * This code closely matches the code in:
280334290e2cSRalph Campbell  *   __handle_mm_fault()
280434290e2cSRalph Campbell  *     handle_pte_fault()
280534290e2cSRalph Campbell  *       do_anonymous_page()
280634290e2cSRalph Campbell  * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
280734290e2cSRalph Campbell  * private page.
280834290e2cSRalph Campbell  */
28098315ada7SJérôme Glisse static void migrate_vma_insert_page(struct migrate_vma *migrate,
28108315ada7SJérôme Glisse 				    unsigned long addr,
28118315ada7SJérôme Glisse 				    struct page *page,
2812d85c6db4SStephen Zhang 				    unsigned long *src)
28138315ada7SJérôme Glisse {
28148315ada7SJérôme Glisse 	struct vm_area_struct *vma = migrate->vma;
28158315ada7SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
28168315ada7SJérôme Glisse 	bool flush = false;
28178315ada7SJérôme Glisse 	spinlock_t *ptl;
28188315ada7SJérôme Glisse 	pte_t entry;
28198315ada7SJérôme Glisse 	pgd_t *pgdp;
28208315ada7SJérôme Glisse 	p4d_t *p4dp;
28218315ada7SJérôme Glisse 	pud_t *pudp;
28228315ada7SJérôme Glisse 	pmd_t *pmdp;
28238315ada7SJérôme Glisse 	pte_t *ptep;
28248315ada7SJérôme Glisse 
28258315ada7SJérôme Glisse 	/* Only allow populating anonymous memory */
28268315ada7SJérôme Glisse 	if (!vma_is_anonymous(vma))
28278315ada7SJérôme Glisse 		goto abort;
28288315ada7SJérôme Glisse 
28298315ada7SJérôme Glisse 	pgdp = pgd_offset(mm, addr);
28308315ada7SJérôme Glisse 	p4dp = p4d_alloc(mm, pgdp, addr);
28318315ada7SJérôme Glisse 	if (!p4dp)
28328315ada7SJérôme Glisse 		goto abort;
28338315ada7SJérôme Glisse 	pudp = pud_alloc(mm, p4dp, addr);
28348315ada7SJérôme Glisse 	if (!pudp)
28358315ada7SJérôme Glisse 		goto abort;
28368315ada7SJérôme Glisse 	pmdp = pmd_alloc(mm, pudp, addr);
28378315ada7SJérôme Glisse 	if (!pmdp)
28388315ada7SJérôme Glisse 		goto abort;
28398315ada7SJérôme Glisse 
28408315ada7SJérôme Glisse 	if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
28418315ada7SJérôme Glisse 		goto abort;
28428315ada7SJérôme Glisse 
28438315ada7SJérôme Glisse 	/*
28448315ada7SJérôme Glisse 	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
28458315ada7SJérôme Glisse 	 * pte_offset_map() on pmds where a huge pmd might be created
28468315ada7SJérôme Glisse 	 * from a different thread.
28478315ada7SJérôme Glisse 	 *
28483e4e28c5SMichel Lespinasse 	 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
28498315ada7SJérôme Glisse 	 * parallel threads are excluded by other means.
28508315ada7SJérôme Glisse 	 *
28513e4e28c5SMichel Lespinasse 	 * Here we only have mmap_read_lock(mm).
28528315ada7SJérôme Glisse 	 */
28534cf58924SJoel Fernandes (Google) 	if (pte_alloc(mm, pmdp))
28548315ada7SJérôme Glisse 		goto abort;
28558315ada7SJérôme Glisse 
28568315ada7SJérôme Glisse 	/* See the comment in pte_alloc_one_map() */
28578315ada7SJérôme Glisse 	if (unlikely(pmd_trans_unstable(pmdp)))
28588315ada7SJérôme Glisse 		goto abort;
28598315ada7SJérôme Glisse 
28608315ada7SJérôme Glisse 	if (unlikely(anon_vma_prepare(vma)))
28618315ada7SJérôme Glisse 		goto abort;
2862d9eb1ea2SJohannes Weiner 	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
28638315ada7SJérôme Glisse 		goto abort;
28648315ada7SJérôme Glisse 
28658315ada7SJérôme Glisse 	/*
28668315ada7SJérôme Glisse 	 * The memory barrier inside __SetPageUptodate makes sure that
28678315ada7SJérôme Glisse 	 * preceding stores to the page contents become visible before
28688315ada7SJérôme Glisse 	 * the set_pte_at() write.
28698315ada7SJérôme Glisse 	 */
28708315ada7SJérôme Glisse 	__SetPageUptodate(page);
28718315ada7SJérôme Glisse 
2872df6ad698SJérôme Glisse 	if (is_zone_device_page(page)) {
2873df6ad698SJérôme Glisse 		if (is_device_private_page(page)) {
28748315ada7SJérôme Glisse 			swp_entry_t swp_entry;
28758315ada7SJérôme Glisse 
28764dd845b5SAlistair Popple 			if (vma->vm_flags & VM_WRITE)
28774dd845b5SAlistair Popple 				swp_entry = make_writable_device_private_entry(
28784dd845b5SAlistair Popple 							page_to_pfn(page));
28794dd845b5SAlistair Popple 			else
28804dd845b5SAlistair Popple 				swp_entry = make_readable_device_private_entry(
28814dd845b5SAlistair Popple 							page_to_pfn(page));
28828315ada7SJérôme Glisse 			entry = swp_entry_to_pte(swp_entry);
288334f5e9b9SMiaohe Lin 		} else {
288434f5e9b9SMiaohe Lin 			/*
288534f5e9b9SMiaohe Lin 			 * For now we only support migrating to un-addressable
288634f5e9b9SMiaohe Lin 			 * device memory.
288734f5e9b9SMiaohe Lin 			 */
288834f5e9b9SMiaohe Lin 			pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
288934f5e9b9SMiaohe Lin 			goto abort;
2890df6ad698SJérôme Glisse 		}
28918315ada7SJérôme Glisse 	} else {
28928315ada7SJérôme Glisse 		entry = mk_pte(page, vma->vm_page_prot);
28938315ada7SJérôme Glisse 		if (vma->vm_flags & VM_WRITE)
28948315ada7SJérôme Glisse 			entry = pte_mkwrite(pte_mkdirty(entry));
28958315ada7SJérôme Glisse 	}
28968315ada7SJérôme Glisse 
28978315ada7SJérôme Glisse 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
28988315ada7SJérôme Glisse 
289934290e2cSRalph Campbell 	if (check_stable_address_space(mm))
290034290e2cSRalph Campbell 		goto unlock_abort;
290134290e2cSRalph Campbell 
29028315ada7SJérôme Glisse 	if (pte_present(*ptep)) {
29038315ada7SJérôme Glisse 		unsigned long pfn = pte_pfn(*ptep);
29048315ada7SJérôme Glisse 
2905c23a0c99SRalph Campbell 		if (!is_zero_pfn(pfn))
2906c23a0c99SRalph Campbell 			goto unlock_abort;
29078315ada7SJérôme Glisse 		flush = true;
2908c23a0c99SRalph Campbell 	} else if (!pte_none(*ptep))
2909c23a0c99SRalph Campbell 		goto unlock_abort;
29108315ada7SJérôme Glisse 
29118315ada7SJérôme Glisse 	/*
2912c23a0c99SRalph Campbell 	 * Check for userfaultfd but do not deliver the fault. Instead,
29138315ada7SJérôme Glisse 	 * just back off.
29148315ada7SJérôme Glisse 	 */
2915c23a0c99SRalph Campbell 	if (userfaultfd_missing(vma))
2916c23a0c99SRalph Campbell 		goto unlock_abort;
29178315ada7SJérôme Glisse 
29188315ada7SJérôme Glisse 	inc_mm_counter(mm, MM_ANONPAGES);
2919be5d0a74SJohannes Weiner 	page_add_new_anon_rmap(page, vma, addr, false);
29208315ada7SJérôme Glisse 	if (!is_zone_device_page(page))
2921b518154eSJoonsoo Kim 		lru_cache_add_inactive_or_unevictable(page, vma);
29228315ada7SJérôme Glisse 	get_page(page);
29238315ada7SJérôme Glisse 
29248315ada7SJérôme Glisse 	if (flush) {
29258315ada7SJérôme Glisse 		flush_cache_page(vma, addr, pte_pfn(*ptep));
29268315ada7SJérôme Glisse 		ptep_clear_flush_notify(vma, addr, ptep);
29278315ada7SJérôme Glisse 		set_pte_at_notify(mm, addr, ptep, entry);
29288315ada7SJérôme Glisse 		update_mmu_cache(vma, addr, ptep);
29298315ada7SJérôme Glisse 	} else {
29308315ada7SJérôme Glisse 		/* No need to invalidate - it was non-present before */
29318315ada7SJérôme Glisse 		set_pte_at(mm, addr, ptep, entry);
29328315ada7SJérôme Glisse 		update_mmu_cache(vma, addr, ptep);
29338315ada7SJérôme Glisse 	}
29348315ada7SJérôme Glisse 
29358315ada7SJérôme Glisse 	pte_unmap_unlock(ptep, ptl);
29368315ada7SJérôme Glisse 	*src = MIGRATE_PFN_MIGRATE;
29378315ada7SJérôme Glisse 	return;
29388315ada7SJérôme Glisse 
2939c23a0c99SRalph Campbell unlock_abort:
2940c23a0c99SRalph Campbell 	pte_unmap_unlock(ptep, ptl);
29418315ada7SJérôme Glisse abort:
29428315ada7SJérôme Glisse 	*src &= ~MIGRATE_PFN_MIGRATE;
29438315ada7SJérôme Glisse }
29448315ada7SJérôme Glisse 
2945a7d1f22bSChristoph Hellwig /**
29468763cb45SJérôme Glisse  * migrate_vma_pages() - migrate meta-data from src page to dst page
29478763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
29488763cb45SJérôme Glisse  *
29498763cb45SJérôme Glisse  * This migrates struct page meta-data from source struct page to destination
29508763cb45SJérôme Glisse  * struct page. This effectively finishes the migration from source page to the
29518763cb45SJérôme Glisse  * destination page.
29528763cb45SJérôme Glisse  */
2953a7d1f22bSChristoph Hellwig void migrate_vma_pages(struct migrate_vma *migrate)
29548763cb45SJérôme Glisse {
29558763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
29568763cb45SJérôme Glisse 	const unsigned long start = migrate->start;
2957ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
2958ac46d4f3SJérôme Glisse 	unsigned long addr, i;
29598315ada7SJérôme Glisse 	bool notified = false;
29608763cb45SJérôme Glisse 
29618763cb45SJérôme Glisse 	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
29628763cb45SJérôme Glisse 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
29638763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
29648763cb45SJérôme Glisse 		struct address_space *mapping;
29658763cb45SJérôme Glisse 		int r;
29668763cb45SJérôme Glisse 
29678315ada7SJérôme Glisse 		if (!newpage) {
29688315ada7SJérôme Glisse 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
29698763cb45SJérôme Glisse 			continue;
29708315ada7SJérôme Glisse 		}
29718315ada7SJérôme Glisse 
29728315ada7SJérôme Glisse 		if (!page) {
2973c23a0c99SRalph Campbell 			if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
29748763cb45SJérôme Glisse 				continue;
29758315ada7SJérôme Glisse 			if (!notified) {
29768315ada7SJérôme Glisse 				notified = true;
2977ac46d4f3SJérôme Glisse 
29786b49bf6dSAlistair Popple 				mmu_notifier_range_init_owner(&range,
29796b49bf6dSAlistair Popple 					MMU_NOTIFY_MIGRATE, 0, migrate->vma,
29806b49bf6dSAlistair Popple 					migrate->vma->vm_mm, addr, migrate->end,
29815e5dda81SRalph Campbell 					migrate->pgmap_owner);
2982ac46d4f3SJérôme Glisse 				mmu_notifier_invalidate_range_start(&range);
29838315ada7SJérôme Glisse 			}
29848315ada7SJérôme Glisse 			migrate_vma_insert_page(migrate, addr, newpage,
2985d85c6db4SStephen Zhang 						&migrate->src[i]);
29868315ada7SJérôme Glisse 			continue;
29878315ada7SJérôme Glisse 		}
29888763cb45SJérôme Glisse 
29898763cb45SJérôme Glisse 		mapping = page_mapping(page);
29908763cb45SJérôme Glisse 
2991a5430ddaSJérôme Glisse 		if (is_zone_device_page(newpage)) {
2992a5430ddaSJérôme Glisse 			if (is_device_private_page(newpage)) {
2993a5430ddaSJérôme Glisse 				/*
2994a5430ddaSJérôme Glisse 				 * For now only support private anonymous when
2995a5430ddaSJérôme Glisse 				 * migrating to un-addressable device memory.
2996a5430ddaSJérôme Glisse 				 */
2997a5430ddaSJérôme Glisse 				if (mapping) {
2998a5430ddaSJérôme Glisse 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2999a5430ddaSJérôme Glisse 					continue;
3000a5430ddaSJérôme Glisse 				}
300125b2995aSChristoph Hellwig 			} else {
3002a5430ddaSJérôme Glisse 				/*
3003a5430ddaSJérôme Glisse 				 * Other types of ZONE_DEVICE page are not
3004a5430ddaSJérôme Glisse 				 * supported.
3005a5430ddaSJérôme Glisse 				 */
3006a5430ddaSJérôme Glisse 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
3007a5430ddaSJérôme Glisse 				continue;
3008a5430ddaSJérôme Glisse 			}
3009a5430ddaSJérôme Glisse 		}
3010a5430ddaSJérôme Glisse 
30118763cb45SJérôme Glisse 		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
30128763cb45SJérôme Glisse 		if (r != MIGRATEPAGE_SUCCESS)
30138763cb45SJérôme Glisse 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
30148763cb45SJérôme Glisse 	}
30158315ada7SJérôme Glisse 
30164645b9feSJérôme Glisse 	/*
30174645b9feSJérôme Glisse 	 * No need to double call mmu_notifier->invalidate_range() callback as
30184645b9feSJérôme Glisse 	 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
30194645b9feSJérôme Glisse 	 * did already call it.
30204645b9feSJérôme Glisse 	 */
30218315ada7SJérôme Glisse 	if (notified)
3022ac46d4f3SJérôme Glisse 		mmu_notifier_invalidate_range_only_end(&range);
30238763cb45SJérôme Glisse }
3024a7d1f22bSChristoph Hellwig EXPORT_SYMBOL(migrate_vma_pages);
30258763cb45SJérôme Glisse 
3026a7d1f22bSChristoph Hellwig /**
30278763cb45SJérôme Glisse  * migrate_vma_finalize() - restore CPU page table entry
30288763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
30298763cb45SJérôme Glisse  *
30308763cb45SJérôme Glisse  * This replaces the special migration pte entry with either a mapping to the
30318763cb45SJérôme Glisse  * new page if migration was successful for that page, or to the original page
30328763cb45SJérôme Glisse  * otherwise.
30338763cb45SJérôme Glisse  *
30348763cb45SJérôme Glisse  * This also unlocks the pages and puts them back on the lru, or drops the extra
30358763cb45SJérôme Glisse  * refcount, for device pages.
30368763cb45SJérôme Glisse  */
3037a7d1f22bSChristoph Hellwig void migrate_vma_finalize(struct migrate_vma *migrate)
30388763cb45SJérôme Glisse {
30398763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
30408763cb45SJérôme Glisse 	unsigned long i;
30418763cb45SJérôme Glisse 
30428763cb45SJérôme Glisse 	for (i = 0; i < npages; i++) {
30438763cb45SJérôme Glisse 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
30448763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
30458763cb45SJérôme Glisse 
30468315ada7SJérôme Glisse 		if (!page) {
30478315ada7SJérôme Glisse 			if (newpage) {
30488315ada7SJérôme Glisse 				unlock_page(newpage);
30498315ada7SJérôme Glisse 				put_page(newpage);
30508315ada7SJérôme Glisse 			}
30518763cb45SJérôme Glisse 			continue;
30528315ada7SJérôme Glisse 		}
30538315ada7SJérôme Glisse 
30548763cb45SJérôme Glisse 		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
30558763cb45SJérôme Glisse 			if (newpage) {
30568763cb45SJérôme Glisse 				unlock_page(newpage);
30578763cb45SJérôme Glisse 				put_page(newpage);
30588763cb45SJérôme Glisse 			}
30598763cb45SJérôme Glisse 			newpage = page;
30608763cb45SJérôme Glisse 		}
30618763cb45SJérôme Glisse 
30628763cb45SJérôme Glisse 		remove_migration_ptes(page, newpage, false);
30638763cb45SJérôme Glisse 		unlock_page(page);
30648763cb45SJérôme Glisse 
3065a5430ddaSJérôme Glisse 		if (is_zone_device_page(page))
3066a5430ddaSJérôme Glisse 			put_page(page);
3067a5430ddaSJérôme Glisse 		else
30688763cb45SJérôme Glisse 			putback_lru_page(page);
30698763cb45SJérôme Glisse 
30708763cb45SJérôme Glisse 		if (newpage != page) {
30718763cb45SJérôme Glisse 			unlock_page(newpage);
3072a5430ddaSJérôme Glisse 			if (is_zone_device_page(newpage))
3073a5430ddaSJérôme Glisse 				put_page(newpage);
3074a5430ddaSJérôme Glisse 			else
30758763cb45SJérôme Glisse 				putback_lru_page(newpage);
30768763cb45SJérôme Glisse 		}
30778763cb45SJérôme Glisse 	}
30788763cb45SJérôme Glisse }
3079a7d1f22bSChristoph Hellwig EXPORT_SYMBOL(migrate_vma_finalize);
30809b2ed9cbSChristoph Hellwig #endif /* CONFIG_DEVICE_PRIVATE */
308179c28a41SDave Hansen 
3082884a6e5dSDave Hansen #if defined(CONFIG_MEMORY_HOTPLUG)
308379c28a41SDave Hansen /* Disable reclaim-based migration. */
308479c28a41SDave Hansen static void __disable_all_migrate_targets(void)
308579c28a41SDave Hansen {
308679c28a41SDave Hansen 	int node;
308779c28a41SDave Hansen 
308879c28a41SDave Hansen 	for_each_online_node(node)
308979c28a41SDave Hansen 		node_demotion[node] = NUMA_NO_NODE;
309079c28a41SDave Hansen }
309179c28a41SDave Hansen 
309279c28a41SDave Hansen static void disable_all_migrate_targets(void)
309379c28a41SDave Hansen {
309479c28a41SDave Hansen 	__disable_all_migrate_targets();
309579c28a41SDave Hansen 
309679c28a41SDave Hansen 	/*
309779c28a41SDave Hansen 	 * Ensure that the "disable" is visible across the system.
309879c28a41SDave Hansen 	 * Readers will see either a combination of before+disable
309979c28a41SDave Hansen 	 * state or disable+after.  They will never see before and
310079c28a41SDave Hansen 	 * after state together.
310179c28a41SDave Hansen 	 *
310279c28a41SDave Hansen 	 * The before+after state together might have cycles and
310379c28a41SDave Hansen 	 * could cause readers to do things like loop until this
310479c28a41SDave Hansen 	 * function finishes.  This ensures they can only see a
310579c28a41SDave Hansen 	 * single "bad" read and would, for instance, only loop
310679c28a41SDave Hansen 	 * once.
310779c28a41SDave Hansen 	 */
310879c28a41SDave Hansen 	synchronize_rcu();
310979c28a41SDave Hansen }
311079c28a41SDave Hansen 
311179c28a41SDave Hansen /*
311279c28a41SDave Hansen  * Find an automatic demotion target for 'node'.
311379c28a41SDave Hansen  * Failing here is OK.  It might just indicate
311479c28a41SDave Hansen  * being at the end of a chain.
311579c28a41SDave Hansen  */
311679c28a41SDave Hansen static int establish_migrate_target(int node, nodemask_t *used)
311779c28a41SDave Hansen {
311879c28a41SDave Hansen 	int migration_target;
311979c28a41SDave Hansen 
312079c28a41SDave Hansen 	/*
312179c28a41SDave Hansen 	 * Can not set a migration target on a
312279c28a41SDave Hansen 	 * node with it already set.
312379c28a41SDave Hansen 	 *
312479c28a41SDave Hansen 	 * No need for READ_ONCE() here since this
312579c28a41SDave Hansen 	 * in the write path for node_demotion[].
312679c28a41SDave Hansen 	 * This should be the only thread writing.
312779c28a41SDave Hansen 	 */
312879c28a41SDave Hansen 	if (node_demotion[node] != NUMA_NO_NODE)
312979c28a41SDave Hansen 		return NUMA_NO_NODE;
313079c28a41SDave Hansen 
313179c28a41SDave Hansen 	migration_target = find_next_best_node(node, used);
313279c28a41SDave Hansen 	if (migration_target == NUMA_NO_NODE)
313379c28a41SDave Hansen 		return NUMA_NO_NODE;
313479c28a41SDave Hansen 
313579c28a41SDave Hansen 	node_demotion[node] = migration_target;
313679c28a41SDave Hansen 
313779c28a41SDave Hansen 	return migration_target;
313879c28a41SDave Hansen }
313979c28a41SDave Hansen 
314079c28a41SDave Hansen /*
314179c28a41SDave Hansen  * When memory fills up on a node, memory contents can be
314279c28a41SDave Hansen  * automatically migrated to another node instead of
314379c28a41SDave Hansen  * discarded at reclaim.
314479c28a41SDave Hansen  *
314579c28a41SDave Hansen  * Establish a "migration path" which will start at nodes
314679c28a41SDave Hansen  * with CPUs and will follow the priorities used to build the
314779c28a41SDave Hansen  * page allocator zonelists.
314879c28a41SDave Hansen  *
314979c28a41SDave Hansen  * The difference here is that cycles must be avoided.  If
315079c28a41SDave Hansen  * node0 migrates to node1, then neither node1, nor anything
315179c28a41SDave Hansen  * node1 migrates to can migrate to node0.
315279c28a41SDave Hansen  *
315379c28a41SDave Hansen  * This function can run simultaneously with readers of
315479c28a41SDave Hansen  * node_demotion[].  However, it can not run simultaneously
315579c28a41SDave Hansen  * with itself.  Exclusion is provided by memory hotplug events
315679c28a41SDave Hansen  * being single-threaded.
315779c28a41SDave Hansen  */
315879c28a41SDave Hansen static void __set_migration_target_nodes(void)
315979c28a41SDave Hansen {
316079c28a41SDave Hansen 	nodemask_t next_pass	= NODE_MASK_NONE;
316179c28a41SDave Hansen 	nodemask_t this_pass	= NODE_MASK_NONE;
316279c28a41SDave Hansen 	nodemask_t used_targets = NODE_MASK_NONE;
316379c28a41SDave Hansen 	int node;
316479c28a41SDave Hansen 
316579c28a41SDave Hansen 	/*
316679c28a41SDave Hansen 	 * Avoid any oddities like cycles that could occur
316779c28a41SDave Hansen 	 * from changes in the topology.  This will leave
316879c28a41SDave Hansen 	 * a momentary gap when migration is disabled.
316979c28a41SDave Hansen 	 */
317079c28a41SDave Hansen 	disable_all_migrate_targets();
317179c28a41SDave Hansen 
317279c28a41SDave Hansen 	/*
317379c28a41SDave Hansen 	 * Allocations go close to CPUs, first.  Assume that
317479c28a41SDave Hansen 	 * the migration path starts at the nodes with CPUs.
317579c28a41SDave Hansen 	 */
317679c28a41SDave Hansen 	next_pass = node_states[N_CPU];
317779c28a41SDave Hansen again:
317879c28a41SDave Hansen 	this_pass = next_pass;
317979c28a41SDave Hansen 	next_pass = NODE_MASK_NONE;
318079c28a41SDave Hansen 	/*
318179c28a41SDave Hansen 	 * To avoid cycles in the migration "graph", ensure
318279c28a41SDave Hansen 	 * that migration sources are not future targets by
318379c28a41SDave Hansen 	 * setting them in 'used_targets'.  Do this only
318479c28a41SDave Hansen 	 * once per pass so that multiple source nodes can
318579c28a41SDave Hansen 	 * share a target node.
318679c28a41SDave Hansen 	 *
318779c28a41SDave Hansen 	 * 'used_targets' will become unavailable in future
318879c28a41SDave Hansen 	 * passes.  This limits some opportunities for
318979c28a41SDave Hansen 	 * multiple source nodes to share a destination.
319079c28a41SDave Hansen 	 */
319179c28a41SDave Hansen 	nodes_or(used_targets, used_targets, this_pass);
319279c28a41SDave Hansen 	for_each_node_mask(node, this_pass) {
319379c28a41SDave Hansen 		int target_node = establish_migrate_target(node, &used_targets);
319479c28a41SDave Hansen 
319579c28a41SDave Hansen 		if (target_node == NUMA_NO_NODE)
319679c28a41SDave Hansen 			continue;
319779c28a41SDave Hansen 
319879c28a41SDave Hansen 		/*
319979c28a41SDave Hansen 		 * Visit targets from this pass in the next pass.
320079c28a41SDave Hansen 		 * Eventually, every node will have been part of
320179c28a41SDave Hansen 		 * a pass, and will become set in 'used_targets'.
320279c28a41SDave Hansen 		 */
320379c28a41SDave Hansen 		node_set(target_node, next_pass);
320479c28a41SDave Hansen 	}
320579c28a41SDave Hansen 	/*
320679c28a41SDave Hansen 	 * 'next_pass' contains nodes which became migration
320779c28a41SDave Hansen 	 * targets in this pass.  Make additional passes until
320879c28a41SDave Hansen 	 * no more migrations targets are available.
320979c28a41SDave Hansen 	 */
321079c28a41SDave Hansen 	if (!nodes_empty(next_pass))
321179c28a41SDave Hansen 		goto again;
321279c28a41SDave Hansen }
321379c28a41SDave Hansen 
321479c28a41SDave Hansen /*
321579c28a41SDave Hansen  * For callers that do not hold get_online_mems() already.
321679c28a41SDave Hansen  */
321779c28a41SDave Hansen static void set_migration_target_nodes(void)
321879c28a41SDave Hansen {
321979c28a41SDave Hansen 	get_online_mems();
322079c28a41SDave Hansen 	__set_migration_target_nodes();
322179c28a41SDave Hansen 	put_online_mems();
322279c28a41SDave Hansen }
3223884a6e5dSDave Hansen 
3224884a6e5dSDave Hansen /*
3225884a6e5dSDave Hansen  * React to hotplug events that might affect the migration targets
3226884a6e5dSDave Hansen  * like events that online or offline NUMA nodes.
3227884a6e5dSDave Hansen  *
3228884a6e5dSDave Hansen  * The ordering is also currently dependent on which nodes have
3229884a6e5dSDave Hansen  * CPUs.  That means we need CPU on/offline notification too.
3230884a6e5dSDave Hansen  */
3231884a6e5dSDave Hansen static int migration_online_cpu(unsigned int cpu)
3232884a6e5dSDave Hansen {
3233884a6e5dSDave Hansen 	set_migration_target_nodes();
3234884a6e5dSDave Hansen 	return 0;
3235884a6e5dSDave Hansen }
3236884a6e5dSDave Hansen 
3237884a6e5dSDave Hansen static int migration_offline_cpu(unsigned int cpu)
3238884a6e5dSDave Hansen {
3239884a6e5dSDave Hansen 	set_migration_target_nodes();
3240884a6e5dSDave Hansen 	return 0;
3241884a6e5dSDave Hansen }
3242884a6e5dSDave Hansen 
3243884a6e5dSDave Hansen /*
3244884a6e5dSDave Hansen  * This leaves migrate-on-reclaim transiently disabled between
3245884a6e5dSDave Hansen  * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
3246884a6e5dSDave Hansen  * whether reclaim-based migration is enabled or not, which
3247884a6e5dSDave Hansen  * ensures that the user can turn reclaim-based migration at
3248884a6e5dSDave Hansen  * any time without needing to recalculate migration targets.
3249884a6e5dSDave Hansen  *
3250884a6e5dSDave Hansen  * These callbacks already hold get_online_mems().  That is why
3251884a6e5dSDave Hansen  * __set_migration_target_nodes() can be used as opposed to
3252884a6e5dSDave Hansen  * set_migration_target_nodes().
3253884a6e5dSDave Hansen  */
3254884a6e5dSDave Hansen static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
3255884a6e5dSDave Hansen 						 unsigned long action, void *arg)
3256884a6e5dSDave Hansen {
3257884a6e5dSDave Hansen 	switch (action) {
3258884a6e5dSDave Hansen 	case MEM_GOING_OFFLINE:
3259884a6e5dSDave Hansen 		/*
3260884a6e5dSDave Hansen 		 * Make sure there are not transient states where
3261884a6e5dSDave Hansen 		 * an offline node is a migration target.  This
3262884a6e5dSDave Hansen 		 * will leave migration disabled until the offline
3263884a6e5dSDave Hansen 		 * completes and the MEM_OFFLINE case below runs.
3264884a6e5dSDave Hansen 		 */
3265884a6e5dSDave Hansen 		disable_all_migrate_targets();
3266884a6e5dSDave Hansen 		break;
3267884a6e5dSDave Hansen 	case MEM_OFFLINE:
3268884a6e5dSDave Hansen 	case MEM_ONLINE:
3269884a6e5dSDave Hansen 		/*
3270884a6e5dSDave Hansen 		 * Recalculate the target nodes once the node
3271884a6e5dSDave Hansen 		 * reaches its final state (online or offline).
3272884a6e5dSDave Hansen 		 */
3273884a6e5dSDave Hansen 		__set_migration_target_nodes();
3274884a6e5dSDave Hansen 		break;
3275884a6e5dSDave Hansen 	case MEM_CANCEL_OFFLINE:
3276884a6e5dSDave Hansen 		/*
3277884a6e5dSDave Hansen 		 * MEM_GOING_OFFLINE disabled all the migration
3278884a6e5dSDave Hansen 		 * targets.  Reenable them.
3279884a6e5dSDave Hansen 		 */
3280884a6e5dSDave Hansen 		__set_migration_target_nodes();
3281884a6e5dSDave Hansen 		break;
3282884a6e5dSDave Hansen 	case MEM_GOING_ONLINE:
3283884a6e5dSDave Hansen 	case MEM_CANCEL_ONLINE:
3284884a6e5dSDave Hansen 		break;
3285884a6e5dSDave Hansen 	}
3286884a6e5dSDave Hansen 
3287884a6e5dSDave Hansen 	return notifier_from_errno(0);
3288884a6e5dSDave Hansen }
3289884a6e5dSDave Hansen 
3290884a6e5dSDave Hansen static int __init migrate_on_reclaim_init(void)
3291884a6e5dSDave Hansen {
3292884a6e5dSDave Hansen 	int ret;
3293884a6e5dSDave Hansen 
3294884a6e5dSDave Hansen 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
3295884a6e5dSDave Hansen 				migration_online_cpu,
3296884a6e5dSDave Hansen 				migration_offline_cpu);
3297884a6e5dSDave Hansen 	/*
3298884a6e5dSDave Hansen 	 * In the unlikely case that this fails, the automatic
3299884a6e5dSDave Hansen 	 * migration targets may become suboptimal for nodes
3300884a6e5dSDave Hansen 	 * where N_CPU changes.  With such a small impact in a
3301884a6e5dSDave Hansen 	 * rare case, do not bother trying to do anything special.
3302884a6e5dSDave Hansen 	 */
3303884a6e5dSDave Hansen 	WARN_ON(ret < 0);
3304884a6e5dSDave Hansen 
3305884a6e5dSDave Hansen 	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
3306884a6e5dSDave Hansen 	return 0;
3307884a6e5dSDave Hansen }
3308884a6e5dSDave Hansen late_initcall(migrate_on_reclaim_init);
3309884a6e5dSDave Hansen #endif /* CONFIG_MEMORY_HOTPLUG */
3310